## Required Libraries

In [4]:
%pip install tensorboard

Collecting tensorboard
  Using cached tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting absl-py>=0.4 (from tensorboard)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting grpcio>=1.48.2 (from tensorboard)
  Downloading grpcio-1.73.1-cp313-cp313-win_amd64.whl.metadata (4.0 kB)
Collecting markdown>=2.6.8 (from tensorboard)
  Downloading markdown-3.8.2-py3-none-any.whl.metadata (5.1 kB)
Collecting protobuf!=4.24.0,>=3.19.6 (from tensorboard)
  Downloading protobuf-6.31.1-cp310-abi3-win_amd64.whl.metadata (593 bytes)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard)
  Using cached tensorboard_data_server-0.7.2-py3-none-any.whl.metadata (1.1 kB)
Collecting werkzeug>=1.0.1 (from tensorboard)
  Using cached werkzeug-3.1.3-py3-none-any.whl.metadata (3.7 kB)
Using cached tensorboard-2.19.0-py3-none-any.whl (5.5 MB)
Downloading absl_py-2.3.1-py3-none-any.whl (135 kB)
Downloading grpcio-1.73.1-cp313-cp313-win_amd64.whl (4.3 MB)
   ----------


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Dataset PreProcessing

In [2]:
from datasets import Dataset
from transformers import AutoTokenizer

# Load data - FIXED splitting method
with open(r"C:\Users\adars\OneDrive\Desktop\Dataset.txt", "r", encoding="utf-8") as f:
    # Try both single and double newlines as separators
    content = f.read()
    tweets = [p.strip() for p in content.splitlines() if p.strip()]  # Split by any newline
    
    # Verify we got multiple tweets
    print(f"Loaded {len(tweets)} tweets")  # Debug output

# Ensure we have enough data
if len(tweets) < 2:
    raise ValueError(f"Only {len(tweets)} tweets found - need at least 2 for train/test split")

dataset = Dataset.from_dict({"text": tweets})

# Adjusted splitting - ensure minimum 1 sample in each set
test_size = max(1, int(0.1 * len(tweets)))  # At least 1 sample for test
dataset = dataset.train_test_split(
    test_size=test_size,
    train_size=len(tweets)-test_size,  # Explicit sizes
    seed=42
)

# Tokenizer setup
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=64  # Reduced for tweets
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

  from .autonotebook import tqdm as notebook_tqdm


Loaded 158 tweets


Map: 100%|██████████| 143/143 [00:00<00:00, 18306.79 examples/s]
Map: 100%|██████████| 15/15 [00:00<00:00, 3690.00 examples/s]


# Fine Tuning GPT2 Model

In [11]:
# Install packages without version constraints
!pip install torch transformers datasets

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import Dataset

# 1. Load and prepare dataset
with open("Dataset.txt", "r", encoding="utf-8") as f:
    tweets = [line.strip() for line in f.readlines() if line.strip()]

print(f"Loaded {len(tweets)} tweets")
if len(tweets) < 10:
    raise ValueError(f"Insufficient data: only {len(tweets)} tweets found. Need at least 10.")

dataset = Dataset.from_dict({"text": tweets})
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)

# 2. Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    # Remove return_tensors="pt" for compatibility with datasets.map
    tokens = tokenizer(
        examples["text"],
        truncation=True,
        max_length=64,
        padding="max_length"
    )
    # Add labels for causal LM: labels = input_ids
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_train = split_dataset["train"].map(tokenize_function, batched=True)
tokenized_eval = split_dataset["test"].map(tokenize_function, batched=True)

# 3. Initialize model
model = AutoModelForCausalLM.from_pretrained("gpt2")
model.config.pad_token_id = tokenizer.pad_token_id

# 4. Configure training arguments (universal parameters)
base_args = {
    "output_dir": "./results",
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 4,
    "num_train_epochs": 50,
    "logging_steps": 100,
    "save_total_limit": 2,
    "learning_rate": 5e-5,
    "weight_decay": 0.01,
    "gradient_accumulation_steps": 2,
}

# Add version-specific parameters dynamically
try:
    training_args = TrainingArguments(
        **base_args,
        evaluation_strategy="steps",
        eval_steps=500,
        save_strategy="steps",
        save_steps=500,
        fp16=torch.cuda.is_available(),
    )
except TypeError:
    try:
        training_args = TrainingArguments(
            **base_args,
            evaluate_during_training=True,
            save_steps=500,
            fp16=torch.cuda.is_available(),
        )
    except TypeError:
        training_args = TrainingArguments(
            **base_args,
            fp16=torch.cuda.is_available(),
        )

# 5. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
)

# 6. Train the model
print("Starting training...")
trainer.train()

# 7. Save model
trainer.save_model("./trained_model")
tokenizer.save_pretrained("./trained_model")

# 8. Generation function
def generate_tweet(prompt, max_length=50):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        inputs.input_ids,
        max_length=max_length,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        pad_token_id=tokenizer.pad_token_id
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test generation
print("\nGenerated Tweet Examples:")
print(generate_tweet("I'm not saying I'm a good cook, but"))
print(generate_tweet("My dog is so smart,"))


Loaded 158 tweets



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: C:\Users\adars\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip
Map: 100%|██████████| 126/126 [00:00<00:00, 18050.49 examples/s]
Map: 100%|██████████| 32/32 [00:00<00:00, 6721.98 examples/s]


Starting training...




Step,Training Loss
100,0.7142
200,0.1898
300,0.1259
400,0.1074
500,0.098
600,0.0943
700,0.0885
800,0.0886





Generated Tweet Examples:
I'm not saying I'm a good cook, but my fire extinguisher has never been used. As a paperweight.
My dog is so smart, he has no limits.
