# Fine Tuning of GPT-2

In [None]:
# Install required libraries
!pip install transformers datasets torch

# Import necessary modules
import os
import json
from google.colab import drive
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, pipeline
from datasets import Dataset, DatasetDict



In [None]:
# Replace with your actual Hugging Face token
HF_TOKEN = " " ## Add your Hugging Face token

# Log in to Hugging Face
login(token=HF_TOKEN)

# Store the token in environment variables
os.environ["HF_TOKEN"] = HF_TOKEN

print("✅ Successfully logged in to Hugging Face!")


✅ Successfully logged in to Hugging Face!


In [None]:
# Load GPT-2 model with authentication
model = AutoModelForCausalLM.from_pretrained("gpt2", use_auth_token=os.getenv("HF_TOKEN"))

# Load tokenizer and set a padding token
tokenizer = AutoTokenizer.from_pretrained("gpt2", use_auth_token=os.getenv("HF_TOKEN"))
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 does not have a padding token by default

# Resize model embeddings for the tokenizer
model.resize_token_embeddings(len(tokenizer))

print("✅ GPT-2 Model and Tokenizer Loaded Successfully!")




✅ GPT-2 Model and Tokenizer Loaded Successfully!


In [None]:
# Mount Google Drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Initialize a dictionary to store movie lines
movie_lines = {}

# Read movie_lines.txt and store character dialogues
with open("/content/drive/MyDrive/archive (3)/movie_lines.txt", "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 5:
            line_id, character, text = parts[0], parts[3], parts[4]
            movie_lines[line_id] = f"{character}: {text}"  # Format as "Character: Dialogue"

# Initialize a list to store conversations
conversations = []

# Read movie_conversations.txt and match dialogues
with open("/content/drive/MyDrive/archive (3)/movie_conversations.txt", "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 4:
            line_ids = eval(parts[3])  # Convert string list to Python list
            dialogue = [movie_lines[line_id] for line_id in line_ids if line_id in movie_lines]
            conversations.append(" ".join(dialogue))  # Join lines into a single conversation

# Save formatted conversations to a text file
with open("/content/formatted_conversations.txt", "w", encoding="utf-8") as f:
    f.write("\n\n".join(conversations))  # Save as plain text for fine-tuning

print("✅ Preprocessing complete! Check formatted_conversations.txt")

✅ Preprocessing complete! Check formatted_conversations.txt


In [None]:
# Define file path
file_path = "/content/formatted_conversations.txt"

try:
    with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
        # Read the entire file content
        file_content = file.read()

        # Display a preview of the content
        print(file_content[:1000])  # Print only the first 1000 characters

except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
except Exception as e:
    print(f"An error occurred: {e}")


BIANCA: Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again. CAMERON: Well, I thought we'd start with pronunciation, if that's okay with you. BIANCA: Not the hacking and gagging and spitting part.  Please. CAMERON: Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?

BIANCA: You're asking me out.  That's so cute. What's your name again? CAMERON: Forget it.

BIANCA: No, no, it's my fault -- we didn't have a proper introduction --- CAMERON: Cameron. BIANCA: The thing is, Cameron -- I'm at the mercy of a particularly hideous breed of loser.  My sister.  I can't date until she does. CAMERON: Seems like she could get a date easy enough...

CAMERON: Why? BIANCA: Unsolved mystery.  She used to be really popular when she started high school, then it was just like she got sick of it or something. CAMERON: That's a shame.

BIANCA: Gosh, if only we could find Kat a boyfriend... CAMERON: Let me s

In [None]:
# Load conversations into a list
with open("/content/formatted_conversations.txt", "r", encoding="utf-8") as f:
    conversations = f.read().split("\n\n")  # Each conversation is separated by two newlines

# Convert to Hugging Face Dataset format
dataset = Dataset.from_dict({"text": conversations})

# Display a sample conversation
print("✅ Dataset Loaded Successfully! Sample:")
print(dataset[0]["text"])

✅ Dataset Loaded Successfully! Sample:
BIANCA: Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again. CAMERON: Well, I thought we'd start with pronunciation, if that's okay with you. BIANCA: Not the hacking and gagging and spitting part.  Please. CAMERON: Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?


In [None]:
# Split dataset into 90% train and 10% validation
dataset = dataset.train_test_split(test_size=0.1)

# Print dataset sizes
print(f"✅ Dataset Split Successfully! Train size: {len(dataset['train'])}, Eval size: {len(dataset['test'])}")
# Load dataset and reduce its size before splitting
# Load dataset and reduce its size before splitting
dataset = dataset["train"].shuffle(seed=42).select(range(5000))  # Reduce dataset to 5000 samples
dataset = dataset.train_test_split(test_size=0.1)  # Now split into train & eval


✅ Dataset Split Successfully! Train size: 74787, Eval size: 8310


In [None]:
# Define a tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Define train and eval datasets
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

print("✅ Dataset Tokenized Successfully!")

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

✅ Dataset Tokenized Successfully!


In [None]:
training_args = TrainingArguments(
    output_dir="/content/gpt2-finetuned",
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch
    per_device_train_batch_size=1,  # Reduce if Colab crashes
    num_train_epochs=2,  # Training epochs
    logging_dir="/content/logs",
    weight_decay=0.01,
    save_total_limit=2  # Keep only the last 2 checkpoints
)

print("✅ Training Arguments Set!")

✅ Training Arguments Set!




In [None]:
def tokenize_function(examples):
    tokenized_inputs = tokenizer(
        examples["text"], truncation=True, padding="max_length", max_length=128
    )

    # Labels are just input_ids shifted left (auto-regressive model training)
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()

    return tokenized_inputs

# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Define train and eval datasets
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["test"]

print("✅ Dataset Tokenized with Labels Successfully!")


Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

✅ Dataset Tokenized with Labels Successfully!


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=eval_dataset,  # Validation dataset
)

# Start fine-tuning
trainer.train()

print("🔥 Training Started!")


[34m[1mwandb[0m: Currently logged in as: [33mbheemesh-1[0m ([33mbheemesh-1-prom-iit-rajasthan[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,1.5666,1.438999
2,1.3252,1.441769


🔥 Training Started!


In [None]:
trainer.save_model("/content/gpt2-finetuned")
tokenizer.save_pretrained("/content/gpt2-finetuned")

print("✅ Model Saved Successfully!")


✅ Model Saved Successfully!


In [None]:
# Load the fine-tuned model
generator = pipeline("text-generation", model="/content/gpt2-finetuned", tokenizer=tokenizer)

# Generate a conversation
prompt = "BIANCA: Hey, how was your day?"
response = generator(prompt, max_length=100, num_return_sequences=1)

# Print the generated dialogue
print("🎬 Movie Dialogue Generated:")
print(response[0]['generated_text'])


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


🎬 Movie Dialogue Generated:
BIANCA: Hey, how was your day? RUSSIAN: Very good.
