<a href="https://colab.research.google.com/github/Ace007-0/Option-3.-Generative-AI---Togo-AI-Labs/blob/main/gpt2_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install Dependencies
# Uninstall default torch to avoid version conflicts
!pip uninstall -y torch torchvision torchaudio

# Install CUDA-compatible PyTorch
!pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121

# Install required NLP libraries
!pip install transformers==4.44.2 datasets evaluate nltk pandas peft==0.13.2 sentence-transformers


In [None]:
import pandas as pd
import torch, tqdm, re
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer, set_seed, get_linear_schedule_with_warmup
from peft import LoraConfig, get_peft_model
from google.colab import files
import nltk, evaluate

# Download tokenizers
nltk.download("punkt")
nltk.download("wordnet")

# Set random seed + device
set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


In [None]:
"""
Upload Dataset
Prepare a CSV named dialogs.csv with two columns:
question → user input
answer → bot response
"""

print("Upload dialogs.csv (CSV dataset)")
uploaded = files.upload()
dataset_path = list(uploaded.keys())[0]
print(f"Uploaded dataset: {dataset_path}")



In [None]:
"""
Dataset Class
This class processes the dataset into conversational format for training:
"""
class ConversationData(Dataset):
    def __init__(self, path, tokenizer, max_samples=1000, min_turns=6):
        # Load + clean dataset
        df = pd.read_csv(path, sep=',', on_bad_lines='skip')
        df["question"] = df["question"].astype(str).str.strip()
        df["answer"] = df["answer"].astype(str).str.strip()
        df = df.dropna(subset=["question", "answer"])
        df = df[(df['question'].str.strip() != "") & (df['answer'].str.strip() != "")]

        # Build conversations
        convs, conv = [], []
        for _, row in df.iterrows():
            conv.append(("Human", row["question"]))
            conv.append(("Bot", row["answer"]))
            if len(conv) >= min_turns:
                formatted = self.format_conversation(conv)
                if formatted:
                    convs.append(formatted)
                conv = []
        if conv:
            formatted = self.format_conversation(conv)
            if formatted:
                convs.append(formatted)

        self.X = convs[:max_samples]
        print("Dataset size:", len(self.X))

        # Fallback if dataset too small
        if len(self.X) == 0:
            print("Warning: No conversations meet min_turns criteria.")
            self.X = ["<start> Human: Hello <turn> Bot: Hi <end>"] * 10

        # Tokenization
        enc = tokenizer(self.X, max_length=512, truncation=True,
                        padding="max_length", return_tensors="pt")
        self.input_ids, self.attn_mask = enc.input_ids, enc.attention_mask

    def format_conversation(self, conv):
        text = "<start> "
        for r, msg in conv:
            text += f"{r}: {msg} <turn> "
        return text.strip() + " <end>"

    def __len__(self): return len(self.X)
    def __getitem__(self, i): return self.input_ids[i], self.attn_mask[i]


In [None]:
# Tokenizer Setup
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({
    "bos_token": "<start>",
    "eos_token": "<end>",
    "additional_special_tokens": ["<turn>"]
})


In [None]:
"""
Few-Shot Baseline (No Training)
Before fine-tuning, test GPT-2 directly with handcrafted examples:
"""
vanilla_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
vanilla_tokenizer.pad_token = vanilla_tokenizer.eos_token
model_fewshot = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

In [None]:
# Helper function for response generation:
def generate_response(model, prompt, tokenizer):
    inputs = tokenizer(prompt, return_tensors="pt",
                       truncation=True, max_length=512).to(device)
    with torch.no_grad():
        output = model.generate(
            **inputs, max_new_tokens=80, do_sample=True,
            temperature=0.9, top_k=50, top_p=0.92,
            repetition_penalty=1.5, no_repeat_ngram_size=3,
            pad_token_id=tokenizer.eos_token_id
        )
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    return re.sub(r'[\d\s]{5,}', '', decoded).strip()


In [None]:
# Run few-shot chat:
fixed_inputs = [
    "Hi!!!!! It's nice to see you again!",
    "Looking for interesting podcasts",
    "what do you usually do for fun?",
    "First time with wifi on a plane and oh god is it glorious",
    "I've always wanted to go to England."
]

def few_shot_chat():
    exemplar = """Human: What kind of phone(s) do you guys have?
Bot: I have a pixel. It's pretty great. Much better than what I had before.
Human: Does it really charge all the way in 15 min?
Bot: Pretty fast. I've never timed it, but it's under half an hour.
Human: so how have you been?
Bot: i've been great. what about you?"""

    history = []
    print("\n=== FEW-SHOT CHAT (Baseline GPT-2, no fine-tune) ===")
    for u in fixed_inputs:
        print(f"Input: {u}")
        prompt = exemplar + "".join(history) + f"\nHuman: {u}\nBot:"
        out = generate_response(model_fewshot, prompt, vanilla_tokenizer)
        resp = out.split("Bot:")[-1].split("Human:")[0].strip() or "Interesting! Tell me more."
        history.append(f"Human: {u}\nBot: {resp}\n")
        print(f"Output: {resp}")

few_shot_chat()


In [None]:
# Fine-Tuning Setup (Full vs LoRA)
# Full GPT-2
model_base = GPT2LMHeadModel.from_pretrained(model_name)
model_base.resize_token_embeddings(len(tokenizer))
model_base = model_base.to(device)

# LoRA GPT-2
model_lora = GPT2LMHeadModel.from_pretrained(model_name)
model_lora.resize_token_embeddings(len(tokenizer))
lora_cfg = LoraConfig(r=32, lora_alpha=64,
                      target_modules=["c_attn", "c_proj"],
                      lora_dropout=0.05, task_type="CAUSAL_LM")
model_lora = get_peft_model(model_lora, lora_cfg).to(device)

# Dataset + DataLoader
dataset = ConversationData(dataset_path, tokenizer, max_samples=1000, min_turns=6)
loader = DataLoader(dataset, batch_size=4, shuffle=True)


In [None]:
# Training function:
def train_model(dataloader, model, optimizer, scheduler, epochs=5, save_name="model_ckpt"):
    model.train()
    for e in range(epochs):
        total_loss = 0
        progress_bar = tqdm.tqdm(dataloader, desc=f"Epoch {e+1}")
        for X, mask in progress_bar:
            X, mask = X.to(device), mask.to(device)
            out = model(X, attention_mask=mask, labels=X)
            loss = out.loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step(); scheduler.step(); optimizer.zero_grad()
            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())
        print(f"Epoch {e+1}, avg loss={total_loss/len(dataloader):.4f}")
        torch.save(model.state_dict(), f"{save_name}_epoch{e+1}.pt")


In [None]:
# Train models:
print("Training Full GPT-2")
opt_base = torch.optim.AdamW(model_base.parameters(), lr=5e-6)
sch_base = get_linear_schedule_with_warmup(opt_base, 10, len(loader)*5)
train_model(loader, model_base, opt_base, sch_base, epochs=5, save_name="full_gpt2")

print("Training LoRA GPT-2")
opt_lora = torch.optim.AdamW(model_lora.parameters(), lr=2e-4)
sch_lora = get_linear_schedule_with_warmup(opt_lora, 10, len(loader)*10)
train_model(loader, model_lora, opt_lora, sch_lora, epochs=10, save_name="lora_gpt2")


In [None]:
# Tuned Chats
def tuned_chat(model, title="model"):
    history = []
    print(f"\n=== Chat with {title} ===")
    for u in fixed_inputs:
        print(f"Input: {u}")
        prompt = "<start> " + " <turn> ".join(history) + f"Human: {u} <turn> Bot:"
        out = generate_response(model, prompt, tokenizer)
        resp = out.split("Bot:")[-1].split("Human:")[0].strip() or "Hmm, can you expand?"
        history.append(f"Human: {u} <turn> Bot: {resp}")
        print(f"Output: {resp}")

tuned_chat(model_base, title="Full Fine-Tuned GPT-2")
tuned_chat(model_lora, title="LoRA Fine-Tuned GPT-2")


In [None]:
# Evaluation (BLEU + Perplexity)
bleu = evaluate.load("bleu")

def evaluate_model(model, test_prompt, reference, tokenizer):
    response = generate_response(model, test_prompt, tokenizer) or "Default response"
    inputs = tokenizer(response, return_tensors="pt").to(device)
    with torch.no_grad():
        loss = model(**inputs, labels=inputs["input_ids"]).loss
    ppl = torch.exp(loss).item()
    bleu_score = bleu.compute(predictions=[response], references=[[reference]])["bleu"]
    return {"response": response, "perplexity": ppl, "BLEU": bleu_score}

print("\n=== Evaluation Comparison Table ===")
ref = "Human: hi, how are you doing? Bot: I'm fine, thanks for asking."
prompt_few = "Human: hi, how are you doing? Bot:"
prompt_tuned = "<start> Human: hi, how are you doing? <turn> Bot:"

# Few-shot
few_shot_response = generate_response(model_fewshot, prompt_few, vanilla_tokenizer)
inputs_few = vanilla_tokenizer(few_shot_response, return_tensors="pt").to(device)
with torch.no_grad():
    loss_few = model_fewshot(**inputs_few, labels=inputs_few["input_ids"]).loss
few_shot_ppl = torch.exp(loss_few).item()
few_shot_bleu = bleu.compute(predictions=[few_shot_response], references=[[ref]])["bleu"]

# Fine-tuned models
full_eval = evaluate_model(model_base, prompt_tuned, ref, tokenizer)
lora_eval = evaluate_model(model_lora, prompt_tuned, ref, tokenizer)

results = pd.DataFrame([
    {"Model": "Few-Shot GPT-2", "Perplexity": few_shot_ppl, "BLEU": few_shot_bleu},
    {"Model": "Full Fine-Tuned GPT-2", "Perplexity": full_eval["perplexity"], "BLEU": full_eval["BLEU"]},
    {"Model": "LoRA Fine-Tuned GPT-2", "Perplexity": lora_eval["perplexity"], "BLEU": lora_eval["BLEU"]},
])
print(results.to_string(index=False))
