In [13]:
# Install required packages
!pip install transformers datasets torch pandas numpy tqdm wandb gradio

# Import necessary libraries
import pandas as pd
import numpy as np
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from datasets import Dataset
import os
import re
import random
import logging
import math
from tqdm.notebook import tqdm

# Set seed for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: C:\Users\Dilun\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [15]:
# Load the dataset
df = pd.read_csv('../data/qa_dataset.csv')

# Display basic information
print(f"Dataset shape: {df.shape}")
print("\nSample data:")
df.head()

# Check for missing values
print("Missing values:")
df.isnull().sum()

# Preprocess the data
def preprocess_text(text):
    if isinstance(text, str):
        # Basic cleaning
        text = text.strip()
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        return text
    return ""

# Apply preprocessing
df['Question'] = df['Question'].apply(preprocess_text)
df['Answer'] = df['Answer'].apply(preprocess_text)

# Create formatted text for GPT-2 fine-tuning
# Format: "Question: {question} Answer: {answer}"
df['formatted_text'] = 'Question: ' + df['Question'] + ' Answer: ' + df['Answer'] + '<|endoftext|>'

# Split the data into training and validation sets (90-10 split)
train_df = df.sample(frac=0.9, random_state=seed)
val_df = df.drop(train_df.index)

print(f"Training examples: {len(train_df)}")
print(f"Validation examples: {len(val_df)}")

# Create text files for training and validation
with open('train.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(train_df['formatted_text'].tolist()))
    
with open('val.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(val_df['formatted_text'].tolist()))

Dataset shape: (995, 2)

Sample data:
Missing values:
Training examples: 896
Validation examples: 99


In [16]:
# Load GPT-2 tokenizer and model
model_name = "gpt2"  # We're using the base GPT-2 model
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Move model to the appropriate device
model.to(device)

# Adjust the tokenizer
# GPT-2 doesn't have a padding token by default, so we'll set it
tokenizer.pad_token = tokenizer.eos_token

# Create Dataset objects for training and validation
def load_dataset(file_path, tokenizer, block_size=512):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

train_dataset = load_dataset('train.txt', tokenizer)
val_dataset = load_dataset('val.txt', tokenizer)

# Create data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False
)



In [18]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-ctse-chatbot",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_steps=400,
    save_steps=800,
    warmup_steps=500,
    # Removed evaluation_strategy as it is not recognized
    save_total_limit=2,
    prediction_loss_only=True,
    fp16=torch.cuda.is_available(),  # Use mixed precision if available
    logging_dir='./logs',
    logging_steps=100,
    report_to="none"  # Disable reporting to wandb or other services by default
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Fine-tune the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

# Save the fine-tuned model and tokenizer
model_path = "./gpt2-ctse-chatbot-final"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model and tokenizer saved to {model_path}")

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


Perplexity: 11.53
Model and tokenizer saved to ./gpt2-ctse-chatbot-final


In [None]:
# Load the fine-tuned model and tokenizer
model_path = "./gpt2-ctse-chatbot-final"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
model.to(device)

# Set the pad token
tokenizer.pad_token = tokenizer.eos_token

def generate_answer(question, model, tokenizer, max_length=150):
    """
    Generate an answer for a given question using the fine-tuned GPT-2 model
    """
    # Format the input text
    input_text = f"Question: {question.strip()} Answer:"
    
    # Tokenize the input
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    
    # Generate the output
    output = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        num_return_sequences=1,
        do_sample=True,
        top_p=0.95,
        temperature=0.7,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
    
    # Decode the output
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Extract the answer part
    try:
        answer = generated_text.split("Answer:")[1].strip()
        # Remove any additional questions that might be generated
        if "Question:" in answer:
            answer = answer.split("Question:")[0].strip()
    except IndexError:
        answer = generated_text  # Return the whole text if we can't extract just the answer
        
    return answer

# Create a simple chatbot interface
def ctse_chatbot():
    print("\n=== CTSE Lecture Notes Chatbot ===\n")
    print("Ask questions about CTSE topics. Type 'exit' to quit.\n")
    
    while True:
        question = input("\nYou: ")
        
        if question.lower() in ["exit", "quit", "bye"]:
            print("\nChatbot: Thank you for using the CTSE chatbot. Goodbye!")
            break
            
        if not question.strip():
            print("\nChatbot: Please ask a question about CTSE topics.")
            continue
            
        # Generate the answer
        answer = generate_answer(question, model, tokenizer)
        print(f"\nChatbot: {answer}")

# Run the chatbot
if __name__ == "__main__":
    ctse_chatbot()


=== CTSE Lecture Notes Chatbot ===

Ask questions about CTSE topics. Type 'exit' to quit.

