In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Explicitly set device to CUDA if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
model = GPT2LMHeadModel.from_pretrained('distilgpt2')

# Move model to GPU if available
model.to(device)

# Set padding tokens
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

# Example inference with input moved to GPU
prompt = "Hello, how can I help you?"
inputs = tokenizer(prompt, return_tensors="pt", padding=True)
inputs = {k: v.to(device) for k, v in inputs.items()}  # Move input tensors to GPU

with torch.no_grad():
    outputs = model(**inputs)
    print(outputs)


Using device: cpu
CausalLMOutputWithCrossAttentions(loss=None, logits=tensor([[[-33.1717, -31.6326, -33.7443,  ..., -44.9420, -44.7888, -33.9273],
         [-67.7578, -67.2971, -67.5085,  ..., -73.7265, -72.1330, -66.4876],
         [-62.5467, -63.9545, -67.9195,  ..., -70.8844, -71.9354, -65.5986],
         ...,
         [-60.1651, -62.7217, -65.9464,  ..., -69.6623, -73.2838, -64.5467],
         [-64.0370, -67.2485, -70.6526,  ..., -76.0229, -78.1052, -68.5395],
         [-64.1197, -63.1584, -63.7898,  ..., -72.9823, -74.0720, -59.1741]]]), past_key_values=((tensor([[[[-8.7863e-01,  2.6339e+00,  7.7920e-01,  ..., -1.2441e+00,
           -1.5730e-01,  1.6261e+00],
          [-1.6237e+00,  2.7957e+00,  1.6042e+00,  ..., -9.6159e-01,
           -1.8298e+00,  2.1775e+00],
          [-2.0097e+00,  1.9176e+00,  2.7019e+00,  ...,  6.3615e-02,
           -1.4335e+00,  1.8514e+00],
          ...,
          [-1.9784e+00,  1.1833e+00,  2.2219e+00,  ..., -5.4541e-01,
           -2.0770e+00,  2.3

In [2]:
import pandas as pd
import json

print("Loading campus FAQ dataset...")

# Load the dataset and extract questions and answers
data_path = '../data/campus_faq.json'

try:
    with open(data_path) as f:
        data = json.load(f)
    print("✓ Dataset loaded successfully!")
except FileNotFoundError:
    print("❌ Error: Could not find the dataset file. Please check the path.")
    raise

# Extract questions and answers from the nested structure
questions = []
answers = []

for item in data['faq']:
    questions.append(item['question'])
    answers.append(item['answer'])

# Create DataFrame for easier data manipulation
df = pd.DataFrame({
    'question': questions,
    'answer': answers
})

print(f"Dataset contains {len(df)} question-answer pairs")
print("\nFirst 3 examples:")
for i in range(min(3, len(df))):
    print(f"\nQ: {df.iloc[i]['question']}")
    print(f"A: {df.iloc[i]['answer']}")

print(f"\nDataset shape: {df.shape}")

Loading campus FAQ dataset...
✓ Dataset loaded successfully!
Dataset contains 30 question-answer pairs

First 3 examples:

Q: What are the library hours?
A: The library is open from 8 AM to 10 PM, Monday to Friday.

Q: How can I access the campus Wi-Fi?
A: Connect to the 'CampusWiFi' network and use your student credentials to log in.

Q: Where can I find the computer labs?
A: Computer labs are located in the main building, on the second floor.

Dataset shape: (30, 2)


In [3]:
print("Splitting data into training and validation sets...")
from sklearn.model_selection import train_test_split

# Split data: 80% for training, 20% for validation
train_questions, val_questions, train_answers, val_answers = train_test_split(
    df['question'].tolist(),
    df['answer'].tolist(),# Add this cell right after the training cell to debug the generation issue

    test_size=0.2,
    random_state=42  # For reproducible results
)

print(f"Training examples: {len(train_questions)}")
print(f"Validation examples: {len(val_questions)}")

Splitting data into training and validation sets...
Training examples: 24
Validation examples: 6


In [4]:
from transformers import Trainer, TrainingArguments

class QADataset(torch.utils.data.Dataset):
    """
    A simple dataset class for question-answer pairs.
    
    This class converts our Q&A data into a format the model can learn from.
    """
    
    def __init__(self, questions, answers, tokenizer, max_length=512):
        self.questions = questions
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        print(f"Created dataset with {len(questions)} examples")

    def __len__(self):
        """Return the number of examples in our dataset"""
        return len(self.questions)

    def __getitem__(self, idx):
        # Get the question and answer for the given index
        question = self.questions[idx]
        answer   = self.answers[idx]
        
        # Combine question and answer into a single string, following the prompt format used for training
        full_text = f"Question: {question} Answer: {answer}{self.tokenizer.eos_token}"
        
        # Tokenize the combined text, pad/truncate to max_length, and return as tensors
        enc = self.tokenizer(
            full_text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Squeeze to remove the batch dimension
        input_ids      = enc['input_ids'].squeeze()
        attention_mask = enc['attention_mask'].squeeze()
        
        # Create labels for language modeling
        labels = input_ids.clone()
        
        # Mask out the question part so that loss is only computed on the answer
        question_prefix = self.tokenizer.encode(f"Question: {question} Answer:")
        labels[:len(question_prefix)] = -100  # -100 is ignored by PyTorch loss
        
        # Mask out padding tokens in the labels as well
        labels[labels == self.tokenizer.pad_token_id] = -100
        
        # Return a dictionary of tensors for the Trainer
        return {
            'input_ids':      input_ids,
            'attention_mask': attention_mask,
            'labels':         labels,
        }

# Create dataset objects
train_dataset = QADataset(train_questions, train_answers, tokenizer)
val_dataset = QADataset(val_questions, val_answers, tokenizer)

Created dataset with 24 examples
Created dataset with 6 examples


In [5]:
from transformers import Trainer, TrainingArguments

print("Setting up training parameters...")

training_args = TrainingArguments(
    output_dir='./results',           # Where to save the model
    num_train_epochs=10,              # How many times to go through all data
    per_device_train_batch_size=2,    # How many examples to process at once
    per_device_eval_batch_size=2,     # Batch size for validation
    warmup_steps=100,                 # Gradual learning rate increase
    weight_decay=0.01,                # Regularization to prevent overfitting
    logging_dir='./logs',             # Where to save training logs
    logging_steps=5,                  # Log progress every 5 steps
    evaluation_strategy='epoch',      # <-- Correct argument for evaluation frequency
    save_strategy='epoch',            # Save model after each epoch
    load_best_model_at_end=True,      # Load the best model when done
    metric_for_best_model='eval_loss',# Use validation loss to pick best model
    report_to=None,                   # Don't report to wandb/tensorboard
)

print("Training configuration set up!")
print(f"Will train for {training_args.num_train_epochs} epochs")
print(f"Batch size: {training_args.per_device_train_batch_size}")


Setting up training parameters...
Training configuration set up!
Will train for 10 epochs
Batch size: 2




In [6]:
print("Creating trainer...")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

print("Starting training... This might take a few minutes.")
print("You'll see training progress below:")

# Start training
training_result = trainer.train()

print("Training completed! 🎉")
print(f"Final training loss: {training_result.training_loss:.4f}")

Creating trainer...
Starting training... This might take a few minutes.
You'll see training progress below:


  0%|          | 0/120 [00:00<?, ?it/s]

{'loss': 3.359, 'grad_norm': 21.57236099243164, 'learning_rate': 2.5e-06, 'epoch': 0.42}
{'loss': 3.2068, 'grad_norm': 19.24810218811035, 'learning_rate': 5e-06, 'epoch': 0.83}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 3.3054311275482178, 'eval_runtime': 2.7299, 'eval_samples_per_second': 2.198, 'eval_steps_per_second': 1.099, 'epoch': 1.0}
{'loss': 3.4023, 'grad_norm': 28.36723518371582, 'learning_rate': 7.5e-06, 'epoch': 1.25}
{'loss': 3.0532, 'grad_norm': 17.192794799804688, 'learning_rate': 1e-05, 'epoch': 1.67}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 3.0853207111358643, 'eval_runtime': 2.6986, 'eval_samples_per_second': 2.223, 'eval_steps_per_second': 1.112, 'epoch': 2.0}
{'loss': 2.7337, 'grad_norm': 22.863454818725586, 'learning_rate': 1.25e-05, 'epoch': 2.08}
{'loss': 2.6597, 'grad_norm': 27.611772537231445, 'learning_rate': 1.5e-05, 'epoch': 2.5}
{'loss': 2.5884, 'grad_norm': 24.569175720214844, 'learning_rate': 1.75e-05, 'epoch': 2.92}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.9346275329589844, 'eval_runtime': 2.8253, 'eval_samples_per_second': 2.124, 'eval_steps_per_second': 1.062, 'epoch': 3.0}
{'loss': 2.5226, 'grad_norm': 19.878374099731445, 'learning_rate': 2e-05, 'epoch': 3.33}
{'loss': 2.263, 'grad_norm': 21.896852493286133, 'learning_rate': 2.25e-05, 'epoch': 3.75}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.84531307220459, 'eval_runtime': 2.7267, 'eval_samples_per_second': 2.2, 'eval_steps_per_second': 1.1, 'epoch': 4.0}
{'loss': 1.9563, 'grad_norm': 16.938678741455078, 'learning_rate': 2.5e-05, 'epoch': 4.17}
{'loss': 1.8734, 'grad_norm': 22.406373977661133, 'learning_rate': 2.7500000000000004e-05, 'epoch': 4.58}
{'loss': 1.7936, 'grad_norm': 19.788949966430664, 'learning_rate': 3e-05, 'epoch': 5.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.847160577774048, 'eval_runtime': 2.7036, 'eval_samples_per_second': 2.219, 'eval_steps_per_second': 1.11, 'epoch': 5.0}
{'loss': 1.617, 'grad_norm': 17.35465431213379, 'learning_rate': 3.2500000000000004e-05, 'epoch': 5.42}
{'loss': 1.5263, 'grad_norm': 20.785640716552734, 'learning_rate': 3.5e-05, 'epoch': 5.83}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 2.931520462036133, 'eval_runtime': 2.6502, 'eval_samples_per_second': 2.264, 'eval_steps_per_second': 1.132, 'epoch': 6.0}
{'loss': 1.0974, 'grad_norm': 17.6064510345459, 'learning_rate': 3.7500000000000003e-05, 'epoch': 6.25}
{'loss': 0.9752, 'grad_norm': 16.102073669433594, 'learning_rate': 4e-05, 'epoch': 6.67}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 3.0855133533477783, 'eval_runtime': 2.5861, 'eval_samples_per_second': 2.32, 'eval_steps_per_second': 1.16, 'epoch': 7.0}
{'loss': 1.0216, 'grad_norm': 12.247735023498535, 'learning_rate': 4.25e-05, 'epoch': 7.08}
{'loss': 0.7475, 'grad_norm': 21.594221115112305, 'learning_rate': 4.5e-05, 'epoch': 7.5}
{'loss': 0.7957, 'grad_norm': 24.18516731262207, 'learning_rate': 4.75e-05, 'epoch': 7.92}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 3.5499675273895264, 'eval_runtime': 2.7123, 'eval_samples_per_second': 2.212, 'eval_steps_per_second': 1.106, 'epoch': 8.0}
{'loss': 0.4664, 'grad_norm': 15.003941535949707, 'learning_rate': 5e-05, 'epoch': 8.33}
{'loss': 0.5083, 'grad_norm': 16.47370719909668, 'learning_rate': 3.7500000000000003e-05, 'epoch': 8.75}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 3.7636146545410156, 'eval_runtime': 2.7577, 'eval_samples_per_second': 2.176, 'eval_steps_per_second': 1.088, 'epoch': 9.0}
{'loss': 0.3768, 'grad_norm': 11.325638771057129, 'learning_rate': 2.5e-05, 'epoch': 9.17}
{'loss': 0.3782, 'grad_norm': 8.987375259399414, 'learning_rate': 1.25e-05, 'epoch': 9.58}
{'loss': 0.3649, 'grad_norm': 14.052441596984863, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/3 [00:00<?, ?it/s]

{'eval_loss': 3.7962684631347656, 'eval_runtime': 2.9024, 'eval_samples_per_second': 2.067, 'eval_steps_per_second': 1.034, 'epoch': 10.0}


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


{'train_runtime': 489.888, 'train_samples_per_second': 0.49, 'train_steps_per_second': 0.245, 'train_loss': 1.7203019618988038, 'epoch': 10.0}
Training completed! 🎉
Final training loss: 1.7203


In [7]:
def generate_answer(question, max_new_tokens=100, temperature=0.7):
    """
    Generate an answer for a given question using our fine-tuned model.
    
    Args:
        question: The question to answer
        max_new_tokens: Maximum length of the answer
        temperature: Controls randomness (0.1 = focused, 1.0 = creative)
    """
    # Format the question the same way we did during training
    prompt = f"Question: {question} Answer:"
    
    # Convert text to model input
    inputs = tokenizer.encode_plus(
        prompt,
        return_tensors='pt',
        truncation=True,
        return_attention_mask=True
    )
    
    # Move to same device as model
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    
    # Generate answer
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,  # Mask to avoid attending to padding tokens
            max_new_tokens=max_new_tokens,  # Maximum number of tokens to generate
            min_new_tokens=5,               # Don't allow EOS until at least 5 tokens
            do_sample=True,                 # If True, sample from the distribution (more creative); if False, use greedy decoding
            temperature=temperature,        # Controls randomness of sampling
            num_beams=3,                    # Number of beams for beam search (higher = more thorough search, but slower)
            pad_token_id=tokenizer.eos_token_id,  # Token ID used for padding (set to EOS for GPT-2)
        )
    # Extract only the generated part (remove the input prompt)
    generated_tokens = outputs[0][len(input_ids[0]):]
    answer = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    
    # Clean up the answer
    answer = answer.strip()
    if answer.startswith("Answer:"):
        answer = answer[7:].strip()
    
    return answer

In [8]:
print("Testing the fine-tuned model with training examples:")
print("=" * 60)

training_questions = [
    "What are the library hours?",
    "How can I access campus Wi-Fi?",
    "Where is the cafeteria located?"
]

for question in training_questions:
    print(f"❓ Question: {question}")
    answer = generate_answer(question)
    print(f"🤖 Answer: {answer}")
    print("-" * 40)

# Test with new questions (not in training data)
print("\nTesting with NEW questions (not in training data):")
print("=" * 60)

new_questions = [
    "What time does the gym open?",
    "How do I contact the IT help desk?",
    "Where can I study late at night?"
]

for question in new_questions:
    print(f"❓ Question: {question}")
    answer = generate_answer(question)
    print(f"🤖 Answer: {answer}")
    print("-" * 40)

Testing the fine-tuned model with training examples:
❓ Question: What are the library hours?
🤖 Answer: The library is open every day from 6:00 a.m. to 8:00 p.m. The library is open every day from 6:00 a.m. to 8:00 p.m. The library is open every day from 6:00 a.m. to 8:00 p.m. The library is open every day from 6:00 a.m. to 8:00 p.m. The library is open every day from 6:00 a.
----------------------------------------
❓ Question: How can I access campus Wi-Fi?
🤖 Answer: You can access campus Wi-Fi through the campus Wi-Fi portal.
----------------------------------------
❓ Question: Where is the cafeteria located?
🤖 Answer: The cafeteria is located in the cafeteria area of the cafeteria area of the cafeteria area of the cafeteria area of the cafeteria area of the cafeteria area of the cafeteria area of the cafeteria area of the cafeteria area of the cafeteria area of the cafeteria area of the cafeteria area of the cafeteria area of the cafeteria area of the cafeteria area of the cafeteria 

In [None]:
print("🔍 Debugging model generation...")

# Test the base model generation first
def debug_generation(question):
    """Debug function to see what's happening during generation"""
    prompt = f"Question: {question} Answer:"
    print(f"Input prompt: '{prompt}'")

    # Tokenize
    inputs = tokenizer.encode_plus(
        prompt,
        return_tensors='pt',
        return_attention_mask=True
    )
    
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    
    print(f"Input token IDs shape: {input_ids.shape}")
    print(f"Input tokens: {input_ids[0].tolist()}")
    print(f"Decoded input: '{tokenizer.decode(input_ids[0])}'")
    
    # Generate with simpler parameters first
    model.eval()
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=20,
            min_new_tokens=5,           # don’t allow EOS until at least 5 tokens
            do_sample=False,            # you can mix with or without sampling
            num_beams=3,
            pad_token_id=tokenizer.eos_token_id,
        )

    
    print(f"Output shape: {outputs.shape}")
    print(f"Full output tokens: {outputs[0].tolist()}")
    print(f"Full decoded output: '{tokenizer.decode(outputs[0])}'")
    
    # Extract generated part
    generated_tokens = outputs[0][len(input_ids[0]):]
    print(f"Generated tokens only: {generated_tokens.tolist()}")
    answer = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    print(f"Generated answer: '{answer}'")
    
    return answer

# Test with a simple question
test_answer = debug_generation("What are the library hours?")
print(f"\nFinal answer: '{test_answer}'")

🔍 Debugging model generation...
Input prompt: 'Question: What are the library hours? Answer:'
Input token IDs shape: torch.Size([1, 10])
Input tokens: [24361, 25, 1867, 389, 262, 5888, 2250, 30, 23998, 25]
Decoded input: 'Question: What are the library hours? Answer:'
Output shape: torch.Size([1, 30])
Full output tokens: [24361, 25, 1867, 389, 262, 5888, 2250, 30, 23998, 25, 383, 5888, 2250, 389, 1280, 284, 262, 2276, 1171, 319, 262, 7611, 13, 383, 5888, 2250, 389, 1280, 284, 262]
Full decoded output: 'Question: What are the library hours? Answer: The library hours are open to the general public on the campus. The library hours are open to the'
Generated tokens only: [383, 5888, 2250, 389, 1280, 284, 262, 2276, 1171, 319, 262, 7611, 13, 383, 5888, 2250, 389, 1280, 284, 262]
Generated answer: ' The library hours are open to the general public on the campus. The library hours are open to the'

Final answer: ' The library hours are open to the general public on the campus. The library hou