In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForCausalLM
import torch


In [None]:
# Load the dataset
dataset_path = '../data.csv'  # Adjust path if necessary
df = pd.read_csv(dataset_path)

# Preprocessing: Keep only relevant columns and remove null values
df = df[['text']]  # Focus on the 'text' column
df.dropna(subset=['text'], inplace=True)  # Remove rows with null 'text'

# Reduce the dataset size to 60,000 rows (use first 60k rows or a random sample)
df = df.head(60000)  # Keep only the first 60,000 rows

# If you want to select a random sample of 60,000 rows, use:
# df = df.sample(n=60000, random_state=42)  # Use random_state for reproducibility

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df)

# Print first few rows to verify the data
print("Dataset loaded and preprocessed. Sample data:\n", df.head())

print("Dataset Length: ",len(dataset))


In [None]:
# Load GPT-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

# Set padding token to the EOS token
tokenizer.pad_token = tokenizer.eos_token

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], return_tensors="pt", padding="max_length", truncation=True)

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Print a sample tokenized data
print("Tokenization complete. Sample tokenized data:\n", tokenized_dataset[0])


In [None]:
# Split into train and validation sets
train_dataset, val_dataset = tokenized_dataset.train_test_split(test_size=0.2).values()

# Print split info
print(f"Data split into {len(train_dataset)} training samples and {len(val_dataset)} validation samples.")


In [None]:
# Load distilGPT-2 model for the teacher model
model = AutoModelForCausalLM.from_pretrained("distilgpt2")

freeze_layers = 3  # Freezing first 3 layers
for i, layer in enumerate(model.transformer.h):
    if i < freeze_layers:
        for param in layer.parameters():
            param.requires_grad = False  # Freeze layer

# Ensure the final layers and output layers remain trainable
for param in model.lm_head.parameters():
    param.requires_grad = True

# Print model loading confirmation
print("DistilGPT-2 model loaded successfully.")


In [None]:
# Function to compute metrics
def compute_metrics(p):
    # We need to get the predictions and labels from the output
    logits, labels = p
    # Use the softmax function to convert logits to probabilities (this is for classification tasks)
    predictions = torch.argmax(logits, dim=-1)

    # Flatten predictions and labels for evaluation
    predictions = predictions.flatten().cpu().numpy()
    labels = labels.flatten().cpu().numpy()

    # Compute perplexity (standard for language models)
    loss = torch.nn.CrossEntropyLoss()(logits.view(-1, logits.size(-1)), labels.view(-1))
    perplexity = torch.exp(loss).item()

    # Print perplexity for tracking
    print(f"Perplexity: {perplexity}")

    # Return the dictionary of metrics
    return {
        'perplexity': perplexity,
    }


In [None]:
# Temporarily reduce the number of epochs for testing
training_args = TrainingArguments(
    output_dir="./gpt2_shakespeare",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,  # Reduced batch size
    per_device_eval_batch_size=16,   # Reduced batch size
    num_train_epochs=3,  # Reduce epochs to 3 temporarily
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,
    save_steps=500,
    fp16=False,  # Disable mixed precision for CPU training
    gradient_accumulation_steps=2,
)


In [None]:
from torch.utils.data import DataLoader
from transformers import DataCollatorForLanguageModeling

# Initialize DataCollator for Language Modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Set to False for causal language modeling (GPT-2)
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator  # Use the DataCollator here
)

# Start training and track progress
print("Starting model training...")

# Training loop with progress
trainer.train()

# Confirm completion of training
print("Model training complete.")


In [None]:
# Save the trained teacher model
model.save_pretrained("./gpt2_shakespeare_teacher")

# Print confirmation
print("Teacher model saved successfully.")
