# ***Synthetic Model Training with GPT-2***

This notebook demonstrates the process of fine-tuning a GPT-2 model for text generation, specifically for creating email content. It covers data loading, preprocessing, model training, and saving the fine-tuned model. The dataset used contains email text labeled as either spam or ham.

# **Import Required Libraries**

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import torch

# **1.Load dataset**

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

file_path = "/content/drive/MyDrive/CleanedDataset.csv"
df = pd.read_csv(file_path, low_memory=False)




#  **2.Inspect dataset**

## Shape Of The Dataset

In [None]:

print(df.shape)



The dataset has 83448 rows and 2 columns.

# First 5 rows

In [None]:
print(df.head())

# Info

In [None]:
print(df.info())


# Handle missing 'text' values

In [None]:

most_frequent_text = df['text'].mode()[0]
df['text'].fillna(most_frequent_text, inplace=True)

most_frequent_label = df['label'].mode()[0]
df['label'].fillna(most_frequent_label, inplace=True)
# Remove exact duplicate rows
df.drop_duplicates(inplace=True)

print("Dataset shape after handling nulls and duplicates:", df.shape)
print("Number of missing text values:", df['text'].isna().sum())
print("Number of missing labels:", df['label'].isna().sum())


# **3. Separate spam and ham**


In [None]:
spam_df = df[df['label'] == 1]   #  1 = spam
ham_df = df[df['label'] == 0]    #  0 = ham

print("Spam samples:", len(spam_df))
print("Ham samples:", len(ham_df))

##  Save as text files for training

In [None]:

spam_file = "/content/drive/MyDrive/spam.txt"
ham_file = "/content/drive/MyDrive/ham.txt"

spam_df['cleaned_text'].to_csv(spam_file, index=False, header=False)
ham_df['cleaned_text'].to_csv(ham_file, index=False, header=False)

print(" Files saved:")
print(f"Spam -> {spam_file}, {len(spam_df)} samples")
print(f"Ham  -> {ham_file}, {len(ham_df)} samples")

# **3.GPT-2 fine-tuning**

## Install dependencies

In [None]:
!pip install transformers datasets


##   Convert to HuggingFace Dataset

In [None]:

dataset = Dataset.from_pandas(df[['text', 'label']])


# Split train/test

In [None]:

dataset_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset_split['train']
test_dataset = dataset_split['test']
print("Train size:", len(train_dataset))
print("Test size:", len(test_dataset))

#  Load GPT-2 and Tokenizer

In [None]:


tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 has no pad token
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Tokenize Dataset

In [None]:

def tokenize(batch):
    return tokenizer(batch['text'], padding="max_length", truncation=True, max_length=128)

tokenized_train = train_dataset.map(tokenize, batched=True, batch_size=128)
tokenized_test  = test_dataset.map(tokenize, batched=True, batch_size=128)

#  Data Collator

In [None]:

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # GPT-2 uses causal LM, not masked LM
)


#  Training Arguments

In [None]:
training_args = TrainingArguments(
    output_dir="./gen_model",
    overwrite_output_dir=True,
    num_train_epochs=3,                # Can increase for better accuracy
    per_device_train_batch_size=8,     # Adjust based on GPU
    save_strategy="steps",
    save_steps=1000,
    logging_steps=200,
    fp16=True,                         # Use mixed precision if GPU supports
    eval_strategy="no",
    eval_steps=1000,
    save_total_limit=2,
    gradient_accumulation_steps=2,
)

#  Trainer Setup

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train.remove_columns(["label"]), # Remove the label column
    eval_dataset=tokenized_test,
    data_collator=data_collator,
)

#  Start Training

In [None]:

trainer.train()

#  Save Fine-tuned Model

In [None]:
model.save_pretrained("./finetuned_gpt2_emails")
tokenizer.save_pretrained("./finetuned_gpt2_emails")
print("Training complete and model saved!")

In [None]:
import shutil
from google.colab import files

# Zip the folder containing model & tokenizer
shutil.make_archive("finetuned_gpt2_emails", 'zip', "./finetuned_gpt2_emails")

# Download the zip file
files.download("finetuned_gpt2_emails.zip")
