# **Generating the synthetic data**

# Mount drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')


# Load your fine-tuned GPT-2 model & tokenizer

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

model_path = "/content/drive/MyDrive/finetuned_gpt2_emails"  # path to the folder in Drive
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
model.to("cpu")  # use CPU if GPU is not available

# Generate synthetic emails

In [None]:
from tqdm import tqdm

def generate_unique_emails(prompt, target_count=1000, max_length=80, batch_size=20):
    generated_set = set()

    while len(generated_set) < target_count:
        n = min(batch_size, target_count - len(generated_set))
        inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
        outputs = model.generate(
            inputs.repeat(n, 1),
            max_length=max_length,
            num_return_sequences=n,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
        for o in outputs:
            email = tokenizer.decode(o, skip_special_tokens=True).strip()
            if email and email not in generated_set:
                generated_set.add(email)

    return list(generated_set)

#  Generate 3500 spam and 3500 ham
synthetic_spam = generate_unique_emails("<SPAM>", target_count=3500)
synthetic_ham = generate_unique_emails("<HAM>", target_count=3500)


In [None]:
import pandas as pd

# Load your real dataset
real_df = pd.read_csv("/content/drive/MyDrive/CleanedDataset.csv", low_memory=False)

print(" Real dataset loaded with:", real_df.shape)

# Build synthetic dataframe
synthetic_df = pd.DataFrame({
    "text": synthetic_spam + synthetic_ham,
    "label": ["spam"] * len(synthetic_spam) + ["ham"] * len(synthetic_ham)
})

print(" Synthetic dataset created with:", synthetic_df.shape)

# Combine real + synthetic
augmented_df = pd.concat([real_df, synthetic_df], ignore_index=True)

# Drop duplicates (just in case)
augmented_df.drop_duplicates(subset=['text'], inplace=True)

# Save final augmented dataset
augmented_df.to_csv("/content/drive/MyDrive/augmented_emails.csv", index=False)

print(f" Augmented dataset ready with {len(augmented_df)} samples and saved as 'augmented_emails.csv'")
