In [8]:
import os
import warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Disables TensorFlow logging
warnings.filterwarnings("ignore", category=DeprecationWarning, module="tensorflow")

In [9]:
# Updated Cell 1 (data loading and preprocessing)
import pandas as pd

# Load only first 5k entries
df = pd.read_csv('emails.csv', names=['file', 'content'], header=None, nrows=21280497)  # <-- LIMIT HERE

def parse_email(content):
    headers = {}
    body = []
    in_headers = True
    for line in content.split('\n'):
        if in_headers:
            if line.strip() == '':
                in_headers = False
            else:
                if ':' in line:
                    key, value = line.split(':', 1)
                    headers[key.strip()] = value.strip()
        else:
            body.append(line)
    return headers, '\n'.join(body)

df[['headers', 'body']] = df['content'].apply(lambda x: pd.Series(parse_email(x)))
df['To'] = df['headers'].apply(lambda x: x.get('To', ''))
df['Subject'] = df['headers'].apply(lambda x: x.get('Subject', ''))

In [10]:
# Combine To, Subject, and body into a single training string
df['training_text'] = 'To: ' + df['To'] + '\nSubject: ' + df['Subject'] + '\n\n' + df['body']

In [11]:
# Updated Cell 3 (dataset preparation)
from transformers import GPT2Tokenizer
from torch.utils.data import Dataset

class EmailDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Explicitly create labels for language modeling
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': encoding['input_ids'].flatten().clone()  # THIS IS CRUCIAL
        }

# Load tokenizer with GPU support
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

train_dataset = EmailDataset(df['training_text'].tolist(), tokenizer)

In [None]:
# Updated Cell 4 (model training)
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments
import torch

# Check GPU availability
print(f"GPU available: {torch.cuda.is_available()}")
print(f"GPU device: {torch.cuda.get_device_name(0)}")

# Initialize model with explicit device map
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)

# Optimized training args for GPU
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,  # Better GPU memory utilization
    fp16=True,
    logging_dir='./logs',
    logging_steps=100,
    learning_rate=5e-5,
    optim="adamw_torch_fused",
    report_to="none",  # Disables external services
    save_strategy="no",  # Reduces file I/O
    disable_tqdm=False  # Ensures progress bars work
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Start training
trainer.train()

# Save model
model.save_pretrained('./enron-email-generator')
tokenizer.save_pretrained('./enron-email-generator')

GPU available: True
GPU device: NVIDIA GeForce RTX 3060 Laptop GPU


Step,Training Loss
100,2.2779
200,1.5803
300,1.5588
400,1.5452
500,1.5249
600,1.4154
700,1.4383
800,1.4157
900,1.4383
1000,1.3992


In [None]:
def generate_email(to, subject, max_length=600):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    prompt = f"To: {to}\nSubject: {subject}\n\nBody:\n"  # Explicit body section
    
    # Configure tokenizer for generation
    tokenizer.padding_side = 'left'
    tokenizer.pad_token = tokenizer.eos_token
    
    inputs = tokenizer(prompt, return_tensors='pt', return_attention_mask=True).to(device)
    
    # Generation parameters
    outputs = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=max_length,
        num_return_sequences=1,
        temperature=0.9,  # More creative
        top_p=0.92,       # Nucleus sampling
        repetition_penalty=1.2,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.encode("<|endoftext|>")[0],
        num_beams=3,      # Beam search for better coherence
        early_stopping=True
    )
    
    # Decode and clean output
    full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract content after Body: and before end token
    try:
        body_start = full_text.index("Body:\n") + 6
        body_end = full_text.index("\n\n<|endoftext|>")
        return full_text[:body_start] + full_text[body_start:body_end]
    except ValueError:
        return full_text.split("Body:\n")[-1].split("<|endoftext|>")[0]

# Example usage
print(generate_email("client@company.com", "Project Update"))