In [1]:
import torch
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import re

In [2]:
# Check if GPU is available for faster training (optional)

# Load and sample the email dataset (you can use a smaller subset for faster testing)
emails_df = pd.read_csv("./Downloads/email_data.zip").sample(5000)

In [3]:
import re

def clean_email(raw_email):
    # Replace standard metadata fields
    clean_email = re.sub(r'(?i)^Message-ID:.*\n', '[MESSAGE_ID]', raw_email)
    clean_email = re.sub(r'(?i)^Date:.*\n', '[DATE]', clean_email)
    clean_email = re.sub(r'(?i)^From:.*\n', '[FROM]', clean_email)
    clean_email = re.sub(r'(?i)^To:.*\n', '[TO]', clean_email)
    clean_email = re.sub(r'(?i)^Subject:.*\n', '[SUBJECT LINE]', clean_email)

    # Remove email headers that don't need to be retained
    clean_email = re.sub(r'(?i)^Mime-Version:.*\n', '', clean_email)
    clean_email = re.sub(r'(?i)^Content-Type:.*\n', '', clean_email)
    clean_email = re.sub(r'(?i)^Content-Transfer-Encoding:.*\n', '', clean_email)
    clean_email = re.sub(r'(?i)^X-.*\n', '', clean_email)
    clean_email = re.sub(r'(?i)^FYI.*\n', '', clean_email)
    clean_email = re.sub(r'(?i)^----- Forwarded by.*\n', '', clean_email)

    # Replace email addresses
    clean_email = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b', '[EMAIL]', clean_email)

    # Replace phone numbers (various formats)
    clean_email = re.sub(r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b', '[PHONE_NUMBER]', clean_email)
    clean_email = re.sub(r'\+\d{1,3}[-.\s]?\d{1,4}[-.\s]?\d{3,4}[-.\s]?\d{3,4}', '[PHONE_NUMBER]', clean_email)

    # Replace personal names (basic pattern, can be improved with NLP)
    clean_email = re.sub(r'\b[A-Z][a-z]+(?:\s[A-Z][a-z]+){0,2}\b', '[NAME]', clean_email)

    # Replace company names (basic approach)
    clean_email = re.sub(r'\b(?:Enron|ExxonMobil|Amazon|Google|Microsoft|Facebook|Tesla|Apple)\b', '[COMPANY]', clean_email)

    # Replace any identifiers (contract numbers, transaction IDs, etc.)
    clean_email = re.sub(r'\b[A-Z0-9]{5,}\b', '[IDENTIFIER]', clean_email)

    # Remove excess whitespace
    clean_email = re.sub(r'\n+', ' ', clean_email)
    clean_email = clean_email.strip()
    
    return clean_email

# Clean the dataset
emails_df['cleaned_message'] = emails_df['message'].apply(clean_email)

In [4]:
# Convert cleaned DataFrame to Hugging Face dataset
df_email_data = emails_df[['cleaned_message']].copy()
email_dataset = Dataset.from_pandas(df_email_data)

# Initialize the T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
# Tokenize the dataset
def tokenize_function(examples):
    # Tokenizing input and assigning labels as input ids
    model_inputs = tokenizer(examples['cleaned_message'], padding="max_length", truncation=True, max_length=64)
    model_inputs['labels'] = model_inputs['input_ids'].copy()  # T5 uses input IDs as labels
    return model_inputs

tokenized_dataset = email_dataset.map(tokenize_function, batched=True)

# Split the dataset into training and evaluation datasets (80/20 split)
# Split the dataset into training and evaluation datasets (80/20 split)
train_dataset, eval_dataset = tokenized_dataset.train_test_split(test_size=0.2).values()

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [6]:
# Set up faster training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,                   # Keep it minimal for quick training
    per_device_train_batch_size=2,        # Small batch size for faster training
    per_device_eval_batch_size=2,         # Small eval batch size
    warmup_steps=0,                       # No warmup for faster start
    weight_decay=0.01,
    logging_dir='./logs',
    gradient_accumulation_steps=1,         # No gradient accumulation
    logging_steps=1000,                     # Log less frequently
    evaluation_strategy="no",               # No evaluation during training
    save_strategy="no",                     # Skip saving model to save time
    fp16=True,                              # Use mixed precision (faster)
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Start training
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_model_3')
tokenizer.save_pretrained('./fine_tuned_model_3')

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
1000,0.0189
2000,0.0023


('./fine_tuned_model_3\\tokenizer_config.json',
 './fine_tuned_model_3\\special_tokens_map.json',
 './fine_tuned_model_3\\spiece.model',
 './fine_tuned_model_3\\added_tokens.json')

In [7]:
def generate_synthetic_email_with_context(prompt="Form of Memorandum of Option Attached is a copy of our proposed Memorandum of Option that we would like to  use for our land options.", max_length=300):
    # Tokenize the input prompt
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    # Generate text with more randomness and diversity
    output_ids = model.generate(
        input_ids,
        max_length=max_length,    # Allow the generation to be longer if necessary
        num_beams=1,              # Use random sampling instead of greedy search
        top_p=0.9,                # Nucleus sampling: top 90% probability mass
        top_k=50,                 # Restrict to top 50 tokens for sampling
        no_repeat_ngram_size=2,   # Avoid repeating the same n-grams
        temperature=1.0,          # Higher temperature for more randomness
        do_sample=True,           # Enable sampling for randomness
        num_return_sequences=5    # Generate multiple variations of the email
    )

    # Decode the generated ids back into text
    generated_emails = []
    for output in output_ids:
        email_text = tokenizer.decode(output, skip_special_tokens=True)
        generated_emails.append(email_text)

    return generated_emails

# Generate 5 synthetic emails
generated_emails = generate_synthetic_email_with_context(prompt="""| william.giuliani@enron.com | andrew.fastow@enron.com | 2001-06-07 07:48:00 |
Subject: Approval of the DPR Accelerated Put transaction
Dear Andrew,

Attached is the DASH for the approval of the DPR Accelerated Put transaction. This
partial divestiture allows us to put $11 million of our equity interest back to DPR
Holding Company, LLC, and its subsidiary, Dakota, LLC. Both entities are controlled
by Chris Cline.
In addition to redeeming part of our equity interest, the deal provides us with
900,000 tons of coal priced below market, an option which could lead to a very
profitable synfuel project, and the potential for more marketing fees from other
Cline entities.
The DASH has been approved and signed by RAC and JEDI II and is now awaiting
Mark Haedicke’s review and approval. I wanted to give you the opportunity to review
the DASH and become familiar with the provisions of the deal.
If you have any questions on the transaction, feel free to contact me at (412) 490-
9048. Others familiar with the deal are Mike Beyer, George McClellan, and Wayne
Gresham.
Thank you.
Best regards,
Bill Giuliani""")

# Print the generated emails
for i, email in enumerate(generated_emails, 1):
    print(f"Generated Email {i}:\n")
    print(email)
    print("\n" + "-"*50 + "\n")

Generated Email 1:

onII IIison the DASH on youichad.: Approval of the. Attached is the DEC on approval of DPR accelerated Put transaction Dear Andrew, Attachd.i@enron.com | andrew.fastow@enra.co.uk | 2001-06-07 07:48:00 | Subject: Approposition of RED for the approval LLC to them. Congratulations., and

--------------------------------------------------

Generated Email 2:

– the DASH, the purchase of the agreement is now awaiting Mark Haedicke’s review and approval. Thanks. I wanted to give you any questions on the transaction and become familiar with the provisions of that deal

--------------------------------------------------

Generated Email 3:

the DASH approval process, which means that the deal can take 900 000 tons of coal priced below market. Both entities are controlled by Chris Cline. Each entity is entitled to a full membership in the commercial agreement - as well as re-purchased's dash. All rights reserved. RR.

--------------------------------------------------

Gener