In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, Trainer
from transformers import TrainingArguments
import evaluate




In [3]:
df = pd.read_csv("/Users/abhishekwaghchaure/Desktop/Datasets/email/preprocessed_emails.csv")

df = df.dropna(subset=['processed_body', 'subject', 'response'])

In [4]:
df.head()

Unnamed: 0,file,headers,body,subject,from,to,clean_body,processed_body,date,response
0,semperger-c/deleted_items/46.,Message-ID: <30978077.1075841544706.JavaMail.e...,Time is running very short. Is your company p...,!!! OATI Etag 1.7 Minimum Requirements !!!,frank.billington@oatiinc.com,cara.semperger@enron.com,Time is running very short Is your company pre...,"[['Time', 'running', 'short', 'Is', 'company',...",2002-02-01 11:25:00-08:00,"[['Time', 'running', 'short', 'Is', 'company',..."
1,king-j/deleted_items/19.,Message-ID: <15244269.1075840797931.JavaMail.e...,Time is running very short. Is your company p...,!!! OATI Etag 1.7 Minimum Requirements !!!,frank.billington@oatiinc.com,jeff.king@enron.com,Time is running very short Is your company pre...,"[['Time', 'running', 'short', 'Is', 'company',...",2002-02-01 11:28:51-08:00,"[['Time', 'running', 'short', 'Is', 'company',..."
2,platter-p/inbox/43.,Message-ID: <394365.1075841413683.JavaMail.eva...,Time is running very short. Is your company p...,!!! OATI Etag 1.7 Minimum Requirements !!!,frank.billington@oatiinc.com,phillip.platter@enron.com,Time is running very short Is your company pre...,"[['Time', 'running', 'short', 'Is', 'company',...",2002-02-01 11:32:28-08:00,"[['Time', 'running', 'short', 'Is', 'company',..."
3,salisbury-h/inbox/196.,Message-ID: <19201127.1075841505530.JavaMail.e...,Immediately delete and DO NOT OPEN email \n \n...,!!!!!!!!!!!GONE.SCR VIRUS Warning!!!!!!!!!!!11,david.steiner@enron.com,center.dl-portland@enron.com,Immediately delete and DO NOT OPEN email From ...,"[['Immediately', 'delete', 'DO', 'NOT', 'OPEN'...",2001-12-04 11:49:46-08:00,"[['Immediately', 'delete', 'DO', 'NOT', 'OPEN'..."
4,kaminski-v/all_documents/1055.,Message-ID: <8575423.1075856206811.JavaMail.ev...,HENWOOD ANNOUNCES A MAJOR NEW RELEASE AND FUNC...,""" Henwood's Rationalizing Midwest Power Market...",cfarrell@hesinet.com,vkamins@ect.enron.com,HENWOOD ANNOUNCES A MAJOR NEW RELEASE AND FUNC...,"[['HENWOOD', 'ANNOUNCES', 'A', 'MAJOR', 'NEW',...",2001-03-19 03:17:00-08:00,"[['HENWOOD', 'ANNOUNCES', 'A', 'MAJOR', 'NEW',..."


In [5]:
df["input_text"] = "Email Body: " + df["processed_body"] + " Subject: " + df["subject"]
df["target_text"] = df["response"]

In [6]:
df['input_text'].head()

0    Email Body: [['Time', 'running', 'short', 'Is'...
1    Email Body: [['Time', 'running', 'short', 'Is'...
2    Email Body: [['Time', 'running', 'short', 'Is'...
3    Email Body: [['Immediately', 'delete', 'DO', '...
4    Email Body: [['HENWOOD', 'ANNOUNCES', 'A', 'MA...
Name: input_text, dtype: object

In [7]:
df['target_text'].head()

0    [['Time', 'running', 'short', 'Is', 'company',...
1    [['Time', 'running', 'short', 'Is', 'company',...
2    [['Time', 'running', 'short', 'Is', 'company',...
3    [['Immediately', 'delete', 'DO', 'NOT', 'OPEN'...
4    [['HENWOOD', 'ANNOUNCES', 'A', 'MAJOR', 'NEW',...
Name: target_text, dtype: object

In [8]:
# Split into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save as Hugging Face Dataset format
train_df.to_csv("/Users/abhishekwaghchaure/Desktop/Datasets/email/train.csv", index=False)
test_df.to_csv("/Users/abhishekwaghchaure/Desktop/Datasets/email/test.csv", index=False)

In [9]:
train_data = load_dataset("csv", data_files="/Users/abhishekwaghchaure/Desktop/Datasets/email/train.csv")["train"]
test_data = load_dataset("csv", data_files="/Users/abhishekwaghchaure/Desktop/Datasets/email/test.csv")["train"]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [10]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Tokenize function
def preprocess_function(examples):
    inputs = tokenizer(examples["input_text"], max_length=512, truncation=True, padding="max_length")
    targets = tokenizer(examples["target_text"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [11]:
test_data

Dataset({
    features: ['file', 'headers', 'body', 'subject', 'from', 'to', 'clean_body', 'processed_body', 'date', 'response', 'input_text', 'target_text'],
    num_rows: 67872
})

In [12]:
# Apply tokenization
train_data = train_data.map(preprocess_function, batched=True)
test_data = test_data.map(preprocess_function, batched=True)

# Remove unused columns
train_data = train_data.remove_columns(["input_text", "target_text"])
test_data = test_data.remove_columns(["input_text", "target_text"])

train_data.set_format("torch")
test_data.set_format("torch")

Map:   0%|          | 0/271484 [00:00<?, ? examples/s]

Map:   0%|          | 0/67872 [00:00<?, ? examples/s]

In [13]:
# Load metrics
rouge = evaluate.load("rouge")

# Define custom evaluation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Calculate ROUGE scores
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    return {
        "rouge1": result["rouge1"].mid.fmeasure,
        "rouge2": result["rouge2"].mid.fmeasure,
        "rougeL": result["rougeL"].mid.fmeasure,
    }


In [14]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs"
)



In [15]:
# Load model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics  # Use the custom evaluation function
)

  trainer = Trainer(


: 

In [None]:
trainer.train()

  0%|          | 0/101808 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


In [None]:
model.save_pretrained("./email_response_model")
tokenizer.save_pretrained("./email_response_model")
print("Model and tokenizer saved successfully!")

## Generate Response

In [None]:
def generate_response(input_text):
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    outputs = model.generate(inputs["input_ids"], max_length=128, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test the model
test_input = "Email Body: Let's schedule a meeting. Subject: Meeting Request"
response = generate_response(test_input)
print("Generated Response:", response)