In [42]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer
import pandas as pd
from tqdm import tqdm

In [29]:
class EmailDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        input_text = f"Email Body: {row['processed_body']} Subject: {row['subject']}"
        target_text = row['response']

        # Tokenize input and target
        inputs = self.tokenizer(
            input_text, max_length=self.max_len, truncation=True, padding="max_length", return_tensors="pt"
        )
        targets = self.tokenizer(
            target_text, max_length=self.max_len, truncation=True, padding="max_length", return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": targets["input_ids"].squeeze(0),
        }


In [30]:
df = pd.read_csv("/Users/abhishekwaghchaure/Desktop/Datasets/email/preprocessed_emails.csv")
df = df.dropna(subset=['processed_body', 'subject', 'response'])

In [31]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
tokenizer = T5Tokenizer.from_pretrained("t5-small")



In [32]:
train_dataset = EmailDataset(train_df, tokenizer)
test_dataset = EmailDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

In [33]:
print(len(train_dataset))
print(len(test_dataset))

271484
67872


## Custom Seq2Seq Encoder Decoder

In [34]:
# class EmailResponseGenerator(nn.Module):
#     def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
#         super(EmailResponseGenerator, self).__init__()
#         self.embedding = nn.Embedding(vocab_size, embedding_dim)
#         self.encoder = nn.GRU(
#             input_size=embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True
#         )
#         self.decoder = nn.GRU(
#             input_size=embedding_dim, hidden_size=hidden_dim * 2, num_layers=num_layers, batch_first=True
#         )
#         self.fc = nn.Linear(hidden_dim * 2, vocab_size)

#     # def forward(self, input_ids, attention_mask, labels=None):
#     #     embedded_inputs = self.embedding(input_ids)
#     #     encoder_outputs, hidden = self.encoder(embedded_inputs)
#     #     decoder_inputs = self.embedding(labels) if labels is not None else encoder_outputs[:, 0:1, :]
#     #     decoder_outputs, _ = self.decoder(decoder_inputs, hidden)
#     #     logits = self.fc(decoder_outputs)
#     #     return logits
    
#         def forward(self, input_ids, attention_mask, labels=None):
#             # Embedding Layer
#             embedded_inputs = self.embedding(input_ids)  # (batch_size, seq_len, embedding_dim)
            
#             # Encoder
#             encoder_outputs, hidden = self.encoder(embedded_inputs)  # (batch_size, seq_len, hidden_dim * 2)

#             # Fix hidden state for decoder (merge bidirectional states)
#             if self.encoder.bidirectional:
#                 hidden = hidden.view(self.encoder.num_layers, 2, input_ids.size(0), -1)  # (num_layers, 2, batch_size, hidden_dim)
#                 hidden = torch.cat((hidden[:, 0], hidden[:, 1]), dim=-1)  # (num_layers, batch_size, hidden_dim)

#             # Decoder Input
#             decoder_inputs = self.embedding(labels) if labels is not None else encoder_outputs[:, 0:1, :]  # (batch_size, 1, embedding_dim)
            
#             # Decoder
#             decoder_outputs, _ = self.decoder(decoder_inputs, hidden)  # (batch_size, seq_len, hidden_dim * 2)

#             # Output Layer
#             logits = self.fc(decoder_outputs)  # (batch_size, seq_len, vocab_size)
#             return logits

In [38]:
class EmailResponseGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(EmailResponseGenerator, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Encoder: GRU
        self.encoder = nn.GRU(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True
        )
        
        # Decoder: GRU
        self.decoder = nn.GRU(
            input_size=embedding_dim,
            hidden_size=hidden_dim * 2,  # Match bidirectional encoder output
            num_layers=num_layers,
            batch_first=True
        )
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim * 2, vocab_size)

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        # Embedding Layer
        embedded_inputs = self.embedding(input_ids)  # (batch_size, seq_len, embedding_dim)
        
        # Encoder
        encoder_outputs, hidden = self.encoder(embedded_inputs)  # (batch_size, seq_len, hidden_dim * 2)

        # Fix hidden state for decoder
        if self.encoder.bidirectional:
            hidden = hidden.view(self.encoder.num_layers, 2, input_ids.size(0), -1)  # (num_layers, 2, batch_size, hidden_dim)
            hidden = torch.cat((hidden[:, 0], hidden[:, 1]), dim=-1)  # (num_layers, batch_size, hidden_dim)

        # Decoder Input
        decoder_inputs = self.embedding(labels) if labels is not None else encoder_outputs[:, 0:1, :]  # (batch_size, 1, embedding_dim)
        
        # Decoder
        decoder_outputs, _ = self.decoder(decoder_inputs, hidden)  # (batch_size, seq_len, hidden_dim * 2)

        # Output Layer
        logits = self.fc(decoder_outputs)  # (batch_size, seq_len, vocab_size)
        return logits

In [40]:
vocab_size = tokenizer.vocab_size
embedding_dim = 256
hidden_dim = 512
num_layers = 2
epochs = 1
model = EmailResponseGenerator(vocab_size, embedding_dim, hidden_dim, num_layers)

In [39]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = 'mps'
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)


In [47]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct = 0  # For accuracy calculation
    total_tokens = 0  # Total number of valid tokens (non-padding)

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}", colour="blue")

    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device) 
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        # Reshape logits and labels for loss computation
        logits = outputs.view(-1, vocab_size)  # (batch_size * seq_length, vocab_size)
        target_labels = labels.view(-1)  # (batch_size * seq_length)

        loss = criterion(logits, target_labels)
        loss.backward()
        optimizer.step()

        # Compute accuracy: Compare predictions to labels
        predictions = logits.argmax(dim=-1)  # Predicted token indices
        mask = target_labels != tokenizer.pad_token_id  # Ignore padding tokens
        correct += (predictions == target_labels)[mask].sum().item()
        total_tokens += mask.sum().item()

        total_loss += loss.item()

        # Update TQDM progress bar
        progress_bar.set_postfix(loss=f"{loss.item():.4f}", accuracy=f"{(correct / total_tokens):.4f}")

    # Epoch statistics
    accuracy = correct / total_tokens
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}, Accuracy: {accuracy:.4f}")

Epoch 1/1:   0%|[34m          [0m| 33/33936 [03:52<66:17:47,  7.04s/it, accuracy=0.0000, loss=10.3588]


KeyboardInterrupt: 

In [None]:
def generate_response(input_text):
    model.eval()
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding="max_length").to(device)
    input_ids = inputs["input_ids"]

    # Generate response using the decoder
    with torch.no_grad():
        embedded_inputs = model.embedding(input_ids)
        encoder_outputs, hidden = model.encoder(embedded_inputs)
        decoder_input = embedded_inputs[:, 0:1, :]  # Start with the first token
        responses = []
        for _ in range(100):  # Maximum response length
            decoder_outputs, hidden = model.decoder(decoder_input, hidden)
            logits = model.fc(decoder_outputs)
            predicted_token = torch.argmax(logits, dim=-1)
            responses.append(predicted_token)
            if predicted_token.item() == tokenizer.eos_token_id:
                break
            decoder_input = model.embedding(predicted_token)

    decoded_response = tokenizer.decode(torch.cat(responses).squeeze(), skip_special_tokens=True)
    return decoded_response


In [None]:
test_input = "Email Body: Let's schedule a meeting. Subject: Meeting Request"
response = generate_response(test_input)
print("Generated Response:", response)

In [None]:
## Save Model
model_path = "email_response_generator.pt"
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")