In [22]:
import pandas as pd
import numpy as np
import torchtext
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
from torchtext.data import Field, LabelField
from torchtext.data import TabularDataset, BucketIterator
import torch.nn.functional as F
import re

### Load Dataset

In [23]:
df = pd.read_csv("email.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [24]:
df.isna().sum()

Category    0
Message     0
dtype: int64

In [25]:
df.duplicated().sum()

415

In [26]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

### Preprocess Data

In [27]:
label_encoder = LabelEncoder()
df["Category"] = label_encoder.fit_transform(df["Category"])

In [28]:
# Tokenize the text
def tokenize(text):
    return [word for word in re.findall(r'\b\w+\b', text.lower())]

df['tokens'] = df['Message'].apply(tokenize)

In [29]:
# Build Vocabulary
all_tokens = [token for sublist in df['tokens'] for token in sublist]
vocab = {word: idx + 1 for idx, (word, _) in enumerate(Counter(all_tokens).items())}

In [30]:
# Convert token to indices
df['tokens_indices'] = df['tokens'].apply(lambda x: [vocab[word] for word in x])

In [31]:
# Pad sequence
max_len = 50
padded_token_indices = pad_sequence([torch.tensor(x) for x in df['tokens_indices']], batch_first=True, padding_value=0)
padded_token_indices = padded_token_indices[:, :max_len]

In [32]:
# Convert to tensor
input_ids = torch.tensor(padded_token_indices, dtype=torch.long)
labels = torch.tensor(df['Category'].values, dtype=torch.long)

  input_ids = torch.tensor(padded_token_indices, dtype=torch.long)


In [33]:
# Split data
train_inputs, test_inputs, train_labels, val_labels = train_test_split(input_ids, labels, test_size=0.1, random_state=42)

In [34]:
# Create DataLoader
BATCH_SIZE = 32

train_data = TensorDataset(train_inputs, train_labels)
train_dataloader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)

val_data = TensorDataset(test_inputs, val_labels)
val_dataloader = DataLoader(dataset=val_data, batch_size=BATCH_SIZE, shuffle=False)

### Define Model

In [35]:
import torch.nn as nn

class SpamClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers=2, bidirectional=True, dropout=0.5):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout= nn.Dropout(dropout)
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        out = self.dropout(lstm_out[:, -1, :])
        out = self.fc(out)
        return out

vocab_size = len(vocab) + 1
embedding_dim = 50
hidden_dim = 128
output_dim = len(df['Category'].unique())

model = SpamClassifier(vocab_size, embedding_dim, hidden_dim, output_dim)

# Loss function
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [36]:
# Move the model to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

SpamClassifier(
  (embedding): Embedding(8752, 50)
  (lstm): LSTM(50, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=256, out_features=3, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [37]:
# Training Loop
from tqdm import tqdm

epochs = 10

for epoch in tqdm(range(epochs)):
    total_loss = 0
    for batch in train_dataloader:
        b_input_ids, b_labels = tuple(t.to(device) for t in batch)
        
        optimizer.zero_grad()
        
        output = model(b_input_ids)
        
        # Ensure labels are in long format
        loss = loss_fn(output, b_labels.long())
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {avg_train_loss}')

 10%|█         | 1/10 [00:01<00:10,  1.14s/it]

Epoch 1/10, Loss: 0.4039271040628218


 20%|██        | 2/10 [00:02<00:07,  1.02it/s]

Epoch 2/10, Loss: 0.3826319929977802


 30%|███       | 3/10 [00:02<00:06,  1.09it/s]

Epoch 3/10, Loss: 0.38809258046827905


 40%|████      | 4/10 [00:03<00:05,  1.13it/s]

Epoch 4/10, Loss: 0.38175219578081615


 50%|█████     | 5/10 [00:04<00:04,  1.17it/s]

Epoch 5/10, Loss: 0.3799057262503121


 60%|██████    | 6/10 [00:05<00:03,  1.21it/s]

Epoch 6/10, Loss: 0.3803317364152164


 70%|███████   | 7/10 [00:06<00:02,  1.24it/s]

Epoch 7/10, Loss: 0.3777688991737692


 80%|████████  | 8/10 [00:06<00:01,  1.26it/s]

Epoch 8/10, Loss: 0.37900563395799025


 90%|█████████ | 9/10 [00:07<00:00,  1.28it/s]

Epoch 9/10, Loss: 0.3695071603949756


100%|██████████| 10/10 [00:08<00:00,  1.20it/s]

Epoch 10/10, Loss: 0.28547760053244353





In [38]:
# Evaluate Model
model.eval()

eval_loss = 0
eval_acc = 0

for batch in val_dataloader:
    b_input_ids, b_labels = tuple(t.to(device) for t in batch)
    
    with torch.inference_mode():
        output = model(b_input_ids)

        loss = loss_fn(output, b_labels.long())
        eval_loss += loss.item()

        preds = torch.argmax(output, dim=1).flatten()
        eval_acc += (preds == b_labels).cpu().numpy().mean()

avg_eval_loss = eval_loss / len(val_dataloader)
avg_eval_acc = eval_acc / len(val_dataloader)
print(f'Loss: {avg_eval_loss}, Accuracy: {avg_eval_acc}')

Loss: 0.24414510761990266, Accuracy: 0.8897058823529411


In [39]:
# Save Model
torch.save(model.state_dict(), 'spam_classifier.pt')