### Imports

In [52]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
from collections import Counter
from sklearn.metrics import classification_report

### Dataset class

In [53]:
class SpamOrHamDataset(Dataset):

    def __init__(self, csv_file, seq_length):
        self.seq_length = seq_length
        
        # Read CSV file and split into emails and labels
        data = pd.read_csv(csv_file)
        self.emails = data["email"].astype(str).tolist()
        self.labels = data["label"].astype(int).tolist()

        self.class_counts = Counter(self.labels)
        print(f"\nClass distribution: {self.class_counts}")
        print(f"Spam percentage: {self.class_counts[1]/len(self.labels)*100:.1f}%")
        print(f"Ham percentage: {self.class_counts[0]/len(self.labels)*100:.1f}%")

        # Tokenize emails
        self.tokenized_emails = [email.split() for email in self.emails]
        
        # Create vocabulary from all words in emails
        self.vocab = self.obtainUniqueWords(self.tokenized_emails)
        self.id2word = {i: w for i, w in enumerate(self.vocab)}
        self.word2id = {w: i for i, w in enumerate(self.vocab)}
        self.listOfIds = [self.word2id[w] for w in self.vocab]

        # Encode tokenized emails
        self.encoded_emails = [self.encode(email) for email in self.tokenized_emails]


    def obtainUniqueWords(self, tokenized_texts):
        # Find all unique words
        wordCounts = Counter()
        for words in tokenized_texts:
            wordCounts.update(words)

        unique_words = sorted(wordCounts, key=wordCounts.get, reverse=True)
        # Add padding token and unknown token
        return ["<PAD>", "<UNK>"] + unique_words
    
    def encode(self, text):
        ids = [self.word2id.get(token, 1) for token in text]

        # Pad or truncate to seq_length size
        if len(ids) > self.seq_length:
            ids = ids[:self.seq_length]
        else:
            ids += [0] * (self.seq_length - len(ids))

        return torch.tensor(ids, dtype=torch.long)

    def __len__(self):
        return len(self.emails)

    def __getitem__(self, index):
        return (
            self.encoded_emails[index],
            torch.tensor(self.labels[index], dtype=torch.long)
            )

### LSTM Model

In [54]:
class LSTM_Model(nn.Module):

    def __init__(self, num_embeddings, embedding_dim, hidden_dim, output_dim, num_layers):
        super(LSTM_Model, self).__init__()
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.embedding(x)
        output, (hidden, cell) = self.lstm(x)
        last_hidden = hidden[-1, :, :]
        out = self.fc(last_hidden)
        return out

### Train and test functions

In [55]:
def train_model(model, dataloader, epochs):
    model.train()

    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        epoch_loss = 0
        num_batches = 0

        for x, target in dataloader:
            optimizer.zero_grad()

            output = model(x)
            loss = criterion(output.squeeze(), target.float())
            
            # Backpropagation
            loss.backward()
            
            optimizer.step()

            epoch_loss += loss.item()
            num_batches += 1
        
        avg_loss = epoch_loss / num_batches
        print(f"Epoch [{epoch+1}/{epochs}] completed. Average Loss: {avg_loss:.4f}")

def test_model(model, dataloader):
    model.eval()
    predictions = []
    labels = []

    with torch.no_grad():
        for x, target in dataloader:
            output = model(x)
            pred = (torch.sigmoid(output) > 0.5).int()
            
            predictions.extend(pred)
            labels.extend(target)

    print(classification_report(labels, predictions, target_names=['Ham', 'Spam']))

### Set hyperparameters

In [56]:
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = 1  # For binary classification
NUM_LAYERS = 2
BATCH_SIZE = 32
EPOCHS = 25
SEQ_LENGTH = 75

### Train and test model

In [57]:
# Load dataset
dataset = SpamOrHamDataset("spam_or_not_spam_SMS.csv", seq_length=SEQ_LENGTH)

# Get vocabulary size from dataset
NUM_EMBEDDINGS = len(dataset.vocab)
print("\nDataset:")
print(f"Vocabulary size: {NUM_EMBEDDINGS}")
print(f"Number of samples: {len(dataset)}")

# Split into train and test data
train_size = int(0.7 * len(dataset))
test_size = (len(dataset) - train_size)
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Initialize model using hyperparameters
model = LSTM_Model(
    num_embeddings=NUM_EMBEDDINGS,
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    output_dim=OUTPUT_DIM,
    num_layers=NUM_LAYERS
)

# Print model architecture
print("\nModel Architecture:")
print(model)
print(f"\nTrainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

# Train model
print(f"\nTraining for {EPOCHS} epochs...")
train_model(model, train_loader, EPOCHS)

# Test model
print("\nTesting results:")
test_model(model, test_loader)


Class distribution: Counter({0: 4827, 1: 747})
Spam percentage: 13.4%
Ham percentage: 86.6%

Dataset:
Vocabulary size: 15735
Number of samples: 5574

Model Architecture:
LSTM_Model(
  (embedding): Embedding(15735, 100)
  (lstm): LSTM(100, 128, num_layers=2, batch_first=True)
  (fc): Linear(in_features=128, out_features=1, bias=True)
)

Trainable parameters: 1,823,485

Training for 25 epochs...
Epoch [1/25] completed. Average Loss: 0.4008
Epoch [2/25] completed. Average Loss: 0.3896
Epoch [3/25] completed. Average Loss: 0.3899
Epoch [4/25] completed. Average Loss: 0.3886
Epoch [5/25] completed. Average Loss: 0.3879
Epoch [6/25] completed. Average Loss: 0.3890
Epoch [7/25] completed. Average Loss: 0.3870
Epoch [8/25] completed. Average Loss: 0.3890
Epoch [9/25] completed. Average Loss: 0.3876
Epoch [10/25] completed. Average Loss: 0.3878
Epoch [11/25] completed. Average Loss: 0.3881
Epoch [12/25] completed. Average Loss: 0.3864
Epoch [13/25] completed. Average Loss: 0.3873
Epoch [14/25]