In [1]:
import pandas as pd
df = pd.read_csv("processed.csv")
df = df.drop(columns=['Unnamed: 0'])

In [2]:
from collections import Counter

In [3]:
df

Unnamed: 0,Text,Spam
0,subject naturally irresistible your corporate ...,1
1,subject the stock trading gunslinger fanny is ...,1
2,subject unbelievable new homes made easy im wa...,1
3,subject 4 color printing special request addit...,1
4,subject do not have money get software cds fro...,1
...,...,...
5723,subject re research and development charges to...,0
5724,subject re receipts from visit jim thanks agai...,0
5725,subject re enron case study update wow all on ...,0
5726,subject re interest david please call shirley ...,0


In [4]:
def build_vocab(texts,max_vocab_size=10000):
    words = " ".join(texts).split()
    freq = Counter(words)
    vocab = {'<PAD>':0,"<UNK>":1}
    for idx, (word,_) in enumerate(freq.most_common(max_vocab_size - 2),start = 2)  :
        vocab[word] = idx
    return vocab

In [5]:
vocab = build_vocab(df['Text'].tolist()) 

In [6]:
def encode_text(text, vocab, max_len=50):
    tokens = text.split()
    ids = [vocab.get(tok, vocab["<UNK>"]) for tok in tokens]
    if len(ids) < max_len:
        ids += [vocab["<PAD>"]] * (max_len - len(ids))
    else:
        ids = ids[:max_len]
    return ids

In [7]:
max_len = 50
df['encoded'] = df['Text'].apply(lambda x: encode_text(x, vocab, max_len))

In [8]:
len(df['encoded'][1])

50

In [9]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [10]:
X = torch.tensor(df['encoded'].tolist(), dtype=torch.long)
y = torch.tensor(df['Spam'].tolist(), dtype=torch.long)

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=3)

In [12]:
class SpamDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [13]:
train_ds = SpamDataset(X_train, y_train)
test_ds = SpamDataset(X_test, y_test)

In [14]:
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=32)

In [15]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
    
    def forward(self, x):
        x = self.embedding(x)
        _, (h_n, _) = self.lstm(x)
        out = self.fc(h_n[-1])
        return out

In [16]:
vocab_size = len(vocab)
embed_dim = 128
hidden_dim = 128
num_classes = 2

In [17]:
model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, num_classes)

In [18]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

Epoch 1/20, Loss: 0.4191
Epoch 2/20, Loss: 0.1828
Epoch 3/20, Loss: 0.1064
Epoch 4/20, Loss: 0.0500
Epoch 5/20, Loss: 0.0327
Epoch 6/20, Loss: 0.0247
Epoch 7/20, Loss: 0.0317
Epoch 8/20, Loss: 0.0127
Epoch 9/20, Loss: 0.0174
Epoch 10/20, Loss: 0.0218
Epoch 11/20, Loss: 0.0790
Epoch 12/20, Loss: 0.0243
Epoch 13/20, Loss: 0.0090
Epoch 14/20, Loss: 0.0176
Epoch 15/20, Loss: 0.0116
Epoch 16/20, Loss: 0.0042
Epoch 17/20, Loss: 0.0046
Epoch 18/20, Loss: 0.0014
Epoch 19/20, Loss: 0.0012
Epoch 20/20, Loss: 0.0011


In [19]:
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == y_batch).sum().item()
        total += y_batch.size(0)

print(f"Test Accuracy: {correct/total:.4f}")

Test Accuracy: 0.9782
