In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from pathlib import Path
import numpy as np


In [3]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

class TextCleaner:
    '''Class for cleaning Text'''
    def __init__(self, currency_symbols, stop_words=None, lemmatizer=None):
        self.currency_symbols = currency_symbols
        
        if stop_words is None:
            self.stop_words = set(stopwords.words('english'))
        else:
            self.stop_words = stop_words
        
        if lemmatizer is None:
            self.lemmatizer = WordNetLemmatizer()
        else:
            self.lemmatizer = lemmatizer
    

    # functions for removing punctuations
    def remove_punctuation(self,text):
        return text.translate(str.maketrans('', '', string.punctuation))
    

    # Functions for cleaning text
    def clean_text(self, text):
        text = text.lower()
        text = re.sub(self.currency_symbols, 'currency', text)
        text = self.remove_punctuation(text)
        text = re.compile('<.*?>').sub('', text)
        text = text.replace('_', '')
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        text = ' '.join(word for word in text.split() if word not in self.stop_words)
        text = ' '.join(self.lemmatizer.lemmatize(word) for word in text.split())
        
        return text

In [4]:
train_path = Path("../ML Engineer/train.csv")
df = pd.read_csv(train_path)
print(df.head())

                                    id_  source  \
0  5732aa7f-0c44-4a4f-877a-0488aed0d1f7       2   
1  4d3c392d-a4f0-465d-baa3-2c15f1560f07       2   
2  d47e95c0-4909-41b8-aec8-a3fb953fa18f       4   
3  658a83eb-689c-480a-ae31-d622dc83f9f8       6   
4  179d10b7-1c43-4e10-a0be-18d205b0fe24       4   

                                               email     class  
0  Subject: is the supply rebound beginning ? an ...  not_spam  
1  Subject: email list - 100 million addresses $ ...      spam  
2  Subject: alley dodecahedra suicide\nare you re...      spam  
3  Subject: ibuyit project\ni wanted to share som...  not_spam  
4  Subject: cheap vicodin online - us fda pharmac...      spam  


In [5]:
encoder = LabelEncoder()
# Fit and transform the labels to numeric values
df['target'] = encoder.fit_transform(df['class'])
df.head()

Unnamed: 0,id_,source,email,class,target
0,5732aa7f-0c44-4a4f-877a-0488aed0d1f7,2,Subject: is the supply rebound beginning ? an ...,not_spam,0
1,4d3c392d-a4f0-465d-baa3-2c15f1560f07,2,Subject: email list - 100 million addresses $ ...,spam,1
2,d47e95c0-4909-41b8-aec8-a3fb953fa18f,4,Subject: alley dodecahedra suicide\nare you re...,spam,1
3,658a83eb-689c-480a-ae31-d622dc83f9f8,6,Subject: ibuyit project\ni wanted to share som...,not_spam,0
4,179d10b7-1c43-4e10-a0be-18d205b0fe24,4,Subject: cheap vicodin online - us fda pharmac...,spam,1


In [6]:
# clean the training text
currency_symbols = r'[\$\£\€\¥\₹\¢\₽\₩\₪]'  
text_cleaner = TextCleaner(currency_symbols)
df['clean_text'] = df['email'].apply(lambda x: text_cleaner.clean_text(x))
print(df.head())

                                    id_  source  \
0  5732aa7f-0c44-4a4f-877a-0488aed0d1f7       2   
1  4d3c392d-a4f0-465d-baa3-2c15f1560f07       2   
2  d47e95c0-4909-41b8-aec8-a3fb953fa18f       4   
3  658a83eb-689c-480a-ae31-d622dc83f9f8       6   
4  179d10b7-1c43-4e10-a0be-18d205b0fe24       4   

                                               email     class  target  \
0  Subject: is the supply rebound beginning ? an ...  not_spam       0   
1  Subject: email list - 100 million addresses $ ...      spam       1   
2  Subject: alley dodecahedra suicide\nare you re...      spam       1   
3  Subject: ibuyit project\ni wanted to share som...  not_spam       0   
4  Subject: cheap vicodin online - us fda pharmac...      spam       1   

                                          clean_text  
0  subject supply rebound beginning update cera o...  
1  subject email list million address currency ja...  
2  subject alley dodecahedra suicide ready lomse ...  
3  subject ibuyit project wa

In [7]:
X = df['clean_text'].to_list()
y = df['target'].to_list()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Tokenize the text data (basic tokenization, use more advanced tokenizers for better performance)
def tokenize(text):
    return text.split()

In [9]:
# Build vocabulary
vocab = set(token for text in X_train for token in tokenize(text))
print(vocab)



In [10]:
vocab = {word: i+1 for i, word in enumerate(vocab)}  # index 0 is reserved for padding
print(vocab)



In [11]:
# Convert text to sequence of indices
def text_to_sequence(text, vocab):
    return [vocab.get(token, 0) for token in tokenize(text)]

X_train_seq = [text_to_sequence(text, vocab) for text in X_train]
X_test_seq = [text_to_sequence(text, vocab) for text in X_test]


In [12]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, max_len=1000):
        self.texts = texts
        self.labels = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # Pad sequences
        if len(text) < self.max_len:
            text = text + [0] * (self.max_len - len(text))
        else:
            text = text[:self.max_len]
        
        return torch.tensor(text, dtype=torch.long), torch.tensor(label, dtype=torch.long)
    

In [13]:
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim, num_class)
    
    def forward(self, text):
        embedded = self.embedding(text)
        pooled = torch.mean(embedded, dim=1)
        return self.fc(pooled)

In [17]:
import torch
import torch.nn as nn

class TextClassificationModelwithLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_class, num_layers):
        super(TextClassificationModelwithLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        
        # Add bidirectional LSTM layer
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, bidirectional=True, batch_first=True)
        
        # Update the fully connected layer input size due to bidirectional LSTM
        self.fc = nn.Linear(hidden_dim * 2, num_class)
    
    def forward(self, text):
        embedded = self.embedding(text)
        
        # LSTM layer
        lstm_out, _ = self.lstm(embedded)
        
        # Use the output from the last LSTM unit for classification
        final_out = lstm_out[:, -1, :]
        
        return self.fc(final_out)


In [18]:
# Parameters
vocab_size = len(vocab) + 1  # +1 for padding token
embed_dim = 128
num_class = len(encoder.classes_)

# Hyperparameters
learning_rate = 0.001
batch_size = 16
epochs = 10

print(vocab_size, num_class)
# Model instance
model = TextClassificationModel(vocab_size, embed_dim, num_class)

83992 2


In [19]:
# Parameters
vocab_size = len(vocab) + 1  # +1 for padding token
embed_dim = 128
num_class = len(encoder.classes_)

# Hyperparameters
learning_rate = 0.001
batch_size = 16
epochs = 10

print(vocab_size, num_class)


model = TextClassificationModelwithLSTM(vocab_size, embed_dim, hidden_dim=256, num_class=num_class, num_layers=2)

83992 2


In [20]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# DataLoader
train_dataset = TextDataset(X_train_seq, y_train,max_len=150)
test_dataset = TextDataset(X_test_seq, y_test,max_len=150)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


model.train()

TextClassificationModelwithLSTM(
  (embedding): Embedding(83992, 128)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=512, out_features=2, bias=True)
)

In [21]:
(train_dataset[120][0])

tensor([22084, 44248, 67193,  2058, 52139, 20973, 34725, 16083, 32957, 28886,
        34835, 20973, 41815, 55352, 56136,  3907, 29770, 52544, 34835, 15127,
         6424, 12393, 23144, 34835, 11627, 17487, 52702, 17487, 70237, 42969,
        60568, 58647, 49175, 58605, 13549, 43946, 32957, 52234, 28886,  2190,
        23212, 32957, 17651, 28886, 37307, 17487, 43672, 80790, 52234, 28886,
        15671, 17487, 70237, 42969, 58605, 45954, 50729, 29449, 58605, 72432,
        55403, 20973, 76214, 17487, 34835, 11942, 34049, 52234, 28886, 72424,
        17487, 66334, 13549, 32957, 52234, 28886, 34835, 17651, 74289, 13919,
        32957, 28886, 24882,  7313,  4049,  7313, 34896,  7313, 83102,  7313,
        52234,  7313, 28886, 53279,  1727, 34171,  7313, 48645,  7313, 50412,
         7313, 62805,  7313, 52234,  7313, 28886,  1727,  8233, 53279,  7313,
        15341,  7313, 11734,  7313,  6423,  7313, 52234,  7313, 28886,  8233,
        14987, 83102,  7313, 63593,  7313,  4049,  7313, 34896, 

In [22]:
assert len(X_train_seq) == len(y_train), "Mismatch between number of training samples and labels"


In [23]:
train_dataset[11306]

(tensor([22084, 65813, 50505, 23564, 75460, 18747, 51061, 52352, 81833, 70858,
         83156, 44863, 21325, 55938, 78065, 82001, 69317, 29412,  9388, 57006,
         55419, 39810, 35523, 60279, 15166, 46688, 80946, 36311, 75875,  8487,
         60224, 76005, 69087, 66412, 36604, 31625, 48786, 54308, 23045,  2384,
         19337, 67801, 47920, 37702, 52521, 49412, 83527, 37093, 20487, 29311,
         66671,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [24]:
for epoch in range(epochs):
    total_loss = 0
    for texts, labels in tqdm(train_loader):
        optimizer.zero_grad()
        output = model(texts)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}')

100%|██████████| 750/750 [16:40<00:00,  1.33s/it]  


Epoch 1/10, Loss: 0.4671


100%|██████████| 750/750 [05:00<00:00,  2.50it/s]


Epoch 2/10, Loss: 0.0876


100%|██████████| 750/750 [04:55<00:00,  2.54it/s]


Epoch 3/10, Loss: 0.0300


100%|██████████| 750/750 [04:53<00:00,  2.55it/s]


Epoch 4/10, Loss: 0.0112


100%|██████████| 750/750 [05:05<00:00,  2.45it/s]


Epoch 5/10, Loss: 0.0019


100%|██████████| 750/750 [04:58<00:00,  2.51it/s]


Epoch 6/10, Loss: 0.0100


100%|██████████| 750/750 [04:56<00:00,  2.53it/s]


Epoch 7/10, Loss: 0.0079


100%|██████████| 750/750 [04:58<00:00,  2.51it/s]


Epoch 8/10, Loss: 0.0021


100%|██████████| 750/750 [05:01<00:00,  2.48it/s]


Epoch 9/10, Loss: 0.0004


100%|██████████| 750/750 [05:05<00:00,  2.45it/s]

Epoch 10/10, Loss: 0.0000





In [68]:
model.eval()
y_pred = []
y_true = []

with torch.no_grad():
    for texts, labels in test_loader:
        outputs = model(texts)
        _, predicted = torch.max(outputs, 1)
        y_pred.extend(predicted.tolist())
        y_true.extend(labels.tolist())

accuracy = accuracy_score(y_true, y_pred)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.9857


In [69]:
def predict(text, model, vocab, max_len=50):
    model.eval()
    text_seq = text_to_sequence(text, vocab)
    if len(text_seq) < max_len:
        text_seq = text_seq + [0] * (max_len - len(text_seq))
    else:
        text_seq = text_seq[:max_len]
    text_tensor = torch.tensor([text_seq], dtype=torch.long)
    
    with torch.no_grad():
        output = model(text_tensor)
        _, predicted = torch.max(output, 1)
    
    return encoder.inverse_transform(predicted.tolist())[0]

# Example prediction
print(predict("I enjoy learning new things", model, vocab))

spam
