In [24]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tqdm import tqdm
from pathlib import Path
import numpy as np


In [2]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

class TextCleaner:
    '''Class for cleaning Text'''
    def __init__(self, currency_symbols, stop_words=None, lemmatizer=None):
        self.currency_symbols = currency_symbols
        
        if stop_words is None:
            self.stop_words = set(stopwords.words('english'))
        else:
            self.stop_words = stop_words
        
        if lemmatizer is None:
            self.lemmatizer = WordNetLemmatizer()
        else:
            self.lemmatizer = lemmatizer
    

    # functions for removing punctuations
    def remove_punctuation(self,text):
        return text.translate(str.maketrans('', '', string.punctuation))
    

    # Functions for cleaning text
    def clean_text(self, text):
        text = text.lower()
        text = re.sub(self.currency_symbols, 'currency', text)
        text = self.remove_punctuation(text)
        text = re.compile('<.*?>').sub('', text)
        text = text.replace('_', '')
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\d', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        text = ' '.join(word for word in text.split() if word not in self.stop_words)
        text = ' '.join(self.lemmatizer.lemmatize(word) for word in text.split())
        
        return text

In [3]:
train_path = Path("../ML Engineer/train.csv")
df = pd.read_csv(train_path)
print(df.head())

                                    id_  source  \
0  5732aa7f-0c44-4a4f-877a-0488aed0d1f7       2   
1  4d3c392d-a4f0-465d-baa3-2c15f1560f07       2   
2  d47e95c0-4909-41b8-aec8-a3fb953fa18f       4   
3  658a83eb-689c-480a-ae31-d622dc83f9f8       6   
4  179d10b7-1c43-4e10-a0be-18d205b0fe24       4   

                                               email     class  
0  Subject: is the supply rebound beginning ? an ...  not_spam  
1  Subject: email list - 100 million addresses $ ...      spam  
2  Subject: alley dodecahedra suicide\nare you re...      spam  
3  Subject: ibuyit project\ni wanted to share som...  not_spam  
4  Subject: cheap vicodin online - us fda pharmac...      spam  


In [4]:
encoder = LabelEncoder()
# Fit and transform the labels to numeric values
df['target'] = encoder.fit_transform(df['class'])
df.head()

Unnamed: 0,id_,source,email,class,target
0,5732aa7f-0c44-4a4f-877a-0488aed0d1f7,2,Subject: is the supply rebound beginning ? an ...,not_spam,0
1,4d3c392d-a4f0-465d-baa3-2c15f1560f07,2,Subject: email list - 100 million addresses $ ...,spam,1
2,d47e95c0-4909-41b8-aec8-a3fb953fa18f,4,Subject: alley dodecahedra suicide\nare you re...,spam,1
3,658a83eb-689c-480a-ae31-d622dc83f9f8,6,Subject: ibuyit project\ni wanted to share som...,not_spam,0
4,179d10b7-1c43-4e10-a0be-18d205b0fe24,4,Subject: cheap vicodin online - us fda pharmac...,spam,1


In [5]:
# clean the training text
currency_symbols = r'[\$\£\€\¥\₹\¢\₽\₩\₪]'  
text_cleaner = TextCleaner(currency_symbols)
df['clean_text'] = df['email'].apply(lambda x: text_cleaner.clean_text(x))
print(df.head())

                                    id_  source  \
0  5732aa7f-0c44-4a4f-877a-0488aed0d1f7       2   
1  4d3c392d-a4f0-465d-baa3-2c15f1560f07       2   
2  d47e95c0-4909-41b8-aec8-a3fb953fa18f       4   
3  658a83eb-689c-480a-ae31-d622dc83f9f8       6   
4  179d10b7-1c43-4e10-a0be-18d205b0fe24       4   

                                               email     class  target  \
0  Subject: is the supply rebound beginning ? an ...  not_spam       0   
1  Subject: email list - 100 million addresses $ ...      spam       1   
2  Subject: alley dodecahedra suicide\nare you re...      spam       1   
3  Subject: ibuyit project\ni wanted to share som...  not_spam       0   
4  Subject: cheap vicodin online - us fda pharmac...      spam       1   

                                          clean_text  
0  subject supply rebound beginning update cera o...  
1  subject email list million address currency ja...  
2  subject alley dodecahedra suicide ready lomse ...  
3  subject ibuyit project wa

In [55]:
X = df['clean_text'].to_list()
y = df['target'].to_list()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [56]:
# Tokenize the text data (basic tokenization, use more advanced tokenizers for better performance)
def tokenize(text):
    return text.split()

In [57]:
# Build vocabulary
vocab = set(token for text in X_train for token in tokenize(text))
print(vocab)



In [58]:
vocab = {word: i+1 for i, word in enumerate(vocab)}  # index 0 is reserved for padding
print(vocab)



In [59]:
# Convert text to sequence of indices
def text_to_sequence(text, vocab):
    return [vocab.get(token, 0) for token in tokenize(text)]

X_train_seq = [text_to_sequence(text, vocab) for text in X_train]
X_test_seq = [text_to_sequence(text, vocab) for text in X_test]


In [60]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, max_len=1000):
        self.texts = texts
        self.labels = labels
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # Pad sequences
        if len(text) < self.max_len:
            text = text + [0] * (self.max_len - len(text))
        else:
            text = text[:self.max_len]
        
        return torch.tensor(text, dtype=torch.long), torch.tensor(label, dtype=torch.long)
    

In [61]:
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim, num_class)
    
    def forward(self, text):
        embedded = self.embedding(text)
        pooled = torch.mean(embedded, dim=1)
        return self.fc(pooled)

In [62]:
# Parameters
vocab_size = len(vocab) + 1  # +1 for padding token
embed_dim = 128
num_class = len(encoder.classes_)

# Hyperparameters
learning_rate = 0.001
batch_size = 16
epochs = 10

print(vocab_size, num_class)
# Model instance
model = TextClassificationModel(vocab_size, embed_dim, num_class)

83992 2


In [63]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# DataLoader
train_dataset = TextDataset(X_train_seq, y_train,max_len=150)
test_dataset = TextDataset(X_test_seq, y_test,max_len=150)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


model.train()

TextClassificationModel(
  (embedding): Embedding(83992, 128)
  (fc): Linear(in_features=128, out_features=2, bias=True)
)

In [64]:
(train_dataset[120][0])

tensor([67365, 73948, 37645, 55504, 38027, 22783, 28580, 55207, 71081, 78717,
        52215, 22783, 34733, 11206,  4245, 45237,  3667, 17480, 52215, 28996,
        27603, 44078, 35736, 52215, 33710, 64137, 50273, 64137, 47057, 24964,
        46258, 65354, 14000, 57669, 51959, 78211, 71081,  8190, 78717, 27504,
        60780, 71081, 80260, 78717, 24997, 64137,  9646,  9929,  8190, 78717,
        40889, 64137, 47057, 24964, 57669, 56508, 71470, 69357, 57669, 21449,
        62748, 22783, 22802, 64137, 52215, 36243,  6346,  8190, 78717, 25853,
        64137, 82208, 51959, 71081,  8190, 78717, 52215, 80260, 35599, 79664,
        71081, 78717, 72162, 25222, 46351, 25222, 75092, 25222, 19516, 25222,
         8190, 25222, 78717, 55779, 35326, 74816, 25222, 42083, 25222, 74808,
        25222, 14416, 25222,  8190, 25222, 78717, 35326, 57066, 55779, 25222,
         9374, 25222, 71301, 25222, 49878, 25222,  8190, 25222, 78717, 57066,
        50544, 19516, 25222, 83175, 25222, 46351, 25222, 75092, 

In [65]:
assert len(X_train_seq) == len(y_train), "Mismatch between number of training samples and labels"


In [66]:
train_dataset[11306]

(tensor([67365,  9655, 25405, 62898, 12113, 46871, 67954, 41752, 28923, 52853,
         17610,  9026, 14454, 45128, 22345, 83293, 43764, 20254, 83068, 19153,
         24387, 73981, 55028, 42147, 17140,   773, 66980, 66891, 50464, 62236,
         76253, 33857,  9508, 45645, 71521, 46237, 83496, 29863, 24985, 74309,
         38986, 66450, 41728, 79450, 30228, 77729, 78149, 78820, 71698,  6550,
         17271,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [67]:
for epoch in range(epochs):
    total_loss = 0
    for texts, labels in tqdm(train_loader):
        optimizer.zero_grad()
        output = model(texts)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}')

100%|██████████| 750/750 [00:52<00:00, 14.20it/s]


Epoch 1/10, Loss: 0.4348


100%|██████████| 750/750 [00:52<00:00, 14.15it/s]


Epoch 2/10, Loss: 0.1410


100%|██████████| 750/750 [00:53<00:00, 14.06it/s]


Epoch 3/10, Loss: 0.0780


100%|██████████| 750/750 [00:52<00:00, 14.23it/s]


Epoch 4/10, Loss: 0.0526


100%|██████████| 750/750 [00:50<00:00, 14.90it/s]


Epoch 5/10, Loss: 0.0380


100%|██████████| 750/750 [00:53<00:00, 14.11it/s]


Epoch 6/10, Loss: 0.0292


100%|██████████| 750/750 [00:53<00:00, 13.94it/s]


Epoch 7/10, Loss: 0.0233


100%|██████████| 750/750 [00:53<00:00, 13.94it/s]


Epoch 8/10, Loss: 0.0187


100%|██████████| 750/750 [00:55<00:00, 13.61it/s]


Epoch 9/10, Loss: 0.0156


100%|██████████| 750/750 [00:55<00:00, 13.50it/s]

Epoch 10/10, Loss: 0.0133





In [68]:
model.eval()
y_pred = []
y_true = []

with torch.no_grad():
    for texts, labels in test_loader:
        outputs = model(texts)
        _, predicted = torch.max(outputs, 1)
        y_pred.extend(predicted.tolist())
        y_true.extend(labels.tolist())

accuracy = accuracy_score(y_true, y_pred)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.9857


In [69]:
def predict(text, model, vocab, max_len=50):
    model.eval()
    text_seq = text_to_sequence(text, vocab)
    if len(text_seq) < max_len:
        text_seq = text_seq + [0] * (max_len - len(text_seq))
    else:
        text_seq = text_seq[:max_len]
    text_tensor = torch.tensor([text_seq], dtype=torch.long)
    
    with torch.no_grad():
        output = model(text_tensor)
        _, predicted = torch.max(output, 1)
    
    return encoder.inverse_transform(predicted.tolist())[0]

# Example prediction
print(predict("I enjoy learning new things", model, vocab))

spam
