In [94]:
import numpy as np
from torch import nn
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [95]:
# define parameters
SEED = 1234
BATCH_SIZE = 64

## Data Loading and Preprocessing

In [96]:
url = 'https://raw.githubusercontent.com/ShresthaSudip/SMS_Spam_Detection_DNN_LSTM_BiLSTM/master/SMSSpamCollection'
messages = pd.read_csv(url, sep ='\t',names=["label", "message"])
messages.head()

In [97]:
print(messages.groupby("label").describe())

In [98]:
messages["Message Length"]=messages["message"].apply(len)
fig=plt.figure(figsize=(12,8))
sns.histplot(
    x=messages["Message Length"],
    hue=messages["label"]
)
plt.title("ham & spam messege length comparision")
plt.show()

In [99]:
ham_desc=messages[messages["label"]=="ham"]["Message Length"].describe()
spam_desc=messages[messages["label"]=="spam"]["Message Length"].describe()

print("Ham Messege Length Description:\n",ham_desc)
print("************************************")
print("Spam Message Length Description:\n",spam_desc)

In [100]:
sns.countplot(
    data=messages,
    x="label"
)
plt.title("ham vs spam")
plt.show()

### Data Cleaning (undersampling for imbalanced data)

In [101]:
#compute the length of majority & minority class
minority_len=len(messages[messages["label"]=="spam"])
majority_len=len(messages[messages["label"]=="ham"])

#store the indices of majority and minority class
minority_indices=messages[messages["label"]=="spam"].index
majority_indices=messages[messages["label"]=="ham"].index

#generate new majority indices from the total majority_indices
#with size equal to minority class length so we obtain equivalent number of indices length
random_majority_indices=np.random.choice(
    majority_indices,
    size=minority_len,
    replace=False
)

#concatenate the two indices to obtain indices of new dataframe
undersampled_indices=np.concatenate([minority_indices,random_majority_indices])

#create df using new indices
df=messages.loc[undersampled_indices]

#shuffle the sample
df=df.sample(frac=1)

#reset the index as its all mixed
df=df.reset_index()

#drop the older index
df=df.drop(
    columns=["index"],
)
print("df shape: ", df.shape)
df['label'].value_counts()

In [102]:
df["Label"]=df["label"].map(
    {
        "ham":0,
        "spam":1
    }
)

In [103]:
# save df to csv
df.to_csv("df.csv",index=False)

### Data Processing and Tokenization

In [104]:
import torch
from torchtext import data
import warnings as wrn
import spacy
wrn.filterwarnings("ignore")

spacy_en = spacy.load('en_core_web_sm')
text = data.Field(tokenize=lambda text: [tok.text for tok in spacy_en(text)], batch_first=True, include_lengths=True)
label = data.LabelField(dtype = torch.float,batch_first=True)

fields = [("label",label),('text',text)]

# load data from df
training_data=data.TabularDataset(
    path="df.csv",
    format="csv",
    fields=fields,
    skip_header=True,
)
print(vars(training_data.examples[1]))

In [105]:
import random
train_data, test_data = training_data.split(split_ratio=0.8, random_state=random.seed(SEED))
text.build_vocab(train_data, min_freq=3, vectors="glove.6B.100d")
label.build_vocab(train_data)
print("Size of text vocab: ", len(text.vocab))
print("Size of label vocab: ", len(label.vocab))

In [106]:
text.vocab.freqs.most_common(10)

## Model Building & Data Loading

In [107]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data),
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

In [108]:
import torch.nn as nn
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # lstm layer -> output_dim
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        # dense layer -> prediction
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        # dropout layer
        self.dropout = nn.Dropout(dropout)
        # activation function
        self.act = nn.Sigmoid()

    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        # pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.cpu(), batch_first=True)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        return self.act(self.fc(hidden.squeeze(0)))

In [109]:
# training hyperparameters
SIZE_OF_VOCAB = len(text.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.2
NUM_EPOCHS = 15

## Training

In [110]:
model = LSTM(SIZE_OF_VOCAB, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCELoss()
model = model.to(device)
model

In [111]:
def binary_accuracy(preds, y):
    # round predictions to the closest integer
    rounded_preds = torch.round(preds)
    correct = (rounded_preds == y).float() # convert into float for division
    acc = correct.sum() / len(correct)
    return acc

In [112]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iterator:
        optimizer.zero_grad()
        text, text_lengths = batch.text
        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [113]:
for epoch in range(NUM_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tTest Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

In [114]:
# save the model
torch.save(model.state_dict(), 'LSTM-UCIspam-model.pt')

## fine-tuning with our own data

In [115]:
# fine tune with our fine-tune-dataset.csv
import pandas as pd
ft_df = pd.read_csv("fine-tune-dataset.csv")

# tokenize
ft_df['text'] = ft_df['text'].apply(lambda x: [tok.text for tok in spacy_en(x)])

# use all data to train
ft_data=data.TabularDataset(
    path="fine-tune-dataset.csv",
    format="csv",
    fields=fields,
    skip_header=True,
)

# build vocab
text.build_vocab(ft_data, min_freq=3, vectors="glove.6B.100d")
label.build_vocab(ft_data)

# load model
model.load_state_dict(torch.load('LSTM-UCIspam-model.pt'))
model = model.to(device)

# training
train_iterator = data.BucketIterator(
    ft_data,
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

for epoch in range(NUM_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')


## Testing with some examples

In [None]:
t1 = "Limited time offer! Buy one, get one free on all online courses!"
t2 = "Just finished my final exams! Looking forward to a relaxing summer break. What are some places you recommend for a vacation?"

