In [None]:
# Imports
from IPython.display import clear_output
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import spacy
import re
import string
from collections import Counter
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import tqdm

import warnings
warnings.filterwarnings('ignore')

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip

!unzip /content/smsspamcollection.zip
!rm /content/readme
!rm !rm /content/smsspamcollection.zip

clear_output()

In [None]:
# Downloading the GloVe embeddings database
!wget https://nlp.stanford.edu/data/glove.6B.zip
!unzip /content/glove.6B.zip
!rm -rf /content/glove.6B.zip
!rm /content/glove.6B.100d.txt
!rm /content/glove.6B.200d.txt
!rm /content/glove.6B.300d.txt

--2024-05-22 13:26:25--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... ^C
unzip:  cannot find or open /content/glove.6B.zip, /content/glove.6B.zip.zip or /content/glove.6B.zip.ZIP.
rm: cannot remove '/content/glove.6B.100d.txt': No such file or directory
rm: cannot remove '/content/glove.6B.200d.txt': No such file or directory
rm: cannot remove '/content/glove.6B.300d.txt': No such file or directory


In [None]:
text = []
label = []

with open("/content/SMSSpamCollection") as f:

    """ read each line of the text file and create a Pandas Data Frame
        label spam messages as 1 and legit messages as 0
    """

    ###########YOUR CODE HERE###########
    # Load the dataset into a pandas DataFrame
    sms = pd.read_csv(f, sep="\t", header=None)
    # Rename columns
    sms.columns = ["label", "Text"]

    # Convert labels to binary (1 for spam, 0 for not spam/ham)
    sms['label'] = sms['label'].map({'spam': 1, 'ham': 0})

In [None]:
# Creating a Pandas Dataframe
###########YOUR CODE HERE###########
sms['Text_Length'] = sms['Text'].apply(lambda x:len(x)) #Calculating text length if each text
sms

Unnamed: 0,label,Text,Text_Length
0,0,"Go until jurong point, crazy.. Available only ...",111
1,0,Ok lar... Joking wif u oni...,29
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,0,U dun say so early hor... U c already then say...,49
4,0,"Nah I don't think he goes to usf, he lives aro...",61
...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,160
5568,0,Will ü b going to esplanade fr home?,36
5569,0,"Pity, * was in mood for that. So...any other s...",57
5570,0,The guy did some bitching but I acted like i'd...,125


In [None]:
spacy_tokenizer = spacy.load('en_core_web_sm')
def tokenize (text):

    """remove any non-ascii characters
       remove punctuations
       tokenize the text
       return the tokenized text
    """

    ###########YOUR CODE HERE###########
    ascii_text = []
    for word in text.split(): # Removing Non-ASCII characters from the text
        tempword = []
        for char in word.lower():
            if ord(char) < 127:
                tempword.append(char)
        ascii_text.append(''.join(tempword))
    ascii_text_joined = ' '.join(ascii_text)
    punctuation=list(string.punctuation) #Removing punctuations from the text to complete tokenization
    punctuation.append('...')
    Text=[token.text for token in spacy_tokenizer(ascii_text_joined)]
    tokenizedText=[token for token in Text if token not in punctuation]
    return tokenizedText


In [None]:
# Tokenize the text sms in the Pandas Dataframe
###########YOUR CODE HERE###########
sms["Tokenized_Text"] = sms['Text'].apply(tokenize) # Applying tokenize function
sms

Unnamed: 0,label,Text,Text_Length,Tokenized_Text
0,0,"Go until jurong point, crazy.. Available only ...",111,"[go, until, jurong, point, crazy, .., availabl..."
1,0,Ok lar... Joking wif u oni...,29,"[ok, lar, joking, wif, u, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,0,U dun say so early hor... U c already then say...,49,"[u, dun, say, so, early, hor, u, c, already, t..."
4,0,"Nah I don't think he goes to usf, he lives aro...",61,"[nah, i, do, n't, think, he, goes, to, usf, he..."
...,...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,160,"[this, is, the, 2nd, time, we, have, tried, 2,..."
5568,0,Will ü b going to esplanade fr home?,36,"[will, , b, going, to, esplanade, fr, home]"
5569,0,"Pity, * was in mood for that. So...any other s...",57,"[pity, was, in, mood, for, that, so, any, othe..."
5570,0,The guy did some bitching but I acted like i'd...,125,"[the, guy, did, some, bitching, but, i, acted,..."


In [None]:
def load_GloVe_embeddings(glove_file):

    """
        load the GloVe embeddings from the files downloaded
        create a dictionary of the form {word : word embedding}
    """

    ###########YOUR CODE HERE###########
    word2embedding = {} #Dictionary for word->GloVe embedding
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            embedding = np.array([float(val) for val in values[1:]])
            word2embedding[word] = embedding
    return word2embedding

In [None]:
def embed_text(tokenized_text, word_embeddings, max_text_length=70, embedding_size = 50):
    """
        given a sequence of tokens convert them to their word embeddings
    """

    ###########YOUR CODE HERE###########
    embedded_text = []
    for token in tokenized_text: #Embedding the text using the embedding dictionary
        if token.lower() in word_embeddings:
            embedded_text.append(word_embeddings[token])
        else:
            # If the token is not in the word embeddings, use a zero vector
            embedded_text.append(np.zeros(embedding_size, dtype='float32'))

    # Pad or truncate the embedded text to the specified max_text_length
    if len(embedded_text) < max_text_length:
        # Pad with zero vectors if the text is shorter than max_text_length
        padding = [np.zeros(embedding_size, dtype='float32')] * (max_text_length - len(embedded_text)) #Providing padding for texts that are shorter than Max Text length
        embedded_text.extend(padding)
    elif len(embedded_text) > max_text_length:
        # Truncate the text if it's longer than max_text_length
        embedded_text = embedded_text[:max_text_length] #Truncating for texts longer than Max text length

    return embedded_text

In [None]:
word2embeddings=load_GloVe_embeddings("/content/glove.6B.50d.txt") #Creating the embeddings dictionary

In [None]:
###########YOUR CODE HERE###########
sms["Embedded_Text"] = sms['Tokenized_Text'].apply(lambda x: embed_text(x, word2embeddings)) #Encoding the tokens of text into embeddings
sms


Unnamed: 0,label,Text,Text_Length,Tokenized_Text,Embedded_Text
0,0,"Go until jurong point, crazy.. Available only ...",111,"[go, until, jurong, point, crazy, .., availabl...","[[0.14828, 0.17761, 0.42346, -0.31489, 0.32273..."
1,0,Ok lar... Joking wif u oni...,29,"[ok, lar, joking, wif, u, oni]","[[-0.53646, -0.072432, 0.24182, 0.099021, 0.18..."
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,155,"[free, entry, in, 2, a, wkly, comp, to, win, f...","[[-0.41183, 0.4528, 0.02825, -0.28702, 0.03702..."
3,0,U dun say so early hor... U c already then say...,49,"[u, dun, say, so, early, hor, u, c, already, t...","[[-0.25676, 0.8549, 1.1003, 0.95363, 0.36585, ..."
4,0,"Nah I don't think he goes to usf, he lives aro...",61,"[nah, i, do, n't, think, he, goes, to, usf, he...","[[0.50959, 1.2707, -0.078318, -1.4834, -0.3478..."
...,...,...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,160,"[this, is, the, 2nd, time, we, have, tried, 2,...","[[0.53074, 0.40117, -0.40785, 0.15444, 0.47782..."
5568,0,Will ü b going to esplanade fr home?,36,"[will, , b, going, to, esplanade, fr, home]","[[0.81544, 0.30171, 0.5472, 0.46581, 0.28531, ..."
5569,0,"Pity, * was in mood for that. So...any other s...",57,"[pity, was, in, mood, for, that, so, any, othe...","[[-0.052489, 0.30524, -0.33187, -0.43559, 0.53..."
5570,0,The guy did some bitching but I acted like i'd...,125,"[the, guy, did, some, bitching, but, i, acted,...","[[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -..."


In [None]:
"""Complete the below code for the Dataloader class"""
class load_dataset(Dataset):
    def __init__(self, X, Y):
        """
            X: the embeddings of the sentence
            Y: ground truth of the sentence (0- positive, 1- negative)
        """
        self.X = X
        self.y = Y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
      ###########YOUR CODE HERE###########
      # Retrieve the embedding and label at the given index
      embeddings = self.X[idx]
      label_ = self.y[idx]
      if(label_==1): #One-hot encoding for the classes: Spam and not spam
        one_hot_labels=[0,1]
      else:
        one_hot_labels=[1,0]

      input=torch.tensor(embeddings,dtype=torch.float32) #Converting the list of embeddings into PyTorch tensor
      labels=torch.tensor(one_hot_labels, dtype=torch.float32) #Converting one-hot encodings to PyTorch tensor
      # Return the embedding and label as a tuple
      return input, labels

In [None]:
import torch
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers):
        """Define your layers, activation functions here"""
        super(RNN, self).__init__()

        # Embedding layer
        # self.embedding = nn.Embedding(vocab_size, embedding_dim) I don't think there is a need for an embedding layer because i have already encoded the tokens into embeddings

        # I have used LSTM instead of Vanilla RNN. LSTM layer-:
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True) #LSTM encoding the embeddings into hidden state

        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_dim) #Fully connected layer to convert hidden state to the output space=NUMBER OF CLASSES
        self.softmax = nn.Softmax() #Softmax function to get the probability distribution over classes


    def forward(self, x):
        """Perform a forward pass"""
        # embedded = self.embedding(x)

        # RNN
        _output_, (hidden, cell) = self.lstm(x) #Accessing the output sequence, final hidden state ans cell state of LSTM
        # Select the hidden state encoding of the last time step (Many to one RNN)

        hidden = hidden[-1, :, :]

        # Fully connected layer
        output = self.fc(hidden) #Passing the final hidden state encoding for last time step to the fully connected layer
        output=self.softmax(output)

        return output

In [None]:
def train_model(num_epochs, train_loader, model, criterion, optimizer):
    """
    Write a trainer loop for the model. It must follow the below pattern
    1. Pass the input to the model and perform forward propagation
    2. Obtain losses
    3. Backpropagate to find the gradients

    Make sure to check the accuracy of the model at regular intervals
    """

    for epoch in range(num_epochs):
        model.train()

        epoch_loss = 0.0
        correct = 0
        total = 0

        for inputs, labels in train_loader:

            optimizer.zero_grad()
            # Forward pass
            outputs = model(inputs)
            max_prob_indices = torch.argmax(outputs, dim=1)
            # Get the value of the maximum probability
            max_prob_value = outputs[:, max_prob_indices]
            # Compute loss
            loss = criterion(outputs, labels)
            epoch_loss += loss.item()

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            # Compute accuracy
            actual_indices=torch.argmax(labels, dim=1) #Getting class labels from one-hot encoding
            total += labels.size(0) #Total predictions
            correct += (max_prob_indices == actual_indices).sum().item() #Correct predictions


        epoch_loss /= len(train_loader)
        accuracy = 100 * correct / total

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {accuracy:.2f}%')


In [None]:
import torch.optim as optim

In [None]:
"""
1. Write code to split your available data into training and testing splits
2. Define the model
3. Set up hyper-parameters such as learning rate, number of epochs, batch size
4. Train the model by using the function you defined above
5. Check the model accuracy by running the model on the testing split
6. Save the model as a .pth file
"""

###########YOUR CODE HERE###########
# Split Data
X_train, X_test, y_train, y_test = train_test_split(sms['Embedded_Text'], sms['label'], test_size=0.2, random_state=42)
# Defining Model
vocab_size = 6000000000
num_layers = 1  # number of RNN layers
model = RNN(vocab_size, embedding_dim=50, hidden_dim=100, output_dim=2, num_layers=2)

# Setting Hyperparameters
learning_rate = 0.01
num_epochs = 15
batch_size = 128 #To avoid oscillations in the weights of neural network nodes caused by individual training examples. Batches average out this effect.
num_classes=2
# Training Model
criterion = nn.BCELoss() #Binary Cross Entropy loss function
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
train_dataset = load_dataset(X_train.values, y_train.values)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
train_model(num_epochs, train_loader, model, criterion, optimizer)

# Checking Accuracy over test dataset
model.eval()  # Set the model to evaluation mode
test_dataset=load_dataset(X_test.values,y_test.values)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
with torch.no_grad():
    correct = 0
    total = 0
    tp=0 #Because the dataset has class imbalance about 10% is only spam sms. Precision and recall are more important evaluation measures for this classification over accuracy
    fp=0
    tn=0
    fn=0
    for inputs, labels in test_loader:
        preds=model(inputs)
        max_prob_indices=torch.argmax(preds, dim=1)
        actual_indices=torch.argmax(labels, dim=1)
        total += labels.size(0)
        correct += (max_prob_indices == actual_indices).sum().item()
        tp += ((max_prob_indices == actual_indices) & (actual_indices == torch.tensor(1.0))).sum().item() #True positive(Prediction-spam, label-spam)
        tn += ((max_prob_indices == actual_indices) & (actual_indices == torch.tensor(0.0))).sum().item() #True negative(Prediction-not spam, label-not spam)
        fp += ((max_prob_indices != actual_indices) & (max_prob_indices == torch.tensor(1.0))).sum().item() #False positive(Prediction-spam, label-not spam)
        fn += ((max_prob_indices != actual_indices) & (max_prob_indices == torch.tensor(0.0))).sum().item() #False negative(Prediction-not spam, label-spam)
accuracy = 100 * correct / total
precision= tp/(tp+fp)
recall= tp/(tp+fn)
beta=0.5
fscore=((1+beta**2)*precision*recall)/(precision*beta**2 + recall) #More weight to recall
print(f'Testing Accuracy: {accuracy:.2f}%, Precision: {precision:.2f}, Recall= {recall:.2f}, F0.5Score: {fscore:.2f}')

#Saving Model
torch.save(model.state_dict(), 'rnn_model.pth')

Epoch [1/15], Loss: 0.4104, Accuracy: 84.52%
Epoch [2/15], Loss: 0.3942, Accuracy: 86.58%
Epoch [3/15], Loss: 0.3930, Accuracy: 86.58%
Epoch [4/15], Loss: 0.3832, Accuracy: 86.58%
Epoch [5/15], Loss: 0.2563, Accuracy: 89.05%
Epoch [6/15], Loss: 0.1790, Accuracy: 93.58%
Epoch [7/15], Loss: 0.1577, Accuracy: 94.95%
Epoch [8/15], Loss: 0.1185, Accuracy: 95.92%
Epoch [9/15], Loss: 0.1061, Accuracy: 96.28%
Epoch [10/15], Loss: 0.0832, Accuracy: 97.33%
Epoch [11/15], Loss: 0.0714, Accuracy: 97.73%
Epoch [12/15], Loss: 0.0563, Accuracy: 98.27%
Epoch [13/15], Loss: 0.0527, Accuracy: 98.41%
Epoch [14/15], Loss: 0.0518, Accuracy: 98.14%
Epoch [15/15], Loss: 0.0438, Accuracy: 98.59%
Testing Accuracy: 98.30%, Precision: 0.94, Recall= 0.93, F0.5Score: 0.94


THANK YOU FOR GOING THROUGH THE CODE.
PEACE.
