In [23]:
import numpy as np
from gensim.models import Word2Vec
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [24]:
data = pd.read_csv(r"SMS-Spam-Classification\spam.csv", encoding='ISO-8859-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [25]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
data["target"] = encoder.fit_transform(data[['v1']])
data = data.drop(['v1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
data.head()

  y = column_or_1d(y, warn=True)


Unnamed: 0,v2,target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [26]:
sentences = [text.split() for text in data['v2']]

# Train Word2Vec model

In [27]:
word2vec_model = Word2Vec(sentences, vector_size=50, window=5, min_count=1, workers=4)

# Vocabulary and padding

In [28]:
word_idx = {word: i+1 for i, word in enumerate(word2vec_model.wv.index_to_key)}
max_len = max(len(s) for s in data["v2"])

In [29]:
# COnvert sentences to sequences of indeces 
# Here i will give only a single sentence as sentence as the input

def sentence_to_sequence(sentences, word_idx, max_len):
    word = sentences.split()
    idx = [word_idx.get(wd) for wd in word]
    padded = idx + [0] * (max_len - len(idx))

    return padded

In [30]:
Target = data['target']
Feature = data.drop(['target'], axis=1)

In [32]:
x = np.array([sentence_to_sequence(text, word_idx, max_len) for text in data["v2"]])
y = np.array(Target)

In [33]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [34]:
# Example of how the below code works gets the size of the word vectors then get the size of the vocab and then makes a new zero vector with the size (vocab_len, vector_size)

# embedding_matrix =
# [[0.0, 0.0, 0.0],      # index 0 (padding)
#  [0.1, 0.2, 0.3],      # "king"
#  [0.4, 0.5, 0.6],      # "queen"
#  [0.7, 0.8, 0.9],      # "man"
#  [1.0, 1.1, 1.2]]      # "woman"

In [None]:
# Manual embedding matrix creation
embedding_dim = word2vec_model.vector_size
vocab_size = len(word_idx) + 1  # +1 for padding token
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, idx in word_idx.items():
    embedding_matrix[idx] = word2vec_model.wv[word]

converting a pandas dataframe gives a object datatype not int or float 

In [36]:
# Converts the data in the format of the pytorch suitable data

x_train = torch.tensor(x_train, dtype=torch.long)
x_test = torch.tensor(x_test, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.float)
y_test = torch.tensor(y_test, dtype=torch.float)

In [38]:
class RNNClassifier(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, num_classes):
        super().__init__()
        vocab_size, embedding_dim = embedding_matrix.shape
        # Inbuilt function or embedding in pytorch
        self.embedding = nn.Embedding.from_pretrained(
            torch.tensor(embedding_matrix, dtype=torch.float), 
            freeze=False
        )
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)  

    def forward(self, text):
        # When the text comes in embedding layer it looks up for the corresponding vector in the embedding matrix and returns the vector
        embedded = self.embedding(text) # [batch_size, seq_len, embedding_dim]
        output, hidden = self.rnn(embedded) # hidden: [1, batch_size, hidden_dim]
        return self.fc(hidden.squeeze(0))   # [batch_size, output_dim]
    
# Initialize the model parameters
hidden_dim = 20
output_dim = 1 # 1 for the binary classification

# Create the model instance
model = RNNClassifier(embedding_matrix, hidden_dim, output_dim)
criteria = nn.BCEWithLogitsLoss() # Loss Fucntion Binary Cross Entropy with logits
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 10
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    predictions = model(x_train).squeeze(1) # Removes the extra dimension
    loss = criteria(predictions, y_train)
    loss.backward()  # Backpropagation
    optimizer.step() # Update the weights

    # Validation
    model.eval()
    with torch.no_grad():
        test_preds = model(x_test).squeeze(1)
        test_loss = criteria(test_preds, y_test)
        test_acc = ((test_preds > 0).float() == y_test).float().mean()
    print(f"Epoch {epoch}: Train Loss = {loss.item():.4f}, Test Loss = {test_loss.item():.4f}, Test Acc = {test_acc:.4f}")

new_sms = "win a free trip"
new_indices = torch.tensor([sentence_to_sequence(new_sms, word_idx, max_len)], dtype=torch.long)
model.eval()
with torch.no_grad():
    pred = model(new_indices).squeeze(1)
    pred_class = (pred > 0).float().item()
print(f"Prediction for '{new_sms}': {'spam' if pred_class == 1 else 'not spam'}")

Epoch 0: Train Loss = 0.7430, Test Loss = 0.7353, Test Acc = 0.1345
Epoch 1: Train Loss = 0.7354, Test Loss = 0.7278, Test Acc = 0.1345
Epoch 2: Train Loss = 0.7278, Test Loss = 0.7203, Test Acc = 0.1345
Epoch 3: Train Loss = 0.7203, Test Loss = 0.7128, Test Acc = 0.1345
Epoch 4: Train Loss = 0.7129, Test Loss = 0.7054, Test Acc = 0.1345
Epoch 5: Train Loss = 0.7054, Test Loss = 0.6981, Test Acc = 0.1345
Epoch 6: Train Loss = 0.6981, Test Loss = 0.6908, Test Acc = 0.8655
Epoch 7: Train Loss = 0.6908, Test Loss = 0.6835, Test Acc = 0.8655
Epoch 8: Train Loss = 0.6835, Test Loss = 0.6763, Test Acc = 0.8655
Epoch 9: Train Loss = 0.6763, Test Loss = 0.6692, Test Acc = 0.8655
Prediction for 'win a free trip': not spam


In [None]:
# embedded = self.embedding(text)

# Let’s say text (i.e. x_train) is shaped like:
# [batch_size, seq_len] = [4, 10]

# So text is a bunch of word indices, e.g.:

# tensor([[12, 56, 89, 0, 0, 0, 0, 0, 0, 0],
#         [24, 67, 0, 0, 0, 0, 0, 0, 0, 0],
#         ...
#        ])