In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset

In [None]:
# Load your dataset
# Assuming you have a CSV file with 'text' and 'label' columns
df = pd.read_csv("/content/spam.csv", encoding = "ISO-8859-1", usecols=[0,1], skiprows=1, names=["label", "message"])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.label = df.label.map({"ham":0, "spam":1})

In [None]:
# Preprocess the text data
df['message'] = df['message'].str.lower().apply(lambda x: re.sub(r'\W', ' ', x))
df.head()

Unnamed: 0,label,message
0,0,go until jurong point crazy available only ...
1,0,ok lar joking wif u oni
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor u c already then say
4,0,nah i don t think he goes to usf he lives aro...


In [None]:
# Tokenization and building vocabulary
vocabulary = {}
for text in df['message']:
    for word in text.split():
        if word not in vocabulary:
            vocabulary[word] = len(vocabulary)

In [None]:
# Create BoW vectors
def text_to_bow(text, vocab):
    vector = [0] * len(vocab)
    for word in text.split():
        if word in vocab:
            vector[vocab[word]] += 1
    return vector

In [None]:
X = df['message'].apply(lambda x: text_to_bow(x, vocabulary)).tolist()
X = torch.tensor(X, dtype=torch.float32)
X

tensor([[1., 1., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 1., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [None]:
# Labels
y = torch.tensor(df['label'].values, dtype=torch.long)
y

tensor([0, 0, 1,  ..., 0, 0, 0])

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Create DataLoader
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size, hidden_size)
        self.h2h = nn.Linear(hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input, hidden):
        hidden = F.tanh(self.i2h(input) + self.h2h(hidden))
        output = self.h2o(hidden)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

# Model parameters
input_size = len(vocabulary)
hidden_size = 128
output_size = 2  # Number of classes (ham or spam)

# Initialize the RNN
rnn = RNN(input_size, hidden_size, output_size)

In [None]:
# Train the model
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn.parameters(), lr=0.01)

# Training loop
n_epochs = 10

for epoch in range(n_epochs):
    total_loss = 0
    for i, (inputs, labels) in enumerate(train_loader):
        hidden = rnn.initHidden()

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs, hidden = rnn(inputs, hidden)

        # Compute the loss and backpropagate
        loss = criterion(outputs, labels)
        loss.backward()

        # Update the parameters
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}/{n_epochs}, Loss: {total_loss/len(train_loader)}')

# Save the trained model
torch.save(rnn.state_dict(), 'spam_rnn_model.pth')

Epoch 1/10, Loss: 0.3343365837420736
Epoch 2/10, Loss: 0.32102747431823186
Epoch 3/10, Loss: 0.3182699435523578
Epoch 4/10, Loss: 0.3170607822281974
Epoch 5/10, Loss: 0.3170570154275213
Epoch 6/10, Loss: 0.3170567552958216
Epoch 7/10, Loss: 0.31705665247780934
Epoch 8/10, Loss: 0.3170565692441804
Epoch 9/10, Loss: 0.3170565243278231
Epoch 10/10, Loss: 0.3170564843075616


In [None]:
# Evaluation function
def evaluate(model, data_loader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in data_loader:
            hidden = model.initHidden()
            outputs, hidden = model(inputs, hidden)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct / total

# Evaluate on the test set
test_accuracy = evaluate(rnn, test_loader)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

Test Accuracy: 97.85%
