#### A

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

import re
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

import urllib.request, zipfile, io

def download_and_unzip(url, extract_to='.'):
    # Send an HTTP GET request, download the file, and save it in a BytesIO object
    with urllib.request.urlopen(url) as response:
        file_content = response.read()
        zip_file_like = io.BytesIO(file_content)

        # Unzip the file
        with zipfile.ZipFile(zip_file_like) as zip_file:
            zip_file.extractall(extract_to)

# Call the function to download and unzip the file
download_and_unzip(url='https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip', extract_to='.')

# Load and prepare data. Lower case, no punctuation.
data = [ln.strip() for ln in open('./SMSSpamCollection')]
data = [re.sub('[^A-Za-z0-9]+', ' ', line).lower() for line in data]
data = [re.sub(' +', ' ', line) for line in data]
data


['ham go until jurong point crazy available only in bugis n great world la e buffet cine there got amore wat ',
 'ham ok lar joking wif u oni ',
 'spam free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry question std txt rate t c s apply 08452810075over18 s',
 'ham u dun say so early hor u c already then say ',
 'ham nah i don t think he goes to usf he lives around here though',
 'spam freemsg hey there darling it s been 3 week s now and no word back i d like some fun you up for it still tb ok xxx std chgs to send 1 50 to rcv',
 'ham even my brother is not like to speak with me they treat me like aids patent ',
 'ham as per your request melle melle oru minnaminunginte nurungu vettam has been set as your callertune for all callers press 9 to copy your friends callertune',
 'spam winner as a valued network customer you have been selected to receivea 900 prize reward to claim call 09061701461 claim code kl341 valid 12 hours only ',
 'spam h

##### B

In [2]:
train_data, test_data = train_test_split(data, test_size=0.2)

#### C

In [3]:
def tokenize(data):
    return [line.split() for line in data]

tokenized_data = tokenize(train_data)

# Build vocabulary of 10,000 words + /UNK and /PAD
vocab_size = 10000
word_counts = Counter(word for line in tokenized_data for word in line[1:])
most_common_words = [word for word, _ in word_counts.most_common(vocab_size - 2)]
word_to_idx = {word: idx + 2 for idx, word in enumerate(most_common_words)}
word_to_idx['/UNK'] = 0
word_to_idx['/PAD'] = 1

print(f'Vocabulary size: {len(word_to_idx)}')


Vocabulary size: 7763


#### D

In [4]:
# Convert words to indices
def encode_message(message, word_to_idx, max_length=30):
    tokens = message.split()[1:]
    token_indices = [word_to_idx.get(word, 0) for word in tokens]
    if len(token_indices) < max_length:
        token_indices = [1] * (max_length - len(token_indices)) + token_indices  # Pad with /PAD
    else:
        token_indices = token_indices[:max_length]
    return token_indices

# Create a custom dataset class
class SMSDataset(Dataset):
    def __init__(self, data, word_to_idx, max_length=30):
        self.data = data
        self.word_to_idx = word_to_idx
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        line = self.data[idx]
        label = 1 if line.startswith('spam') else 0
        message = encode_message(line, self.word_to_idx, self.max_length)
        return torch.tensor(message, dtype=torch.long), torch.tensor(label, dtype=torch.float)

# Create dataset and dataloader
train_dataset = SMSDataset(train_data, word_to_idx)
test_dataset = SMSDataset(test_data, word_to_idx)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f'Train batches: {len(train_loader)}, Test batches: {len(test_loader)}')


Train batches: 140, Test batches: 35


#### E

In [5]:
# Define the RNN model
class SimpleRNNClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=128, num_layers=1):
        super(SimpleRNNClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
    
    def forward(self, x):
        x = self.embedding(x)
        output, hidden = self.rnn(x)
        out = self.fc(hidden[-1])
        return torch.sigmoid(out).squeeze(1)

# Initialize model, criterion, and optimizer
vocab_size = len(word_to_idx)
model = SimpleRNNClassifier(vocab_size).to(device='cuda' if torch.cuda.is_available() else 'cpu')


#### F

In [8]:
import torch.optim as optim
from sklearn.metrics import accuracy_score

# Training parameters
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 20

# Training loop
for epoch in range(num_epochs):
    model.train()
    for messages, labels in train_loader:
        messages, labels = messages.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(messages)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Evaluate on test data
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for messages, labels in test_loader:
            messages, labels = messages.to(device), labels.to(device)
            preds = model(messages).round()
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    print(f'Epoch [{epoch + 1}/{num_epochs}], Accuracy: {acc:.4f}')

Epoch [1/20], Accuracy: 0.9695
Epoch [2/20], Accuracy: 0.9794
Epoch [3/20], Accuracy: 0.9794
Epoch [4/20], Accuracy: 0.9830
Epoch [5/20], Accuracy: 0.9758
Epoch [6/20], Accuracy: 0.9794
Epoch [7/20], Accuracy: 0.9821
Epoch [8/20], Accuracy: 0.9794
Epoch [9/20], Accuracy: 0.9848
Epoch [10/20], Accuracy: 0.9848
Epoch [11/20], Accuracy: 0.9839
Epoch [12/20], Accuracy: 0.9857
Epoch [13/20], Accuracy: 0.9848
Epoch [14/20], Accuracy: 0.9857
Epoch [15/20], Accuracy: 0.9848
Epoch [16/20], Accuracy: 0.9857
Epoch [17/20], Accuracy: 0.9857
Epoch [18/20], Accuracy: 0.9848
Epoch [19/20], Accuracy: 0.9848
Epoch [20/20], Accuracy: 0.9848


#### G

In [9]:
# Print first 10 predictions on the test set
model.eval()
with torch.no_grad():
    for i, (messages, labels) in enumerate(test_loader):
        if i >= 1:
            break
        messages, labels = messages.to(device), labels.to(device)
        preds = model(messages).round()
        
        for j in range(10):
            decoded_message = ' '.join([list(word_to_idx.keys())[list(word_to_idx.values()).index(idx.item())] if idx.item() in word_to_idx.values() else '/UNK' for idx in messages[j] if idx.item() != 1])
            label_text = 'spam' if labels[j] == 1 else 'ham'
            pred_text = 'spam' if preds[j] == 1 else 'ham'
            print(f'Message: {decoded_message}')
            print(f'Prediction: {pred_text}, Ground Truth: {label_text}\n')


Message: 8007 free for 1st week no1 nokia tone 4 ur mob every week just txt nokia to 8007 get txting and tell ur mates www getzed co uk pobox 36504
Prediction: spam, Ground Truth: spam

Message: we re on the /UNK side from where we dropped you off
Prediction: ham, Ground Truth: ham

Message: ok thanx
Prediction: ham, Ground Truth: ham

Message: going to join tomorrow
Prediction: ham, Ground Truth: ham

Message: did i forget to tell you i want you i need you i crave you but most of all i love you my sweet /UNK /UNK mmmmmm yummy
Prediction: ham, Ground Truth: ham

Message: oh k k but he is not a big /UNK anyway good
Prediction: ham, Ground Truth: ham

Message: goodmorning today i am late for /UNK
Prediction: ham, Ground Truth: ham

Message: you have come into my life and brought the sun /UNK down on me /UNK my heart putting a /UNK smile on my face making me feel loved and cared
Prediction: ham, Ground Truth: ham

Message: when i was born god said oh no another idiot when you were born go