<a href="https://colab.research.google.com/github/AkashKoley012/Deep-Learning-Projects/blob/main/Spam%20Email%20Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Collecting Data

In [25]:
!kaggle datasets download -d uciml/sms-spam-collection-dataset

Dataset URL: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset
License(s): unknown
sms-spam-collection-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [26]:
import zipfile
zip_ref = zipfile.ZipFile('/content/sms-spam-collection-dataset.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

# Imports liberies

In [27]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam

# Data Preprocessing

In [28]:
df = pd.read_csv('/content/spam.csv', encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [29]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [30]:
label_encoder = LabelEncoder()
df['v1'] = label_encoder.fit_transform(df['v1'])
df.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


# Tokenize

In [31]:
def tokenize(text):
  text = text.lower()
  text = text.replace('.','')
  return text.split()

In [32]:
len(tokenize(df['v2'][0]))

20

In [33]:
vocab = {'<UNK>':0}

for sentence in df['v2']:
  for word in tokenize(sentence):
    if word not in vocab:
      vocab[word] = len(vocab)

In [34]:
max_len = max([len(tokenize(df['v2'][i])) for i in range(len(df))])
max_len

171

In [35]:
def pad_sequence(seq, max_len=100):
  return seq[:max_len] + [0] * (max_len - len(seq))

In [36]:
def vectorize(sentence):
  vector = []
  for word in tokenize(sentence):
    if word in vocab:
      vector.append(vocab[word])
    else:
      vector.append(vocab['<UNK>'])
  return pad_sequence(vector)

In [37]:
df['v2'] = df['v2'].apply(vectorize)
df.head()

Unnamed: 0,v1,v2
0,0,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
1,0,"[21, 22, 23, 24, 25, 26, 0, 0, 0, 0, 0, 0, 0, ..."
2,1,"[27, 28, 8, 29, 30, 31, 32, 33, 34, 35, 36, 37..."
3,0,"[25, 50, 51, 52, 53, 54, 25, 55, 56, 57, 51, 0..."
4,0,"[58, 59, 60, 61, 62, 63, 33, 64, 62, 65, 66, 6..."


# Train Test Split

In [38]:
X_train, X_test, y_train, y_test = train_test_split(df['v2'], df['v1'], test_size=0.2, random_state=42)

In [39]:
type(X_train)

In [40]:
np.array(X_train.tolist()).shape

(4457, 100)

# Dataset & DataLoader

In [41]:
class SpamDataset(Dataset):

  def __init__(self, X, y):
    self.X = torch.tensor(X, dtype=torch.long)
    self.y = torch.tensor(y, dtype=torch.long)

  def __len__(self):
    return len(self.y)

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

In [42]:
# train_dataset = SpamDataset(X_train, y_train)
# test_dataset = SpamDataset(X_test, y_test)
train_dataset = SpamDataset(np.array(X_train.tolist()), np.array(y_train.tolist()))
test_dataset = SpamDataset(np.array(X_test.tolist()), np.array(y_test.tolist()))

In [43]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)

In [44]:
for feature, label in train_dataloader:
  print(feature, label)

tensor([[4388, 4389,  279,  ...,    0,    0,    0],
        [5463,  762,  101,  ...,    0,    0,    0],
        [ 165,    8, 3373,  ...,    0,    0,    0],
        ...,
        [  61,  475,  299,  ...,    0,    0,    0],
        [ 418,  140,  207,  ...,    0,    0,    0],
        [2290,   70, 2291,  ...,    0,    0,    0]]) tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0])
tensor([[3345,   58, 5562,  ...,    0,    0,    0],
        [ 680, 5767,  101,  ...,    0,    0,    0],
        [1491,   59,  382,  ...,    0,    0,    0],
        ...,
        [3358,  269, 1318,  ...,    0,    0,    0],
        [ 254,  707,  338,  ...,    0,    0,    0],
        [ 332,   59,  852,  ...,    0,    0,    0]]) tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0

# Model

In [45]:
class RNN(nn.Module):

  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, 50)
    self.rnn = nn.RNN(50, 64, batch_first=True)
    self.fc = nn.Linear(64, 2)

  def forward(self, x):
    x = self.embedding(x)
    _, hidden = self.rnn(x)
    x = self.fc(hidden.squeeze(0))
    return x

# Train & Evaluation

In [51]:
lr = 0.001
epochs = 10

In [53]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RNN(len(vocab)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=lr)
print(device)

cuda


In [54]:
for epoch in range(epochs):
  total_loss = 0
  for feature, label in train_dataloader:
    feature, label = feature.to(device), label.to(device)
    optimizer.zero_grad()
    output = model(feature)
    loss = criterion(output, label)
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  print(f'Epoch: {epoch+1}, Loss: {total_loss/len(train_dataloader)}')

Epoch: 1, Loss: 0.4016817554831505
Epoch: 2, Loss: 0.3955520348889487
Epoch: 3, Loss: 0.39582941745008743
Epoch: 4, Loss: 0.39591106814997534
Epoch: 5, Loss: 0.3948737810764994
Epoch: 6, Loss: 0.3966163728918348
Epoch: 7, Loss: 0.3953001809971673
Epoch: 8, Loss: 0.3942330756357738
Epoch: 9, Loss: 0.3962723574468068
Epoch: 10, Loss: 0.3937445851308959
Epoch: 11, Loss: 0.3948189109563828
Epoch: 12, Loss: 0.3946747049689293
Epoch: 13, Loss: 0.3943984180688858
Epoch: 14, Loss: 0.39435939214059285
Epoch: 15, Loss: 0.39473830035754615
Epoch: 16, Loss: 0.3960753519620214
Epoch: 17, Loss: 0.3946407254253115
Epoch: 18, Loss: 0.3933553716966084
Epoch: 19, Loss: 0.3936884858778545
Epoch: 20, Loss: 0.39358684739896227
Epoch: 21, Loss: 0.3942023417779377
Epoch: 22, Loss: 0.3955491772719792
Epoch: 23, Loss: 0.3948347972972052
Epoch: 24, Loss: 0.3962248725550515
Epoch: 25, Loss: 0.3962325285587992
Epoch: 26, Loss: 0.39414997696876525
Epoch: 27, Loss: 0.39421790476356233
Epoch: 28, Loss: 0.39435897873

In [55]:
model.eval()

total = 0
correct = 0

with torch.no_grad():
  for feature, label in test_dataloader:
    feature, label = feature.to(device), label.to(device)
    output = model(feature)
    _, pred = torch.max(output, 1)
    total += label.size(0)
    correct += (pred == label).sum().item()

print(f'Accuracy: {correct/total}')

Accuracy: 0.8654708520179372


In [56]:
model.eval()

total = 0
correct = 0

with torch.no_grad():
  for feature, label in train_dataloader:
    feature, label = feature.to(device), label.to(device)
    output = model(feature)
    _, pred = torch.max(output, 1)
    total += label.size(0)
    correct += (pred == label).sum().item()

print(f'Accuracy: {correct/total}')

Accuracy: 0.8660533991474085
