### **STEP 1**: Load and explore the dataset.

Link to dataset: https://www.kaggle.com/datasets/jackksoncsie/spam-email-dataset


In [81]:
import numpy as np
import pandas as pd
import os, csv, pathlib

We access the Google Drive folder containing the dataset to read it.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [97]:
file_path = '/content/drive/MyDrive/AML/Assignment 1'

In [83]:
df = pd.read_csv(file_path + "/emails.csv")
print(df.head(10))

                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1
5  Subject: great nnews  hello , welcome to medzo...     1
6  Subject: here ' s a hot play in motion  homela...     1
7  Subject: save your money buy getting this thin...     1
8  Subject: undeliverable : home based business f...     1
9  Subject: save your money buy getting this thin...     1


In [84]:
df.shape

(5728, 2)

In [85]:
df.spam.value_counts()

0    4360
1    1368
Name: spam, dtype: int64

### **STEP 2**: Preprocess the dataset.

First, we ensure that there are no missing values.

In [86]:
df.isna().sum()

text    0
spam    0
dtype: int64

Next, we split the data into X (features, i.e.'text') and y (target, i.e. 'spam').

In [87]:
X = df['text'].copy()
y = df['spam'].copy()

Next, we remove punctuation marks from the email text as these typically do not contain much information in spam detection.

Also, since every entry of X begins with 'Subject:', we will be removing that too as it is redundant and does not help in spam detection.

In [88]:
def remove_punctuation(text):
  new_text = ''
  for char in text[8:]:
    if char.isalnum() or char == ' ':
      new_text += char
  return new_text

In [89]:
X = X.apply(remove_punctuation)
print(X.head())

0     naturally irresistible your corporate identit...
1     the stock trading gunslinger  fanny is merril...
2     unbelievable new homes made easy  im wanting ...
3     4 color printing special  request additional ...
4     do not have money  get software cds from here...
Name: text, dtype: object


Next, we tokenize the text using TfidfVectorizer.

In [90]:
from sklearn.feature_extraction.text import TfidfVectorizer
t_vectorizer = TfidfVectorizer()

X_new = t_vectorizer.fit_transform(X)
X_new.shape

(5728, 37330)

In [91]:
X_transformed = pd.DataFrame(X_new.toarray())
X_transformed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37320,37321,37322,37323,37324,37325,37326,37327,37328,37329
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### **STEP 3**: Split the dataset into train/validation/test.

We split the transformed dataset into three parts:
- 70% for training
- 15% for validation
- 15% for testing

In [92]:
from sklearn.model_selection import train_test_split

train_X, remaining_X, train_y, remaining_y = train_test_split(X_transformed, y, train_size = 0.7, random_state = 2024)
val_X, test_X, val_y, test_y = train_test_split(remaining_X, remaining_y, test_size = 0.5, random_state = 2024)

### **STEP 4**: Store the splits at train.csv/validation.csv/test.csv

We first combine the feature and target arrays to create the training datasets.

In [93]:
df_train = pd.concat([train_X, train_y], axis = 1)
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,37321,37322,37323,37324,37325,37326,37327,37328,37329,spam
2261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3042,0.054163,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


We repeat the same to obtain validation and testing datasets.

In [94]:
df_val = pd.concat([val_X, val_y], axis = 1)

In [95]:
df_test = pd.concat([test_X, test_y], axis = 1)

Then, we save the datasets as csv files in the Google drive folder containing the .ipynb file.

In [98]:
df_train.to_csv(file_path + '/train.csv', index = False)

In [99]:
df_val.to_csv(file_path + '/validation.csv', index = False)

In [100]:
df_test.to_csv(file_path + '/test.csv', index = False)