# **Import Dataset**

In [None]:
import os
import tarfile
import urllib.request
import pandas as pd
import email
import string
from string import punctuation
import spacy
from bs4 import BeautifulSoup
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

In [None]:
DOWNLOAD_HAM = ["https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2",
                "https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2",
                "https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2"]

DOWNLOAD_SPAM = ["https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2",
                 "https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2"]


def fetch_tar_file(url,extract_path,file_name):
  os.makedirs(extract_path, exist_ok = True)
  tgz_path = os.path.join("datasets",file_name)
  urllib.request.urlretrieve(url,tgz_path)
  file_tgz = tarfile.open(tgz_path)
  file_tgz.extractall(extract_path)
  file_tgz.close()

i = 0
for url in DOWNLOAD_HAM:
  fetch_tar_file(url,"datasets",("ham"+str(i)))
  i = i + 1

i = 0
for url in DOWNLOAD_SPAM:
  fetch_tar_file(url,"datasets",("spam"+str(i)))
  i = i + 1



In [None]:
def load_text_data(path,df):
  subfolders = [f.path for f in os.scandir(path) if f.is_dir()]
  for dir in subfolders:
    label = 0 if dir.rfind("ham")!=-1 else 1
    for file in os.scandir(dir):
      file_path = os.path.join(dir,file)
      if os.path.isfile(file_path):
        text = open(file_path,"r",encoding="utf-8",errors="replace",)
        df = pd.concat([pd.DataFrame(columns=df.columns,data=[[text.read(),label]]),df],ignore_index=True)
  return df

In [None]:
df = pd.DataFrame(columns=["text","label"])

df = load_text_data("/content/datasets",df)

In [None]:
df.head()

Unnamed: 0,text,label
0,From ilug-admin@linux.ie Tue Sep 10 18:15:29 ...,1
1,From fholland@bigfoot.com Wed Sep 11 19:43:52...,1
2,From safety33o@l8.newnamedns.com Mon Aug 26 2...,1
3,From aig@insiq.us Tue Sep 24 23:55:56 2002\nR...,1
4,From k_v_g20022002@yahoo.fr Mon Sep 2 17:00:...,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6051 entries, 0 to 6050
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    6051 non-null   object
 1   label   6051 non-null   object
dtypes: object(2)
memory usage: 94.7+ KB


In [None]:
df[(df["label"] == 1)].count()

text     1898
label    1898
dtype: int64

# **Data Cleaning**

In [None]:
#Regex is fantastic for its intended purpose: searching for highly-variable needles in highly-variable haystacks
#But it's very slow in our case, so we will use email.parser from email python library

In [None]:
def parse_text(text):
  parser = email.parser.Parser()
  parsed_mail_message = parser.parsestr(text)
  body = ""
  for element in parsed_mail_message.get_payload():
      body+=str(element)

  return body

In [None]:
df['text'] = [parse_text(text) for text in df['text']]

In [None]:
df.head()

Unnamed: 0,text,label
0,"Dear Partner to be,\n\nFirst, I must apologise...",1
1,<html>\n<head>\n</head>\n<center>\n<h1>\n<b><f...,1
2,Protect your financial well-being.\nPurchase a...,1
3,"Content-Type: text/plain;\n\tcharset=""Windows-...",1
4,Content-Type: text/plain; charset=iso-8859-1\n...,1


# **Pre-Processing**

In [None]:
!python -m spacy download en_core_web_sm

2022-08-29 13:12:46.562642: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 5.0 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
preprocessing_model = spacy.load('en_core_web_sm')

In [None]:
def is_ok(token):
  if token.is_stop:
    return False
  if len(token) <= 3:
    return False
  if token.is_punct:
    return False
  if token.is_space:
    return False

  return True

In [None]:
def preprocessing(text):
  text_without_tags = BeautifulSoup(text).get_text()

  for char in text_without_tags:
    if char.isdigit() or char in punctuation:
      text_without_tags = text_without_tags.replace(char," ")

  doc = preprocessing_model(text_without_tags)

  return [token.lemma_.lower() for token in doc if is_ok(token)]

In [None]:
df["text"] = [preprocessing(text) for text in df["text"]]

In [None]:
shuffler = StratifiedShuffleSplit(n_splits=1,test_size=0.2)

for train_index,test_index in shuffler.split(df,df['label']):
  train_set = df.loc[train_index]
  test_set = df.loc[test_index]

In [None]:
train_x = train_set.text
train_y = train_set.label
test_x = test_set.text
test_y = test_set.label

In [None]:
def dummy(doc):
  return doc

tfidf = TfidfVectorizer(
    analyzer = "word",
    preprocessor = dummy,
    tokenizer = dummy,
    token_pattern = None
)

train_x = tfidf.fit_transform(train_x)
test_x = tfidf.transform(test_x)

In [None]:
train_y = train_y.astype('int')
test_y = test_y.astype('int')

# **Training and Testing**

**Logistic** **Regression**

In [None]:
lr = LogisticRegression()

lr.fit(train_x,train_y)

LogisticRegression()

In [None]:
predictions = lr.predict(test_x)

In [None]:
acc_score = accuracy_score(test_y,predictions)
f1_score_ = f1_score(test_y,predictions)

In [None]:
print("Accuracy:",acc_score)
print("F1 Score:",f1_score_)

Accuracy: 0.9587118084227911
F1 Score: 0.9318801089918256


**SVM Classifier (with GridSearch Fine Tuning)**

In [None]:
param_grid = {'C':[1,0.1,0.01],'kernel':['rbf','linear']}
svc = SVC()
gs = GridSearchCV(svc,param_grid,cv=5,scoring='accuracy',return_train_score=True)

gs.fit(train_x,train_y)


GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [1, 0.1, 0.01], 'kernel': ['rbf', 'linear']},
             return_train_score=True, scoring='accuracy')

In [None]:
gs.best_params_

{'C': 1, 'kernel': 'linear'}

In [None]:
svc = SVC(C=1,kernel='linear')
svc.fit(train_x,train_y)
predictions = svc.predict(test_x)

In [None]:
acc_score = accuracy_score(test_y,predictions)
f1_score_ = f1_score(test_y,predictions)

In [None]:
print("Accuracy:",acc_score)
print("F1 Score:",f1_score_)

Accuracy: 0.9867877786952931
F1 Score: 0.978891820580475


**XGBoost (with GridSearch Fine Tuning)**

In [None]:
xgbc = XGBClassifier()
param_grid = {'learning_rate':[0.1],'min_child_weight':[1,2,0.5],'max_depth':[3,5,8]}

gs = GridSearchCV(xgbc,param_grid,cv=5,scoring='accuracy')
gs.fit(train_x,train_y)

GridSearchCV(cv=5, estimator=XGBClassifier(),
             param_grid={'learning_rate': [0.1], 'max_depth': [3, 5, 8],
                         'min_child_weight': [1, 2, 0.5]},
             scoring='accuracy')

In [None]:
gs.best_params_

{'learning_rate': 0.1, 'max_depth': 8, 'min_child_weight': 2}

In [None]:
xgbc = XGBClassifier(n_estimators=400,learning_rate=0.1,max_depth=8,min_child_weight=2)
xgbc.fit(train_x,train_y)
predictions = xgbc.predict(test_x)

In [None]:
acc_score = accuracy_score(test_y,predictions)
f1_score_ = f1_score(test_y,predictions)

In [None]:
print("Accuracy:",acc_score)
print("F1 Score:",f1_score_)

Accuracy: 0.9777043765483072
F1 Score: 0.9639519359145527
