In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Install Spacy


In [None]:
!pip install -U pip setuptools wheel
!pip install -U 'spacy[cuda-autodetect,transformers,lookups]'
!python -m spacy download en_core_web_sm



Import modules

In [4]:
import spacy
import pandas as pd
import nltk
import random
import string
import numpy as np
import seaborn as sns
from spacy.util import minibatch
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from spacy.lang.en import English
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, plot_confusion_matrix, classification_report, accuracy_score, f1_score, confusion_matrix
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn import metrics
%matplotlib inline




In [5]:
#load data without metadata

data_1 = pd.read_csv("/content/drive/MyDrive/Uni/emails.csv", delimiter=',')
data_1['text'] = data_1['text'].str.replace('Subject:', '') # remove "Subject: from column"
data_1.head()


Unnamed: 0,text,spam
0,naturally irresistible your corporate identit...,1
1,the stock trading gunslinger fanny is merril...,1
2,unbelievable new homes made easy im wanting ...,1
3,4 color printing special request additional ...,1
4,"do not have money , get software cds from her...",1


In [6]:
# load data with metadata to later test it on the model
data_2 = pd.read_csv("/content/drive/MyDrive/emails_mit_Metadaten.csv", delimiter=',')
data_2 = data_2.drop("Unnamed: 0", axis=1)
data_2['text'] = data_2['content'] + ' ' + data_2['metadata'] #concetenate columns to fit to the model
data_2.head()

Unnamed: 0,metadata,content,spam,text
0,From 12a1mailbot1@web.de Thu Aug 22 13:17:22 ...,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",False,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr..."
1,From ilug-admin@linux.ie Thu Aug 22 13:27:39 ...,1) Fight The Risk of Cancer!\nhttp://www.adcli...,False,1) Fight The Risk of Cancer!\nhttp://www.adcli...
2,From sabrina@mx3.1premio.com Thu Aug 22 14:44...,1) Fight The Risk of Cancer!\nhttp://www.adcli...,False,1) Fight The Risk of Cancer!\nhttp://www.adcli...
3,From wsup@playful.com Thu Aug 22 16:17:00 200...,##############################################...,False,##############################################...
4,From social-admin@linux.ie Thu Aug 22 16:37:3...,I thought you might like these:\n1) Slim Down ...,False,I thought you might like these:\n1) Slim Down ...


Data exploration

In [7]:

display(data_1.info(),data_1.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


None

Unnamed: 0,text,spam
0,naturally irresistible your corporate identit...,1
1,the stock trading gunslinger fanny is merril...,1
2,unbelievable new homes made easy im wanting ...,1
3,4 color printing special request additional ...,1
4,"do not have money , get software cds from her...",1


Less spam emails than in the dataset, which creates a heavy bias -> detecting the spam emails will be a chellenge


In [8]:
import plotly.express as px
df= data_1['spam'].value_counts()
fig = px.bar(df,hover_data=[df.index],width=500,height = 300)
fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',
                   marker_line_width=1.5, opacity=0.6)
fig.update_layout(title='Spam 0 or 1')
fig.show()
              

*Data Preprocessing*

1. **Tokenization**

---


---


Tokenization function that takes in a sentence as input and returns a list of tokens (lemmatized, lowercased, and without stop words or punctuation)

In [9]:
#Create list of punctuation marks
punctuations = string.punctuation

# Create list of stopwords
#nlp = spacy.load("en_core_web_sm")
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load the English model and create a new parser object
parser = spacy.load("en_core_web_sm")

# Define the spacy_tokenizer function as before
def spacy_tokenizer(sentence):
    # Tokenize the input sentence using the parser object
    mytokens = parser(sentence)

    # Perform lemmatization and lowercasing
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Remove stop words and punctuations
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # Return the preprocessed list of tokens
    return mytokens


Custom transformer class that takes in a list of texts (X) and returns a list of cleaned texts. The cleaning process consists of stripping leading and trailing spaces and converting the text to lowercase.

In [10]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}
# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [11]:
# creates a CountVectorizer object with the specified tokenizer function
# the output is a matrix of token counts, where each row represents a document and each column represents a token
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))


In [12]:
# creates a TfidfVectorizer object (statistical measure that weights the importance of each word in a document)
# the output is a matrix of TF-IDF features, where each row represents a document and each column represents a token
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

**2. Create Pipeline, train and evaluate models**

---



---



In [13]:
# split into train and test data
X = data_1['text'] # features to analyze
ylabels = data_1['spam'] # labels
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

In [14]:
# split into train an test data
X_test_2_split = data_2['text']
ylabels_2 = data_2['spam']
X_train_2, X_test_2_split, y_train_2, y_test_2 = train_test_split(X_test_2_split, ylabels_2, test_size=0.5)

Create a pipeline with three components: a cleaner, a vectorizer, and a classifier. Then use several algorithm to train the models.

In [15]:
# Logistic Regression Classifier (bow_vector)
classifier = LogisticRegression()

# Create pipeline
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

# model training
pipe.fit(X_train,y_train)


Pipeline(steps=[('cleaner', <__main__.predictors object at 0x7f0bed783a90>),
                ('vectorizer',
                 CountVectorizer(tokenizer=<function spacy_tokenizer at 0x7f0bf029d160>)),
                ('classifier', LogisticRegression())])

In [16]:
# Predicting using the test dataset
predicted = pipe.predict(X_test)


# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted, zero_division=0)) 
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted, zero_division=0))

Logistic Regression Accuracy: 0.9924374636416521
Logistic Regression Precision: 0.9901234567901235
Logistic Regression Recall: 0.9780487804878049


In [17]:
# Predicting using the foreign test dataset with metadata & text  
predicted = pipe.predict(X_test_2_split)



# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test_2, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test_2, predicted, zero_division=0)) 
print("Logistic Regression Recall:",metrics.recall_score(y_test_2, predicted, zero_division=0))

Logistic Regression Accuracy: 0.737221494102228
Logistic Regression Precision: 0.8216654384672071
Logistic Regression Recall: 0.8751962323390895


In [18]:
# Random Forest Classifier (bow_vector)
classifier_1 = classifier = RandomForestClassifier(random_state=42)

# Create pipeline 
pipe_1 = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier_1)])

# model training
pipe_1.fit(X_train,y_train)

Pipeline(steps=[('cleaner', <__main__.predictors object at 0x7f0bea2007f0>),
                ('vectorizer',
                 CountVectorizer(tokenizer=<function spacy_tokenizer at 0x7f0bf029d160>)),
                ('classifier', RandomForestClassifier(random_state=42))])

In [19]:
# Predict usingg the test dataset
predicted_1 = pipe_1.predict(X_test)


# Model Accuracy
print("Random Forest Accuracy:",metrics.accuracy_score(y_test, predicted_1))
print("Random Forest Precision:",metrics.precision_score(y_test, predicted_1, zero_division=0)) #however using zero_divsion no value in the output!!
print("Random Forest Recall:",metrics.recall_score(y_test, predicted_1, zero_division=0)) #however using zero_divsion no value in the output!!

Random Forest Accuracy: 0.9819662594531704
Random Forest Precision: 0.9922077922077922
Random Forest Recall: 0.9317073170731708


In [25]:
# Predicting using the foreign test dataset with metadata & text  
predicted_1 = pipe_1.predict(X_test_2_split)



# Model Accuracy
print("Random Forest Accuracy:",metrics.accuracy_score(y_test_2, predicted_1))
print("Random Forest Accuracy:",metrics.precision_score(y_test_2, predicted_1, zero_division=0)) 
print("Random Forest Accuracy:",metrics.recall_score(y_test_2, predicted_1, zero_division=0))

Random Forest Accuracy: 0.6127129750982963
Random Forest Accuracy: 0.805183199285076
Random Forest Accuracy: 0.707221350078493


In [20]:
# Logistic Regression Classifier (tfidf_vector)
classifier = LogisticRegression()

# Create pipeline 
pipe_2 = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfidf_vector),
                 ('classifier', classifier)])

# model training 
pipe_2.fit(X_train,y_train)

Pipeline(steps=[('cleaner', <__main__.predictors object at 0x7f0be9f84c40>),
                ('vectorizer',
                 TfidfVectorizer(tokenizer=<function spacy_tokenizer at 0x7f0bf029d160>)),
                ('classifier', LogisticRegression())])

In [21]:
# Predict using the test dataset
predicted_2 = pipe_2.predict(X_test)


# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted_2))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted_2, zero_division=0)) #however using zero_divsion no value in the output!!
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted_2, zero_division=0)) #however using zero_divsion no value in the output!!

Logistic Regression Accuracy: 0.9866201279813845
Logistic Regression Precision: 0.9948849104859335
Logistic Regression Recall: 0.948780487804878


In [26]:
# Predicting using the foreign test dataset with metadata & text 
predicted_2 = pipe_2.predict(X_test_2_split)



# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test_2, predicted_2))
print("Logistic Regression Accuracy:",metrics.precision_score(y_test_2, predicted_2, zero_division=0)) 
print("Logistic Regression Accuracy:",metrics.recall_score(y_test_2, predicted_2, zero_division=0))

Logistic Regression Accuracy: 0.39384010484927917
Logistic Regression Accuracy: 0.722860791826309
Logistic Regression Accuracy: 0.44427001569858715


In [22]:
# Random Forest Classifier (tfidf_vector)
classifier_1 = classifier = RandomForestClassifier(random_state=42)

# Create pipeline 
pipe_3 = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfidf_vector),
                 ('classifier', classifier_1)])

# model training
pipe_3.fit(X_train,y_train)

Pipeline(steps=[('cleaner', <__main__.predictors object at 0x7f0be9f84fa0>),
                ('vectorizer',
                 TfidfVectorizer(tokenizer=<function spacy_tokenizer at 0x7f0bf029d160>)),
                ('classifier', RandomForestClassifier(random_state=42))])

In [23]:
# Predict usning the test dataset
predicted_3 = pipe_3.predict(X_test)


# Model Accuracy
print("Random Forest Accuracy:",metrics.accuracy_score(y_test, predicted_3))
print("Random Forest Precision:",metrics.precision_score(y_test, predicted_3, zero_division=0)) #however using zero_divsion no value in the output!!
print("Random Forest Recall:",metrics.recall_score(y_test, predicted_3, zero_division=0)) #however using zero_divsion no value in the output!!

Random Forest Accuracy: 0.985456660849331
Random Forest Precision: 0.9974160206718347
Random Forest Recall: 0.9414634146341463


In [27]:
# Predicting using the foreign test dataset with metadata & text 
predicted_3 = pipe_3.predict(X_test_2_split)

# Model Accuracy
print("Random Forest Accuracy:",metrics.accuracy_score(y_test_2, predicted_3))
print("Random Forest Accuracy:",metrics.precision_score(y_test_2, predicted_3, zero_division=0)) 
print("Random Forest Accuracy:",metrics.recall_score(y_test_2, predicted_3, zero_division=0))

Random Forest Accuracy: 0.2601572739187418
Random Forest Accuracy: 0.6600441501103753
Random Forest Accuracy: 0.23469387755102042
