#Spacy models trained on dataset with metadata

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Install Spacy


In [None]:
!pip install -U pip setuptools wheel
!pip install -U 'spacy[cuda-autodetect,transformers,lookups]'
!python -m spacy download en_core_web_sm



Import modules

In [8]:
import spacy
import pandas as pd
import nltk
import random
import string
import numpy as np
import seaborn as sns
from spacy.util import minibatch
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from spacy.lang.en import English
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, plot_confusion_matrix, classification_report, accuracy_score, f1_score, confusion_matrix
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn import metrics
%matplotlib inline


In [56]:
#load data without metadata

data_1 = pd.read_csv("/content/drive/MyDrive/emails_mit_Metadaten.csv", delimiter=',')
data_1 = data_1.drop("Unnamed: 0", axis=1)
data_1.head()

Unnamed: 0,metadata,content,spam
0,From 12a1mailbot1@web.de Thu Aug 22 13:17:22 ...,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",False
1,From ilug-admin@linux.ie Thu Aug 22 13:27:39 ...,1) Fight The Risk of Cancer!\nhttp://www.adcli...,False
2,From sabrina@mx3.1premio.com Thu Aug 22 14:44...,1) Fight The Risk of Cancer!\nhttp://www.adcli...,False
3,From wsup@playful.com Thu Aug 22 16:17:00 200...,##############################################...,False
4,From social-admin@linux.ie Thu Aug 22 16:37:3...,I thought you might like these:\n1) Slim Down ...,False


In [68]:
#load data without metadata (to later test the models)

data_2 = pd.read_csv("/content/drive/MyDrive/Uni/emails.csv", delimiter=',')
data_2['text'] = data_2['text'].str.replace('Subject:', '') # remove "Subject: from column"
data_2.rename(columns={'text': 'content'}, inplace=True)
data_2["metadata"] = None
data_2['spam'] = [True if x == 1 else False for x in data_2['spam']]
data_2.head()

Unnamed: 0,content,spam,metadata
0,naturally irresistible your corporate identit...,True,
1,the stock trading gunslinger fanny is merril...,True,
2,unbelievable new homes made easy im wanting ...,True,
3,4 color printing special request additional ...,True,
4,"do not have money , get software cds from her...",True,


Data exploration

Less spam emails than in the dataset, which creates a heavy bias -> detecting the spam emails will be a chellenge


In [11]:
import plotly.express as px
df= data_1['spam'].value_counts()
fig = px.bar(df,hover_data=[df.index],width=500,height = 300)
fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)',
                   marker_line_width=1.5, opacity=0.6)
fig.update_layout(title='Spam 0 or 1')
fig.show()
              

*Data Preprocessing*

1. **Tokenization**

---


---


Tokenization function using spacy and takes in a sentence as input and returns a list of tokens (lemmatized, lowercased, and without stop words or punctuation)

In [12]:
#Create list of punctuation marks
punctuations = string.punctuation

# Create list of stopwords
#nlp = spacy.load("en_core_web_sm")
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load the English model and create a new parser object
parser = spacy.load("en_core_web_sm")

# Define the spacy_tokenizer function as before
def spacy_tokenizer(sentence):
    # Tokenize the input sentence using the parser object
    mytokens = parser(sentence)

    # Perform lemmatization and lowercasing
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Remove stop words and punctuations
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # Return the preprocessed list of tokens
    return mytokens


Custom predictor class that takes in a list of texts (X) and returns a list of cleaned texts (consists of stripping leading and trailing spaces and converting the text to lowercase).

In [13]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X['content']]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}
# Clean the text
def clean_text(text):
    if type(text) != str:
        text = str(text)
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [14]:
# creates a CountVectorizer object with the specified tokenizer function
# the output is a matrix of token counts, where each row represents a document and each column represents a token
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))


In [15]:
# creates a TfidfVectorizer object (statistical measure that weights the importance of each word in a document)
# the output is a matrix of TF-IDF features, where each row represents a document and each column represents a token
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

**2. Create Pipeline, train and evaluate models**

---



---



In [60]:
# split into train and test data
X = data_1[['content', 'metadata']] # features to analyze
ylabels = data_1['spam'] # labels
X_train, X_test, y_train, y_test = train_test_split(X[['content', 'metadata']], ylabels, test_size=0.3)





In [78]:
# split into train and test data (for testing)
X_test_2_split = data_2[['content', 'metadata']]
ylabels_2 = data_2['spam'] # labels
X_train_2, X_test_2_split, y_train_2, y_test_2 = train_test_split(X_test_2_split[['content', 'metadata']], ylabels_2, test_size=0.5)




Create a  pipeline which takes the predefined spacy-based components: a cleaner, a vectorizer, and a classifier. Then use several algorithms to train the models.

In [17]:
# Logistic Regression Classifier (bow_vector)
classifier = LogisticRegression()
pipe = Pipeline([('cleaner', predictors()),
                  ('vectorizer', bow_vector),
                  ('classifier', classifier)])
pipe.fit(X_train, y_train)




Pipeline(steps=[('cleaner', <__main__.predictors object at 0x7f27dcd11550>),
                ('vectorizer',
                 CountVectorizer(tokenizer=<function spacy_tokenizer at 0x7f27df832430>)),
                ('classifier', LogisticRegression())])

In [18]:
# Predicting using the test dataset with text & metadata
predicted = pipe.predict(X_test)


# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted, zero_division=0)) 
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted, zero_division=0))

Logistic Regression Accuracy: 0.9890829694323144
Logistic Regression Precision: 0.9884169884169884
Logistic Regression Recall: 0.9986996098829649


In [80]:
# Predicting using the foreign test dataset with only text 
predicted = pipe.predict(X_test_2_split)


# Model Accuracy
# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test_2, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test_2, predicted, zero_division=0)) 
print("Logistic Regression Recall:",metrics.recall_score(y_test_2, predicted, zero_division=0))



Logistic Regression Accuracy: 0.20705307262569833
Logistic Regression Precision: 0.16547066272688898
Logistic Regression Recall: 0.5714285714285714


In [82]:
# Random Forest Classifier (bow_vector)
classifier_1 = classifier = RandomForestClassifier(random_state=42)

# Create pipeline 
pipe_1 = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier_1)])

# model training
pipe_1.fit(X_train,y_train)

Pipeline(steps=[('cleaner', <__main__.predictors object at 0x7f27d5696c10>),
                ('vectorizer',
                 CountVectorizer(tokenizer=<function spacy_tokenizer at 0x7f27df832430>)),
                ('classifier', RandomForestClassifier(random_state=42))])

In [83]:
# Predict using the test dataset
predicted_1 = pipe_1.predict(X_test)


# Model Accuracy
print("Random Forest Accuracy:",metrics.accuracy_score(y_test, predicted_1))
print("Random Forest Precision:",metrics.precision_score(y_test, predicted_1, zero_division=0)) #however using zero_divsion no value in the output!!
print("Random Forest Recall:",metrics.recall_score(y_test, predicted_1, zero_division=0)) #however using zero_divsion no value in the output!!

Random Forest Accuracy: 0.962882096069869
Random Forest Precision: 0.9569620253164557
Random Forest Recall: 1.0


In [85]:
# Predicting using the foreign test dataset with only text 
predicted_1 = pipe_1.predict(X_test_2_split)


# Model Accuracy
# Model Accuracy
print("Random Forest Accuracy:",metrics.accuracy_score(y_test_2, predicted_1))
print("Random Forest Accuracy:",metrics.precision_score(y_test_2, predicted_1, zero_division=0)) 
print("Random Forest Accuracy:",metrics.recall_score(y_test_2, predicted_1, zero_division=0))

Random Forest Accuracy: 0.21822625698324022
Random Forest Accuracy: 0.22138500179404377
Random Forest Accuracy: 0.8994169096209913


In [None]:
# Logistic Regression Classifier (tfidf_vector)
classifier = LogisticRegression()

if 'content' not in X.columns:
    raise ValueError("X must have a 'content' column")

# Create pipeline 
pipe_2 = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfidf_vector),
                 ('classifier', classifier)])

# model training 
pipe_2.fit(X_train,y_train)

Pipeline(steps=[('cleaner', <__main__.predictors object at 0x7f5d60e9bdf0>),
                ('vectorizer',
                 TfidfVectorizer(tokenizer=<function spacy_tokenizer at 0x7f5d5eed94c0>)),
                ('classifier', LogisticRegression())])

In [None]:
# Predict using the test dataset
predicted_2 = pipe_2.predict(X_test)


# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted_2))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted_2, zero_division=0)) #however using zero_divsion no value in the output!!
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted_2, zero_division=0)) #however using zero_divsion no value in the output!!

Logistic Regression Accuracy: 0.9366812227074236
Logistic Regression Precision: 0.928921568627451
Logistic Regression Recall: 1.0


In [None]:
# Predicting using the foreign test dataset with only text 
predicted_2 = pipe_2.predict(X_test_2_split)


# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test_2, predicted_2))
print("Logistic Regression Precision:",metrics.precision_score(y_test_2, predicted_2, zero_division=0)) 
print("Logistic Regression Recall:",metrics.recall_score(y_test_2, predicted_2, zero_division=0)

In [None]:
# Random Forest Classifier (tfidf_vector)
classifier_1 = classifier = RandomForestClassifier(random_state=42)

# Create pipeline 
pipe_3 = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfidf_vector),
                 ('classifier', classifier_1)])

# model training
pipe_3.fit(X_train,y_train)

Pipeline(steps=[('cleaner', <__main__.predictors object at 0x7f5d5b22b3d0>),
                ('vectorizer',
                 TfidfVectorizer(tokenizer=<function spacy_tokenizer at 0x7f5d5eed94c0>)),
                ('classifier', RandomForestClassifier(random_state=42))])

In [None]:
# Predict usinf the test dataset
predicted_3 = pipe_3.predict(X_test)


# Model Accuracy
print("Random Forest Accuracy:",metrics.accuracy_score(y_test, predicted_3))
print("Random Forest Precision:",metrics.precision_score(y_test, predicted_3, zero_division=0)) #however using zero_divsion no value in the output!!
print("Random Forest Recall:",metrics.recall_score(y_test, predicted_3, zero_division=0)) #however using zero_divsion no value in the output!!

Random Forest Accuracy: 0.9596069868995634
Random Forest Precision: 0.9557522123893806
Random Forest Recall: 0.9973614775725593


In [None]:
# Predicting using the foreign test dataset with only text 
predicted_3 = pipe_3.predict(X_test_2_split)


# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test_2, predicted_3))
print("Logistic Regression Precision:",metrics.precision_score(y_test_2, predicted_3, zero_division=0)) 
print("Logistic Regression Recall:",metrics.recall_score(y_test_2, predicted_3, zero_division=0)