# Classification of emails into spam and ham
In this project we have a dataset containing different emails and we have to classify them into spam and ham

## Importing libraries

In [1]:
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# nltk.download('punkt')
# nltk.download('wordnet')


import sklearn
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier,AdaBoostClassifier,ExtraTreesClassifier,RandomForestClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import Perceptron,SGDClassifier,PassiveAggressiveClassifier
from sklearn.metrics import precision_score,recall_score, f1_score, accuracy_score

#pre-settings 
np.random.seed(42)

## Importing datasets

In [2]:
imported_datasets =pd.read_csv('ham-spam/spamhamdata.csv',sep='\t',header=[0])

In [3]:
imported_datasets.head()

Unnamed: 0,Type,Email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Data preprocessing

In [4]:
imported_datasets.describe()

Unnamed: 0,Type,Email
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [5]:
imported_datasets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Type    5572 non-null   object
 1   Email   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:

punctuations=string.punctuation
stop_words=stopwords.words('english')
lemmatizer=WordNetLemmatizer()

In [7]:
punctuations=string.punctuation
stop_words=stopwords.words('english')
lemmatizer=WordNetLemmatizer()
#convert spam and ham to 0/1
#remove stop words
    # tokenize the text
    # convert text to lowercase
    # remove punctuation
    # remove numerical values
    # stem or lemmatize the words
    # return the cleaned text
def preprocess(obj):
    words = word_tokenize(obj)
    corpus=[word.lower() for word in words if word not in stop_words and word not in punctuations]
    corpus=[lemmatizer.lemmatize(words) for words in corpus]
    corpus=' '.join(corpus)
    return corpus

In [8]:
imported_datasets['Email']=imported_datasets['Email'].apply(preprocess)

In [9]:
imported_datasets['IsSpam']=imported_datasets['Type'].apply(lambda x:1 if x=='spam' else 0)

In [10]:
imported_datasets=imported_datasets.drop('Type',axis=1)

In [11]:
imported_datasets.head(3)

Unnamed: 0,Email,IsSpam
0,go jurong point crazy .. available bugis n gre...,0
1,ok lar ... joking wif u oni ...,0
2,free entry 2 wkly comp win fa cup final tkts 2...,1


## Data Transformation

### Vectorzation of data

In [28]:
count_vectorizer=CountVectorizer()
tfidf=TfidfVectorizer()
Emails=imported_datasets['Email']
Emails_tfidf=tfidf.fit_transform(Emails)
Emails_count_vectorizer=count_vectorizer.fit_transform(Emails)
print(len(tfidf.get_feature_names_out()))
print(count_vectorizer.get_feature_names_out())

8186
['00' '000' '000pes' ... 'èn' 'ú1' '〨ud']


In [13]:
Emails_tfidf[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

## Splitting of data and Model Training

In [14]:
x=Emails_tfidf
y=imported_datasets['IsSpam']
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2)

In [15]:
# models
# from sklearn.ensemble import BaggingClassifier,AdaBoostClassifier,ExtraTreesClassifier,RandomForestClassifier,GradientBoostingClassifier
# from sklearn.naive_bayes import MultinomialNB,BernoulliNB
# from sklearn.neural_network import MLPClassifier
# from sklearn.linear_model import Perceptron,SGDClassifier,PassiveAggressiveClassifier

#ensemble
bagging=BaggingClassifier()
ada=AdaBoostClassifier()
extra_trees=ExtraTreesClassifier()
gradient=GradientBoostingClassifier()
random_forest=RandomForestClassifier()

#naive bayes
multinomial=MultinomialNB()
bernoulli=BernoulliNB()

#neural network
mlp=MLPClassifier()

#linear model
perceptron=Perceptron()
sgd=SGDClassifier()
passive_aggressive=PassiveAggressiveClassifier()

In [16]:
clfs={
    'Bagging':bagging,
    'AdaBoost':ada,
    'ExtraTrees':extra_trees,
    'GradientBoosting':gradient,
    'RandomForest':random_forest,
    'MultinomialNB':multinomial,
    'BernoulliNB':bernoulli,
    'MLP':mlp,
    'Perceptron':perceptron,
    'SGD':sgd,
    'PassiveAggressiveClassifier':passive_aggressive
}

In [17]:
import sklearn.metrics

for name,clf in clfs.items():
    clf.fit(xtrain,ytrain)
    print(f'{name} trained')
    ypred=clf.predict(xtest)
    print(f'{name} accuracy: {sklearn.metrics.accuracy_score(ytest,ypred)}')
    print(f'{name} precision: {precision_score(ytest,ypred)}')
    print(f'{name} recall: {recall_score(ytest,ypred)}')
    print(f'{name} f1-score: {f1_score(ytest,ypred)}')
    print(f'{name} confusion matrix: ')
    print(sklearn.metrics.confusion_matrix(ytest,ypred))

Bagging trained
Bagging accuracy: 0.97847533632287
Bagging precision: 0.9699248120300752
Bagging recall: 0.8657718120805369
Bagging f1-score: 0.9148936170212766
Bagging confusion matrix: 
[[962   4]
 [ 20 129]]




AdaBoost trained
AdaBoost accuracy: 0.9811659192825112
AdaBoost precision: 0.9507042253521126
AdaBoost recall: 0.9060402684563759
AdaBoost f1-score: 0.9278350515463918
AdaBoost confusion matrix: 
[[959   7]
 [ 14 135]]
ExtraTrees trained
ExtraTrees accuracy: 0.9820627802690582
ExtraTrees precision: 0.9849624060150376
ExtraTrees recall: 0.8791946308724832
ExtraTrees f1-score: 0.9290780141843972
ExtraTrees confusion matrix: 
[[964   2]
 [ 18 131]]
GradientBoosting trained
GradientBoosting accuracy: 0.9775784753363229
GradientBoosting precision: 0.984375
GradientBoosting recall: 0.8456375838926175
GradientBoosting f1-score: 0.9097472924187726
GradientBoosting confusion matrix: 
[[964   2]
 [ 23 126]]
RandomForest trained
RandomForest accuracy: 0.9811659192825112
RandomForest precision: 1.0
RandomForest recall: 0.8590604026845637
RandomForest f1-score: 0.924187725631769
RandomForest confusion matrix: 
[[966   0]
 [ 21 128]]
MultinomialNB trained
MultinomialNB accuracy: 0.9739910313901345
M

## Tuning the models

In [18]:
features=len(count_vectorizer.get_feature_names_out())

In [19]:
nEstimators=int(features*0.5)
print(nEstimators)

4093


In [20]:
#ensemble
bagging=BaggingClassifier(n_estimators=nEstimators)
ada=AdaBoostClassifier(n_estimators=nEstimators)
extra_trees=ExtraTreesClassifier(n_estimators=nEstimators)
gradient=GradientBoostingClassifier(n_estimators=nEstimators)
random_forest=RandomForestClassifier(n_estimators=nEstimators)

#naive bayes
multinomial=MultinomialNB()
bernoulli=BernoulliNB()

#neural network
mlp=MLPClassifier(learning_rate='adaptive',verbose=True)

#linear model
perceptron=Perceptron()
sgd=SGDClassifier()
passive_aggressive=PassiveAggressiveClassifier()

In [21]:
for name,clf in clfs.items():
    clf.fit(xtrain,ytrain)
    print(f'{name} trained')
    ypred=clf.predict(xtest)
    print(f'{name} accuracy: {sklearn.metrics.accuracy_score(ytest,ypred)}')
    print(f'{name} precision: {precision_score(ytest,ypred)}')
    print(f'{name} recall: {recall_score(ytest,ypred)}')
    print(f'{name} f1-score: {f1_score(ytest,ypred)}')
    print(f'{name} confusion matrix: ')
    print(sklearn.metrics.confusion_matrix(ytest,ypred))

Bagging trained
Bagging accuracy: 0.9757847533632287
Bagging precision: 0.9420289855072463
Bagging recall: 0.87248322147651
Bagging f1-score: 0.9059233449477352
Bagging confusion matrix: 
[[958   8]
 [ 19 130]]




AdaBoost trained
AdaBoost accuracy: 0.9811659192825112
AdaBoost precision: 0.9507042253521126
AdaBoost recall: 0.9060402684563759
AdaBoost f1-score: 0.9278350515463918
AdaBoost confusion matrix: 
[[959   7]
 [ 14 135]]
ExtraTrees trained
ExtraTrees accuracy: 0.9838565022421525
ExtraTrees precision: 1.0
ExtraTrees recall: 0.8791946308724832
ExtraTrees f1-score: 0.9357142857142857
ExtraTrees confusion matrix: 
[[966   0]
 [ 18 131]]
GradientBoosting trained
GradientBoosting accuracy: 0.9766816143497757
GradientBoosting precision: 0.984251968503937
GradientBoosting recall: 0.8389261744966443
GradientBoosting f1-score: 0.9057971014492754
GradientBoosting confusion matrix: 
[[964   2]
 [ 24 125]]
RandomForest trained
RandomForest accuracy: 0.9802690582959641
RandomForest precision: 1.0
RandomForest recall: 0.8523489932885906
RandomForest f1-score: 0.9202898550724637
RandomForest confusion matrix: 
[[966   0]
 [ 22 127]]
MultinomialNB trained
MultinomialNB accuracy: 0.9739910313901345
Multin

## Exporting model
We can see that SGDClassifier is best

In [24]:
saved_model=pickle.dump(mlp,open('model.pkl','wb'))

In [25]:
loaded_model=pickle.load(open('model.pkl','rb'))
loaded_model.predict(x)

NotFittedError: This MLPClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [29]:
# Retrain
mlp = SGDClassifier()
sgd.fit(xtrain, ytrain)

# Save
with open('model.pkl', 'wb') as file:
    pickle.dump(sgd, file)

# Load and predict
loaded_model = pickle.load(open('model.pkl', 'rb'))
    
predictions = loaded_model.predict(x)


In [30]:
pickle.dump(tfidf,open('tfidf.pkl','wb'))