In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [None]:
cols = ['label', 'text']
df = pd.read_csv(r'dataset/spam.csv', encoding='latin-1')
df = df[['v1', 'v2']]
df.columns = cols
df['label']= df['label'].map({'ham': 0, 'spam': 1})
df.head()
print(len(df[df['label'] == 0]))
print(len(df[df['label'] == 1]))

4825
747


In [5]:
#Removing punctuation, numbers, and converting to lowercase
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    return text

df['text'] = df['text'].apply(clean_text)
df.head()

Unnamed: 0,label,text
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in a wkly comp to win fa cup final...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...


In [6]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['text'])

In [7]:
from sklearn.model_selection import train_test_split
x = X
y = df['label']
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25, random_state=42)

metrics = []

In [8]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=10, random_state=42, class_weight='balanced')
# dt = DecisionTreeClassifier(max_depth=10, random_state=42)
dt.fit(xtrain, ytrain)
ypred = dt.predict(xtest)
report = classification_report(ytest,ypred, output_dict=True)
metrics.append(['Decision Tree', report['0']['precision'], report['1']['precision'], report['0']['recall'], report['1']['recall'], accuracy_score(ytest, ypred), report['weighted avg']['f1-score']])
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1202
           1       0.90      0.69      0.78       191

    accuracy                           0.95      1393
   macro avg       0.92      0.84      0.87      1393
weighted avg       0.94      0.95      0.94      1393



In [9]:
from sklearn.ensemble import RandomForestClassifier
# rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight='balanced')
rf.fit(xtrain, ytrain)
ypred = rf.predict(xtest)
report = classification_report(ytest,ypred, output_dict=True)
metrics.append(['Random Forest', report['0']['precision'], report['1']['precision'], report['0']['recall'], report['1']['recall'], accuracy_score(ytest, ypred), report['weighted avg']['f1-score']])
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      1202
           1       0.95      0.73      0.82       191

    accuracy                           0.96      1393
   macro avg       0.95      0.86      0.90      1393
weighted avg       0.96      0.96      0.95      1393



In [10]:
from sklearn.svm import SVC
# svm = SVC()
svm = SVC(class_weight='balanced')
svm = svm.fit(xtrain, ytrain)
ypred = svm.predict(xtest)
report = classification_report(ytest,ypred, output_dict=True)
metrics.append(['Support Vector Machine', report['0']['precision'], report['1']['precision'], report['0']['recall'], report['1']['recall'], accuracy_score(ytest, ypred), report['weighted avg']['f1-score']])
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1202
           1       0.99      0.79      0.87       191

    accuracy                           0.97      1393
   macro avg       0.98      0.89      0.93      1393
weighted avg       0.97      0.97      0.97      1393



In [11]:
from sklearn.ensemble import GradientBoostingClassifier
gbr = GradientBoostingClassifier(n_estimators=100, max_depth=10, random_state=42)
gbr.fit(xtrain, ytrain)
ypred = gbr.predict(xtest)
report = classification_report(ytest,ypred, output_dict=True)
metrics.append(['Gradient Boosting', report['0']['precision'], report['1']['precision'], report['0']['recall'], report['1']['recall'], accuracy_score(ytest, ypred), report['weighted avg']['f1-score']])
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1202
           1       0.93      0.78      0.85       191

    accuracy                           0.96      1393
   macro avg       0.95      0.89      0.91      1393
weighted avg       0.96      0.96      0.96      1393



In [12]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(xtrain, ytrain)
ypred = knn.predict(xtest)
report = classification_report(ytest,ypred, output_dict=True)
metrics.append(['K-Nearest Neighbors', report['0']['precision'], report['1']['precision'], report['0']['recall'], report['1']['recall'], accuracy_score(ytest, ypred), report['weighted avg']['f1-score']])
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1202
           1       0.99      0.62      0.77       191

    accuracy                           0.95      1393
   macro avg       0.97      0.81      0.87      1393
weighted avg       0.95      0.95      0.94      1393



In [13]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb = nb.fit(xtrain, ytrain)
ypred = nb.predict(xtest)
report = classification_report(ytest,ypred, output_dict=True)
metrics.append(['Naive Bayes', report['0']['precision'], report['1']['precision'], report['0']['recall'], report['1']['recall'], accuracy_score(ytest, ypred), report['weighted avg']['f1-score']])
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1202
           1       1.00      0.72      0.84       191

    accuracy                           0.96      1393
   macro avg       0.98      0.86      0.91      1393
weighted avg       0.96      0.96      0.96      1393



In [14]:
from sklearn.linear_model import LogisticRegression
# lgr = LogisticRegression()
lgr = LogisticRegression(class_weight='balanced')
lgr= lgr.fit(xtrain, ytrain)
ypred = lgr.predict(xtest)
report = classification_report(ytest,ypred, output_dict=True)
metrics.append(['Logistic Regression', report['0']['precision'], report['1']['precision'], report['0']['recall'], report['1']['recall'], accuracy_score(ytest, ypred), report['weighted avg']['f1-score']])
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1202
           1       0.94      0.85      0.89       191

    accuracy                           0.97      1393
   macro avg       0.96      0.92      0.94      1393
weighted avg       0.97      0.97      0.97      1393



In [18]:
metrics_df  = pd.DataFrame(metrics, columns= ['Model', 'Precision(0)', 'Precision(1)', 'Recall(0)', 'Recall(1)', 'Accuracy', 'F1 Score'], index = [1,2,3,4,5,6,7]).drop_duplicates()
for col in metrics_df.columns[1:]:
    metrics_df[col] = metrics_df[col].apply(lambda x: round(x, 3))

metrics_df.sort_values(by='F1 Score', ascending=False, inplace=True)
metrics_df.reset_index(drop=True, inplace=True)
metrics_df.index += 1
metrics_df

Unnamed: 0,Model,Precision(0),Precision(1),Recall(0),Recall(1),Accuracy,F1 Score
1,Logistic Regression,0.977,0.937,0.991,0.853,0.972,0.971
2,Support Vector Machine,0.967,0.987,0.998,0.785,0.969,0.968
3,Gradient Boosting,0.966,0.931,0.991,0.78,0.962,0.961
4,Naive Bayes,0.957,1.0,1.0,0.717,0.961,0.958
5,Random Forest,0.958,0.946,0.993,0.728,0.957,0.955
6,Decision Tree,0.952,0.897,0.988,0.686,0.946,0.943
7,K-Nearest Neighbors,0.943,0.992,0.999,0.623,0.948,0.942
