In [4]:
# Pandas and numpy for data wrangling
import pandas as pd
import numpy as np

# Seaborn / matplotlib for visualization 
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
%matplotlib inline

# Import the trees from sklearn
from sklearn import tree

# Helper function to split our data
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [5]:
data = "https://raw.githubusercontent.com/AliceLiu17/csc448_final/main/data/preprocessed_dataset/preprocessed_english.csv"

df = pd.read_csv(data)
df.head()

Unnamed: 0,label,email,processed_email
0,0,"Go until jurong point, crazy.. Available only ...","['go', 'jurong', 'point', 'crazy', '..', 'avai..."
1,1,Free entry in 2 a wkly comp to win FA Cup fina...,"['free', 'entry', '2', 'wkly', 'comp', 'win', ..."
2,0,U dun say so early hor... U c already then say...,"['u', 'dun', 'say', 'early', 'hor', '...', 'u'..."
3,0,"Nah I don't think he goes to usf, he lives aro...","['nah', ""n't"", 'think', 'goes', 'usf', 'lives'..."
4,1,FreeMsg Hey there darling it's been 3 week's n...,"['freemsg', 'hey', 'darling', ""'s"", '3', 'week..."


In [6]:
cv = CountVectorizer()
tfid = TfidfVectorizer(max_features = 3000)

In [19]:
X = tfid.fit_transform(df['processed_email'])
y = df['label'].values

In [25]:
X_train, X_test , y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 45)

In [26]:
svc = SVC(kernel='linear')
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth = 5)
lrc = LogisticRegression(solver = 'liblinear', penalty = 'l1')
rfc = RandomForestClassifier(n_estimators = 50, random_state = 2 )
gbdt = GradientBoostingClassifier(n_estimators = 50, random_state = 2)    

In [27]:
models = {
    'SVC': svc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'GBDT': gbdt   
}

In [28]:
def train_classifier(clfs, X_train, y_train, X_test, y_test):
    models.fit(X_train,y_train)
    y_pred = models.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    return accuracy , precision

In [29]:
accuracy_scores = []
precision_scores = []
for name, models in models.items():
    current_accuracy, current_precision = train_classifier(models, X_train, y_train, X_test, y_test)
    print()
    print("For: ", name)
    print("Accuracy: ", current_accuracy)
    print("Precision: ", current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)


For:  SVC
Accuracy:  0.9774086378737542
Precision:  0.9595375722543352

For:  NB
Accuracy:  0.9727574750830564
Precision:  0.9810126582278481

For:  DT
Accuracy:  0.9348837209302325
Precision:  0.8518518518518519

For:  LR
Accuracy:  0.9674418604651163
Precision:  0.9556962025316456

For:  RF
Accuracy:  0.9774086378737542
Precision:  0.9649122807017544

For:  GBDT
Accuracy:  0.9554817275747508
Precision:  0.9565217391304348
