In [1]:
import pandas as pd
import numpy as np

# Helper function to split our data
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# for model evaluation 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
from sklearn import preprocessing

# for cross validation
from sklearn.model_selection import KFold

# for MSE

from sklearn.metrics import mean_squared_error

# log loss
from numpy.linalg import inv

In [2]:
url = "https://raw.githubusercontent.com/AliceLiu17/csc448_final/main/data/preprocessed_dataset/preprocessed_english.csv"
df = pd.read_csv(url)

In [3]:
df.head()

Unnamed: 0,label,email,processed_email
0,0,"Go until jurong point, crazy.. Available only ...","['go', 'jurong', 'point', 'crazy', '..', 'avai..."
1,1,Free entry in 2 a wkly comp to win FA Cup fina...,"['free', 'entry', '2', 'wkly', 'comp', 'win', ..."
2,0,U dun say so early hor... U c already then say...,"['u', 'dun', 'say', 'early', 'hor', '...', 'u'..."
3,0,"Nah I don't think he goes to usf, he lives aro...","['nah', ""n't"", 'think', 'goes', 'usf', 'lives'..."
4,1,FreeMsg Hey there darling it's been 3 week's n...,"['freemsg', 'hey', 'darling', ""'s"", '3', 'week..."


In [4]:
# create a training and testing set to train
cv = CountVectorizer()
tfid = TfidfVectorizer(max_features = 3000)

X = tfid.fit_transform(df['processed_email'])
y = df['label'].values

X_train, X_test , y_train, y_test = train_test_split(X,y,test_size = 0.20, random_state = 45)

rfc = RandomForestClassifier(n_estimators = 50, random_state = 2 )
gbdt = GradientBoostingClassifier(n_estimators = 50, random_state = 2)    

# models to test: 
models = {
    'RF': rfc, # random forest
    'GBDT': gbdt   # gradient boosting
}

# train the model and compute model evaluation
def train_classifier(clfs, X_train, y_train, X_test, y_test):
    models.fit(X_train,y_train)
    y_pred = models.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred) # accuracy
    precision = precision_score(y_test, y_pred) # precision 
    recall = recall_score(y_test, y_pred) # recall
    f1 = f1_score(y_test, y_pred) # f1
    
    # If the model provides decision scores, calculate AUC-ROC
    if hasattr(models, 'decision_function'):
        y_scores = models.decision_function(X_test)
        auc_roc = roc_auc_score(y_test, y_scores)
    elif hasattr(models, 'predict_proba'):
        # For models with predict_proba
        y_probs = models.predict_proba(X_test)[:, 1]
        auc_roc = roc_auc_score(y_test, y_probs)
    else:
        auc_roc = None  # AUC-ROC not available for this model
    
    
    return accuracy , precision, recall, f1, auc_roc

In [5]:
# display the model evaluation from train_classifier
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
auc_roc_scores = []

for name, models in models.items():
    current_accuracy, current_precision, current_recall, current_f1, current_aucroc = train_classifier(models, X_train, y_train, X_test, y_test)
    print()
    print("For: ", name)
    print("Accuracy: ", current_accuracy)
    print("Precision: ", current_precision)
    print("Recall: ", current_recall)
    print("F1: ", current_f1)
    print("AUC-ROC: ", current_aucroc)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)


For:  RF
Accuracy:  0.9774086378737542
Precision:  0.9649122807017544
Recall:  0.8549222797927462
F1:  0.9065934065934066
AUC-ROC:  0.9829078731201821

For:  GBDT
Accuracy:  0.9554817275747508
Precision:  0.9565217391304348
Recall:  0.6839378238341969
F1:  0.797583081570997
AUC-ROC:  0.9631243681283962
