# Cluebot - Modeling - Bayes-based Methods

In [8]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [9]:
import pandas as pd
from feature_engineer import preprocessor

train_data = pd.read_csv('/Users/robin/Documents/GitHub/Cluebot/train_data.csv')
train_data = train_data.reset_index(drop=True)
preprocessor.preprocessor(train_data)
train_data.sample(5)

Unnamed: 0,EditType,EditID,comment,user,user_edit_count,user_distinct_pages,user_warns,user_reg_time,prev_user,common,...,previous_timestamp,deleted_lines,isvandalism,num_edits_5d_before,is_person,comment_empty,account_age,is_IP,word_count_added,word_count_deleted
21251,change,328101808,,A.h. king,9990,0,3,1199035761,Jack1956,,...,1256026867,,False,0,0,True,697,False,9,1
238,change,231739380,/* Plot synopsis */,151.204.63.206,1,1,0,20080813191400,92.226.201.245,,...,1218453070,"""Lost in the forest, they find a house made of...",True,1,0,False,1,True,251,249
19432,change,233018229,,68.217.67.68,3,3,0,20080820005958,MER-C,,...,1218892282,[[Image:Leavessnipedale.jpg||thumb|right|200px...,True,2,0,True,1,True,13,12
12166,change,329092083,Quick-adding category [[:Category:Women's foot...,Djln,272831,121713,10,1130953933,Djln,,...,1259695938,,False,2,0,False,1490,False,8,1
8921,change,235929138,/* Rastafari */,121.217.155.141,2,1,1,20080903002653,Jwillbur,,...,1219986630,"""The most internationally known aspect of Jama...",True,1,0,False,1,True,8,215


In [11]:
from sklearn.model_selection import StratifiedKFold

n_splits = 5

kfold = StratifiedKFold(n_splits,
                           shuffle=True,
                           random_state=498)

features = ['user_edit_count', 'user_warns', 'user_distinct_pages', 'num_recent_edits', 'num_recent_reversions', 'current_minor', \
            'num_edits_5d_before', 'is_person', \
            'is_IP', 'account_age', 'comment_empty', 'word_count_added', 'word_count_deleted'] 
# For Bayes-based methods, the vandalism score has almost no contribution so we do not include it here.

In [40]:
# Bayes-based Methods

from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import TunedThresholdClassifierCV
from sklearn.metrics import make_scorer

# Record accuracy, precision, recall, and f1 score
bayes_accs = np.zeros((n_splits, 3))
bayes_precision = np.zeros((n_splits, 3))
bayes_recall = np.zeros((n_splits, 3))
bayes_f1 = np.zeros((n_splits, 3))
bayes_threshold = np.zeros((n_splits, 3))

# Tune decision threshold to maximize f1 score
pos_label = True
scorer = make_scorer(f1_score, pos_label=pos_label)

for i, (train_index, test_index) in enumerate(kfold.split(train_data, train_data['isvandalism'])):
    edits_tt = train_data.iloc[train_index]
    edits_ho = train_data.iloc[test_index]
    
    ## Linear Discriminant Analysis
    base_lda = LinearDiscriminantAnalysis()
    lda = TunedThresholdClassifierCV(base_lda, scoring=scorer)
    lda.fit(edits_tt[features], edits_tt['isvandalism'])
    lda_pred = lda.predict(edits_ho[features])
    
    bayes_accs[i, 0] = accuracy_score(edits_ho['isvandalism'], lda_pred)
    bayes_precision[i, 0] = precision_score(edits_ho['isvandalism'], lda_pred)
    bayes_recall[i, 0] = recall_score(edits_ho['isvandalism'], lda_pred)
    bayes_f1[i, 0] = f1_score(edits_ho['isvandalism'], lda_pred)
    bayes_threshold[i, 0] = lda.best_threshold_
    
    ## Quadratic Discriminant Analysis
    base_qda = QuadraticDiscriminantAnalysis(store_covariance = True)
    qda = TunedThresholdClassifierCV(base_qda, scoring=scorer)
    qda.fit(edits_tt[features], edits_tt['isvandalism'])
    qda_pred = qda.predict(edits_ho[features])
    
    bayes_accs[i, 1] = accuracy_score(edits_ho['isvandalism'], qda_pred)
    bayes_precision[i, 1] = precision_score(edits_ho['isvandalism'], qda_pred)
    bayes_recall[i, 1] = recall_score(edits_ho['isvandalism'], qda_pred)
    bayes_f1[i, 1] = f1_score(edits_ho['isvandalism'], qda_pred)
    bayes_threshold[i, 1] = qda.best_threshold_
    
    ## Gaussian Naive Bayes
    base_nb = GaussianNB()
    nb = TunedThresholdClassifierCV(base_nb, scoring=scorer)
    nb.fit(edits_tt[features], edits_tt['isvandalism'])
    nb_pred = nb.predict(edits_ho[features])
    
    bayes_accs[i, 2] = accuracy_score(edits_ho['isvandalism'], nb_pred)
    bayes_precision[i, 2] = precision_score(edits_ho['isvandalism'], nb_pred)
    bayes_recall[i, 2] = recall_score(edits_ho['isvandalism'], nb_pred)
    bayes_f1[i, 2] = f1_score(edits_ho['isvandalism'], nb_pred)
    bayes_threshold[i, 2] = nb.best_threshold_

print(np.mean(bayes_accs, axis=0))
print(np.mean(bayes_precision, axis=0))
print(np.mean(bayes_recall, axis=0))
print(np.mean(bayes_f1, axis=0))
print(np.mean(bayes_threshold, axis=0))

[0.82062184 0.81218629 0.78334912]
[0.75321923 0.73620467 0.693987  ]
[0.93877546 0.95692478 0.99201431]
[0.83581235 0.83210469 0.81665814]
[0.73148481 0.98989899 0.98989899]
