# Cluebot - Modeling - kNN with PCA Features

In [1]:
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
import pandas as pd
from feature_engineer import preprocessor

train_data = pd.read_csv('/Users/robin/Documents/GitHub/Cluebot/train_data.csv')
train_data = train_data.reset_index(drop=True)
preprocessor.preprocessor(train_data)
train_data.sample(5)

Unnamed: 0,EditType,EditID,comment,user,user_edit_count,user_distinct_pages,user_warns,user_reg_time,prev_user,common,...,previous_timestamp,deleted_lines,isvandalism,num_edits_5d_before,is_person,comment_empty,account_age,is_IP,word_count_added,word_count_deleted
1663,change,326863825,/* Early life */,89.195.193.140,5,3,0,20091120033351,Tide rolls,,...,1258598031,"""McAdams was born in [[London, Ontario]], [[Ca...",False,2,1,False,1,True,304,301
9737,change,327062989,/* History */,122.169.141.28,6,2,0,20091121052513,203.99.212.224,,...,1258625429,,True,1,0,False,1,True,40,1
12290,change,232992033,,69.65.227.235,10,8,3,20080819220435,220.255.179.136,,...,1217162152,"""'''Cool''', [[California]], [[United States|U...",True,0,0,True,1,True,57,103
11951,change,327619423,/* Healers */,NightBear,167,94,0,1201197383,NightBear,,...,1259043958,"""* [[Druid (character class)|Druid]]: A priest...",False,3,0,False,669,False,96,94
6189,change,327026174,,81.149.113.132,92,52,0,20091121003126,SGGH,,...,1254153256,,False,0,1,True,1,True,21,1


In [3]:
# PCA + kNN

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
from feature_engineer import vandalism_scorer as vs
from sklearn.model_selection import TunedThresholdClassifierCV
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score

features = ['user_edit_count', 'user_warns', 'user_distinct_pages', 'num_recent_edits', 'num_recent_reversions', 'current_minor', \
            'num_edits_5d_before', 'is_person', \
            'is_IP', 'account_age', 'comment_empty', 'word_count_added', 'word_count_deleted', \
            'added_lines', 'deleted_lines', 'EditID'] # 

neighbors = range(1, 51)
comps = range(2, 6)

n_splits_cv = 5

kfold = StratifiedKFold(n_splits_cv,
                           shuffle=True,
                           random_state=498)

pca_accs = np.zeros((n_splits_cv, len(comps), len(neighbors)))
pca_precision = np.zeros((n_splits_cv, len(comps), len(neighbors)))
pca_recall = np.zeros((n_splits_cv, len(comps), len(neighbors)))
pca_f1 = np.zeros((n_splits_cv, len(comps), len(neighbors)))

# Tune decision threshold to maximize f1 score
# pos_label = True
# scorer = make_scorer(f1_score, pos_label=pos_label)

for i,(train_index, test_index) in enumerate(kfold.split(train_data, train_data['isvandalism'])):
    print("CV Split", i)
    edits_tt = train_data.iloc[train_index]
    edits_ho = train_data.iloc[test_index]
    
    for j, n_comps in enumerate(comps):
        pca_pipe = Pipeline([('scorer', vs.VandalismScorer(n_splits = 5)), ('scale', StandardScaler()), ('pca', PCA(n_components=n_comps))])
        pca_pipe.fit(edits_tt[features], edits_tt['isvandalism'])

        pca_tt = np.hstack((pca_pipe.transform(edits_tt[features])[:,0].reshape(-1, 1), \
                          pca_pipe.transform(edits_tt[features])[:,1].reshape(-1, 1), \
                            np.array(edits_tt['isvandalism']).reshape(-1, 1)))
        pca_ho = np.hstack((pca_pipe.transform(edits_ho[features])[:,0].reshape(-1, 1), \
                          pca_pipe.transform(edits_ho[features])[:,1].reshape(-1, 1), \
                            np.array(edits_ho['isvandalism']).reshape(-1, 1)))
        
        for k, n_neighbors in enumerate(neighbors):
            # base_knn = KNeighborsClassifier(n_neighbors)
            knn = KNeighborsClassifier(n_neighbors) # TunedThresholdClassifierCV(base_knn, scoring=scorer)
            knn.fit(pca_tt[:, :2], pca_tt[:, 2])
            pred = knn.predict(pca_ho[:, :2])

            pca_accs[i,j,k] = accuracy_score(pca_ho[:, 2], pred)
            pca_precision[i,j,k] = precision_score(edits_ho['isvandalism'], pred)
            pca_recall[i,j,k] = recall_score(edits_ho['isvandalism'], pred)
            pca_f1[i,j,k] = f1_score(edits_ho['isvandalism'], pred)

CV Split 0
CV Split 1
CV Split 2
CV Split 3
CV Split 4


In [6]:
max_index = np.unravel_index(np.argmax(np.mean(pca_accs, axis=0), axis=None), 
                                       np.mean(pca_accs, axis=0).shape)


print(f"The pair with the highest AVG CV Accuracy was k = {neighbors[max_index[1]]} and number of components = {comps[max_index[0]]:.1f}")
print(f"The highest AVG CV Accuracy was {np.max(np.mean(pca_accs, axis=0)):.3f}")
print(f"The highest AVG CV Precision was {np.max(np.mean(pca_precision, axis=0)):.3f}")
print(f"The highest AVG CV Recall was {np.max(np.mean(pca_recall, axis=0)):.3f}")
print(f"The highest AVG CV F1-score was {np.max(np.mean(pca_f1, axis=0)):.3f}")

The pair with the highest AVG CV Accuracy was k = 37 and number of components = 2.0
The highest AVG CV Accuracy was 0.862
The highest AVG CV Precision was 0.889
The highest AVG CV Recall was 0.859
The highest AVG CV F1-score was 0.858


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

n_splits = 5

kfold = StratifiedKFold(n_splits,
                           shuffle=True,
                           random_state=498)

from feature_engineer import vandalism_scorer as vs

# scorer = vs.VandalismScorer(n_splits=4, random_state=42)
# scorer.fit(train_data, train_data['isvandalism'])

bayes_accs = np.zeros((n_splits, 3))

for i, (train_index, test_index) in enumerate(kfold.split(train_data, train_data['isvandalism'])):
    edits_tt = train_data.iloc[train_index]
    edits_ho = train_data.iloc[test_index]

    model_pipe = Pipeline([('scorer', vs.VandalismScorer(n_splits = 5)), ('scaler', StandardScaler()), ('nb', LogisticRegression())])
    
    ## Gaussian Naive Bayes
    model_pipe.fit(edits_tt[features], edits_tt['isvandalism'])
    
    nb_pred = model_pipe.predict(edits_ho[features])
    
    bayes_accs[i, 2] = accuracy_score(edits_ho['isvandalism'], nb_pred)

np.mean(bayes_accs, axis=0)