In [None]:
import sys
import os
import pandas as pd

# Automatically add the project root (1 level up) to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from feature_engineer import preprocessor

In [14]:
from sklearn.model_selection import TunedThresholdClassifierCV, StratifiedKFold
from sklearn.metrics import precision_recall_curve, make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [15]:
df = pd.read_csv(project_root+"/Data/train.csv")
preprocessor(df)
df = df.replace(np.nan, '').reset_index(drop=True)

In [16]:
vectorizer = CountVectorizer()
vectorizer.fit(pd.concat([df['added_lines'], df['deleted_lines']]))
df_added = vectorizer.transform(df['added_lines'])
df_deleted = vectorizer.transform(df['deleted_lines'])
df_diff = (df_added - df_deleted).maximum(0)
df_diff.eliminate_zeros()

In [11]:
nb = MultinomialNB()

In [17]:
nb.fit(df_diff, df['isvandalism'])

In [33]:
preds = pd.Series(nb.predict(df_diff))

In [5]:
n_splits = 5
random_state = 412

kfold = StratifiedKFold(n_splits, shuffle=True, random_state=random_state)

In [34]:
# Record accuracy, precision, recall, and f1 score with no threshold tuning
base_acc = accuracy_score(df['isvandalism'], preds)
base_prec = precision_score(df['isvandalism'], preds)
base_rec = recall_score(df['isvandalism'], preds)
base_f1 = f1_score(df['isvandalism'], preds)
print([base_acc, base_prec, base_rec, base_f1])

[0.8830029888312096, 0.9477292202227935, 0.8040387722132472, 0.8699908228816152]


In [37]:
# Record accuracy, precision, recall, and f1 score
tuned_accs = np.zeros((n_splits, ))
tuned_precs = np.zeros((n_splits, ))
tuned_recs = np.zeros((n_splits, ))
tuned_f1s = np.zeros((n_splits, ))

# Tune decision threshold to maximize f1 score
pos_label = True
scorer = make_scorer(f1_score, pos_label=pos_label)

for i, (train_index, test_index) in enumerate(kfold.split(df, df['isvandalism'])):
    edits_tt = df_diff[train_index]
    labels_tt = df['isvandalism'].iloc[train_index]
    edits_ho = df_diff[test_index]
    labels_ho = df['isvandalism'].iloc[train_index]
    
    base_model = nb
    tuned_model = TunedThresholdClassifierCV(base_model, scoring=scorer)
    tuned_model.fit(edits_tt, labels_tt)
    tuned_pred = tuned_model.predict(edits_tt)
    
    tuned_accs[i] = accuracy_score(labels_ho, tuned_pred)
    tuned_precs[i] = precision_score(labels_ho, tuned_pred)
    tuned_recs[i] = recall_score(labels_ho, tuned_pred)
    tuned_f1s[i] = f1_score(labels_ho, tuned_pred)

print(np.mean(tuned_accs, axis=0))
print(np.mean(tuned_precs, axis=0))
print(np.mean(tuned_recs, axis=0))
print(np.mean(tuned_f1s, axis=0))
print(tuned_model.best_score_)
print(tuned_model.best_threshold_)

0.8847825648935042
0.9504737923294243
0.8053513731825526
0.871864992421977
0.7796955259911453
0.494949494949495
