# Cluebot - Modeling - Bayes-based Methods

In [2]:
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix

In [5]:
import pandas as pd
import sys, os
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

from feature_engineer import preprocessor


train_data = pd.read_csv('../Data/train.csv')
train_data = train_data.reset_index(drop=True)
preprocessor(train_data)

test_data = pd.read_csv('../Data/test.csv')

train_data.sample(5)

Unnamed: 0,EditType,EditID,comment,user,user_edit_count,user_distinct_pages,user_warns,user_reg_time,prev_user,common,...,previous_timestamp,deleted_lines,isvandalism,num_edits_5d_before,is_person,comment_empty,account_age,is_IP,word_count_added,word_count_deleted
7497,change,327600301,Revert to revision 325173076 dated 2009-11-11 ...,Debivort,11002,3256,0,1116059808,142.104.9.37,,...,1259024011,"""Sandar are most common in [[Iceland]], where ...",False,1,0,False,1654,False,48,48
22875,change,253005674,Added Rio Ashcfroft to the list because thay i...,79.79.1.85,1,1,0,20081120163623,Gibson Flying V,,...,1225359235,"""""",True,0,0,False,1,True,5,0
1299,change,231355099,,96.234.150.172,1,1,0,20080812013501,Kirby13,,...,1218501868,| Birth_name = Paul Kevin Jonas II,True,24,1,True,1,True,5,5
18574,change,327229797,[[WP:UNDO|Undid]] revision 327150558 by [[Spec...,Goodraise,15957,4079,0,1182244481,89.134.154.238,,...,1258830793,"""A number of musical CDs have been created. Va...",False,14,0,False,886,False,62,64
16853,change,329049847,Delink dates ([[WP:MOSUNLINKDATES]]) using [[P...,Rich Farmbrough,1712760,0,14,1083530056,Numbo3-bot,,...,1254606641,"{{Infobox Planet ,""| discovered = [[September ...",False,0,0,False,2038,False,41,41


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB

n_splits = 5

kfold = StratifiedKFold(n_splits,
                           shuffle=True,
                           random_state=498)

features = ['user_edit_count', 'user_warns', 'user_distinct_pages', 'num_recent_edits', 'num_recent_reversions', 'current_minor', \
            'num_edits_5d_before', 'is_person', \
            'is_IP', 'account_age', 'comment_empty', 'word_count_added', 'word_count_deleted', \
            'added_lines', 'deleted_lines', 'EditID']

preprocessor(train_data)
# train_data = train_data.reset_index(drop=True)
# train_data.sample(5)

from feature_engineer import vandalism_scorer as vs

# scorer = vs.VandalismScorer(n_splits=4, random_state=42)
# scorer.fit(train_data, train_data['isvandalism'])

bayes_accs = np.zeros((n_splits, 3))

for i, (train_index, test_index) in enumerate(kfold.split(train_data, train_data['isvandalism'])):
    edits_tt = train_data.iloc[train_index]
    edits_ho = train_data.iloc[test_index]

    model_pipe = Pipeline([('scorer', vs.VandalismScorer(n_splits = n_splits-1, random_state=498)), ('nb', GaussianNB())])

    print(edits_tt.columns)
    
    ## Gaussian Naive Bayes
    model_pipe.fit(edits_tt[features], edits_tt['isvandalism'])
    
    nb_pred = model_pipe.predict(edits_ho[features])
    
    bayes_accs[i, 2] = accuracy_score(edits_ho['isvandalism'], nb_pred)

np.mean(bayes_accs, axis=0)

Index(['EditType', 'EditID', 'comment', 'user', 'user_edit_count',
       'user_distinct_pages', 'user_warns', 'user_reg_time', 'prev_user',
       'common', 'current', 'previous', 'page_made_time', 'title', 'namespace',
       'creator', 'num_recent_edits', 'num_recent_reversions', 'current_minor',
       'current_timestamp', 'added_lines', 'previous_timestamp',
       'deleted_lines', 'isvandalism', 'num_edits_5d_before', 'is_person',
       'comment_empty', 'account_age', 'is_IP', 'word_count_added',
       'word_count_deleted'],
      dtype='object')
Index(['EditType', 'EditID', 'comment', 'user', 'user_edit_count',
       'user_distinct_pages', 'user_warns', 'user_reg_time', 'prev_user',
       'common', 'current', 'previous', 'page_made_time', 'title', 'namespace',
       'creator', 'num_recent_edits', 'num_recent_reversions', 'current_minor',
       'current_timestamp', 'added_lines', 'previous_timestamp',
       'deleted_lines', 'isvandalism', 'num_edits_5d_before', 'is_person'

array([0.        , 0.        , 0.77658478])

In [None]:
# Bayes-based Methods

from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

features = ['user_edit_count', 'user_warns', 'user_distinct_pages', 'num_recent_edits', 'num_recent_reversions', 'current_minor', \
            'num_edits_5d_before', 'is_person', \
            'is_IP', 'account_age', 'comment_empty', 'word_count_added', 'word_count_deleted']

bayes_accs = np.zeros((n_splits, 3))
print(train_data.columns)

for i, (train_index, test_index) in enumerate(kfold.split(train_data, train_data['isvandalism'])):
    edits_tt = train_data.iloc[train_index]
    edits_ho = train_data.iloc[test_index]

    ## Linear Discriminant Analysis
    lda = LinearDiscriminantAnalysis()
    
    lda.fit(edits_tt[features], edits_tt['isvandalism'])
    lda_pred = lda.predict(edits_ho[features])
    
    bayes_accs[i, 0] = accuracy_score(edits_ho['isvandalism'], lda_pred)
    
    ## Quadratic Discriminant Analysis
    qda = QuadraticDiscriminantAnalysis(store_covariance = True)
    
    qda.fit(edits_tt[features], edits_tt['isvandalism'])
    
    qda_pred = qda.predict(edits_ho[features])
    
    bayes_accs[i, 1] = accuracy_score(edits_ho['isvandalism'], qda_pred)
    
    
    ## Gaussian Naive Bayes
    nb = GaussianNB()
    
    
    nb.fit(edits_tt[features], edits_tt['isvandalism'])
    
    nb_pred = nb.predict(edits_ho[features])
    
    bayes_accs[i, 2] = accuracy_score(edits_ho['isvandalism'], nb_pred)

np.mean(bayes_accs, axis=0)

Index(['EditType', 'EditID', 'comment', 'user', 'user_edit_count',
       'user_distinct_pages', 'user_warns', 'user_reg_time', 'prev_user',
       'common', 'current', 'previous', 'page_made_time', 'title', 'namespace',
       'creator', 'num_recent_edits', 'num_recent_reversions', 'current_minor',
       'current_timestamp', 'added_lines', 'previous_timestamp',
       'deleted_lines', 'isvandalism', 'num_edits_5d_before', 'is_person',
       'comment_empty', 'account_age', 'is_IP', 'word_count_added',
       'word_count_deleted'],
      dtype='object')


array([0.81673741, 0.79510761, 0.77658478])

In [ ]:
model_pipe['nb'].theta_

array([[1.30527820e+05, 4.02490660e+00, 3.82973529e+04, 6.25538845e-02,
        1.24533001e-03, 2.65255293e-01, 8.56672095e+00, 2.33259891e-01,
        3.02136220e-01, 6.77528307e+02, 2.15729476e-01, 1.10793946e+02,
        1.01950762e+02, 2.59322172e-01],
       [1.16486167e+02, 3.57936187e+00, 4.77551494e+01, 9.50121163e-02,
        5.24030695e-02, 6.36106624e-03, 1.06901252e+01, 2.29907108e-01,
        9.44163974e-01, 6.87520194e+00, 3.70759289e-01, 8.49193255e+01,
        3.79863691e+02, 6.93946014e-01]])

In [20]:
model_pipe['nb'].var_

array([[1.82599304e+11, 2.16226333e+02, 2.92661190e+10, 9.80187100e+01,
        9.79538409e+01, 9.81474920e+01, 6.88950133e+02, 9.81314468e+01,
        9.81634470e+01, 5.08075044e+05, 9.81217874e+01, 1.42393372e+05,
        4.60397538e+05, 9.80146294e+01],
       [6.20286876e+06, 1.43559990e+02, 8.13662686e+05, 9.81270310e+01,
        9.80075045e+01, 9.79589177e+01, 9.41522063e+02, 9.81296469e+01,
        9.80053155e+01, 5.25088916e+03, 9.81858939e+01, 1.90512252e+05,
        1.83880162e+06, 9.80634178e+01]])

In [17]:
import random
import numpy as np
import re
from collections import defaultdict

# Do a cross validation split manually
total_indices = list(range(len(train_data))) # 
random.shuffle(total_indices)
fold = [[], [], [], [], []]
[fold[0], fold[1], fold[2], fold[3], fold[4]] = np.array_split(total_indices, 5)
for i in range(5):
    fold[i] = list(fold[i])

# Get the difference between two strings and return a set of words
def get_difference(s1, s2):
    w1 = set(re.sub(r"[^\w\s]", " ", str(s1)).lower().split())
    w2 = set(re.sub(r"[^\w\s]", " ", str(s2)).lower().split())
    return w1 - w2

# Get a list of word probabilities using edits with given indices
def get_word_prob(indices):
    # Count the number of appearances of each word in vandalism/constructive edits
    vandalism_words_count = defaultdict(int)
    constructive_words_count = defaultdict(int)
    
    for i in indices:
        for word in get_difference(str(train_data.loc[i, 'added_lines']), str(train_data.loc[i, 'deleted_lines'])):
            if train_data['isvandalism'][i] == True:
                vandalism_words_count[word] += 1
            else:
                constructive_words_count[word] += 1

    # Combine all unique words
    all_words = set(vandalism_words_count) | set(constructive_words_count)

    # Compute smoothed probabilities
    word_probs = {
        word: (vandalism_words_count[word] + 1) / 
              (vandalism_words_count[word] + constructive_words_count[word] + 2)
        for word in all_words
    }
    return word_probs

# Compute vandalism scores for edits with given indices using a given list of word probabilities 
def compute_vandalism_scores(word_probs, indices):
    for i in indices:
        words = get_difference(train_data['added_lines'][i], train_data['deleted_lines'][i])
        probs_of_words = [word_probs.get(w, 0.5) for w in words]

        product_p = 1
        product_1_minus_p = 1
        for p in probs_of_words:
            product_p *= p
            product_1_minus_p *= (1-p)
        
        score = product_p / (product_p + product_1_minus_p) if (product_p + product_1_minus_p) != 0 else 1
        train_data.loc[i, 'vandalism_score'] = score

In [None]:
# Modified Stacking Classifier Method

accuracy_scores = []

for i in range(5):
    i_0 = i % 5
    i_1 = (i + 1) % 5
    i_2 = (i + 2) % 5
    i_3 = (i + 3) % 5
    i_4 = (i + 4) % 5

    word_probs = get_word_prob(fold[i_1] + fold[i_2] + fold[i_3])
    compute_vandalism_scores(word_probs, fold[i_0])

    word_probs = get_word_prob(fold[i_0] + fold[i_2] + fold[i_3])
    compute_vandalism_scores(word_probs, fold[i_1])

    word_probs = get_word_prob(fold[i_0] + fold[i_1] + fold[i_3])
    compute_vandalism_scores(word_probs, fold[i_2])

    word_probs = get_word_prob(fold[i_0] + fold[i_1] + fold[i_2])
    compute_vandalism_scores(word_probs, fold[i_3])

    word_probs = get_word_prob(fold[i_0] + fold[i_1] + fold[i_2] + fold[i_3])
    compute_vandalism_scores(word_probs, fold[i_4])

    nb = GaussianNB()
    nb.fit(train_data.loc[fold[i_0] + fold[i_1] + fold[i_2] + fold[i_3], features], train_data.loc[fold[i_0] + fold[i_1] + fold[i_2] + fold[i_3], 'isvandalism'])
    pred = nb.predict(train_data.loc[fold[i_4], features])

    accuracy_scores.append(np.sum(pred == train_data.loc[fold[i_4], 'isvandalism'])/len(train_data.loc[fold[i_4], 'isvandalism']))
    print(confusion_matrix(train_data.loc[fold[i_4], 'isvandalism'], pred))

print(np.average(accuracy_scores))

[[1497 1117]
 [  15 2468]]
[[1550 1164]
 [  22 2362]]
[[1473 1127]
 [  18 2480]]
[[1532 1091]
 [  13 2462]]
[[1417 1123]
 [  16 2541]]
0.7761300291796965
