# Cluebot - Modeling - XGBoost

In [1]:
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
import pandas as pd

train_data = pd.read_csv('/Users/robin/Documents/GitHub/Cluebot/train_data.csv')
test_data = pd.read_csv('/Users/robin/Documents/GitHub/Cluebot/test_data.csv')

train_data.sample(5)

Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,EditType,EditID,comment,user,user_edit_count,user_distinct_pages,...,added_lines,previous_timestamp,deleted_lines,isvandalism,vandalism_score,account_age,is_IP,pca_coeff1,user_credit,user_credit_2
11473,11473,11473,11473,11473,change,328090153,/* Godfather Buried Alive */,Unak78,5000,901,...,"""Shyne's second album was released while servi...",1259202290,"""Shyne's second album was released while servi...",False,0.5,1209,False,1.090472,1.090472,-1.565284
6442,6442,6442,6442,6442,change,328530736,/* Conference Championships */,66.190.68.216,1,1,...,| [[2009 in sports|2009]]|| Mountain West Conf...,1259445144,| [[2009 in sports|2009]]|| Mountain West Conf...,False,0.047878,1,True,-0.972478,-0.972478,0.377077
8910,8910,8910,8910,8910,change,234700067,,82.23.58.140,4,3,...,"''Jodan Sparks,Shes Great !* x''",1219889365,,True,0.971404,1,True,-0.972468,-0.972468,0.377089
1901,1901,1901,1901,1901,change,262180418,/* Sources */,208.57.75.81,1,1,...,[[tr:İxtab]]568ru8tryjurtyuityutyeuteutrutyrut...,1228883635,[[tr:İxtab]],True,0.980569,1,True,-0.972478,-0.972478,0.377077
4948,4948,4948,4948,4948,change,327269575,/* Chengdu Military Region */ test,Buckshot06,117882,32171,...,| Chongqing Air Base || {{coord|29|29|43|...,1258836633,| Chongqing Air Base || {{coord|29|29|43|...,False,0.333333,1205,False,1.597714,1.597714,-1.039822


In [3]:
import random
import numpy as np
import re
from collections import defaultdict

# Do a cross validation split manually
total_indices = list(range(len(train_data))) # 
random.shuffle(total_indices)
fold = [[], [], [], [], []]
[fold[0], fold[1], fold[2], fold[3], fold[4]] = np.array_split(total_indices, 5)
for i in range(5):
    fold[i] = list(fold[i])

# Get the difference between two strings and return a set of words
def get_difference(s1, s2):
    w1 = set(re.sub(r"[^\w\s]", " ", str(s1)).lower().split())
    w2 = set(re.sub(r"[^\w\s]", " ", str(s2)).lower().split())
    return w1 - w2

# Get a list of word probabilities using edits with given indices
def get_word_prob(indices):
    # Count the number of appearances of each word in vandalism/constructive edits
    vandalism_words_count = defaultdict(int)
    constructive_words_count = defaultdict(int)
    
    for i in indices:
        for word in get_difference(str(train_data['added_lines'][i]), str(train_data['deleted_lines'][i])):
            if train_data['isvandalism'][i] == True:
                vandalism_words_count[word] += 1
            else:
                constructive_words_count[word] += 1

    # Combine all unique words
    all_words = set(vandalism_words_count) | set(constructive_words_count)

    # Compute smoothed probabilities
    word_probs = {
        word: (vandalism_words_count[word] + 1) / 
              (vandalism_words_count[word] + constructive_words_count[word] + 2)
        for word in all_words
    }
    return word_probs

# Compute vandalism scores for edits with given indices using a given list of word probabilities 
def compute_vandalism_scores(word_probs, indices):
    for i in indices:
        words = get_difference(train_data['added_lines'][i], train_data['deleted_lines'][i])
        probs_of_words = [word_probs.get(w, 0.5) for w in words]

        product_p = 1
        product_1_minus_p = 1
        for p in probs_of_words:
            product_p *= p
            product_1_minus_p *= (1-p)
        
        score = product_p / (product_p + product_1_minus_p) if (product_p + product_1_minus_p) != 0 else 1
        train_data.loc[i, 'vandalism_score'] = score



In [None]:
# Cluebot method

accuracy_scores = []

for i in range(5):
    i_0 = i % 5
    i_1 = (i + 1) % 5
    i_2 = (i + 2) % 5
    i_3 = (i + 3) % 5
    i_4 = (i + 4) % 5

    word_probs = get_word_prob(fold[i_2] + fold[i_3])
    compute_vandalism_scores(word_probs, fold[i_0] + fold[i_1])

    word_probs = get_word_prob(fold[i_0] + fold[i_1] + fold[i_2] + fold[i_3])
    compute_vandalism_scores(word_probs, fold[i_4])

    features = ['is_IP', 'account_age', 'user_edit_count', 'user_warns', 'user_distinct_pages', 'num_recent_edits', 'num_recent_reversions', 'current_minor', 'vandalism_score']

    xgb_reg = xgboost.XGBClassifier(learning_rate=.1, max_depth=1, n_estimators=680)
    xgb_reg.fit(train_data.loc[fold[i_0] + fold[i_1], features], train_data.loc[fold[i_0] + fold[i_1], 'isvandalism'])
    pred = xgb_reg.predict(train_data.loc[fold[i_4], features])

    accuracy_scores.append(np.sum(pred == train_data.loc[fold[i_4], 'isvandalism'])/len(train_data.loc[fold[i_4], 'isvandalism']))
    # print(confusion_matrix(train_data.loc[fold[4], 'isvandalism'], pred))

print(np.average(accuracy_scores))

0.9101930281068265


In [None]:
# Original method

accuracy_scores = []

for i in range(5):
    i_0 = i % 5
    i_1 = (i + 1) % 5
    i_2 = (i + 2) % 5
    i_3 = (i + 3) % 5
    i_4 = (i + 4) % 5

    word_probs = get_word_prob(fold[i_0] + fold[i_1] + fold[i_2] + fold[i_3])
    compute_vandalism_scores(word_probs, fold[i_0] + fold[i_1] + fold[i_2] + fold[i_3] + fold[i_4])

    features = ['is_IP', 'account_age', 'user_edit_count', 'user_warns', 'user_distinct_pages', 'num_recent_edits', 'num_recent_reversions', 'current_minor', 'vandalism_score']

    xgb_reg = xgboost.XGBClassifier(learning_rate=.1, max_depth=1, n_estimators=680)
    xgb_reg.fit(train_data.loc[fold[i_0] + fold[i_1] + fold[i_2] + fold[i_3], features], train_data.loc[fold[i_0] + fold[i_1] + fold[i_2] + fold[i_3], 'isvandalism'])
    pred = xgb_reg.predict(train_data.loc[fold[i_4], features])

    accuracy_scores.append(np.sum(pred == train_data.loc[fold[i_4], 'isvandalism'])/len(train_data.loc[fold[i_4], 'isvandalism']))
    # print(confusion_matrix(train_data.loc[fold[4], 'isvandalism'], pred))

print(np.average(accuracy_scores))

0.8984228447521765


In [None]:
# Modified Stacking Classifier Method

accuracy_scores = []

for i in range(5):
    i_0 = i % 5
    i_1 = (i + 1) % 5
    i_2 = (i + 2) % 5
    i_3 = (i + 3) % 5
    i_4 = (i + 4) % 5

    word_probs = get_word_prob(fold[i_1] + fold[i_2] + fold[i_3])
    compute_vandalism_scores(word_probs, fold[i_0])

    word_probs = get_word_prob(fold[i_0] + fold[i_2] + fold[i_3])
    compute_vandalism_scores(word_probs, fold[i_1])

    word_probs = get_word_prob(fold[i_0] + fold[i_1] + fold[i_3])
    compute_vandalism_scores(word_probs, fold[i_2])

    word_probs = get_word_prob(fold[i_0] + fold[i_1] + fold[i_2])
    compute_vandalism_scores(word_probs, fold[i_3])

    features = ['is_IP', 'account_age', 'user_edit_count', 'user_warns', 'user_distinct_pages', 'num_recent_edits', 'num_recent_reversions', 'current_minor', 'vandalism_score']

    xgb_reg = xgboost.XGBClassifier(learning_rate=.1, max_depth=1, n_estimators=680)
    xgb_reg.fit(train_data.loc[fold[i_0] + fold[i_1] + fold[i_2] + fold[i_3], features], train_data.loc[fold[i_0] + fold[i_1] + fold[i_2] + fold[i_3], 'isvandalism'])
    pred = xgb_reg.predict(train_data.loc[fold[i_4], features])

    accuracy_scores.append(np.sum(pred == train_data.loc[fold[i_4], 'isvandalism'])/len(train_data.loc[fold[i_4], 'isvandalism']))
    # print(confusion_matrix(train_data.loc[fold[4], 'isvandalism'], pred))

print(np.average(accuracy_scores))


0.9165886317022922
