## Setup/preprocessing

In [1]:
import sys
import os

# Automatically add the project root (1 level up) to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.append(project_root)

In [2]:
import pandas as pd
import numpy as np
import itertools
import sklearn.feature_extraction as fe
from feature_engineer import VandalismScorer, preprocessor
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, confusion_matrix

In [3]:
REPO_ROOT = "../"

In [4]:
train = pd.read_csv(REPO_ROOT+"Data/train.csv")
train.dtypes

EditType                  object
EditID                     int64
comment                   object
user                      object
user_edit_count            int64
user_distinct_pages        int64
user_warns                 int64
user_reg_time              int64
prev_user                 object
common                   float64
current                  float64
previous                 float64
page_made_time             int64
title                     object
namespace                 object
creator                   object
num_recent_edits           int64
num_recent_reversions      int64
current_minor               bool
current_timestamp          int64
added_lines               object
previous_timestamp         int64
deleted_lines             object
isvandalism                 bool
num_edits_5d_before        int64
is_person                  int64
dtype: object

In [5]:
train.isvandalism.value_counts()

isvandalism
False    13091
True     12397
Name: count, dtype: int64

In [6]:
preprocessor(train)

train.isvandalism.value_counts()

isvandalism
False    13048
True     12380
Name: count, dtype: int64

In [7]:
train.columns

Index(['EditType', 'EditID', 'comment', 'user', 'user_edit_count',
       'user_distinct_pages', 'user_warns', 'user_reg_time', 'prev_user',
       'common', 'current', 'previous', 'page_made_time', 'title', 'namespace',
       'creator', 'num_recent_edits', 'num_recent_reversions', 'current_minor',
       'current_timestamp', 'added_lines', 'previous_timestamp',
       'deleted_lines', 'isvandalism', 'num_edits_5d_before', 'is_person',
       'comment_empty', 'account_age', 'is_IP', 'word_count_added',
       'word_count_deleted'],
      dtype='object')

In [8]:
train.dtypes

EditType                  object
EditID                     int64
comment                   object
user                      object
user_edit_count            int64
user_distinct_pages        int64
user_warns                 int64
user_reg_time              int64
prev_user                 object
common                   float64
current                  float64
previous                 float64
page_made_time             int64
title                     object
namespace                 object
creator                   object
num_recent_edits           int64
num_recent_reversions      int64
current_minor               bool
current_timestamp          int64
added_lines               object
previous_timestamp         int64
deleted_lines             object
isvandalism                 bool
num_edits_5d_before        int64
is_person                  int64
comment_empty               bool
account_age                int64
is_IP                       bool
word_count_added           int64
word_count

In [17]:
cv = StratifiedKFold(shuffle=True, random_state=42)
cv.split(train, train["isvandalism"])

<generator object _BaseKFold.split at 0x15672abd0>

Let's make a list of the different subsets that StratifiedKFold divides the dataset into. Just the test indices in the `cv.split()` should suffice. The positional semantics of the list `cv_splits` is that the first member is the set of indices in the first `test_indices` of the `cv.split()`. If we want to switch to considering the first `test_indices` as the _last_ fold, we can reverse the list `cv_splits`.

In [10]:
cv_splits = [set()]*5

In [19]:
def _get_cv_splits(df, labels, cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)):
    cv_splits = [set()]*cv.n_splits
    for i, (_, test_idx) in enumerate(cv.split(df, labels)):
        cv_splits[i] = set(test_idx)
    return cv_splits

In [20]:
cv_splits = _get_cv_splits(train, train['isvandalism'], cv = cv)

Let's confirm that the resulting subsets are disjoint,

In [21]:
[set1.intersection(set2) for (set1, set2) in itertools.product(cv_splits, cv_splits) if set1 is not set2]

[set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set()]

and that their union covers the entire training set.

In [22]:
total = 0
for size in map(len, cv_splits):
    total += size
assert(total == train.shape[0]) # ensure that the size of the union of all cv_sets == the number of rows in train

In [38]:
train.loc[list(cv_splits[0])]

KeyError: '[903, 1219, 7988, 8430, 8501, 9122, 13435, 14589, 14701, 16295, 16352, 17829, 18699, 22677, 24108, 24613] not in index'

Ultimately we need the output to be able to function as a list of indices. Let's redefine

Ultimately we need the output to be able to function as a list of indices. Let's redefine

## Bag-of-words vectorization of `added_lines` and `deleted_lines`

In [23]:
bowVectorizer = fe.text.CountVectorizer()

In [24]:
train = train.replace(np.nan, '')
bowVectorizer.fit(pd.concat([train['added_lines'], train['deleted_lines']], axis=0))

In [25]:
for i, item in enumerate(bowVectorizer.vocabulary_):
    if i > 10:
        break
    print(i)
    print(item)
    print()

0
leadership

1
through

2
emotion

3
is

4
much

5
more

6
than

7
managing

8
tasks

9
and

10
barking



In [26]:
train_bow_added = pd.DataFrame.sparse.from_spmatrix(bowVectorizer.transform(train['added_lines']), columns=bowVectorizer.vocabulary_)
train_bow_deleted = pd.DataFrame.sparse.from_spmatrix(bowVectorizer.transform(train['deleted_lines']), columns=bowVectorizer.vocabulary_)

In [27]:
train_bow_diff = (train_bow_added - train_bow_deleted).clip(lower=0)


In [28]:
train_bow_diff.shape[0]

25428

## Use MultinomialNB to generate vandalism_score using BoWVectorizer

In [29]:
from sklearn.naive_bayes import MultinomialNB

In [30]:
nb = MultinomialNB(fit_prior=False)
nb.fit(train_bow_diff, train["isvandalism"])

In [32]:
train['vandalism_score'] = nb.predict_proba(train_bow_diff)[:, nb.classes_]
# nb.classes_ is a list of the classes seen by nb.fit, in the order it saw them.
# The only two class labels are True and False, so this indexing selects the column
# of predict_proba with the probabilities for True, irrespective of whether nb saw
# True first or False first.
train['nb_prediction'] = nb.predict(train_bow_diff)

In [34]:
train

Unnamed: 0,EditType,EditID,comment,user,user_edit_count,user_distinct_pages,user_warns,user_reg_time,prev_user,common,...,isvandalism,num_edits_5d_before,is_person,comment_empty,account_age,is_IP,word_count_added,word_count_deleted,vandalism_score,nb_prediction
0,change,329595189,,Nryan30,66,13,0,1259891940,219.78.124.42,,...,False,1,0,True,0,False,131,1,3.269814e-32,False
1,change,232199357,/* Penis */,89.242.200.212,4,2,2,20080815230001,66.75.235.255,,...,True,4,1,False,1,True,4,202,9.886503e-01,True
2,change,329877752,Reverted edits by [[Special:Contributions/71.2...,Chamal N,18697,0,2,1208605428,71.208.113.72,,...,False,3,0,False,595,False,34,50,2.774292e-01,False
3,change,253129486,,Animaldudeyay1009,3,1,2,1227241317,J.delanoy,,...,True,2,0,True,0,False,94,836,1.000000e+00,True
4,change,394520551,Adding Persondata using [[Project:AWB|AWB]] (7...,RjwilmsiBot,1602950,1309238,0,1257977968,LobãoV,,...,False,0,1,False,356,False,34,0,2.047170e-24,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25482,change,327368981,Reverted 1 edit by [[Special:Contributions/68....,TreyGeek,15458,4978,2,1203859836,68.40.112.72,,...,False,1,0,False,637,False,26,26,5.000000e-01,False
25484,change,234810735,/* History */,59.180.151.222,1,1,0,20080828164140,68.50.79.137,,...,True,0,0,False,1,True,86,79,9.877731e-01,True
25485,change,329132348,/* In Tamil Nadu */,66.184.61.179,1,1,0,20091201230141,RJFJR,,...,False,0,0,False,1,True,340,337,6.863960e-04,False
25486,change,240599711,/* Biography */,75.157.130.175,6,1,2,20080924030549,J.delanoy,,...,True,11,1,False,1,True,2,209,9.962893e-01,True


In [35]:
confusion_matrix(train['isvandalism'], train['nb_prediction'])

array([[12479,   569],
       [ 2384,  9996]])

## Using the stored `cv_splits` to compute `vandalism_score`

In [None]:
scorer = VandalismScorer()
for i, (train_idx, test_idx) in cv.split(train, train["isvandalism"]):
    

SyntaxError: invalid syntax (2978527451.py, line 1)

## BoW vectorization of added_lines

In [22]:
bowVectorizer = fe.text.CountVectorizer()

In [44]:
train = train.replace(np.nan, '')
bowVectorizer.fit(train['added_lines'])

In [45]:
for i, item in enumerate(bowVectorizer.vocabulary_):
    if i > 10:
        break
    print(i)
    print(item)
    print()

0
leadership

1
through

2
emotion

3
is

4
much

5
more

6
than

7
managing

8
tasks

9
and

10
barking



In [47]:
bowVectorizer.fit(train['added_lines'])
train_bow = pd.DataFrame.sparse.from_spmatrix(bowVectorizer.transform(train['added_lines']), columns=['Added_'+word for word in bowVectorizer.vocabulary_])

In [48]:
train_bow

Unnamed: 0,Added_leadership,Added_through,Added_emotion,Added_is,Added_much,Added_more,Added_than,Added_managing,Added_tasks,Added_and,...,Added_0226731375,Added_brahmanism,Added_yogendra,Added_traditionalism,Added_p117,Added_0691116571,Added_jayalalitha,Added_kaṇēcan,Added_amma,Added_kirkjan
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25423,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25424,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25425,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25426,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
