## Classification Model using activity data from all namespaces in the English Wikipedia

#### Using revision activity stats aggregated on a daily basis over a 2 week window to predict whether a user is blocked or not

In [1]:
# import necessary packages
import os
import pandas as pd
import numpy as np
import re
import sys
sys.path.append('/home/ec2-user/anaconda3/envs/JupyterSystemEnv/lib/python3.6/site-packages')

# set options
pd.options.display.max_colwidth = 50
pd.set_option('display.max_colwidth', -1) 
pd.options.mode.chained_assignment = None  # default='warn'

### Read in block/nonblock activity data

In [2]:
#read in blocked users data
df_block = pd.read_csv('/home/ec2-user/SageMaker/bucket/wiki_trust/revisions_data/cr4zy_data/blk_agg_daily.txt', sep = '\t')
df_block.shape

(9667, 36)

In [3]:
#read in non-blocked users data
df_nonblock = pd.read_csv('/home/ec2-user/SageMaker/bucket/wiki_trust/revisions_data/cr4zy_data/nonblk_agg_daily.txt', sep = '\t')
df_nonblock.shape

(18863, 36)

In [4]:
df_nonblock.columns

Index(['username', 'rev_count_0', 'rev_count_1', 'rev_count_2', 'rev_count_3',
       'rev_count_4', 'rev_count_5', 'rev_count_6', 'rev_count_7',
       'rev_count_8', 'rev_count_9', 'rev_count_10', 'rev_count_11',
       'rev_count_12', 'rev_count_13', 'rev_count_14', 'rev_avglen_0',
       'rev_avglen_1', 'rev_avglen_2', 'rev_avglen_3', 'rev_avglen_4',
       'rev_avglen_5', 'rev_avglen_6', 'rev_avglen_7', 'rev_avglen_8',
       'rev_avglen_9', 'rev_avglen_10', 'rev_avglen_11', 'rev_avglen_12',
       'rev_avglen_13', 'rev_avglen_14', '2wkactivedays', 'blocked',
       'minor_count_norm', 'dlt_count_norm', 'registered'],
      dtype='object')

In [5]:
df_block.columns = ['username', 'rev_count_0', 'rev_count_1', 'rev_count_2', 'rev_count_3',
       'rev_count_4', 'rev_count_5', 'rev_count_6', 'rev_count_7',
       'rev_count_8', 'rev_count_9', 'rev_count_10', 'rev_count_11',
       'rev_count_12', 'rev_count_13', 'rev_count_14', 'rev_avglen_0',
       'rev_avglen_1', 'rev_avglen_2', 'rev_avglen_3', 'rev_avglen_4',
       'rev_avglen_5', 'rev_avglen_6', 'rev_avglen_7', 'rev_avglen_8',
       'rev_avglen_9', 'rev_avglen_10', 'rev_avglen_11', 'rev_avglen_12',
       'rev_avglen_13', 'rev_avglen_14', '2wkactivedays', 'blocked',
       'minor_count_norm', 'dlt_count_norm', 'registered']

In [6]:
df_act = pd.concat([df_block,df_nonblock])
df_act.shape

(28530, 36)

In [7]:
df_act = df_act.sample(frac=1).reset_index(drop=True)

In [8]:
df_act.head()

Unnamed: 0,username,rev_count_0,rev_count_1,rev_count_2,rev_count_3,rev_count_4,rev_count_5,rev_count_6,rev_count_7,rev_count_8,...,rev_avglen_10,rev_avglen_11,rev_avglen_12,rev_avglen_13,rev_avglen_14,2wkactivedays,blocked,minor_count_norm,dlt_count_norm,registered
0,Jbribeiro1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,180,0,0.0,0.0,1
1,110.93.240.148,0.026667,0.066667,0.04,0.24,0.226667,0.213333,0.106667,0.0,0.013333,...,17615.0,0.0,0.0,0.0,0.0,10,0,0.0,0.0,0
2,109.69.8.99,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,6,0,0.0,0.0,0
3,Cipher99,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1,1,1.0,0.0,1
4,Danikraki,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1,1,0.4,0.8,1


In [9]:
df_act.columns

Index(['username', 'rev_count_0', 'rev_count_1', 'rev_count_2', 'rev_count_3',
       'rev_count_4', 'rev_count_5', 'rev_count_6', 'rev_count_7',
       'rev_count_8', 'rev_count_9', 'rev_count_10', 'rev_count_11',
       'rev_count_12', 'rev_count_13', 'rev_count_14', 'rev_avglen_0',
       'rev_avglen_1', 'rev_avglen_2', 'rev_avglen_3', 'rev_avglen_4',
       'rev_avglen_5', 'rev_avglen_6', 'rev_avglen_7', 'rev_avglen_8',
       'rev_avglen_9', 'rev_avglen_10', 'rev_avglen_11', 'rev_avglen_12',
       'rev_avglen_13', 'rev_avglen_14', '2wkactivedays', 'blocked',
       'minor_count_norm', 'dlt_count_norm', 'registered'],
      dtype='object')

In [10]:
# dump to use wih abuse score data -- mute whole if only running model
# this is activity stats over last 2 weeks of a user

# save file as .csv
header = ['username', 'rev_count_0', 'rev_count_1', 'rev_count_2', 'rev_count_3',
       'rev_count_4', 'rev_count_5', 'rev_count_6', 'rev_count_7',
       'rev_count_8', 'rev_count_9', 'rev_count_10', 'rev_count_11',
       'rev_count_12', 'rev_count_13', 'rev_count_14', 'rev_avglen_0',
       'rev_avglen_1', 'rev_avglen_2', 'rev_avglen_3', 'rev_avglen_4',
       'rev_avglen_5', 'rev_avglen_6', 'rev_avglen_7', 'rev_avglen_8',
       'rev_avglen_9', 'rev_avglen_10', 'rev_avglen_11', 'rev_avglen_12',
       'rev_avglen_13', 'rev_avglen_14', '2wkactivedays', 'blocked',
       'minor_count_norm', 'dlt_count_norm', 'registered']
df_act.to_csv('/home/ec2-user/SageMaker/bucket/wiki_trust/revisions_data/cr4zy_data/daily_act_data_final.txt', sep = '\t',encoding='utf-8',header = True,index=False)

In [11]:
df_data = df_act
df_data.columns[16:31]

Index(['rev_avglen_0', 'rev_avglen_1', 'rev_avglen_2', 'rev_avglen_3',
       'rev_avglen_4', 'rev_avglen_5', 'rev_avglen_6', 'rev_avglen_7',
       'rev_avglen_8', 'rev_avglen_9', 'rev_avglen_10', 'rev_avglen_11',
       'rev_avglen_12', 'rev_avglen_13', 'rev_avglen_14'],
      dtype='object')

In [12]:
# standardize revision length and the calculate variance of length and counts
df_data_std = df_data

# standardization length columns
from sklearn.preprocessing import StandardScaler

df_data_std[['rev_avglen_0', 'rev_avglen_1', 'rev_avglen_2', 'rev_avglen_3',
       'rev_avglen_4', 'rev_avglen_5', 'rev_avglen_6', 'rev_avglen_7',
       'rev_avglen_8', 'rev_avglen_9', 'rev_avglen_10', 'rev_avglen_11',
       'rev_avglen_12', 'rev_avglen_13', 'rev_avglen_14']] = StandardScaler().fit_transform(df_data_std[['rev_avglen_0', 'rev_avglen_1', 'rev_avglen_2', 'rev_avglen_3',
       'rev_avglen_4', 'rev_avglen_5', 'rev_avglen_6', 'rev_avglen_7',
       'rev_avglen_8', 'rev_avglen_9', 'rev_avglen_10', 'rev_avglen_11',
       'rev_avglen_12', 'rev_avglen_13', 'rev_avglen_14']])


In [13]:
df_data_std.columns[16:31]

Index(['rev_avglen_0', 'rev_avglen_1', 'rev_avglen_2', 'rev_avglen_3',
       'rev_avglen_4', 'rev_avglen_5', 'rev_avglen_6', 'rev_avglen_7',
       'rev_avglen_8', 'rev_avglen_9', 'rev_avglen_10', 'rev_avglen_11',
       'rev_avglen_12', 'rev_avglen_13', 'rev_avglen_14'],
      dtype='object')

In [14]:

# calculae variance of count and length
df_data_std['revcount_var'] = df_data_std.iloc[:,1:16].var(axis=1)
df_data_std['revlen_var'] = df_data_std.iloc[:,16:31].var(axis=1)
df_data.columns

Index(['username', 'rev_count_0', 'rev_count_1', 'rev_count_2', 'rev_count_3',
       'rev_count_4', 'rev_count_5', 'rev_count_6', 'rev_count_7',
       'rev_count_8', 'rev_count_9', 'rev_count_10', 'rev_count_11',
       'rev_count_12', 'rev_count_13', 'rev_count_14', 'rev_avglen_0',
       'rev_avglen_1', 'rev_avglen_2', 'rev_avglen_3', 'rev_avglen_4',
       'rev_avglen_5', 'rev_avglen_6', 'rev_avglen_7', 'rev_avglen_8',
       'rev_avglen_9', 'rev_avglen_10', 'rev_avglen_11', 'rev_avglen_12',
       'rev_avglen_13', 'rev_avglen_14', '2wkactivedays', 'blocked',
       'minor_count_norm', 'dlt_count_norm', 'registered', 'revcount_var',
       'revlen_var'],
      dtype='object')

In [15]:
df_data.head()

Unnamed: 0,username,rev_count_0,rev_count_1,rev_count_2,rev_count_3,rev_count_4,rev_count_5,rev_count_6,rev_count_7,rev_count_8,...,rev_avglen_12,rev_avglen_13,rev_avglen_14,2wkactivedays,blocked,minor_count_norm,dlt_count_norm,registered,revcount_var,revlen_var
0,Jbribeiro1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.234414,-0.244914,-0.235185,180,0,0.0,0.0,1,0.066667,0.004238
1,110.93.240.148,0.026667,0.066667,0.04,0.24,0.226667,0.213333,0.106667,0.0,0.013333,...,-0.234414,-0.244914,-0.235185,10,0,0.0,0.0,0,0.007898,0.411144
2,109.69.8.99,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.234414,-0.244914,-0.235185,6,0,0.0,0.0,0,0.066667,0.019151
3,Cipher99,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.234414,-0.244914,-0.235185,1,1,1.0,0.0,1,0.066667,0.000803
4,Danikraki,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.234414,-0.244914,-0.235185,1,1,0.4,0.8,1,0.066667,0.009721


In [16]:
df_data.shape

(28530, 38)

In [17]:
df_data = df_data.dropna()
df_data.shape

(28530, 38)

In [18]:
df_data.drop(columns = ['username'],inplace = True)

### Splitting data

In [19]:
from sklearn import model_selection
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, confusion_matrix,accuracy_score,roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from xgboost.sklearn import XGBClassifier
from sklearn.ensemble.forest import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

In [20]:
df_data_model = df_data
X = df_data_model.drop('blocked', axis=1) # columns to drop add here
y = df_data_model['blocked']
X.head()

Unnamed: 0,rev_count_0,rev_count_1,rev_count_2,rev_count_3,rev_count_4,rev_count_5,rev_count_6,rev_count_7,rev_count_8,rev_count_9,...,rev_avglen_11,rev_avglen_12,rev_avglen_13,rev_avglen_14,2wkactivedays,minor_count_norm,dlt_count_norm,registered,revcount_var,revlen_var
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.244951,-0.234414,-0.244914,-0.235185,180,0.0,0.0,1,0.066667,0.004238
1,0.026667,0.066667,0.04,0.24,0.226667,0.213333,0.106667,0.0,0.013333,0.0,...,-0.244951,-0.234414,-0.244914,-0.235185,10,0.0,0.0,0,0.007898,0.411144
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.244951,-0.234414,-0.244914,-0.235185,6,0.0,0.0,0,0.066667,0.019151
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.244951,-0.234414,-0.244914,-0.235185,1,1.0,0.0,1,0.066667,0.000803
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.244951,-0.234414,-0.244914,-0.235185,1,0.4,0.8,1,0.066667,0.009721


In [21]:
# implementing train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=0)

### XGBoosting

In [22]:
# XGB
xgb = XGBClassifier(n_jobs=-1, n_estimators=500, silent=0,max_depth=5,reg_alpha=0.05)
xgb.fit(X_train,y_train)

predictions_xgb = xgb.predict(X_test)
predictions = [round(value) for value in predictions_xgb]

# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
auc = roc_auc_score(y_test, predictions_xgb)

[23:36:06] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 0 pruned nodes, max_depth=5
[23:36:06] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=5
[23:36:06] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[23:36:06] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[23:36:06] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 34 extra nodes, 0 pruned nodes, max_depth=5
[23:36:06] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=5
[23:36:06] /opt/conda/conda-bld/xgboost_

[23:36:07] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=5
[23:36:07] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 26 extra nodes, 0 pruned nodes, max_depth=5
[23:36:07] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 34 extra nodes, 0 pruned nodes, max_depth=5
[23:36:07] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 0 pruned nodes, max_depth=5
[23:36:07] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 26 extra nodes, 0 pruned nodes, max_depth=5
[23:36:07] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 24 extra nodes, 0 pruned nodes, max_depth=5
[23:36:07] /opt/conda/conda-bld/xgboost_

[23:36:08] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 26 extra nodes, 0 pruned nodes, max_depth=5
[23:36:08] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 26 extra nodes, 0 pruned nodes, max_depth=5
[23:36:08] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 24 extra nodes, 0 pruned nodes, max_depth=5
[23:36:08] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 40 extra nodes, 0 pruned nodes, max_depth=5
[23:36:08] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 0 pruned nodes, max_depth=5
[23:36:08] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 24 extra nodes, 0 pruned nodes, max_depth=5
[23:36:08] /opt/conda/conda-bld/xgboost_

[23:36:08] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 34 extra nodes, 0 pruned nodes, max_depth=5
[23:36:09] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 24 extra nodes, 0 pruned nodes, max_depth=5
[23:36:09] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 44 extra nodes, 0 pruned nodes, max_depth=5
[23:36:09] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 24 extra nodes, 0 pruned nodes, max_depth=5
[23:36:09] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=5
[23:36:09] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=5
[23:36:09] /opt/conda/conda-bld/xgboost_

[23:36:09] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 26 extra nodes, 0 pruned nodes, max_depth=5
[23:36:09] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[23:36:09] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_depth=5
[23:36:09] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 0 pruned nodes, max_depth=5
[23:36:09] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=5
[23:36:09] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_depth=5
[23:36:09] /opt/conda/conda-bld/xgboost_

[23:36:10] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=5
[23:36:10] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 46 extra nodes, 0 pruned nodes, max_depth=5
[23:36:10] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=5
[23:36:10] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=5
[23:36:10] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 48 extra nodes, 0 pruned nodes, max_depth=5
[23:36:10] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 26 extra nodes, 0 pruned nodes, max_depth=5
[23:36:10] /opt/conda/conda-bld/xgboost_

[23:36:10] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 22 extra nodes, 0 pruned nodes, max_depth=5
[23:36:10] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 24 extra nodes, 0 pruned nodes, max_depth=5
[23:36:10] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=5
[23:36:10] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 36 extra nodes, 0 pruned nodes, max_depth=5
[23:36:10] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 0 pruned nodes, max_depth=5
[23:36:10] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=5
[23:36:10] /opt/conda/conda-bld/xgboost_

[23:36:11] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 36 extra nodes, 0 pruned nodes, max_depth=5
[23:36:11] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 26 extra nodes, 0 pruned nodes, max_depth=5
[23:36:11] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 34 extra nodes, 0 pruned nodes, max_depth=5
[23:36:11] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 36 extra nodes, 0 pruned nodes, max_depth=5
[23:36:11] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 18 extra nodes, 0 pruned nodes, max_depth=5
[23:36:11] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 0 pruned nodes, max_depth=5
[23:36:11] /opt/conda/conda-bld/xgboost_

[23:36:12] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 0 pruned nodes, max_depth=5
[23:36:12] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 0 pruned nodes, max_depth=5
[23:36:12] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 22 extra nodes, 0 pruned nodes, max_depth=5
[23:36:12] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 20 extra nodes, 0 pruned nodes, max_depth=5
[23:36:12] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 16 extra nodes, 0 pruned nodes, max_depth=5
[23:36:12] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 36 extra nodes, 0 pruned nodes, max_depth=5
[23:36:12] /opt/conda/conda-bld/xgboost_

[23:36:12] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=5
[23:36:12] /opt/conda/conda-bld/xgboost_1541449690605/work/src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 26 extra nodes, 0 pruned nodes, max_depth=5


  if diff:


In [23]:
# evaluate predictions mute once executed
accuracy = accuracy_score(y_test, predictions)
auc = roc_auc_score(y_test, predictions_xgb)

In [24]:
# evaluation results
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print('\n')
print("AUC: %.2f%%" %(auc* 100.0))
print('\n')
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, predictions_xgb))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, predictions_xgb))
print('\n')

Accuracy: 98.24%


AUC: 97.87%


=== Confusion Matrix ===
[[7454   74]
 [ 127 3757]]


=== Classification Report ===
             precision    recall  f1-score   support

          0       0.98      0.99      0.99      7528
          1       0.98      0.97      0.97      3884

avg / total       0.98      0.98      0.98     11412





In [25]:
feature_importances = pd.DataFrame(xgb.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)
feature_importances[0:10]

Unnamed: 0,importance
rev_avglen_0,0.213211
revlen_var,0.179098
2wkactivedays,0.101411
rev_count_0,0.065126
revcount_var,0.057683
rev_avglen_1,0.038921
registered,0.035199
rev_count_1,0.026361
minor_count_norm,0.023259
dlt_count_norm,0.022329


### Linear SVM

In [26]:
#SVM
lin_clf = LinearSVC(penalty='l1',dual=False)

params = {'C': [0.05, 0.1, 1, 5, 10, 50]}
gs_lin_clf = GridSearchCV(estimator=lin_clf,scoring='accuracy',
                  param_grid=params,
                  n_jobs=-1,
                  cv=2,
                  verbose=3)

gs_lin_clf.fit(X_train, y_train)

best_parameters = gs_lin_clf.best_params_  
print("best parameters are",best_parameters) 

best_result = gs_lin_clf.best_score_  
print("Best Score is",best_result)

# refit with best parameter
predictions_lin_svc = gs_lin_clf.best_estimator_.predict(X_test) 

# evaluate
auc = roc_auc_score(y_test, predictions_lin_svc)
accuracy = accuracy_score(y_test, predictions_lin_svc)

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] C=0.05 ..........................................................
[CV] C=0.05 ..........................................................
[CV] C=0.1 ...........................................................
[CV] C=0.1 ...........................................................
[CV] ................. C=0.05, score=0.9754615564384201, total=   0.5s
[CV] C=1 .............................................................
[CV] ................. C=0.05, score=0.9712616822429907, total=   0.6s
[CV] C=1 .............................................................
[CV] .................. C=0.1, score=0.9754615564384201, total=   0.6s
[CV] C=5 .............................................................
[CV] .................. C=0.1, score=0.9720794392523364, total=   0.8s
[CV] C=5 .............................................................
[CV] .................... C=5, score=0.9754615564384201, total=   0.3s
[CV] C=10 .......

[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:    2.0s remaining:    0.4s


[CV] .................... C=1, score=0.9756952559009114, total=   1.8s


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    2.5s finished


best parameters are {'C': 10}
Best Score is 0.9749970790980255


In [27]:
# evaluation results
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print('\n')
print("AUC: %.2f%%" %(auc * 100.0))
print('\n')
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, predictions_lin_svc))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test,predictions_lin_svc))
print('\n')

Accuracy: 97.63%


AUC: 97.13%


=== Confusion Matrix ===
[[7428  100]
 [ 171 3713]]


=== Classification Report ===
             precision    recall  f1-score   support

          0       0.98      0.99      0.98      7528
          1       0.97      0.96      0.96      3884

avg / total       0.98      0.98      0.98     11412





### Logistic Classifier

In [28]:
# Logistic Regression
clf = LogisticRegression(penalty='l1')
clf.fit(X_train, y_train)

params = {'C':[1, 5, 10, 15, 20, 50]}
#C is inverse of regularization strength
gs = GridSearchCV(estimator=clf,
                  param_grid=params,
                  scoring='accuracy',
                  n_jobs=-1,
                  cv=2,
                  verbose=5)
gs.fit(X_train, y_train)

best_parameters = gs.best_params_  
print("best parameters are",best_parameters) 

best_result = gs.best_score_  
print("Best Score is",best_result)

# refit with best parameter
predictions_log = gs.best_estimator_.predict(X_test)

# evaluate
auc = roc_auc_score(y_test, predictions_log)
accuracy = accuracy_score(y_test, predictions_log)

Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV] C=1 .............................................................
[CV] C=1 .............................................................
[CV] C=5 .............................................................
[CV] C=5 .............................................................
[CV] .................... C=1, score=0.9752336448598131, total=   0.2s
[CV] ..................... C=5, score=0.974766355140187, total=   0.2s
[CV] C=10 ............................................................
[CV] C=10 ............................................................
[CV] .................... C=1, score=0.9787333489132976, total=   0.3s
[CV] C=15 ............................................................
[CV] ................... C=10, score=0.9751168224299065, total=   0.2s
[CV] ................... C=10, score=0.9780322505258238, total=   0.2s
[CV] C=15 ............................................................
[CV] C=20 .......

[Parallel(n_jobs=-1)]: Done   8 out of  12 | elapsed:    0.7s remaining:    0.4s


[CV] ................... C=50, score=0.9782659499883151, total=   0.2s
[CV] ................... C=50, score=0.9751168224299065, total=   0.3s


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    1.0s finished


best parameters are {'C': 1}
Best Score is 0.9769832924407057


In [29]:
# evaluation results
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print('\n')
print("AUC: %.2f%%" %(auc * 100.0))
print('\n')
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, predictions_log))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test,predictions_log))
print('\n')

Accuracy: 97.84%


AUC: 97.45%


=== Confusion Matrix ===
[[7427  101]
 [ 146 3738]]


=== Classification Report ===
             precision    recall  f1-score   support

          0       0.98      0.99      0.98      7528
          1       0.97      0.96      0.97      3884

avg / total       0.98      0.98      0.98     11412





### Random Forest

In [30]:
# Random Forest
rf = RandomForestClassifier(oob_score='True')
params = {'max_depth': [100], 'n_estimators':[500]}
gs_w = GridSearchCV(estimator=rf,scoring='accuracy',
                  param_grid=params,
                  n_jobs=-1,
                  cv=2,
                  verbose=5)

gs_w.fit(X_train, y_train)

best_parameters = gs_w.best_params_  
print("best parameters are",best_parameters) 

best_result = gs_w.best_score_  
print("Best Score is",best_result)

# refit with best parameter
predictions_rf_w = gs_w.best_estimator_.predict(X_test)

# evaluate
auc = roc_auc_score(y_test, predictions_rf_w)
accuracy = accuracy_score(y_test, predictions_rf_w)

Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV] max_depth=100, n_estimators=500 .................................
[CV] max_depth=100, n_estimators=500 .................................
[CV]  max_depth=100, n_estimators=500, score=0.9769806029446132, total=   6.2s
[CV]  max_depth=100, n_estimators=500, score=0.9759345794392523, total=   6.3s


[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    6.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    6.9s finished


best parameters are {'max_depth': 100, 'n_estimators': 500}
Best Score is 0.9764575300852903


In [31]:
# evaluation results
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print('\n')
print("AUC: %.2f%%" %(auc * 100.0))
print('\n')
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, predictions_rf_w))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test,predictions_rf_w))
print('\n')

Accuracy: 97.63%


AUC: 97.29%


=== Confusion Matrix ===
[[7405  123]
 [ 147 3737]]


=== Classification Report ===
             precision    recall  f1-score   support

          0       0.98      0.98      0.98      7528
          1       0.97      0.96      0.97      3884

avg / total       0.98      0.98      0.98     11412



