In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import operator
import pickle
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from treeinterpreter import treeinterpreter as ti
from sklearn.metrics import f1_score

sys.path.append("..")
from src.models import model_params

plt.rcParams['figure.figsize'] = 12, 8

In [3]:
df_test = pd.read_csv(
    '../data/final/gambit_sample_test.csv.gz',
    compression='gzip',
    parse_dates=['most_recent_signup', 'most_recent_post'],
    infer_datetime_format=True,
)

In [4]:
X_test = df_test.drop(columns=['target'])
y_test = df_test['target']

In [5]:
model = pickle.load(open('../models/rf-20.pkl', 'rb'))

In [6]:
pipeline = getattr(model_params, 'feature_pipeline')

In [7]:
X_test = pipeline.transform(X_test)

In [8]:
for imp, col in zip(model.feature_importances_, X_test.columns):
    if imp > 0.01:
        print('{}: {} \n'.format(col, imp))

num_signups: 0.34747251706896753 

Discrimination: 0.05191074605696599 

Education: 0.09975791089760863 

Environment: 0.038294446111639374 

Mental Health: 0.03426427235957284 

Physical Health: 0.07257670727647275 

Violence: 0.010477389847597336 

activity_level_less_recent_action: 0.2299361344977013 

activity_level_recent_action: 0.06809389846280552 



In [9]:
scorecard = ti.predict_explain(model, X_test, 5).assign(
    TRUE_CLASS = y_test
).sort_values('SCORE', ascending = False)

In [10]:
scorecard.head(10)

Unnamed: 0,SCORE,REASON1,REASON2,REASON3,REASON4,REASON5,TRUE_CLASS
44748,0.99987,6.50 < num_signups <= 9.50,Education > 0.50,activity_level_recent_action > 0.50,Discrimination > 0.50,Physical Health > 0.50,1
29413,0.999721,num_signups > 6.50,Education > 0.50,activity_level_recent_action > 0.50,Discrimination > 0.50,Physical Health > 0.50,1
119569,0.999662,5.50 < num_signups <= 9.50,Education > 0.50,Discrimination > 0.50,activity_level_recent_action > 0.50,Physical Health > 0.50,1
23236,0.999548,6.50 < num_signups <= 8.50,Education > 0.50,Physical Health > 0.50,Discrimination > 0.50,activity_level_recent_action > 0.50,1
48340,0.999502,5.50 < num_signups <= 6.50,Education > 0.50,Discrimination > 0.50,activity_level_recent_action > 0.50,Mental Health > 0.50,1
104487,0.999459,4.50 < num_signups <= 9.50,Education > 0.50,Discrimination > 0.50,Environment > 0.50,Physical Health > 0.50,1
46728,0.999433,6.50 < num_signups <= 9.50,Education > 0.50,Physical Health > 0.50,Discrimination > 0.50,activity_level_recent_action > 0.50,1
78015,0.9994,4.50 < num_signups <= 9.50,Education > 0.50,Discrimination > 0.50,Physical Health > 0.50,activity_level_recent_action > 0.50,1
105365,0.999355,5.50 < num_signups <= 9.50,Education > 0.50,Physical Health > 0.50,Discrimination > 0.50,activity_level_recent_action > 0.50,1
53851,0.999336,6.50 < num_signups <= 11.50,Education > 0.50,Physical Health > 0.50,Discrimination > 0.50,activity_level_recent_action > 0.50,1


In [11]:
scorecard[(scorecard['SCORE'] >= 0.5) & 
          (scorecard['TRUE_CLASS'] == 1)
         ]['REASON1'].value_counts()[:20]

activity_level_less_recent_action > 0.50    8143
0.50 < num_signups <= 1.50                  2431
1.50 < num_signups <= 2.50                   691
1.50 < num_signups <= 3.50                   502
Education > 0.50                             416
2.50 < num_signups <= 3.50                   387
num_signups > 0.50                           340
0.50 < num_signups <= 4.50                   310
0.50 < num_signups <= 5.50                   272
1.50 < num_signups <= 4.50                   259
1.50 < num_signups <= 5.50                   232
1.50 < num_signups <= 8.50                   225
2.50 < num_signups <= 5.50                   220
3.50 < num_signups <= 5.50                   186
Physical Health > 0.50                       184
2.50 < num_signups <= 4.50                   183
3.50 < num_signups <= 4.50                   168
num_signups > 1.50                           167
2.50 < num_signups <= 6.50                   152
sms_status_less > 0.50                       151
Name: REASON1, dtype

In [12]:
scorecard[(scorecard['SCORE'] >= 0.8) & 
          (scorecard['TRUE_CLASS'] == 1)
         ]['REASON1'].value_counts()[:20]

activity_level_less_recent_action > 0.50    8081
0.50 < num_signups <= 1.50                  2430
1.50 < num_signups <= 2.50                   651
1.50 < num_signups <= 3.50                   499
2.50 < num_signups <= 3.50                   386
num_signups > 0.50                           337
Education > 0.50                             326
0.50 < num_signups <= 4.50                   310
0.50 < num_signups <= 5.50                   272
1.50 < num_signups <= 4.50                   259
1.50 < num_signups <= 5.50                   232
1.50 < num_signups <= 8.50                   225
2.50 < num_signups <= 5.50                   220
3.50 < num_signups <= 5.50                   185
2.50 < num_signups <= 4.50                   183
Physical Health > 0.50                       180
3.50 < num_signups <= 4.50                   168
num_signups > 1.50                           167
3.50 < num_signups <= 6.50                   149
2.50 < num_signups <= 6.50                   149
Name: REASON1, dtype

In [13]:
scorecard[(scorecard['SCORE'] >= 0.8) & 
          (scorecard['TRUE_CLASS'] == 1)
         ]['REASON2'].value_counts()[:20]

activity_level_less_recent_action > 0.50    3339
Education > 0.50                            3199
0.50 < num_signups <= 1.50                  3133
num_signups > 0.50                          1321
activity_level_recent_action > 0.50          998
Physical Health > 0.50                       770
1.50 < num_signups <= 3.50                   486
0.50 < num_signups <= 9.00                   463
1.50 < num_signups <= 6.50                   408
1.50 < num_signups <= 5.50                   282
2.50 < num_signups <= 3.50                   280
2.50 < num_signups <= 6.50                   191
Environment > 0.50                           176
0.50 < num_signups <= 2.50                   155
sms_status_pending <= 0.50                   143
1.50 < num_signups <= 2.50                   140
0.50 < num_signups <= 8.50                   139
3.50 < num_signups <= 6.50                   138
2.50 < num_signups <= 5.50                   125
1.50 < num_signups <= 8.50                   111
Name: REASON2, dtype

In [14]:
scorecard[(scorecard['SCORE'] >= 0.8) & 
          (scorecard['TRUE_CLASS'] == 1)
         ]['REASON3'].value_counts()[:20]

Physical Health > 0.50                      3279
activity_level_recent_action > 0.50         3028
Discrimination > 0.50                       3015
Education > 0.50                            2524
Environment > 0.50                          2082
Mental Health > 0.50                        1898
Violence > 0.50                              560
0.50 < num_rbs <= 1.50                       141
sms_status_active <= 0.50                    137
activity_level_less_recent_action > 0.50     103
Bullying > 0.50                               67
num_signups > 0.50                            38
Week Of Action > 0.50                         33
0.50 < num_signups <= 1.50                    29
1.50 < num_signups <= 4.50                    28
sms_status_pending <= 0.50                    22
3.50 < num_signups <= 4.50                    17
Homelessness > 0.50                           12
age <= 18.50                                  10
2.50 < num_signups <= 4.50                     9
Name: REASON3, dtype

In [15]:
scorecard[(scorecard['SCORE'] >= 0.8) & 
          (scorecard['TRUE_CLASS'] == 1)
         ]['REASON4'].value_counts()[:20]

voter_registration_status_registered <= 0.50    3199
Education > 0.50                                2845
Discrimination > 0.50                           2148
Mental Health > 0.50                            1465
num_rbs <= 0.50                                 1136
Physical Health > 0.50                          1050
activity_level_recent_action > 0.50              930
17.50 < age <= 18.50                             505
Environment > 0.50                               495
Bullying > 0.50                                  487
Bullying <= 0.50                                 462
source_phoenix <= 0.50                           381
Violence > 0.50                                  360
source_phoenix-oauth <= 0.50                     274
sms_status_pending <= 0.50                       260
13.00 < age <= 18.50                             143
Discrimination <= 0.50                           141
Week Of Action > 0.50                            117
0.50 < num_rbs <= 2.50                        

In [16]:
scorecard[(scorecard['SCORE'] >= 0.8) & 
          (scorecard['TRUE_CLASS'] == 1)
         ]['REASON5'].value_counts()[:20]

num_rbs <= 0.50                                 4154
voter_registration_status_registered <= 0.50    2843
Physical Health > 0.50                          1982
source_phoenix <= 0.50                          1431
source_phoenix-oauth <= 0.50                     776
Discrimination > 0.50                            716
Mental Health > 0.50                             688
Discrimination <= 0.50                           585
sms_status_pending <= 0.50                       530
Environment > 0.50                               498
activity_level_recent_action > 0.50              419
Bullying > 0.50                                  336
Violence > 0.50                                  333
19.50 < age <= 21.50                             308
0.50 < num_rbs <= 1.50                           242
17.50 < age <= 18.50                             236
Bullying <= 0.50                                 235
source_sms <= 0.50                                99
Week Of Action > 0.50                         