In [1]:
import pandas as pd
import numpy as np

import patsy as pt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

import plotly.express as px

from interpret import show
from interpret.blackbox import ShapKernel, PartialDependence



### Data Prep

In [2]:
data = pd.read_csv('./DATA/pakistanClean2.csv')

In [3]:
#valid_train_Values = ['0','1']
#work_data = data[data.TTP.isin(valid_train_Values)] 
work_data = data[(data['TTP']==0) | (data['TTP'] ==1)].copy()
test_data = data[pd.isna(data['TTP'])].copy()
work_data = work_data.fillna(-99)

Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
NumExpr defaulting to 8 threads.


In [4]:
# Create regression arrays
Y, X = pt.dmatrices("TTP ~ C(iyear) + C(provstate) + multiple + success + suicide + C(attacktype1_txt) + C(targtype1_txt) + C(targsubtype1_txt) + C(weaptype1_txt) + C(weapsubtype1_txt) + nkill + nkillus + nkillter + nwound + nwoundus + nwoundte + C(Month)", data = work_data, return_type='dataframe')

In [5]:
names = X.columns
names = [i.replace('[', '').replace(']', '').replace(' ', '').replace(',', '') for i in names]

In [6]:
# Randomly create train and test data
x, xt, y, yt = train_test_split(X, Y, test_size = 0.25,random_state=35)

### Model

In [7]:
# Generate the random forest model
writingForest = RandomForestClassifier(n_estimators=110, n_jobs = -1, random_state=35)
# Fit the model to the training data
fclf = writingForest.fit(x, y)

  fclf = writingForest.fit(x, y)


### Generate predictions and evaluate

In [8]:
# Make predictions
fpred = fclf.predict(xt)
# Print the accuracy score of the fitted model
print("The random forest has an accuracy of : %s\n" % str(accuracy_score(fpred, yt)))

The random forest has an accuracy of : 0.8240620957309185



### Explain model

#### Feature importances

In [9]:
imp_df = pd.DataFrame({'Feature':names,'Importance':fclf.feature_importances_})
imp_df.sort_values(by='Importance', ascending=False, inplace=True)

In [10]:
px.bar(x='Importance', y='Feature', data_frame=imp_df.head(20), orientation='h')

#### Shapley Values Intro

Shapley values can help us understand what and how features contributed to our final prediction. The goal is to understand how a model got to the prediction it did. In the above plot we see that the number of terrorists killed, along with a couple location features, pushed our TTP probabibility over 50%. While there were other features that would lead us to believe it was not a TTP attack, they were not enough to bring us below 50%. 

In [11]:
shap = ShapKernel(predict_fn=writingForest.predict_proba, data=x[:100])

In [12]:
shap_local = shap.explain_local(pd.DataFrame(xt)[:25], pd.DataFrame(yt)[:25])

show(shap_local)

  0%|          | 0/25 [00:00<?, ?it/s]num_full_subsets = 1
remaining_weight_vector = [0.12004298 0.08080563 0.06119838 0.04944344 0.0416149  0.03603022
 0.03184814 0.0286013  0.02600931 0.02389372 0.02213558 0.02065256
 0.01938582 0.01829226 0.01733954 0.01650293 0.01576322 0.01510523
 0.01451683 0.0139882  0.01351133 0.01307959 0.01268747 0.01233034
 0.0120043  0.01170602 0.01143266 0.01118179 0.01095129 0.01073933
 0.01054432 0.01036486 0.01019973 0.01004786 0.00990831 0.00978023
 0.0096629  0.00955566 0.00945793 0.00936921 0.00928904 0.00921703
 0.00915284 0.00909615 0.00904672 0.00900431 0.00896873 0.00893983
 0.00891748 0.00890158 0.00889207 0.00888891]
num_paired_subset_sizes = 52
weight_left = 0.8071930411864512
np.sum(w_aug) = 106.00000000000001
np.sum(self.kernelWeights) = 1.0
phi = [ 0.         -0.00596636  0.          0.          0.          0.
  0.          0.          0.005641    0.          0.          0.10665806
 -0.03179146  0.         -0.04049072  0.00594479  0.0031187

#### Partial Dependence Plot Intro
Partial dependence plots reveal the dependence between our target variable and any given feature.  

In [13]:
pdp = PartialDependence(predict_fn=writingForest.predict_proba, data=x, num_points=200)

In [14]:
pdp_global = pdp.explain_global()

show(pdp_global)

No overall plot to display: -1|ShapKernel_0
Generating mini dash
Generated mini dash


### Generate predictions on true test dataset

In [15]:
test_data.fillna(-99, inplace=True)

In [16]:
# Create regression arrays
_, Xtest = pt.dmatrices("TTP ~ C(iyear) + C(provstate) + multiple + success + suicide + attacktype1 + C(targtype1) + C(targsubtype1) + weaptype1 + weapsubtype1 + nkill + nkillus + nkillter + nwound + nwoundus + nwoundte + C(Month)", data = test_data, return_type='dataframe')

In [17]:
remove_cols = set(Xtest.columns) - set(X.columns)
add_cols = set(X.columns) - set(Xtest.columns)

In [18]:
for col in remove_cols:
    del Xtest[col]
for col in add_cols:
    Xtest[col] = 0

In [19]:
# Make predictions
fpred = fclf.predict(Xtest)

In [20]:
pred_df = pd.DataFrame({'eventid':test_data['eventid'], 'TTP':fpred})
pred_df.head()

Unnamed: 0,eventid,TTP
0,200712030005,0.0
1,200712040005,0.0
3,200712080003,0.0
4,200712090002,0.0
5,200712090004,1.0


In [21]:
pred_df.to_csv('test_preds.csv')

In [22]:
pred_df['TTP'].mean()

0.5810013117621338

In [23]:
shap_local = shap.explain_local(pd.DataFrame(Xtest)[:25])

show(shap_local)

  0%|          | 0/25 [00:00<?, ?it/s]num_full_subsets = 1
remaining_weight_vector = [0.12004298 0.08080563 0.06119838 0.04944344 0.0416149  0.03603022
 0.03184814 0.0286013  0.02600931 0.02389372 0.02213558 0.02065256
 0.01938582 0.01829226 0.01733954 0.01650293 0.01576322 0.01510523
 0.01451683 0.0139882  0.01351133 0.01307959 0.01268747 0.01233034
 0.0120043  0.01170602 0.01143266 0.01118179 0.01095129 0.01073933
 0.01054432 0.01036486 0.01019973 0.01004786 0.00990831 0.00978023
 0.0096629  0.00955566 0.00945793 0.00936921 0.00928904 0.00921703
 0.00915284 0.00909615 0.00904672 0.00900431 0.00896873 0.00893983
 0.00891748 0.00890158 0.00889207 0.00888891]
num_paired_subset_sizes = 52
weight_left = 0.8071930411864512
np.sum(w_aug) = 106.00000000000001
np.sum(self.kernelWeights) = 1.0000000000000002
phi = [-0.00449654 -0.00653066 -0.00474183 -0.00301625  0.          0.
  0.          0.         -0.00174202  0.         -0.00104422 -0.21762143
 -0.04858524  0.         -0.04415678 -0.0038

No overall plot to display: -1|PartialDependence_0
No overall plot to display: -1|ShapKernel_1
