In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from interpret import show
from interpret.glassbox import ExplainableBoostingClassifier


### Data Prep

In [2]:
data = pd.read_csv("./DATA/pakistanClean2.csv")

In [3]:
data.set_index('eventid', inplace=True)

In [4]:
data.columns

Index(['iyear', 'iday', 'provstate', 'city', 'latitude', 'longitude',
       'location', 'summary', 'multiple', 'success', 'suicide', 'attacktype1',
       'attacktype1_txt', 'targtype1', 'targtype1_txt', 'targsubtype1',
       'targsubtype1_txt', 'corp1', 'target1', 'gname', 'claimed', 'claimmode',
       'claimmode_txt', 'weaptype1', 'weaptype1_txt', 'weapsubtype1',
       'weapsubtype1_txt', 'nkill', 'nkillus', 'nkillter', 'nwound',
       'nwoundus', 'nwoundte', 'Month', 'TTP', 'dayofweek'],
      dtype='object')

In [5]:
exclude_cols = ['city','location','summary','attacktype1','targtype1','targsubtype1', 'corp1', 'target1', 'gname', 'claimed', 'claimmode',
       'claimmode_txt','weaptype1','weapsubtype1']
for col in exclude_cols:
    del data[col]

In [6]:
data.head()

Unnamed: 0_level_0,iyear,iday,provstate,latitude,longitude,multiple,success,suicide,attacktype1_txt,targtype1_txt,...,weapsubtype1_txt,nkill,nkillus,nkillter,nwound,nwoundus,nwoundte,Month,TTP,dayofweek
eventid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200712030005,2007,3,Balochistan,30.80363,66.711752,0,1,0,Bombing/Explosion,Educational Institution,...,Unknown Explosive Type,6.0,0.0,0.0,5.0,0.0,0.0,Dec,,Monday
200712040005,2007,4,Khyber Pakhtunkhwa,34.006004,71.53743,0,1,1,Bombing/Explosion,Military,...,Suicide (carried bodily by human being),1.0,0.0,1.0,0.0,0.0,0.0,Dec,,Tuesday
200712060008,2007,6,Balochistan,29.034412,69.158661,0,1,0,Bombing/Explosion,Military,...,Remote Trigger,1.0,0.0,0.0,4.0,0.0,0.0,Dec,0.0,Thursday
200712080003,2007,8,Balochistan,27.809921,66.620956,0,0,0,Armed Assault,Police,...,Unknown Gun Type,0.0,0.0,0.0,0.0,0.0,0.0,Dec,,Saturday
200712090002,2007,9,Balochistan,28.458421,68.133223,0,1,0,Armed Assault,Violent Political Party,...,Unknown Gun Type,4.0,0.0,0.0,1.0,0.0,0.0,Dec,,Sunday


In [7]:
train_data = data[(data['TTP']==0) | (data['TTP'] ==1)].copy()
test_data = data[pd.isna(data['TTP'])].copy()
del test_data['TTP']

Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
NumExpr defaulting to 8 threads.


In [8]:
train_data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)

In [9]:
ycol = 'TTP'
X_data = train_data[[col for col in train_data.columns if col != 'TTP']]
y_data = train_data[ycol]

In [10]:
# killwoundratios = []
# for row in X_data.itertuples():
#     nk = row.nkill
#     nw = row.nwound
#     if nw == 0:
#         if nk > 0:
#             rat = 10
#         else:
#             rat = 0
#     else:
#         rat = min(nk / nw, 10)
    
#     killwoundratios.append(rat)
# X_data['killwoundratio'] = killwoundratios

In [11]:
## Training/test creation
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size = 0.25, random_state=35)

### Model

In [12]:
model = ExplainableBoostingClassifier(min_samples_leaf=7)
model.fit(X_train, y_train)

EBM lib loading.
Loading native on win32 | debug = False
Passing a numpy array to schema autogen when it should be dataframe.
Passing a numpy array to schema autogen when it should be dataframe.


ExplainableBoostingClassifier(feature_names=['iyear', 'iday', 'provstate',
                                             'latitude', 'longitude',
                                             'multiple', 'success', 'suicide',
                                             'attacktype1_txt', 'targtype1_txt',
                                             'targsubtype1_txt',
                                             'weaptype1_txt',
                                             'weapsubtype1_txt', 'nkill',
                                             'nkillus', 'nkillter', 'nwound',
                                             'nwoundus', 'nwoundte', 'Month',
                                             'dayofweek', 'iyear x provstate',
                                             'iyear x latitude',
                                             'latitude x Month',
                                             'lat...
                                             'categorical', 'categorical',
 

### Generate predictions and evaluate

In [13]:
y_preds = model.predict(X_test)

In [14]:
print(f"Accuracy: {accuracy_score(y_test, y_preds)}")

Accuracy: 0.8201811125485123


In [15]:
print(f"Predict 0 accuracy: {accuracy_score(y_test, [0 for _ in range(len(y_preds))])}")

Predict 0 accuracy: 0.553686934023286


### Explain model and predictions

In [16]:
ebm_global = model.explain_global()
show(ebm_global)

Detected non-cloud environment.
The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`
  import dash_html_components as html
The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`
  import dash_core_components as dcc
The dash_table package is deprecated. Please replace
`import dash_table` with `from dash import dash_table`

Also, if you're using any of the table format helpers (e.g. Group), replace 
`from dash_table.Format import Group` with 
`from dash.dash_table.Format import Group`
  import dash_table as dt
Generating mini dash


Generated mini dash


In [17]:
ebm_local = model.explain_local(X_test[:10], y_test[:10])
show(ebm_local)

Generating mini dash
Generated mini dash


No overall plot to display: -1|ExplainableBoostingClassifier_1
