# GTD prediction model
Use attack type, weapons used, description of the attack, etc. to build a model that can predict what group may have been responsible for an incident.

* The prediction model used: random forest classifier
* special feature: spatial clustering to identify terrorism hotspots
    * terrorism can crop in regions that are not well described by borders
    * used latitude and longitude of attacks to identify high density areas of attacks
    * assigned every attack to a hotspot
* data selection: 
    * did not use 30% or greater missing features
    * combined parameters to reduce feature space
* memory workarounds:
    * predictive model iterated on subsets of data
    * optimized tree depth

  

In [None]:
# Load the library with the iris dataset
from sklearn.datasets import load_iris
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
# Load pandas
import pandas as pd
# Load numpy
import numpy as np
# Set random seed
np.random.seed(0)
from sklearn.preprocessing import Imputer
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt 
%matplotlib inline

In [None]:
mycol=pd.read_csv("rawdata/col_index.txt",delim_whitespace=1)
myindex=list(np.array([int(i) for i in list(mycol)])-1)+[13,14] #add latitude longitude

In [None]:
df = pd.read_csv("rawdata/globalterrorismdb_0617dist.csv",encoding='ISO-8859-1',usecols=myindex)

# Data Inspection and Cleaning

## a) convert datatypes to category, numeric, integer 

In [None]:
convert_to_cat=["extended",
                "country_txt","region_txt","provstate",
                "crit1","crit2","crit3","doubtterr",
                "suicide","attacktype1_txt","targtype1_txt","natlty1_txt",
                "gname","guncertain1","individual","weaptype1_txt",
               "property","ishostkid",
                "INT_LOG","INT_IDEO","INT_MISC","INT_ANY"]


convert_to_float=["extended","nkill","nwound","latitude","longitude"]

In [None]:
df[['iday', 'imonth','iyear']] = df[['iday', 'imonth','iyear']].astype(int)

for thecol in convert_to_cat:
    df[thecol]=df[thecol].astype('category')
for thecol in convert_to_float:
    df[thecol]=df[thecol].astype(float)

## b) calculate new columns from data

In [None]:
df["datetime"]=pd.to_datetime(df["iday"],df["imonth"],df["iyear"])
df=df.drop('iday',axis=1)

In [None]:
#df['imonth']=df['imonth'].astype('category')
#df['iyear']=df['iyear'].astype('category')

In [None]:
df['nwound'].fillna(0).astype(int)
df['nkill'].fillna(0).astype(int)
df['ncasualities']=df['nkill']+df['nwound']
df=df.drop(['nwound','nkill'], axis=1)

## c) summarize data

In [None]:
dfsumcat=df.describe(include=['category'])
dfsumcat.iloc[:,0:10]

In [None]:
dfsumcat.iloc[:,10:20]

In [None]:
dfsumnum=df.describe(include=[np.number])
dfsumnum

### missing data analysis

In [None]:
df.isnull().sum()/len(df)<0.30 #make sure at least 70 percent of data is present!
#terror_data['injuries'] = terror_data['injuries'].fillna(0).astype(int)
df.isnull().sum()/len(df)

## d)  Data Imputation

In [None]:
col_many_cat=df.select_dtypes(include=['category']).apply(lambda x: len(x.unique())>12)
#1. data has many categories: label na data as missing
df_cat_hi=df.select_dtypes(include=['category']).ix[:,col_many_cat==True] 
df_cat_hi=df_cat_hi.apply(lambda x: x.cat.add_categories(['missing']).fillna('missing'))
#2  data has few categories:impute na data as most frequent
df_cat_lw=df.select_dtypes(include=['category']).ix[:,col_many_cat==False] 
df_cat_lw=df_cat_lw.apply(lambda x:x.fillna(x.value_counts().index[0]))

In [None]:
#3  data is numerical:impute na data as median (some of the data is skewed)
df_num=df[["extended","ncasualities","iyear","imonth","latitude","longitude"]].apply(lambda x:x.fillna(x.median()))

In [None]:
df = pd.concat([df_num,df_cat_hi,df_cat_lw],axis=1)

In [None]:
df.isnull().sum()/len(df)

# modify data 

In [None]:
df["ncasualities"]=list(np.log10(np.array((df["ncasualities"]))+1))
setcrit=zip(list(df["crit1"]),list(df["crit2"]),list(df["crit3"]))

In [None]:
df["critall"]=[str(s[0])+str(s[1])+str(s[2]) for s in setcrit]

In [None]:
df['critall']=df['critall'].astype('category')

# Identify Spatial Terrorism Hot Zones 

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
from geopy.distance import great_circle

In [None]:
from shapely.geometry import MultiPoint

In [None]:
# Define coordinates 
coords = df.as_matrix(columns=['latitude', 'longitude'])

In [None]:
subcoords=coords

In [None]:
kms_per_radian = 6371.0088 #haversine needs radians
epsilon = 160 / kms_per_radian # otherwise outside of cluster (100 miles)
db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(subcoords))

In [None]:
cluster_labels = db.labels_
num_clusters = len(set(cluster_labels))
clusters = pd.Series([subcoords[cluster_labels == n] for n in range(num_clusters)])
print('Number of clusters: {}'.format(num_clusters))

In [None]:
df["terrorism_hotspot"]=list(cluster_labels)

In [None]:
df["terrorism_hotspot"]=df["terrorism_hotspot"].astype('category')

### e) refine model and drop columns that are adding noise

In [None]:
df=df.drop(["individual","crit1","suicide","ishostkid","latitude","longitude"],axis=1)

# Create Training and Testing Data Sets

### a. splice out data that has no group assigned to the attack

In [None]:
dfgu=df.ix[df['gname']=='Unknown',:] #group unknown
dfgk=df.ix[df['gname']!='Unknown',:]#group known

In [None]:
len(df)==len(dfgu)+len(dfgk) #check that this is all the data

### b. factorize categorical data

In [None]:
# Create a list of the feature column's names
dfgk_cat=dfgk.select_dtypes(include=['category']).apply(lambda x: pd.factorize(x)[0])


In [None]:
dfgk_num=dfgk.select_dtypes(exclude=['category'])
dfgk_fac = pd.concat([dfgk_cat,dfgk_num], axis=1)

### c. split data training and test  sets

In [None]:
# randomly label some rows as training and some as test data.
dfgk_fac['is_train'] = np.random.uniform(0, 1, len(dfgk_fac)) <= .70 #is train? 1/0 (75% training)


In [None]:
train, test = dfgk_fac[dfgk_fac['is_train']==True], dfgk_fac[dfgk_fac['is_train']==False]

In [None]:
# Show the number of observations
train=train.drop("is_train",axis=1)
test=test.drop("is_train",axis=1)
print('total attacks in the training data:', len(train))
print('total attacks in the test data:',len(test))


In [None]:
y=train["gname"]

In [None]:
# Create a list of the feature column's names
features = train.drop('gname',axis=1).columns[:]

# Train Model

In [None]:
# generated a random forest classifer (clf)
clf = RandomForestClassifier(n_jobs=4,max_depth=15,random_state=0,warm_start=False,oob_score=False,n_estimators=30)
# training the classifer on the terrorist group names
clf.fit(train[features],y)

# Test Model on Data Piecewise to Avoid Memory Issues

### test subset of training data

pull of a subset 

In [None]:
len(test)

In [None]:
div20=27500
mytest=test.iloc[0:div20]

write function that will predict given index of attack

In [None]:
def predset(index_attack):
    return clf.predict(mytest[features].iloc[index_attack,:])

create an index set for attacks (20 in each)

In [None]:
mysets=np.reshape(np.arange(0,div20),(div20/20,20)) #test 20 at a time
preds=np.zeros(div20)

have the model predict the group responsible

In [None]:
for set_index in mysets:
    preds[set_index]=predset(set_index)

see how accurate the predictions were

# Evaluate Model

In [None]:
accuracy = accuracy_score(mytest['gname'], preds)

In [None]:
accuracy

In [None]:
# Create confusion matrix
#conf=pd.crosstab(mytest['gname'], preds, rownames=['Actual Group Name'], colnames=['Predicted Group Name'])
plt.rcParams['figure.figsize'] = [12, 7]
conf = confusion_matrix(mytest['gname'],preds)
plt.imshow(conf[1:10,1:10],interpolation='None',)


In [None]:
#list(zip(train[features], clf.feature_importances_))

In [None]:
importances = clf.feature_importances_
findex = np.argsort(importances)
plt.rcParams['figure.figsize'] = [12, 7]
plt.figure(1)
plt.title('The Importance of the Predictors')
plt.barh(range(len(findex)), importances[findex], color='g', align='center')
plt.yticks(range(len(findex)), features[findex])
plt.xlabel('Relative Importance')

#### Hack to Give us OOB score

In [None]:
# generated a random forest classifer (clf)
clf = RandomForestClassifier(n_jobs=4,max_depth=15,random_state=0,warm_start=True,oob_score=True,n_estimators=1)
# training the classifer on the terrorist group names
clf.fit(train[features].iloc[0:2000],y[0:2000])

In [None]:
1-clf.oob_score_

This is close to what we found when we tested our model on data it had never seen before.