In [1]:
import pandas as pd

from category_encoders.one_hot import OneHotEncoder

In [2]:
data = pd.read_csv("./DATA/pakistanClean.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,eventid,iyear,imonth,iday,region,region_txt,provstate,city,latitude,...,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related,Date,Month,TTP
0,1,200712030005,2007,12,3,6,South Asia,Balochistan,Qilla Abdullah district,30.80363,...,,CETIS,-9,-9,0,-9,,2007-12-03,Dec,
1,2,200712040005,2007,12,4,6,South Asia,Khyber Pakhtunkhwa,Peshawar,34.006004,...,,CETIS,-9,-9,0,-9,,2007-12-04,Dec,
2,3,200712060008,2007,12,6,6,South Asia,Balochistan,Dera Bugti,29.034412,...,,CETIS,0,0,0,0,,2007-12-06,Dec,0.0
3,4,200712080003,2007,12,8,6,South Asia,Balochistan,Khuzdar,27.809921,...,,CETIS,-9,-9,0,-9,,2007-12-08,Dec,
4,5,200712090002,2007,12,9,6,South Asia,Balochistan,Nasirabad,28.458421,...,,CETIS,-9,-9,0,-9,,2007-12-09,Dec,


In [3]:
data_dct = pd.read_excel('./DATA/PakistanDataDictionary.xlsx')
data_dct.head()

Unnamed: 0,Variable Name,Variable Definition
0,eventid,Unique event identifier
1,iyear,Year of event
2,imonth,Month of event
3,iday,Day of event
4,region,Region within Pakistan where event took place (numeric id)


Need to remove columns that are not in the data dictionary

In [4]:
extra_cols = set(data.columns) - set(data_dct['Variable Name']) - {'Month','TTP'}
len(extra_cols)

33

### Time Engineering

Add column that tracks day of week. It is possible that terror groups may change their behavior depending on the day of week. 

In [5]:
data['dayofweek'] = pd.to_datetime(data['Date']).dt.day_name()

### Remove bad/useless columns
All attacks happen in the same region. Additionally, a "Month" column also exists, so we no longer need imonth. 

In [6]:
remove_cols = ['region', 'region_txt', 'imonth']
extra_cols.update(remove_cols)

In [7]:
for col in list(extra_cols):
    del data[col]

In [8]:
data.to_csv('./DATA/pakistanClean2.csv', index=False)

### Categorical One Hot Encoding
Certain models require dummy varialbes. Below I convert these categorical varies into flags.

In [12]:
cat_cols = ['iyear','Month','iday','dayofweek','provstate','attacktype1_txt','targtype1_txt','weaptype1_txt']

In [13]:
data['iyear'] = data['iyear'].astype(str)
data['iday'] = data['iday'].astype(str)

In [14]:
encoder = OneHotEncoder(handle_unknown='ignore', use_cat_names=True)

In [15]:
encoded_df = encoder.fit_transform(data[cat_cols])
encoded_df.head()

Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
NumExpr defaulting to 8 threads.


Unnamed: 0,iyear_2007,iyear_2008,iyear_2009,iyear_2010,iyear_2011,iyear_2012,iyear_2013,iyear_2014,iyear_2015,iyear_2016,...,targtype1_txt_Other,targtype1_txt_Maritime,weaptype1_txt_Explosives,weaptype1_txt_Firearms,weaptype1_txt_Unknown,weaptype1_txt_Melee,weaptype1_txt_Incendiary,weaptype1_txt_Chemical,weaptype1_txt_Other,"weaptype1_txt_Vehicle (not to include vehicle-borne explosives, i.e., car or truck bombs)"
0,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [16]:
data = data.join(encoded_df)
data.head()

Unnamed: 0,eventid,iyear,iday,provstate,city,latitude,longitude,location,summary,multiple,...,targtype1_txt_Other,targtype1_txt_Maritime,weaptype1_txt_Explosives,weaptype1_txt_Firearms,weaptype1_txt_Unknown,weaptype1_txt_Melee,weaptype1_txt_Incendiary,weaptype1_txt_Chemical,weaptype1_txt_Other,"weaptype1_txt_Vehicle (not to include vehicle-borne explosives, i.e., car or truck bombs)"
0,200712030005,2007,3,Balochistan,Qilla Abdullah district,30.80363,66.711752,,12/03/2007: A bomb exploded in a madrassa in Pakistan killing six. Five students of the madrassa were also injured in the attack by unknown perpetrators.,0,...,0,0,1,0,0,0,0,0,0,0
1,200712040005,2007,4,Khyber Pakhtunkhwa,Peshawar,34.006004,71.53743,,"12/04/2007: A female suicide bomber detonated herself in Peshawar, Pakistan. No other casualties were reported in what was believed to have been the first female suicide attack in Pakistan's history.",0,...,0,0,1,0,0,0,0,0,0,0
2,200712060008,2007,6,Balochistan,Dera Bugti,29.034412,69.158661,,"12/06/2007: A remote controlled roadside bomb went off when two Pakistani military vehicles passed by in Dera Bugti, Balochistan. The blast killed a paramilitary soldier and wounded four others. The Baloch Republican Army claimed responsibility for the attack; however Pakistani officials denied the claim.",0,...,0,0,1,0,0,0,0,0,0,0
3,200712080003,2007,8,Balochistan,Khuzdar,27.809921,66.620956,,"12/08/2007: A police station was fired on in Khuzdar, Pakistan, by unknown attackers. No one was injured.",0,...,0,0,0,1,0,0,0,0,0,0
4,200712090002,2007,9,Balochistan,Nasirabad,28.458421,68.133223,,"12/09/2007: Thee members of Benazir Bhutto's Pakistan People's Party (PPP) were killed by attackers in Nasirabad, Pakistan. One attacker was killed in the incident. Ms. Bhutto herself would be killed on December 27, 2007.",0,...,0,0,0,1,0,0,0,0,0,0


In [17]:
for col in cat_cols:
    del data[col]

In [16]:
data.to_csv('./DATA/pakistanCleanOneHot.csv', index=False)