In [25]:
import pandas as pd

from category_encoders.one_hot import OneHotEncoder

In [26]:
data = pd.read_csv("./DATA/pakistanClean.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,eventid,iyear,imonth,iday,region,region_txt,provstate,city,latitude,...,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related,Date,Month,TTP
0,1,200712030005,2007,12,3,6,South Asia,Balochistan,Qilla Abdullah district,30.80363,...,,CETIS,-9,-9,0,-9,,2007-12-03,Dec,
1,2,200712040005,2007,12,4,6,South Asia,Khyber Pakhtunkhwa,Peshawar,34.006004,...,,CETIS,-9,-9,0,-9,,2007-12-04,Dec,
2,3,200712060008,2007,12,6,6,South Asia,Balochistan,Dera Bugti,29.034412,...,,CETIS,0,0,0,0,,2007-12-06,Dec,0.0
3,4,200712080003,2007,12,8,6,South Asia,Balochistan,Khuzdar,27.809921,...,,CETIS,-9,-9,0,-9,,2007-12-08,Dec,
4,5,200712090002,2007,12,9,6,South Asia,Balochistan,Nasirabad,28.458421,...,,CETIS,-9,-9,0,-9,,2007-12-09,Dec,


### Time Engineering

In [27]:
data['dayofweek'] = pd.to_datetime(data['Date']).dt.day_name()

### Remove bad/useless columns
Revisit related. Maybe the scite as well?

In [28]:
remove_cols = ['Unnamed: 0', 'region', 'region_txt', 'scite1', 'scite2', 'scite3', 'related']
for col in remove_cols:
    del data[col]

In [29]:
data.to_csv('./DATA/pakistanClean2.csv', index=False)

### Categorical One Hot Encoding

In [30]:
cat_cols = ['iyear','Month','iday','dayofweek','provstate','attacktype1_txt','targtype1_txt','weaptype1_txt','propextent_txt','divert','kidhijcountry','hostkidoutcome_txt','dbsource']

In [31]:
data['iyear'] = data['iyear'].astype(str)
data['iday'] = data['iday'].astype(str)

In [32]:
encoder = OneHotEncoder(handle_unknown='ignore', use_cat_names=True)

In [33]:
encoded_df = encoder.fit_transform(data[cat_cols])
encoded_df.head()

Unnamed: 0,iyear_2007,iyear_2008,iyear_2009,iyear_2010,iyear_2011,iyear_2012,iyear_2013,iyear_2014,iyear_2015,iyear_2016,...,hostkidoutcome_txt_Hostage(s) released by perpetrators,hostkidoutcome_txt_Hostage(s) killed (not during rescue attempt),hostkidoutcome_txt_Combination,hostkidoutcome_txt_Successful Rescue,hostkidoutcome_txt_Attempted Rescue,hostkidoutcome_txt_Hostage(s) escaped (not during rescue attempt),dbsource_CETIS,dbsource_UMD Miscellaneous,dbsource_ISVG,dbsource_START Primary Collection
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [34]:
data = data.join(encoded_df)
data.head()

Unnamed: 0,eventid,iyear,imonth,iday,provstate,city,latitude,longitude,location,summary,...,hostkidoutcome_txt_Hostage(s) released by perpetrators,hostkidoutcome_txt_Hostage(s) killed (not during rescue attempt),hostkidoutcome_txt_Combination,hostkidoutcome_txt_Successful Rescue,hostkidoutcome_txt_Attempted Rescue,hostkidoutcome_txt_Hostage(s) escaped (not during rescue attempt),dbsource_CETIS,dbsource_UMD Miscellaneous,dbsource_ISVG,dbsource_START Primary Collection
0,200712030005,2007,12,3,Balochistan,Qilla Abdullah district,30.80363,66.711752,,12/03/2007: A bomb exploded in a madrassa in Pakistan killing six. Five students of the madrassa were also injured in the attack by unknown perpetrators.,...,0,0,0,0,0,0,1,0,0,0
1,200712040005,2007,12,4,Khyber Pakhtunkhwa,Peshawar,34.006004,71.53743,,"12/04/2007: A female suicide bomber detonated herself in Peshawar, Pakistan. No other casualties were reported in what was believed to have been the first female suicide attack in Pakistan's history.",...,0,0,0,0,0,0,1,0,0,0
2,200712060008,2007,12,6,Balochistan,Dera Bugti,29.034412,69.158661,,"12/06/2007: A remote controlled roadside bomb went off when two Pakistani military vehicles passed by in Dera Bugti, Balochistan. The blast killed a paramilitary soldier and wounded four others. The Baloch Republican Army claimed responsibility for the attack; however Pakistani officials denied the claim.",...,0,0,0,0,0,0,1,0,0,0
3,200712080003,2007,12,8,Balochistan,Khuzdar,27.809921,66.620956,,"12/08/2007: A police station was fired on in Khuzdar, Pakistan, by unknown attackers. No one was injured.",...,0,0,0,0,0,0,1,0,0,0
4,200712090002,2007,12,9,Balochistan,Nasirabad,28.458421,68.133223,,"12/09/2007: Thee members of Benazir Bhutto's Pakistan People's Party (PPP) were killed by attackers in Nasirabad, Pakistan. One attacker was killed in the incident. Ms. Bhutto herself would be killed on December 27, 2007.",...,0,0,0,0,0,0,1,0,0,0


In [35]:
data.to_csv('./DATA/pakistanCleanOneHot.csv')