# Importing Data
This notebook might just be the most exciting one! We will import our data and save it into a pickle file for import through the rest of our project.

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

In [2]:
path = !ls /data/augiedoebling/*.csv

In [3]:
fulldata = pd.read_csv(path[0] , low_memory=False, encoding='ISO-8859-1')
fulldata.head()

Unnamed: 0,eventid,iyear,imonth,iday,approxdate,extended,resolution,country,country_txt,region,...,addnotes,scite1,scite2,scite3,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY,related
0,197000000001,1970,0,0,,0,,58,Dominican Republic,2,...,,,,,PGIS,0,0,0,0,
1,197000000002,1970,0,0,,0,,130,Mexico,1,...,,,,,PGIS,0,1,1,1,
2,197001000001,1970,1,0,,0,,160,Philippines,5,...,,,,,PGIS,-9,-9,1,1,
3,197001000002,1970,1,0,,0,,78,Greece,8,...,,,,,PGIS,-9,-9,1,1,
4,197001000003,1970,1,0,,0,,101,Japan,4,...,,,,,PGIS,-9,-9,1,1,


In [4]:
len(fulldata)

156772

While there are 137 columns in this data set, the follow table shows the columns I will use for this project. Each column is either a direct or transformed column from the original data.

<table>
  <tr>
    <th>id</th>
    <th>date</th>
    <th>country</th>
    <th>region</th>
    <th>region code</th>
    <th>city</th>
    <th>lat</th>
    <th>long</th>
    <th>summary</th>
    <th>PERS</th>
    <th>CIP</th>
    <th>outsidehumlaw</th>
    <th>doubt</th>
    <th>alternative</th>
    <th>mulitple</th>
    <th>success</th>
    <th>suicide</th>
    <th>type</th>
    <th>nat1</th>
    <th>nat2</th>
    <th>nat3</th>
    <th>group</th>
    <th>attackercount</th>
    <th>claimed</th>
    <th>weapon</th>
    <th>fatalities</th>
    <th>fatalitiesus</th>
    <th>wounded</th>
    <th>woundedus</th>
    <th>propdamage</th>
    <th>propvalue</th>
    <th>hostkid</th>
    <th>international</th>
    <th>related</th>
    <th>inUS_yes</th>
    <th>byUScit_yes</th>
  </tr>
  <tr>
    <td>unique date-based identifier</td>
    <td>date event began</td>
    <td>country of location</td>
    <td>region of location</td>
    <td>numeric coding of region</td>
    <td>city of location</td>
    <td>lattitude of location</td>
    <td>longitude of location</td>
    <td>summary of the attack</td>
    <td>Was the attack's goal Political, Economic, Religious or Social?</td>
    <td>Was the attacks goal to Coerse, Intimidate or Publicize to an audience?</td>
    <td>Was the attack outside the context of legitimate warfare activities?</td>
    <td>Is there any doubt the attack was an act of terrorism</td>
    <td>Other possible reason if there is doubt</td>
    <td>If the attack is a multiple-incident event</td>
    <td>Was the attack successful</td>
    <td>Was the attack a suicide attack</td>
    <td>The type of attack the incident was (could be multiple)</td>
    <td>Nationality of the attacker(s), could be mulitple</td>
    <td>Nationality of the attacker(s), could be mulitple</td>
    <td>Nationality of the attacker(s), could be mulitple</td>
    <td>Name of group carrying out the account</td>
    <td>Number of terrorists participating</td>
    <td>Whether resposibility was claimed</td>
    <td>Type of weapon used to carry out the attack, could be mulitple</td>
    <td>Number of total confirmed fatalities for the attack</td>
    <td>Number of confirmed fatalities for the attack who were US citizens</td>
    <td>Number of total injured for the attack</td>
    <td>Number of total injured for the attack who were US citizens</td>
    <td>Was there property damage</td>
    <td>How much was the property worth at the time of attack</td>
    <td>Were their victims taken hostage or kidnapped</td>
    <td>Was the attack planned outside of the country it took place</td>
    <td>IDs of any attacks that were related</td>
    <td>Hot encoding if the attack took place in the US</td>
    <td>Hot encoding if a US national was involved in the attack</td>
  </tr>
</table>

In [5]:
data = pd.DataFrame()

In [6]:
data['id'] = fulldata['eventid']

In [7]:
assert np.nan not in data.id.unique()
assert 0 not in data.id.unique()

In [8]:
fulldata['imonth'] = fulldata['imonth'].replace(0, 1)
fulldata['iday'] = fulldata['iday'].replace(0, 1)

In [9]:
data['date'] = pd.to_datetime((fulldata.iyear*10000+fulldata.imonth*100+fulldata.iday).apply(str),format='%Y%m%d')

In [10]:
assert data.date.min().year >= 1970
assert data.date.max().year <= 2015

In [11]:
data['country'] = fulldata['country_txt']

In [12]:
assert np.nan not in data.country.unique()
assert "" not in data.country.unique()

In [13]:
data['region'] = fulldata['region_txt']
data['region_code'] = fulldata['region']

In [14]:
assert np.nan not in data.region.unique()
assert "" not in data.region.unique()
assert np.nan not in data.region_code.unique()

In [15]:
data['city'] = fulldata['city']

In [16]:
assert np.nan not in data.city.unique()
assert "" not in data.city.unique()

In [17]:
data['lat'] = fulldata['latitude']

In [18]:
data['long'] = fulldata['longitude']

In [19]:
assert np.nan not in data.lat.unique()
assert np.nan not in data.long.unique()

In [20]:
data['summary'] = fulldata['summary']

In [21]:
assert np.nan not in data.summary.unique()
assert "" not in data.summary.unique()

In [22]:
data['PERS'] = fulldata['crit1']
data['CIP'] = fulldata['crit2']
data['outsidehumlaw'] = fulldata['crit3']

In [23]:
assert np.nan not in data.PERS.unique()
assert np.nan not in data.CIP.unique()
assert np.nan not in data.outsidehumlaw.unique()

In [24]:
data['doubt'] = fulldata['doubtterr'].replace(-9, np.nan)

In [25]:
assert -9 not in data.doubt.unique()

In [26]:
data['alternative'] = fulldata['alternative_txt'].replace('.', np.nan)
assert '.' not in data.alternative.unique()
assert '' not in data.alternative.unique()
assert np.nan not in data.alternative.unique()

In [27]:
data['multiple'] = fulldata['multiple']
assert [0, 1] == list(data.multiple.unique())

In [28]:
data['success'] = fulldata['success']
assert [1, 0] == list(data.success.unique())

In [29]:
data['suicide'] = fulldata['suicide']
assert [0, 1] == list(data.suicide.unique())

In [30]:
data['type'] = fulldata['attacktype1_txt'].replace('.', np.nan)
assert '.' not in data.type.unique()
assert '' not in data.type.unique()
assert np.nan not in data.type.unique()

In [31]:
data['nat1'] = fulldata['natlty1_txt'].replace('.', np.nan)
data['nat2'] = fulldata['natlty2_txt'].replace('.', np.nan)
data['nat3'] = fulldata['natlty3_txt'].replace('.', np.nan)

assert '.' not in data.nat1.unique()
assert '' not in data.nat1.unique()
assert np.nan not in data.nat1.unique()
assert '.' not in data.nat2.unique()
assert '' not in data.nat2.unique()
assert np.nan not in data.nat2.unique()
assert '.' not in data.nat3.unique()
assert '' not in data.nat3.unique()
assert np.nan not in data.nat3.unique()

In [32]:
data['group'] = fulldata['gname']

assert '.' not in data.group.unique()
assert '' not in data.group.unique()
assert np.nan not in data.group.unique()

In [33]:
data['attackercount'] = fulldata['nperps'].replace([-9, -99], np.nan)
assert -9 not in data.attackercount.unique()
assert -99 not in data.attackercount.unique()

In [34]:
data['claimed'] = fulldata['claimed'].replace(-9, np.nan)
assert -9 not in data.claimed.unique()

In [35]:
data['weapon'] = fulldata['weaptype1_txt'].replace(['.', 'Unknown'], np.nan)
assert '.' not in data.weapon.unique()
assert '' not in data.weapon.unique()

In [36]:
data['fatalities'] = fulldata['nkill'].fillna(0)
assert np.nan not in data.fatalities.unique()
assert -9 not in data.fatalities.unique()
assert -99 not in data.fatalities.unique()

In [37]:
data['fatalitiesus'] = fulldata['nkillus'].fillna(0)
assert np.nan not in data.fatalitiesus.unique()
assert -9 not in data.fatalitiesus.unique()
assert -99 not in data.fatalitiesus.unique()

In [38]:
data['wounded'] = fulldata['nwound'].fillna(0)
assert np.nan not in data.wounded.unique()
assert -9 not in data.wounded.unique()
assert -99 not in data.wounded.unique()

In [39]:
data['woundedus'] = fulldata['nwoundus'].fillna(0)
assert np.nan not in data.woundedus.unique()
assert -9 not in data.woundedus.unique()
assert -99 not in data.woundedus.unique()

In [40]:
data['propdamage'] = fulldata['property'].replace(-9, np.nan)
assert -9 not in data.propdamage.unique()

In [41]:
data['propvalue'] = fulldata['propvalue'].fillna(0).replace(-9, 0)
assert -9 not in data.propvalue.unique()
assert np.nan not in data.propvalue.unique()

In [42]:
data['hostkid'] = fulldata['ishostkid'].replace(-9, np.nan)
assert -9 not in data.hostkid.unique()
assert 3 == len(data.hostkid.unique())

In [43]:
data['international'] = fulldata['INT_ANY'].replace(-9, np.nan)
assert 3 == len(data.international.unique())

In [44]:
data['related'] = fulldata['related'].str.split(', ')

In [45]:
# US country code = 217
data['inUS_yes'] = np.where(fulldata['country'] == 217, 1, 0)
assert [0, 1] == list(data.inUS_yes.unique())

In [46]:
nat1 = np.where(data.nat1 == 'United States', 1, 0)
nat2 = np.where(data.nat2 == 'United States', 1, 0)
nat3 = np.where(data.nat3 == 'United States', 1, 0)
data['byUScit_yes'] = nat1 | nat2 | nat3
assert [0, 1] == list(data.byUScit_yes.unique())

In [47]:
data.to_pickle('/data/augiedoebling/pickledGTD')