# Preprocess Natural Disasters

http://www.unitedstateszipcodes.org/zip-code-database/

- Download free
- Download csv

## Reading in the data

In [57]:
import pandas as pd

disasters = pd.read_csv('DisasterDeclarationsSummaries.csv', usecols=[5,7,8,9,10,11,13,14], dtype=object,
                       header=0, names=['state_abbr', 'disaster_type', 'incident_type', 
                                        'incident_title', 'date_started', 'date_ended', 
                                        'county', 'fip_code'])
print disasters.info()
disasters.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46304 entries, 0 to 46303
Data columns (total 8 columns):
state_abbr        46304 non-null object
disaster_type     46304 non-null object
incident_type     46304 non-null object
incident_title    46304 non-null object
date_started      46304 non-null object
date_ended        45932 non-null object
county            46098 non-null object
fip_code          46087 non-null object
dtypes: object(8)
memory usage: 2.8+ MB
None


Unnamed: 0,state_abbr,disaster_type,incident_type,incident_title,date_started,date_ended,county,fip_code
0,LA,DR,Flood,FLOOD,1953-05-29T00:00:00 -04:00,1953-05-29T00:00:00 -04:00,,
1,GA,DR,Tornado,TORNADO,1953-05-02T00:00:00 -04:00,1953-05-02T00:00:00 -04:00,,
2,MI,DR,Tornado,TORNADO,1953-06-02T00:00:00 -04:00,1953-06-02T00:00:00 -04:00,,
3,CA,DR,Flood,FLOOD & EROSION,1954-02-05T00:00:00 -05:00,1954-02-05T00:00:00 -05:00,,
4,CT,DR,Hurricane,HURRICANES,1954-09-17T00:00:00 -04:00,1954-09-17T00:00:00 -04:00,,


## Updating the dates

In [58]:
# get dates into a datetime
disasters['date_started'] = disasters['date_started'].str.replace('T.*', '')
disasters['date_ended'] = disasters['date_ended'].str.replace('T.*', '')

# get the difference in days
disasters['days_lasted'] = pd.to_datetime(disasters['date_ended']) - pd.to_datetime(disasters['date_started'])

# http://pandas.pydata.org/pandas-docs/stable/timedeltas.html
# change to int
disasters['days_lasted'] = disasters['days_lasted'].dt.days

disasters.head()

Unnamed: 0,state_abbr,disaster_type,incident_type,incident_title,date_started,date_ended,county,fip_code,days_lasted
0,LA,DR,Flood,FLOOD,1953-05-29,1953-05-29,,,0.0
1,GA,DR,Tornado,TORNADO,1953-05-02,1953-05-02,,,0.0
2,MI,DR,Tornado,TORNADO,1953-06-02,1953-06-02,,,0.0
3,CA,DR,Flood,FLOOD & EROSION,1954-02-05,1954-02-05,,,0.0
4,CT,DR,Hurricane,HURRICANES,1954-09-17,1954-09-17,,,0.0


## Subsetting data

In [59]:
# check the unique values from incident type
disasters['incident_type'].unique()

array(['Flood', 'Tornado', 'Hurricane', 'Fire', 'Other', 'Severe Storm(s)',
       'Earthquake', 'Typhoon', 'Drought', 'Volcano', 'Dam/Levee Break',
       'Toxic Substances', 'Snow', 'Severe Ice Storm', 'Freezing',
       'Coastal Storm', 'Fishing Losses', 'Mud/Landslide', 'Human Cause',
       'Terrorist', 'Chemical', 'Tsunami'], dtype=object)

In [60]:
# We want only specific natural disasters and up to 2013
dtype = ['Tornado', 'Hurricane', 'Fire', 'Earthquake', 
         'Typhoon', 'Volcanoe', 'Coastal Storm', 'Tsunami']

cond1 = disasters['incident_type'].isin(dtype)
cond2 = disasters['county'].notnull()           # need a county to geocode

disasters = disasters[cond1 & cond2]

### Check if any states not in the state table

In [61]:
# cond = disasters['state_abbr'].isin(state_table['abbreviation'])
# print disasters.loc[~cond].shape
# disasters.loc[~cond].head()

We can see some irrelevant cases such as American Samoa (AS) that we don't need

## Cleaning the FIP codes

Here I grabbed FIP codes from 2010 from the census bureau 

https://www.census.gov/geo/reference/codes/cou.html

Download -> United States -> Go (opens .txt file in broswer and was downloaded)

In [62]:
fips = pd.read_csv('national_county.txt', header=0, dtype=object,
                   names=['state_abbr', 'state_fip', 'county_fip', 'county', 'fip_class_code'])
fips.head()

Unnamed: 0,state_abbr,state_fip,county_fip,county,fip_class_code
0,AL,1,3,Baldwin County,H1
1,AL,1,5,Barbour County,H1
2,AL,1,7,Bibb County,H1
3,AL,1,9,Blount County,H1
4,AL,1,11,Bullock County,H1


In [63]:
fips['county_fip'] = fips['state_abbr'] + '-' + fips['county_fip']
fips.head()

Unnamed: 0,state_abbr,state_fip,county_fip,county,fip_class_code
0,AL,1,AL-003,Baldwin County,H1
1,AL,1,AL-005,Barbour County,H1
2,AL,1,AL-007,Bibb County,H1
3,AL,1,AL-009,Blount County,H1
4,AL,1,AL-011,Bullock County,H1


Because the FIPs CAN repeat per state, then we can make a unique version by attaching the state abbreviation and the FIP code itself per county. We can also see that in the disasters dataset the FIP codes have 2 digits before the actual code which will need to be removed, this way when we try and compare plots of UFO sightings county names wont overlap and we can use FIP codes

In [64]:
disasters['fip_code'] = disasters['fip_code'].str.replace('^[0-9]{2}', '')

disasters['fip_code'] = disasters['state_abbr'] + '-' + disasters['fip_code']

## Cleaning and Merging data

In [65]:
state_table = pd.read_csv('state_table.csv')
stble_cols = ['abbreviation', 'name', 'census_region_name', 'census_division_name']

disasters = disasters.merge(state_table[stble_cols], how='inner', left_on='state_abbr', right_on='abbreviation')

del disasters['abbreviation']

# set the index to the start/end_date and get up to 2013
disasters = disasters.set_index(['date_started', 'date_ended'])
disasters = disasters.sort_index()
disasters = disasters.loc[:'2014']

# rename the columns added
disasters = disasters.rename(columns={'name':'state', 'census_region_name':'region_name', 'census_division_name':'division_name'})

# reorder DF 
reorder_cols = ['disaster_type','incident_type','incident_title', 'days_lasted', 'county', 'state', 
                'state_abbr','division_name','region_name', 'fip_code']

disasters = disasters[reorder_cols]

# have the title in title form
disasters['incident_title'] = disasters['incident_title'].str.title()

In [66]:
# types of indexing
# disasters.loc['1965-04':'1965-05']
# disasters.loc[(slice(None), slice('1965-04', '1965-07')),:]

### Here we check to see which disasters FIP code arent in the .txt file

In [67]:
cond = disasters['fip_code'].isin(fips['county_fip'])
# cond2 = fips['county_fip'].isin(disasters['fip_code'])
print disasters.loc[~cond].shape
disasters.loc[~cond]

(253, 10)


Unnamed: 0_level_0,Unnamed: 1_level_0,disaster_type,incident_type,incident_title,days_lasted,county,state,state_abbr,division_name,region_name,fip_code
date_started,date_ended,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1965-09-10,1965-09-10,DR,Hurricane,Hurricane Betsy,0.0,Ward 9 (Police Jury Ward),Louisiana,LA,West South Central,South,LA-440
1965-09-14,1965-09-14,DR,Hurricane,Hurricane Betsy,0.0,Dade (County),Florida,FL,South Atlantic,South,FL-025
1969-08-19,1969-08-19,DR,Hurricane,Hurricane Camille,0.0,Ward 9 (Police Jury Ward),Louisiana,LA,West South Central,South,LA-440
1974-09-23,1974-09-23,DR,Hurricane,Hurricane Carmen,0.0,Ward 9 (Police Jury Ward),Louisiana,LA,West South Central,South,LA-440
1975-12-18,1975-12-18,DR,Fire,Fire & Freezing,0.0,Tanana (ANV/ANVSA),Alaska,AK,Pacific,West,AK-165
1979-04-21,1979-04-21,DR,Tornado,"Tornadoes, Torrential Rain & Flooding",0.0,St. Louis,Missouri,MO,West North Central,Midwest,MO-000
1985-10-26,1985-11-08,DR,Hurricane,Hurricane Juan,13.0,Ward 9 (Police Jury Ward),Louisiana,LA,West South Central,South,LA-440
1987-11-15,1987-11-19,DR,Tornado,Tornadoes & Flooding,4.0,Ward 9 (Police Jury Ward),Louisiana,LA,West South Central,South,LA-440
1988-02-10,1988-02-10,DR,Fire,Fire,0.0,Tanana (ANV/ANVSA),Alaska,AK,Pacific,West,AK-165
1989-06-07,1989-06-08,DR,Tornado,Severe Storms & Tornadoes,1.0,Ward 9 (Police Jury Ward),Louisiana,LA,West South Central,South,LA-440


In [68]:
# merging the fips dataset to this disasters to check if county names match up
fips_col = ['county', 'county_fip']

disasters = disasters.reset_index().merge(fips[fips_col], how='inner', left_on='fip_code', right_on='county_fip').set_index(['date_started', 'date_ended'])

In [69]:
disasters.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,disaster_type,incident_type,incident_title,days_lasted,county_x,state,state_abbr,division_name,region_name,fip_code,county_y,county_fip
date_started,date_ended,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1965-04-14,1965-04-14,DR,Tornado,Tornadoes & Severe Storms,0.0,Howard (County),Indiana,IN,East North Central,Midwest,IN-067,Howard County,IN-067
2005-08-29,2005-10-01,EM,Hurricane,Hurricane Katrina Evacuation,33.0,Howard (County),Indiana,IN,East North Central,Midwest,IN-067,Howard County,IN-067
1965-04-14,1965-04-14,DR,Tornado,Tornadoes & Severe Storms,0.0,Grant (County),Indiana,IN,East North Central,Midwest,IN-053,Grant County,IN-053
1974-04-04,1974-04-04,DR,Tornado,Tornadoes,0.0,Grant (County),Indiana,IN,East North Central,Midwest,IN-053,Grant County,IN-053
2005-08-29,2005-10-01,EM,Hurricane,Hurricane Katrina Evacuation,33.0,Grant (County),Indiana,IN,East North Central,Midwest,IN-053,Grant County,IN-053


In [70]:
del disasters['county_x']
del disasters['county_fip']

disasters = disasters.rename(columns={'county_y':'county'})

reorder_cols = ['disaster_type','incident_type','incident_title', 'days_lasted', 'county', 'state', 
                'state_abbr','division_name','region_name', 'fip_code']

disasters = disasters[reorder_cols]

In [71]:
disasters.shape

(12183, 10)

In [72]:
disasters.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,disaster_type,incident_type,incident_title,days_lasted,county,state,state_abbr,division_name,region_name,fip_code
date_started,date_ended,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1965-04-14,1965-04-14,DR,Tornado,Tornadoes & Severe Storms,0.0,Howard County,Indiana,IN,East North Central,Midwest,IN-067
2005-08-29,2005-10-01,EM,Hurricane,Hurricane Katrina Evacuation,33.0,Howard County,Indiana,IN,East North Central,Midwest,IN-067
1965-04-14,1965-04-14,DR,Tornado,Tornadoes & Severe Storms,0.0,Grant County,Indiana,IN,East North Central,Midwest,IN-053
1974-04-04,1974-04-04,DR,Tornado,Tornadoes,0.0,Grant County,Indiana,IN,East North Central,Midwest,IN-053
2005-08-29,2005-10-01,EM,Hurricane,Hurricane Katrina Evacuation,33.0,Grant County,Indiana,IN,East North Central,Midwest,IN-053


In [73]:
disasters.to_csv('disasters.csv')