In [2]:
#  This code takes three NFIRS datasets located in 03_geocodes of the Red Cross google drive
# and combines it into one dataset 
# - Kelson Shilling-Scrivo 2019

In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv




In [4]:
# Change to your filepath to data and uncomment if NFIRS_2009_2016_geocoded_with_tract.csv not in the main path 
os.chdir('../Data/Raw')

In [5]:
os.getcwd()

'/Users/kelson/Documents/DataKind/Red_Cross_Fire/Data/Raw'

In [6]:
# load the data 

# all code is located in the Google Drive under 03_geocodes 
#https://drive.google.com/open?id=1n939Tckv0SFBSiABwIH64M6FSDJHt0eF

# NFIR = automatically geocoded data
# NFIR_2 = handcoded data from Jake 
# NFIR_orig = the original NFIRS dataset


NFIR = pd.read_csv('NFIRS_2009_2016_geocoded_with_tract.csv',
                   encoding='latin_1',
                   index_col=0,
                   low_memory=False)

NFIR_2 = pd.read_csv('NFIRS_2009_2016_FIXED_unique_geocodes_census_tracts.csv',
                   encoding='latin_1',
                   index_col=0,
                   low_memory=False)

NFIR_orig = pd.read_csv('NFIRS_2009_2016.csv',
                   encoding='latin_1',
                   index_col=0,
                   low_memory=False)



  mask |= (ar1 == a)


In [7]:
row_count = sum(1 for row in csv.reader( open('NFIRS_2009_2016_geocoded_with_tract.csv' ) ))
row_count2 = sum(1 for row in csv.reader( open('NFIRS_2009_2016_FIXED_unique_geocodes_census_tracts.csv' ) ))

assert(row_count  - NFIR.shape[0] )
assert(row_count2 - NFIR_2.shape[0] )

row_count

1959106

In [8]:
##Cleaning 

NFIR.dropna(inplace = True)

# Duplicate checking
dup_flag = set(NFIR.index.values.tolist() ) & set( NFIR_2.index.values.tolist() )

assert( dup_flag == set() )


NFIR_orig.head()

Unnamed: 0,state,fdid,inc_no,city,street,zip5,lat,lon,fips,inc_date,...,det_type,det_power,det_operat,det_effect,det_fail,aes_pres,aes_type,aes_oper,no_spr_op,aes_fail
1,AK,11100,211,JUNEAU,1700 ANGUS WAY,99801,,,,1/3/2009 0:00:00,...,1.0,1.0,2.0,1.0,,N,,,,
2,AK,11100,470,JUNEAU,3221 PIONEER AVE,99801,,,,1/6/2009 0:00:00,...,,,,,,,,,,
3,AK,11100,556,JUNEAU,10231 HERON WAY,99801,,,,1/6/2009 0:00:00,...,,,,,,N,,,,
4,AK,11100,1315,JUNEAU,6590 GLACIER HWY,99801,,,,1/13/2009 0:00:00,...,,,,,,,,,,
5,AK,11100,2151,JUNEAU,6590 GLACIER HWY,99801,,,,1/21/2009 0:00:00,...,,,,,,N,,,,


In [9]:
# munging 

#  recreate the original address catagory to be compatible with the original NFIR file 
NFIR_2['ORIGINAL_ADDRESS'] = NFIR_2['Address'] +', ' + NFIR_2['City'] +', ' + NFIR_2['STATE'] + ', ' + NFIR_2['ZIP']

# set ID as index variable 
NFIR_2 = NFIR_2.set_index('ID')


# Appending All matching NFIR_2 Data to NFIR to create the full geocoded NFIRS dataset
NFIR_all = NFIR.append(NFIR_2.loc[:,[ 'ORIGINAL_ADDRESS',
                                     'X','Y','STATE',
                                     'state_fips','COUNTYFP',
                                     'TRACTCE','GEOID']], sort = True ) 

In [10]:
NFIR_all.tail()

Unnamed: 0_level_0,COUNTYFP,FUNCSTAT,GEOID,MATCH,MATCHED_ADDRESS,MATCH_TYPE,MTFCC,NAME,NAMELSAD,ORIGINAL_ADDRESS,STATE,TIGER_LINE_ID,TIGER_LINE_SIDE,TRACTCE,X,Y,state_fips
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
rcp2_id1753662,11.0,,42011010000.0,,,,,,,"ZIEGLERS RD, BERNTOWNSHIP OF, PA, 19605",PA,,,12700.0,-75.93677,40.3907,42.0
rcp2_id1332366,215.0,,29215480000.0,,,,,,,"ZIMMERMAN, CABOOL, MO, 65689",MO,,,480400.0,-92.09128,37.15987,29.0
rcp2_id1332364,215.0,,29215480000.0,,,,,,,"ZIMMERMANN, CABOOL, MO, 65689",MO,,,480400.0,-92.09128,37.15987,29.0
rcp2_id1532504,275.0,,2275000000.0,,,,,,,"ZIMOVIA, WRANGELL, AK, 99929",AK,,,300.0,-132.38707,56.47183,2.0
rcp2_id1924543,167.0,,39167020000.0,,,,,,,"ZION RIDGE RD, NEWPORT, OH, 45768",OH,,,21200.0,-81.26043,39.39918,39.0


In [11]:
NFIR_orig['ID'] = 'rcp2_id' + NFIR_orig.index.astype('str')

In [12]:
# adding ID to Original data set 

NFIR_orig = NFIR_orig.set_index('ID')


# merging Geocoded dataset with original Dataset
NFIR_merged_final = NFIR_orig.merge(NFIR_all,how='left',left_index = True, right_index = True)

In [13]:
# row duplication check 
NFIR_merged_final.head()



Unnamed: 0_level_0,state,fdid,inc_no,city,street,zip5,lat,lon,fips,inc_date,...,NAME,NAMELSAD,ORIGINAL_ADDRESS,STATE,TIGER_LINE_ID,TIGER_LINE_SIDE,TRACTCE,X,Y,state_fips
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
rcp2_id1,AK,11100,211,JUNEAU,1700 ANGUS WAY,99801,,,,1/3/2009 0:00:00,...,5.0,Census Tract 5,"1700 ANGUS WAY, JUNEAU, AK, 99801",AK,190964136.0,R,500.0,-134.4244,58.306507,2.0
rcp2_id2,AK,11100,470,JUNEAU,3221 PIONEER AVE,99801,,,,1/6/2009 0:00:00,...,6.0,Census Tract 6,"3221 PIONEER AVE, JUNEAU, AK, 99801",AK,190966361.0,R,600.0,-134.43314,58.292686,2.0
rcp2_id3,AK,11100,556,JUNEAU,10231 HERON WAY,99801,,,,1/6/2009 0:00:00,...,1.0,Census Tract 1,"10231 HERON WAY, JUNEAU, AK, 99801",AK,190962538.0,R,100.0,-134.6033,58.40261,2.0
rcp2_id4,AK,11100,1315,JUNEAU,6590 GLACIER HWY,99801,,,,1/13/2009 0:00:00,...,4.0,Census Tract 4,"6590 GLACIER HWY, JUNEAU, AK, 99801",AK,190963741.0,R,400.0,-134.51932,58.358955,2.0
rcp2_id5,AK,11100,2151,JUNEAU,6590 GLACIER HWY,99801,,,,1/21/2009 0:00:00,...,4.0,Census Tract 4,"6590 GLACIER HWY, JUNEAU, AK, 99801",AK,190963741.0,R,400.0,-134.51932,58.358955,2.0


In [None]:
NFIR_merged_final.to_csv('..//Transformed//NFIRS_2009_2016_Combined_Census_Tract.csv')
