TO DO:
1. ~Fix counties for unclaimed~
2. ~Fix counties for unidentified~
3. ~Add line to state_centroids with south pole coordinates and nonsense FIPS code~
4. ~Re-export state-level json (now that county fields have been updated within the databases and American Samoa has been removed)~
    * ~MAKE SURE TO MAP NAS to nonsense FIPS~
5. Add 55 lines to county_centroids with south pole coordinates and nonsense county FIPS codes (state only?)
6. Format county data - name and county FIPS code
7. Export county-level json - 
    * any records with no county get pulled (either into separate geojson, or to separate state FIPS with no county key and nonsense coordinates [south pole])
    * make sure state name included as field with each database, not just FIPS code
8. ~Re-export summary count JSON (as a few cases have been deleted)~
    * ~Address 21 NAs for Unclaimed?~
9. See how bad city data would be
    * likely need to make all city names .lower

In [1]:
# import necessary packages
import pandas as pd
import geopandas as gpd
import numpy as np
import json
import geojson

Notes:
1. Edits to Missing_04182020.csv prior to import
  * Virgin Islands (to United States Virgin Islands), 
  * Tennesse (to Tennessee), and 
  * Northern Mariana Islands (to Commonwealth of the Northern Mariana Islands)
  * Address all county nulls
2. Edits to Unidentified_04182020.csv prior to import
  * Virgin Islands (to United States Virgin Islands)
  * Address all county nulls

In [2]:
# Read in csvs
city_df = pd.read_csv('cities.csv')

In [3]:
county_centroids_df = pd.read_csv('county_centroids.csv', encoding='Windows-1252')

In [4]:
state_centroids_df = pd.read_csv('state_centroids.csv')

In [5]:
# alternate centroids - with None option with FIPS 99
state_centroids_v2_df = pd.read_csv('state_centroids_v2.csv')

In [6]:
missing_df = pd.read_csv('Missing_04182020.csv')

In [7]:
unclaimed_df = pd.read_csv('Unclaimed_04182020.csv')

In [8]:
unidentified_df = pd.read_csv('Unidentified_04182020.csv')

In [9]:
# check dataframe
state_centroids_df.head()

Unnamed: 0,OBJECTID,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd
0,1,28,1779790,0400000US28,28,MS,Mississippi,0,122000000000.0,3926920000.0,1,-89.665425,32.751473
1,2,37,1027616,0400000US37,37,NC,North Carolina,0,126000000000.0,13466070000.0,2,-79.380052,35.542238
2,3,40,1102857,0400000US40,40,OK,Oklahoma,0,178000000000.0,3374588000.0,3,-97.508293,35.583486
3,4,51,1779803,0400000US51,51,VA,Virginia,0,102000000000.0,8528532000.0,4,-78.883285,37.512967
4,5,54,1779805,0400000US54,54,WV,West Virginia,0,62266470000.0,489028500.0,5,-80.613731,38.642587


# PART 1: Summary data (count for all 3 databases, by state)

### 1/4: Get count of missing person cases

In [10]:
missing_count = missing_df.groupby('State').count()
len(missing_count)
missing_count.head()

Unnamed: 0_level_0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,Sex,Race / Ethnicity,Date Modified
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Alabama,238,238,238,238,238,238,238,238,238,238
Alaska,582,582,582,582,582,574,582,582,582,582
Arizona,916,916,916,916,916,915,916,916,916,916
Arkansas,280,280,280,280,280,280,280,280,280,280
California,2511,2511,2511,2511,2510,2510,2511,2511,2511,2511


In [11]:
# Remove all columns except case number count
missing_count = missing_count.drop(columns=['DLC','Last Name', 'First Name', 'Missing Age', 'City', 'County', 'Sex', 'Race / Ethnicity', 'Date Modified'])

In [12]:
# add column for state (since state is now index)
missing_count['State'] = missing_count.index

In [13]:
# rename case number count column
missing_count = missing_count.rename(columns = {'Case Number': 'Missing_CaseCount'}, inplace = False)

In [14]:
# check dataframe
missing_count.head()

Unnamed: 0_level_0,Missing_CaseCount,State
State,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,238,Alabama
Alaska,582,Alaska
Arizona,916,Arizona
Arkansas,280,Arkansas
California,2511,California


In [15]:
# make dictionary of states and missing person counts
missing_dict = dict(zip(missing_count.State, missing_count.Missing_CaseCount))
len(missing_dict)
# missing_dict

55

In [16]:
# Get count of values in missing database w/ no state assigned
mis_null_series = missing_df.loc[missing_df['State'].isnull()].count()
mis_null_ct = mis_null_series['Case Number']
mis_null_ct

0

### 2/4: Get count of unclaimed persons

In [17]:
unclaimed_count = unclaimed_df.groupby('State').count()
# len(unclaimed_count)
unclaimed_count.head()

Unnamed: 0_level_0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,Date Modified
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama,63,63,63,63,63,63,63,63,63
Alaska,1,1,1,1,1,1,1,1,1
Arizona,21,20,21,21,21,21,20,21,21
Arkansas,16,16,16,16,16,16,16,16,16
California,1680,1046,1678,1678,769,775,691,1652,1680


In [18]:
# Remove all columns except case number count
unclaimed_count = unclaimed_count.drop(columns=['DBF','Last Name', 'First Name', 'Sex', 'Race / Ethnicity', 'City', 'County', 'Date Modified'])

In [19]:
# add column for state (since state is now index)
unclaimed_count['State'] = unclaimed_count.index

In [20]:
# rename case number count column
unclaimed_count = unclaimed_count.rename(columns = {'Case Number': 'Unclaimed_CaseCount'}, inplace = False)

In [21]:
# check dataframe
unclaimed_count.head()

Unnamed: 0_level_0,Unclaimed_CaseCount,State
State,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,63,Alabama
Alaska,1,Alaska
Arizona,21,Arizona
Arkansas,16,Arkansas
California,1680,California


In [22]:
# make dictionary of states and unclaimed case counts
unclaimed_dict = dict(zip(unclaimed_count.State, unclaimed_count.Unclaimed_CaseCount))
len(unclaimed_dict)
# unclaimed_dict

43

In [23]:
# Get count of values in unclaimed database w/ no state assigned
unc_null_series = unclaimed_df.loc[unclaimed_df['State'].isnull()].count()
unc_null_ct = unc_null_series['Case Number']
unc_null_ct

21

### 3/4: Get count of unidentified persons

In [24]:
unidentified_count = unidentified_df.groupby('State').count()
# len(unidentified_count)
unidentified_count.head()

Unnamed: 0_level_0,Case Number,DBF,Age From,Age To,City,County,Sex,Race / Ethnicity,Date Modified
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama,56,56,46,46,48,55,56,56,56
Alaska,56,56,23,23,50,53,56,56,56
Arizona,1773,1772,1670,1670,1427,1773,1772,1767,1773
Arkansas,100,100,95,95,94,100,100,100,100
California,2741,2740,2426,2426,2218,2741,2741,2737,2741


In [25]:
# Remove all columns except case number count
unidentified_count = unidentified_count.drop(columns=['DBF','Age From', 'Age To', 'City', 'County', 'Sex', 'Race / Ethnicity', 'Date Modified'])

In [26]:
# add column for state (since state is now index)
unidentified_count['State'] = unidentified_count.index

In [27]:
# rename case number count column
unidentified_count = unidentified_count.rename(columns = {'Case Number': 'Unidentified_CaseCount'}, inplace = False)

In [28]:
# check dataframe
unidentified_count.head()

Unnamed: 0_level_0,Unidentified_CaseCount,State
State,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,56,Alabama
Alaska,56,Alaska
Arizona,1773,Arizona
Arkansas,100,Arkansas
California,2741,California


In [29]:
# make dictionary of states and unidentified case counts
unidentified_dict = dict(zip(unidentified_count.State, unidentified_count.Unidentified_CaseCount))
len(unidentified_dict)
# unidentified_dict

54

In [30]:
# Get count of values in unidentified database w/ no state assigned
uni_null_series = unidentified_df.loc[unidentified_df['State'].isnull()].count()
uni_null_ct = uni_null_series['Case Number']
uni_null_ct

0

### 4/4: Make summary dataframe

In [31]:
# make a new summary dataframe based on the state centroids
summary_df = state_centroids_v2_df
# sort by state name
summary_df = summary_df.sort_values(by=['STATEFP'])
summary_df.head()

Unnamed: 0,OBJECTID,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd
17,18,1,1779775,0400000US01,1,AL,Alabama,0,131000000000.0,4593327000.0,18,-86.828092,32.790364
27,28,2,1785533,0400000US02,2,AK,Alaska,0,1480000000000.0,245000000000.0,28,-152.680813,64.530206
35,36,4,1779777,0400000US04,4,AZ,Arizona,0,294000000000.0,1027338000.0,36,-111.664418,34.29311
51,53,5,68085,0400000US05,5,AR,Arkansas,0,135000000000.0,2962860000.0,53,-92.439268,34.899745
16,17,6,1779778,0400000US06,6,CA,California,0,404000000000.0,20463870000.0,17,-119.270414,36.373627


In [32]:
len(summary_df)

56

#### 4a: Add Missing Person count for each state

In [33]:
# Add new column to dataframe, using the state name field as a key in the missing_dict, to pull the correct missing case count for each state
summary_df['Missing_Count'] = summary_df['NAME'].map(missing_dict)
summary_df.head()

Unnamed: 0,OBJECTID,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,Missing_Count
17,18,1,1779775,0400000US01,1,AL,Alabama,0,131000000000.0,4593327000.0,18,-86.828092,32.790364,238.0
27,28,2,1785533,0400000US02,2,AK,Alaska,0,1480000000000.0,245000000000.0,28,-152.680813,64.530206,582.0
35,36,4,1779777,0400000US04,4,AZ,Arizona,0,294000000000.0,1027338000.0,36,-111.664418,34.29311,916.0
51,53,5,68085,0400000US05,5,AR,Arkansas,0,135000000000.0,2962860000.0,53,-92.439268,34.899745,280.0
16,17,6,1779778,0400000US06,6,CA,California,0,404000000000.0,20463870000.0,17,-119.270414,36.373627,2511.0


#### 4b: Add Unclaimed Person count for each state

In [34]:
# Add new column to dataframe, using the state name field as a key in the unclaimed_dict, to pull the correct unclaimed case count for each state
summary_df['Unclaimed_Count'] = summary_df['NAME'].map(unclaimed_dict)
summary_df.head()

Unnamed: 0,OBJECTID,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,Missing_Count,Unclaimed_Count
17,18,1,1779775,0400000US01,1,AL,Alabama,0,131000000000.0,4593327000.0,18,-86.828092,32.790364,238.0,63.0
27,28,2,1785533,0400000US02,2,AK,Alaska,0,1480000000000.0,245000000000.0,28,-152.680813,64.530206,582.0,1.0
35,36,4,1779777,0400000US04,4,AZ,Arizona,0,294000000000.0,1027338000.0,36,-111.664418,34.29311,916.0,21.0
51,53,5,68085,0400000US05,5,AR,Arkansas,0,135000000000.0,2962860000.0,53,-92.439268,34.899745,280.0,16.0
16,17,6,1779778,0400000US06,6,CA,California,0,404000000000.0,20463870000.0,17,-119.270414,36.373627,2511.0,1680.0


In [35]:
# Check null values [NOTE: seems fine to have nulls]
unclaimed_null_df = summary_df.loc[summary_df['Unclaimed_Count'].isnull()]
unclaimed_null_df

Unnamed: 0,OBJECTID,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,Missing_Count,Unclaimed_Count
26,27,20,481813,0400000US20,20,KS,Kansas,0,212000000000.0,1344141000.0,27,-98.380213,38.484708,93.0,
39,41,24,1714934,0400000US24,24,MD,Maryland,0,25151100000.0,6979967000.0,41,-76.774792,39.039764,193.0,
53,55,27,662849,0400000US27,27,MN,Minnesota,0,206000000000.0,18945220000.0,55,-94.309126,46.316468,181.0,
31,32,30,767982,0400000US30,30,MT,Montana,0,377000000000.0,3869209000.0,32,-109.645144,47.03347,86.0,
10,11,31,1779792,0400000US31,31,NE,Nebraska,0,199000000000.0,1371829000.0,11,-99.810809,41.527149,73.0,
45,47,44,1219835,0400000US44,44,RI,Rhode Island,0,2677780000.0,1323670000.0,47,-71.589051,41.694954,21.0,
14,15,46,1785534,0400000US46,46,SD,South Dakota,0,196000000000.0,3382720000.0,15,-100.230512,44.436159,29.0,
30,31,50,1779802,0400000US50,50,VT,Vermont,0,23874180000.0,1030417000.0,31,-72.662649,44.075196,56.0,
43,45,66,1802705,0400000US66,66,GU,Guam,0,543555800.0,934337500.0,45,144.702724,13.357731,2.0,
44,46,69,1779809,0400000US69,69,MP,Commonwealth of the Northern Mariana Islands,0,472292500.0,4644252000.0,46,145.753615,15.188891,2.0,


In [36]:
# # change NaN Unclaimed count for fake FIPS 99 to count of values in missing database w/ no state assigned (21)
index_Series = summary_df.loc[summary_df['STATEFP']==99]
index_None = index_Series.index[0]
summary_df.loc[index_None, 'Unclaimed_Count'] = unc_null_ct
# check value
summary_df['Unclaimed_Count'][index_None]

21.0

#### 4b: Add Unidentified Person count for each state

In [37]:
# Add new column to dataframe, using the state name field as a key in the unidentified_dict, to pull the correct unidentified case count for each state
summary_df['Unidentified_Count'] = summary_df['NAME'].map(unidentified_dict)
summary_df.head()

Unnamed: 0,OBJECTID,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,Missing_Count,Unclaimed_Count,Unidentified_Count
17,18,1,1779775,0400000US01,1,AL,Alabama,0,131000000000.0,4593327000.0,18,-86.828092,32.790364,238.0,63.0,56.0
27,28,2,1785533,0400000US02,2,AK,Alaska,0,1480000000000.0,245000000000.0,28,-152.680813,64.530206,582.0,1.0,56.0
35,36,4,1779777,0400000US04,4,AZ,Arizona,0,294000000000.0,1027338000.0,36,-111.664418,34.29311,916.0,21.0,1773.0
51,53,5,68085,0400000US05,5,AR,Arkansas,0,135000000000.0,2962860000.0,53,-92.439268,34.899745,280.0,16.0,100.0
16,17,6,1779778,0400000US06,6,CA,California,0,404000000000.0,20463870000.0,17,-119.270414,36.373627,2511.0,1680.0,2741.0


In [38]:
# Check null values [NOTE: seems fine to have nulls]
unidentified_null_df = summary_df.loc[summary_df['Unidentified_Count'].isnull()]
unidentified_null_df

Unnamed: 0,OBJECTID,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,Missing_Count,Unclaimed_Count,Unidentified_Count
44,46,69,1779809,0400000US69,69,MP,Commonwealth of the Northern Mariana Islands,0,472292529.0,4644252000.0,46,145.753615,15.188891,2.0,,
55,99,99,9999999,9999999US99,99,,,0,0.0,0.0,99,45.0,-90.0,,21.0,


#### 4c: Add column for total count of cases in all three databases for each state

In [39]:
summary_df.head()

Unnamed: 0,OBJECTID,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,Missing_Count,Unclaimed_Count,Unidentified_Count
17,18,1,1779775,0400000US01,1,AL,Alabama,0,131000000000.0,4593327000.0,18,-86.828092,32.790364,238.0,63.0,56.0
27,28,2,1785533,0400000US02,2,AK,Alaska,0,1480000000000.0,245000000000.0,28,-152.680813,64.530206,582.0,1.0,56.0
35,36,4,1779777,0400000US04,4,AZ,Arizona,0,294000000000.0,1027338000.0,36,-111.664418,34.29311,916.0,21.0,1773.0
51,53,5,68085,0400000US05,5,AR,Arkansas,0,135000000000.0,2962860000.0,53,-92.439268,34.899745,280.0,16.0,100.0
16,17,6,1779778,0400000US06,6,CA,California,0,404000000000.0,20463870000.0,17,-119.270414,36.373627,2511.0,1680.0,2741.0


In [42]:
summary_df.iloc[:, -3]

17     238.0
27     582.0
35     916.0
51     280.0
16    2511.0
21     311.0
54     210.0
42      58.0
36      42.0
9     1493.0
18     293.0
41     157.0
8      112.0
29     375.0
52     202.0
32      94.0
26      93.0
46     266.0
5      388.0
40     135.0
39     193.0
7      147.0
6      595.0
53     181.0
0      134.0
20     363.0
31      86.0
10      73.0
28     207.0
34      41.0
38     321.0
12     175.0
25     688.0
1      372.0
50      29.0
47     371.0
2      505.0
49     472.0
19     448.0
45      21.0
33     217.0
14      29.0
23     554.0
15    1423.0
22     107.0
30      56.0
3      243.0
11     696.0
4      132.0
48     168.0
24      45.0
43       2.0
44       2.0
13      23.0
37      16.0
55       NaN
Name: Missing_Count, dtype: float64

In [47]:
# Note - may need to change -4 to -3, run, then change back to -4 and re-run. DON'T KNOW WHY
summary_df['Total_Count'] = summary_df.iloc[:, -4:-1].sum(axis=1)
summary_df.head(56)

Unnamed: 0,OBJECTID,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,Missing_Count,Unclaimed_Count,Unidentified_Count,Total_Count
17,18,1,1779775,0400000US01,1,AL,Alabama,0,131000000000.0,4593327000.0,18,-86.828092,32.790364,238.0,63.0,56.0,357.0
27,28,2,1785533,0400000US02,2,AK,Alaska,0,1480000000000.0,245000000000.0,28,-152.680813,64.530206,582.0,1.0,56.0,639.0
35,36,4,1779777,0400000US04,4,AZ,Arizona,0,294000000000.0,1027338000.0,36,-111.664418,34.29311,916.0,21.0,1773.0,2710.0
51,53,5,68085,0400000US05,5,AR,Arkansas,0,135000000000.0,2962860000.0,53,-92.439268,34.899745,280.0,16.0,100.0,396.0
16,17,6,1779778,0400000US06,6,CA,California,0,404000000000.0,20463870000.0,17,-119.270414,36.373627,2511.0,1680.0,2741.0,6932.0
21,22,8,1779779,0400000US08,8,CO,Colorado,0,268000000000.0,1181622000.0,22,-105.547825,38.998552,311.0,6.0,83.0,400.0
54,56,9,1779780,0400000US09,9,CT,Connecticut,0,12542500000.0,1815618000.0,56,-72.725467,41.620559,210.0,100.0,41.0,351.0
42,44,10,1779781,0400000US10,10,DE,Delaware,0,5045926000.0,1399986000.0,44,-75.50036,38.986599,58.0,1.0,30.0,89.0
36,37,11,1702382,0400000US11,11,DC,District of Columbia,0,158340400.0,18687200.0,37,-77.016296,38.904742,42.0,20.0,29.0,91.0
9,10,12,294478,0400000US12,12,FL,Florida,0,139000000000.0,31361100000.0,10,-81.932259,28.630893,1493.0,45.0,900.0,2438.0


#### 4c: Convert to geodataframe and export as GeoJSON

In [48]:
# check final summary_df
summary_df.head()

Unnamed: 0,OBJECTID,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,Missing_Count,Unclaimed_Count,Unidentified_Count,Total_Count
17,18,1,1779775,0400000US01,1,AL,Alabama,0,131000000000.0,4593327000.0,18,-86.828092,32.790364,238.0,63.0,56.0,357.0
27,28,2,1785533,0400000US02,2,AK,Alaska,0,1480000000000.0,245000000000.0,28,-152.680813,64.530206,582.0,1.0,56.0,639.0
35,36,4,1779777,0400000US04,4,AZ,Arizona,0,294000000000.0,1027338000.0,36,-111.664418,34.29311,916.0,21.0,1773.0,2710.0
51,53,5,68085,0400000US05,5,AR,Arkansas,0,135000000000.0,2962860000.0,53,-92.439268,34.899745,280.0,16.0,100.0,396.0
16,17,6,1779778,0400000US06,6,CA,California,0,404000000000.0,20463870000.0,17,-119.270414,36.373627,2511.0,1680.0,2741.0,6932.0


In [49]:
# Convert dataframe to geodataframe
summary_gdf = gpd.GeoDataFrame(summary_df, geometry=gpd.points_from_xy(x=summary_df.Lon_dd, y=summary_df.Lat_dd))
summary_gdf.head()

Unnamed: 0,OBJECTID,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,Missing_Count,Unclaimed_Count,Unidentified_Count,Total_Count,geometry
17,18,1,1779775,0400000US01,1,AL,Alabama,0,131000000000.0,4593327000.0,18,-86.828092,32.790364,238.0,63.0,56.0,357.0,POINT (-86.82809 32.79036)
27,28,2,1785533,0400000US02,2,AK,Alaska,0,1480000000000.0,245000000000.0,28,-152.680813,64.530206,582.0,1.0,56.0,639.0,POINT (-152.68081 64.53021)
35,36,4,1779777,0400000US04,4,AZ,Arizona,0,294000000000.0,1027338000.0,36,-111.664418,34.29311,916.0,21.0,1773.0,2710.0,POINT (-111.66442 34.29311)
51,53,5,68085,0400000US05,5,AR,Arkansas,0,135000000000.0,2962860000.0,53,-92.439268,34.899745,280.0,16.0,100.0,396.0,POINT (-92.43927 34.89975)
16,17,6,1779778,0400000US06,6,CA,California,0,404000000000.0,20463870000.0,17,-119.270414,36.373627,2511.0,1680.0,2741.0,6932.0,POINT (-119.27041 36.37363)


In [51]:
# write to geoJSON
summary_gdf.to_file("JSON/summary_counts.json", driver="GeoJSON", encoding='utf-8')

# Part 2 - Get state-level GeoJSON with data from all 3 databases

### 2a: Add in State FIPS column to each database dataframe

In [52]:
# make dictionary of states and state FIPS code
state_dict = dict(zip(state_centroids_df.NAME, state_centroids_df.STATEFP))
# state_dict
# state_dict['Alaska']
len(state_dict)

55

#### 1/3: add state FIPS codes to missing persons dataframe

In [53]:
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019
3,MP43187,2/20/2018,johnson,abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016


In [54]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
missing_df['State_FIPS'] = missing_df['State'].map(state_dict)
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020,1
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015,1
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019,1
3,MP43187,2/20/2018,johnson,abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020,1
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016,1


In [55]:
# Check unique values in new dataframe field
missing_df['State_FIPS'].unique()

array([ 1,  2,  4,  5,  6,  8, 69,  9, 10, 11, 12, 13, 66, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 72, 44, 45, 46, 47, 48, 78, 49, 50, 51,
       53, 54, 55, 56], dtype=int64)

In [56]:
## If any nulls, check to see what is producing null values [NOTE: fixed by making changes to csv fields, as noted where csvs are imported]
mis_test_df = missing_df.loc[missing_df['State_FIPS'].isnull()]
mis_test_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS


In [57]:
# check missing df
missing_df.head()
# len(missing_df)

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020,1
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015,1
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019,1
3,MP43187,2/20/2018,johnson,abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020,1
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016,1


#### 2/3: add state FIPS codes to unclaimed persons dataframe

In [58]:
unclaimed_df.head()
# len(unclaimed_df)

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020


In [59]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
unclaimed_df['State_FIPS'] = unclaimed_df['State'].map(state_dict)
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17.0
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17.0
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020,36.0
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020,36.0
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020,36.0


In [60]:
# Check unique values in new dataframe field
unclaimed_df['State_FIPS'].unique()

array([17., 36., 25.,  6., 53., 15., 32., 16., 48., 49., 47., 11., 33.,
       35., 40., 42., 39., 29.,  5., 13.,  1., 34.,  4., 12., 55.,  9.,
        8., 26., 22., 18., 19., 41., 23., nan, 45., 56., 54., 51.,  2.,
       28., 21., 37., 10., 38.])

In [61]:
## If any nulls, check to see what is producing null values [NOTE: cannot be addressed, as these cases do not have a state or county assigned]
unc_test_df = unclaimed_df.loc[unclaimed_df['State_FIPS'].isnull()]
# len(unc_test_df)
unc_test_df

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS
3380,UCP5339,12/3/2016,Williams,Barbara,Female,Asian,,,,5/3/2018,
4774,UCP2489,3/26/2015,Lupien,Daryl,Male,,,,,8/18/2016,
5023,UCP4001,11/2/2014,Brown,David,Male,White / Caucasian,,,,5/11/2017,
5883,UCP3962,3/22/2010,Gonzalez,David,Male,Hispanic / Latino,,,,5/11/2017,
5950,UCP3937,12/7/2009,Togovnick,Bernice,Female,White / Caucasian,,,,5/11/2017,
6183,UCP3901,1/21/2009,Carney,John,Male,White / Caucasian,,,,5/11/2017,
6341,UCP3689,6/5/2008,Guzman,Ralph,Male,Hispanic / Latino,,,,5/2/2017,
6560,UCP3824,7/2/2007,Vasquez,Rafael,Male,Hispanic / Latino,,,,5/4/2017,
6698,UCP3672,9/22/2006,Barajas,Epifanio,Male,Hispanic / Latino,,,,5/2/2017,
6719,UCP3668,8/5/2006,Huff,Paul,Male,White / Caucasian,,,,5/2/2017,


In [62]:
# # change NaN FIPS codes (for cases with no city, county or state) to 99 (none option in state_centroids_v2.csv)
unclaimed_df['State_FIPS'] = unclaimed_df['State_FIPS'].fillna(99)
unclaimed_df['State_FIPS'] = unclaimed_df['State_FIPS'].astype(int)
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020,36
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020,36
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020,36


In [63]:
## Double check re-assigned values
unc_test_df2 = unclaimed_df.loc[unclaimed_df['State_FIPS']==99]
unc_test_df2
# len(unc_test_df2)

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS
3380,UCP5339,12/3/2016,Williams,Barbara,Female,Asian,,,,5/3/2018,99
4774,UCP2489,3/26/2015,Lupien,Daryl,Male,,,,,8/18/2016,99
5023,UCP4001,11/2/2014,Brown,David,Male,White / Caucasian,,,,5/11/2017,99
5883,UCP3962,3/22/2010,Gonzalez,David,Male,Hispanic / Latino,,,,5/11/2017,99
5950,UCP3937,12/7/2009,Togovnick,Bernice,Female,White / Caucasian,,,,5/11/2017,99
6183,UCP3901,1/21/2009,Carney,John,Male,White / Caucasian,,,,5/11/2017,99
6341,UCP3689,6/5/2008,Guzman,Ralph,Male,Hispanic / Latino,,,,5/2/2017,99
6560,UCP3824,7/2/2007,Vasquez,Rafael,Male,Hispanic / Latino,,,,5/4/2017,99
6698,UCP3672,9/22/2006,Barajas,Epifanio,Male,Hispanic / Latino,,,,5/2/2017,99
6719,UCP3668,8/5/2006,Huff,Paul,Male,White / Caucasian,,,,5/2/2017,99


In [64]:
# check full dataframe
len(unclaimed_df)

8335

#### 3/3: add state FIPS codes to unidentified persons dataframe

In [65]:
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified
0,UP51903,3/30/1986,,,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018
3,UP55409,3/26/2000,,,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019


In [66]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
unidentified_df['State_FIPS'] = unidentified_df['State'].map(state_dict)
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS
0,UP51903,3/30/1986,,,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020,1
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019,1
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018,1
3,UP55409,3/26/2000,,,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020,1
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019,1


In [67]:
# Check unique values in new dataframe field
unidentified_df['State_FIPS'].unique()

array([ 1,  2,  4,  5,  6,  8,  9, 10, 11, 12, 13, 66, 15, 16, 17, 18, 19,
       20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
       37, 38, 39, 40, 41, 42, 72, 44, 45, 46, 47, 48, 78, 49, 50, 51, 53,
       54, 55, 56], dtype=int64)

In [68]:
## If any nulls, check to see what is producing null values [NOTE: fixed by making changes to csv fields, as noted where csvs are imported]
uni_test_df = unidentified_df.loc[unidentified_df['State_FIPS'].isnull()]
uni_test_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS


### Construct GeoJSON with correct structure

Goal format:
{
"type": "Feature",
    "name": "Wisconsin",
    "properties": {
        "missing": [ ],
        "unclaimed": [ ],
        "unidentified": [ ],
        "filtered": [ ]
    }
    "geometry": {
          "type": "Point",
          "coordinates": [
            -117.79750667,
            36.03755926
          ]
}
* each array will be a list of dictionaries. Each dictionary = one case. Keys = headers

In [69]:
# get headers
missing_header = list(missing_df.columns.values)
print("missing header:", missing_header)
unclaimed_header = list(unclaimed_df.columns.values)
print("unclaimed header:", unclaimed_header)
unidentified_header = list(unidentified_df.columns.values)
print("unidentified header:", unidentified_header)

missing header: ['Case Number', 'DLC', 'Last Name', 'First Name', 'Missing Age', 'City', 'County', 'State', 'Sex', 'Race / Ethnicity', 'Date Modified', 'State_FIPS']
unclaimed header: ['Case Number', 'DBF', 'Last Name', 'First Name', 'Sex', 'Race / Ethnicity', 'City', 'County', 'State', 'Date Modified', 'State_FIPS']
unidentified header: ['Case Number', 'DBF', 'Age From', 'Age To', 'City', 'County', 'State', 'Sex', 'Race / Ethnicity', 'Date Modified', 'State_FIPS']


In [70]:
# check df
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020,36
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020,36
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020,36


In [71]:
# check data types
type(unclaimed_df['State_FIPS'][0])

numpy.int32

In [72]:
# check centroids
state_centroids_v2_df.head()
len(state_centroids_v2_df)

56

In [73]:
# sort each database by state FIPS
# sort state centroids by state FIPS
state_centroids_v2_df = state_centroids_v2_df.sort_values(by=['STATEFP'])
state_centroids_v2_df.head()

Unnamed: 0,OBJECTID,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd
17,18,1,1779775,0400000US01,1,AL,Alabama,0,131000000000.0,4593327000.0,18,-86.828092,32.790364
27,28,2,1785533,0400000US02,2,AK,Alaska,0,1480000000000.0,245000000000.0,28,-152.680813,64.530206
35,36,4,1779777,0400000US04,4,AZ,Arizona,0,294000000000.0,1027338000.0,36,-111.664418,34.29311
51,53,5,68085,0400000US05,5,AR,Arkansas,0,135000000000.0,2962860000.0,53,-92.439268,34.899745
16,17,6,1779778,0400000US06,6,CA,California,0,404000000000.0,20463870000.0,17,-119.270414,36.373627


In [74]:
# sort each database by state FIPS
# sort missing by state FIPS
missing_df = missing_df.sort_values(by=['State_FIPS'])
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020,1
151,MP5734,2/15/2002,Brown,Tamborah,22,Mobile,Mobile,Alabama,Male,Black / African American,5/7/2019,1
152,MP33530,2/10/1999,Brown,Anthony,21,Mobile,Mobile,Alabama,Male,Black / African American,3/4/2020,1
153,MP35096,4/4/1996,Powe,Edgar,31,Mobile,Mobile,Alabama,Male,Black / African American,3/4/2020,1
154,MP64666,1/16/2020,Edwards,Katrina,16,Mobile,Mobile,Alabama,Female,White / Caucasian,3/4/2020,1


In [75]:
# sort unclaimed by state FIPS
unclaimed_df = unclaimed_df.sort_values(by=['State_FIPS'])
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS
2516,UCP5127,10/28/2017,Elmore,Jeanne,Female,White / Caucasian,Birmingham,Jefferson,Alabama,1/11/2019,1
5382,UCP2769,5/26/2013,Black,Billy,Male,White / Caucasian,Birmingham,Jefferson,Alabama,9/20/2018,1
2989,UCP4416,4/22/2017,Harris,Scott,Male,White / Caucasian,Birmingham,Jefferson,Alabama,1/11/2019,1
5095,UCP2760,8/7/2014,Allison,Larry,Male,White / Caucasian,Birmingham,Jefferson,Alabama,9/20/2018,1
5406,UCP2770,4/11/2013,Williams,Darryl,Male,Black / African American,Birmingham,Jefferson,Alabama,9/20/2018,1


In [76]:
# sort unidentified by state FIPS
unidentified_df = unidentified_df.sort_values(by=['State_FIPS'])
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS
0,UP51903,3/30/1986,,,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020,1
30,UP15332,9/5/2004,18.0,99.0,Mountain Brook,Jefferson,Alabama,Male,Black / African American,3/18/2020,1
31,UP9834,1/28/2012,3.0,7.0,Opelika,Lee,Alabama,Female,Black / African American,4/8/2020,1
32,UP13161,11/3/1982,29.0,32.0,,Lee,Alabama,Female,Black / African American,7/10/2015,1
33,UP2747,5/7/2008,45.0,60.0,Huntsville,Limestone,Alabama,Male,White / Caucasian,5/30/2012,1


In [77]:
## Create state array
state_array = []
## for each state in state_centroids...
i = 0
while i < len(state_centroids_v2_df):
    state_dict = {}
    state_dict["type"] = "Feature"
    state_dict["name"] = state_centroids_v2_df["NAME"][i]
    state_dict["name_abbr"] = state_centroids_v2_df["STUSPS"][i]
    state_dict["state_FIPS"] = str(state_centroids_v2_df["STATEFP"][i])
    state_dict["properties"] = {}
    missing_array = []
    j = 0
    while j < len(missing_df):
        # check if state_fips matches
        if missing_df['State_FIPS'][j] == state_centroids_v2_df["STATEFP"][i]:
            missing_dict = {}
            for item in missing_header:
                missing_dict[item] = str(missing_df[item][j])
            # append dictionary to missing array
            missing_array.append(missing_dict)
        # increment j
        j += 1
    state_dict["properties"]['missing'] = missing_array     
    
    unclaimed_array = []
    k = 0
    while k < len(unclaimed_df):
        # check if state_fips matches
        if unclaimed_df['State_FIPS'][k] == state_centroids_v2_df["STATEFP"][i]:
            unclaimed_dict = {}
            for item in unclaimed_header:
                unclaimed_dict[item] = str(unclaimed_df[item][k])
            # append dictionary to unclaimed array
            unclaimed_array.append(unclaimed_dict)
        # increment k
        k += 1
    state_dict["properties"]['unclaimed'] = unclaimed_array
    
    unidentified_array = []
    l = 0
    while l < len(unidentified_df):
        # check if state_fips matches
        if unidentified_df['State_FIPS'][l] == state_centroids_v2_df["STATEFP"][i]:
            unidentified_dict = {}
            for item in unidentified_header:
                unidentified_dict[item] = str(unidentified_df[item][l])
            # append dictionary to unclaimed array
            unidentified_array.append(unidentified_dict)
        # increment l
        l += 1
    state_dict["properties"]['unidentified'] = unidentified_array
    
    state_dict["properties"]['filtered'] = []
    ## set geometry
    state_dict["geometry"] = {}
    state_dict["geometry"]["type"] = "Point"
    state_dict["geometry"]["coordinates"] = [state_centroids_v2_df['Lon_dd'][i], state_centroids_v2_df['Lat_dd'][i]]
    ## append state dictionary to array
    state_array.append(state_dict)
    # increment interator
    i += 1
# state_array

In [81]:
# check item in array
len(state_array)
# state_array[55]['properties']['unclaimed']

56

In [82]:
# Create FeatureCollection
state_feature_collection = {}
state_feature_collection["type"] = "FeatureCollection"
state_feature_collection["features"] = state_array
# state_feature_collection

In [83]:
# Convert FeatureCollection to JSON format
state_geojson = geojson.dumps(state_feature_collection)
# check type to make sure conversion was sucessful
print(type(state_geojson))

<class 'str'>


In [84]:
# Save JSON-formatted FeatureCollection as JSON file
# Save as new json file
with open('JSON/state_geojson.json', 'w', encoding='utf-8') as json_file:
    json_file.write(state_geojson)

## Part 2 - OLD METHOD: Get state-level JSONs for all 3 databases

### 2b: Join each database dataframe to the state_centroids dataframe

In [None]:
# check state centroids dataframe
state_centroids_df.head()

#### 1/4: Join state centroids dataframe to missing persons dataframe
##### (assign each row the coordinates of a state centroid based on the state fips code)

In [None]:
# check dataframe
missing_df.head()

In [None]:
# check dataframe length
len(missing_df)

In [None]:
# join state centroids dataframe to missing persons dataframe - MAKE SURE LENGTH MATCHES LENGTH OF MISSING_DF
missing_states_df = pd.merge(left = missing_df, right = state_centroids_df, how = 'left', left_on = 'State_FIPS', right_on = 'STATEFP')
# len(missing_states_df)
missing_states_df.head()

#### 1/4: Join state centroids dataframe to unclaimed persons dataframe
##### (assign each row the coordinates of a state centroid based on the state fips code)

In [None]:
# check dataframe
unclaimed_df.head()

In [None]:
# check dataframe length
len(unclaimed_df)

In [None]:
# join state centroids dataframe to unclaimed persons dataframe - MAKE SURE LENGTH MATCHES LENGTH OF UNCLAIMED_DF
unclaimed_states_df = pd.merge(left = unclaimed_df, right = state_centroids_df, how = 'left', left_on = 'State_FIPS', right_on = 'STATEFP')
# len(unclaimed_states_df)
unclaimed_states_df.head()

In [None]:
# make sure int STATEFP joined ok to float State_FIPS [ NOTE: it joined fine]
# unclaimed_states_df[['State_FIPS', 'STATEFP']]

In [None]:
## Double check rows w/ no FIPS to make sure missing_df data retained [NOTE: it was!]
unc_states_test_df = unclaimed_states_df.loc[unclaimed_states_df['State_FIPS'].isnull()]
unc_states_test_df

#### 1/4: Join state centroids dataframe to unidentified persons dataframe
##### (assign each row the coordinates of a state centroid based on the state fips code)

In [None]:
# check dataframe
unidentified_df.head()

In [None]:
# check dataframe length
len(unidentified_df)

In [None]:
# join state centroids dataframe to unidentified persons dataframe - MAKE SURE LENGTH MATCHES LENGTH OF UNIDENTIFIED_DF
unidentified_states_df = pd.merge(left = unidentified_df, right = state_centroids_df, how = 'left', left_on = 'State_FIPS', right_on = 'STATEFP')
# len(unidentified_states_df)
unidentified_states_df.head()

#### 4/4: Convert dataframes to geodataframes and export as GeoJSON files

##### 4a: missing_states_df

In [None]:
# check dataframe
missing_states_df.head()

In [None]:
# Convert missing persons dataframe to geodataframe
missing_states_gdf = gpd.GeoDataFrame(missing_states_df, geometry=gpd.points_from_xy(x=missing_states_df.Lon_dd, y=missing_states_df.Lat_dd))
# len(missing_states_gdf)
missing_states_gdf.head()

In [None]:
# write to geoJSON
missing_states_gdf.to_file("JSON/missing_states.json", driver="GeoJSON", encoding='utf-8')

##### 4b: unclaimed_states_df

In [None]:
# check dataframe
unclaimed_states_df.head()

In [None]:
# Convert unclaimed persons dataframe to geodataframe
unclaimed_states_gdf = gpd.GeoDataFrame(unclaimed_states_df, geometry=gpd.points_from_xy(x=unclaimed_states_df.Lon_dd, y=unclaimed_states_df.Lat_dd))
# len(unclaimed_states_gdf)
unclaimed_states_gdf.head()

In [None]:
# write to geoJSON
unclaimed_states_gdf.to_file("JSON/unclaimed_states.json", driver="GeoJSON", encoding='utf-8')

##### 4c: unidentified_states_df

In [None]:
# check dataframe
unidentified_states_df.head()

In [None]:
# Convert unidentified persons dataframe to geodataframe
unidentified_states_gdf = gpd.GeoDataFrame(unidentified_states_df, geometry=gpd.points_from_xy(x=unidentified_states_df.Lon_dd, y=unidentified_states_df.Lat_dd))
# len(unidentified_states_gdf)
unidentified_states_gdf.head()

In [None]:
# write to geoJSON
unidentified_states_gdf.to_file("JSON/unidentified_states.json", driver="GeoJSON", encoding='utf-8')

#### Check that GeoJSON files load properly

In [None]:
test_miss_df = gpd.read_file('JSON/missing_states.json')
len(test_miss_df)
# test_miss_df.head()

In [None]:
test_unc_df = gpd.read_file('JSON/unclaimed_states.json')
len(test_unc_df)
# test_unc_df.head()

In [None]:
test_uni_df = gpd.read_file('JSON/unidentified_states.json')
len(test_uni_df)
# test_uni_df.head()

# Part 3: Get county-level JSONs for all 3 databases

# ADD IN STATE NAME TO COUNTY JSON

#### Add in County FIPS column to county centroids dataframe

In [None]:
# check county dataframe
# county_centroids_df.head()
len(county_centroids_df)

In [None]:
# county_subset = county_centroids_df.loc[county_centroids_df['STATEFP']==35]
# county_subset

In [None]:
# Add column with compound field key
county_centroids_df['County_Key'] = county_centroids_df['STATEFP'].astype(str) + "_" + county_centroids_df['NAME']
county_centroids_df.head()

In [None]:
# check unique values and length
county_key_list = county_centroids_df['County_Key'].unique()
len(county_key_list)

In [None]:
# make dictionary of counties and county FIPS code (GEIOD field)
county_dict = dict(zip(county_centroids_df.County_Key, county_centroids_df.GEOID))
# county_dict

#### Add in County FIPS column to missing dataframe

In [None]:
# check missing df
missing_df.head()

In [None]:
# Add column with compound field key
missing_df['State_County'] = missing_df['State_FIPS'].astype(str) + "_" + missing_df['County']
missing_df.head()

In [None]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
missing_df['County_FIPS'] = missing_df['State_County'].map(county_dict)
# Check unique values in new dataframe field
missing_df['County_FIPS'].unique()

In [None]:
# check null values [NOTE: all ok - no city or county assigned]
missing_county_null_df = missing_df.loc[missing_df['County_FIPS'].isnull()]
# missing_county_null_df.shape
missing_county_null_df

In [None]:
# As needed, export nulls to address
# missing_county_null_df.to_csv('missing_county_nulls.csv', encoding='Windows-1252')

In [None]:
# check missing_df
missing_df.head()

#### Add in County FIPS column to unclaimed dataframe

In [None]:
# check unclaimed df
unclaimed_df.head()

In [None]:
# Add column with compound field key
unclaimed_df['State_County'] = unclaimed_df['State_FIPS'].astype(str) + "_" + unclaimed_df['County']
unclaimed_df.head()

In [None]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
unclaimed_df['County_FIPS'] = unclaimed_df['State_County'].map(county_dict)
# Check unique values in new dataframe field
unclaimed_df['County_FIPS'].unique()

In [None]:
# check null values
unclaimed_county_null_df = unclaimed_df.loc[unclaimed_df['County_FIPS'].isnull()]
# unclaimed_county_null_df.shape
unclaimed_county_null_df

In [None]:
# As needed, export nulls to address [NOTE: all have been addressed. 1241 cases have no county]
# unclaimed_county_null_df.to_csv('unclaimed_county_nulls.csv', encoding='Windows-1252')

In [None]:
# check unclaimed_df
unclaimed_df.head()

#### Add in County FIPS column to unidentified dataframe

In [None]:
# check unidentified df
unidentified_df.head()

In [None]:
# Add column with compound field key
unidentified_df['State_County'] = unidentified_df['State_FIPS'].astype(str) + "_" + unidentified_df['County']
unidentified_df.head()

In [None]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
unidentified_df['County_FIPS'] = unidentified_df['State_County'].map(county_dict)
# Check unique values in new dataframe field
unidentified_df['County_FIPS'].unique()

In [None]:
# check null values
unidentified_county_null_df = unidentified_df.loc[unidentified_df['County_FIPS'].isnull()]
# unidentified_county_null_df.shape
# len(unidentified_county_null_df)
unidentified_county_null_df

In [None]:
# As needed, export nulls to address [NOTE - all have been addressed - 28 remain with no city or county]
# unidentified_county_null_df.to_csv('unidentified_county_nulls.csv', encoding='Windows-1252')

In [None]:
# check unclaimed_df
unidentified_df.head()