TO DO:
1. ~Fix counties for unclaimed~
2. ~Fix counties for unidentified~
3. ~Add line to state_centroids with south pole coordinates and nonsense FIPS code~
4. ~Re-export state-level json (now that county fields have been updated within the databases and American Samoa has been removed)~
    * ~MAKE SURE TO MAP NAS to nonsense FIPS~
5. ~Add to county centroids:~
    * ~55 lines with south pole coordinates and nonsense county FIPS codes (state only_999)~
    * ~1 line with south pole coordinates and nonsense FIPS code (99)~
6. Format county data - state name, and then name and county FIPS code, to get GEOID
7. Export county-level json - 
    * any records with no county get pulled (either into separate geojson, or to separate state FIPS with no county key and nonsense coordinates [south pole])
    * make sure state name included as field with each database, not just FIPS code
8. ~Re-export summary count JSON (as a few cases have been deleted)~
    * ~Address 21 NAs for Unclaimed?~
9. See how bad city data would be
    * likely need to make all city names .lower

In [1]:
# import necessary packages
import pandas as pd
import geopandas as gpd
import numpy as np
import json
import geojson

Notes:
1. Edits to Missing_04182020.csv prior to import
  * Virgin Islands (to United States Virgin Islands), 
  * Tennesse (to Tennessee), and 
  * Northern Mariana Islands (to Commonwealth of the Northern Mariana Islands)
  * Address all county nulls
2. Edits to Unclaimed_0418202.csv priort to import
  * Address all county nulls
3. Edits to Unidentified_04182020.csv prior to import
  * Virgin Islands (to United States Virgin Islands)
  * Address all county nulls
4. Edits to state_centroids_v2 prior to import
  * Add one row with south pole coordinates and nonsense FIPS code(99) - for cases w/ no city, county, or state
5. Edits to county_centroids_v2 prior to import
  * Add one row with south pole coordinates and nonsense FIPS code(99) - for cases w/ no city, county, or state
  * Add 55 rows with south pole coordinates and nonsense county FIPS codes(999) - for cases w/ no city or county

In [2]:
# Read in csvs
city_df = pd.read_csv('cities.csv')

In [3]:
state_centroids_df = pd.read_csv('state_centroids.csv')

In [4]:
# alternate centroids - with None option with FIPS 99
state_centroids_v2_df = pd.read_csv('state_centroids_v2.csv')

In [5]:
county_centroids_df = pd.read_csv('county_centroids.csv', encoding='Windows-1252')

In [6]:
# alternate centroids - with None options with count FIPS 999
county_centroids_v2_df = pd.read_csv('county_centroids_v2.csv', encoding='Windows-1252')

In [7]:
missing_df = pd.read_csv('Missing_04182020.csv')

In [8]:
unclaimed_df = pd.read_csv('Unclaimed_04182020.csv')

In [9]:
unidentified_df = pd.read_csv('Unidentified_04182020.csv')

In [10]:
# check dataframe
county_centroids_v2_df.head()

Unnamed: 0,OBJECTID,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd
0,1,21,7,516850,0500000US21007,21007,Ballard,6,639387500.0,69473325,1,-88.999262,37.058489
1,2,21,17,516855,0500000US21017,21017,Bourbon,6,750439400.0,4829777,2,-84.217155,38.206742
2,3,21,31,516862,0500000US21031,21031,Butler,6,1103572000.0,13943044,3,-86.681628,37.207292
3,4,21,65,516879,0500000US21065,21065,Estill,6,655509900.0,6516335,4,-83.964316,37.692451
4,5,21,69,516881,0500000US21069,21069,Fleming,6,902727200.0,7182793,5,-83.69666,38.370126


# PART 1: Summary data (count for all 3 databases, by state)

### 1/4: Get count of missing person cases

In [None]:
missing_count = missing_df.groupby('State').count()
len(missing_count)
missing_count.head()

In [None]:
# Remove all columns except case number count
missing_count = missing_count.drop(columns=['DLC','Last Name', 'First Name', 'Missing Age', 'City', 'County', 'Sex', 'Race / Ethnicity', 'Date Modified'])

In [None]:
# add column for state (since state is now index)
missing_count['State'] = missing_count.index

In [None]:
# rename case number count column
missing_count = missing_count.rename(columns = {'Case Number': 'Missing_CaseCount'}, inplace = False)

In [None]:
# check dataframe
missing_count.head()

In [None]:
# make dictionary of states and missing person counts
missing_dict = dict(zip(missing_count.State, missing_count.Missing_CaseCount))
len(missing_dict)
# missing_dict

In [None]:
# Get count of values in missing database w/ no state assigned
mis_null_series = missing_df.loc[missing_df['State'].isnull()].count()
mis_null_ct = mis_null_series['Case Number']
mis_null_ct

### 2/4: Get count of unclaimed persons

In [None]:
unclaimed_count = unclaimed_df.groupby('State').count()
# len(unclaimed_count)
unclaimed_count.head()

In [None]:
# Remove all columns except case number count
unclaimed_count = unclaimed_count.drop(columns=['DBF','Last Name', 'First Name', 'Sex', 'Race / Ethnicity', 'City', 'County', 'Date Modified'])

In [None]:
# add column for state (since state is now index)
unclaimed_count['State'] = unclaimed_count.index

In [None]:
# rename case number count column
unclaimed_count = unclaimed_count.rename(columns = {'Case Number': 'Unclaimed_CaseCount'}, inplace = False)

In [None]:
# check dataframe
unclaimed_count.head()

In [None]:
# make dictionary of states and unclaimed case counts
unclaimed_dict = dict(zip(unclaimed_count.State, unclaimed_count.Unclaimed_CaseCount))
len(unclaimed_dict)
# unclaimed_dict

In [None]:
# Get count of values in unclaimed database w/ no state assigned
unc_null_series = unclaimed_df.loc[unclaimed_df['State'].isnull()].count()
unc_null_ct = unc_null_series['Case Number']
unc_null_ct

### 3/4: Get count of unidentified persons

In [None]:
unidentified_count = unidentified_df.groupby('State').count()
# len(unidentified_count)
unidentified_count.head()

In [None]:
# Remove all columns except case number count
unidentified_count = unidentified_count.drop(columns=['DBF','Age From', 'Age To', 'City', 'County', 'Sex', 'Race / Ethnicity', 'Date Modified'])

In [None]:
# add column for state (since state is now index)
unidentified_count['State'] = unidentified_count.index

In [None]:
# rename case number count column
unidentified_count = unidentified_count.rename(columns = {'Case Number': 'Unidentified_CaseCount'}, inplace = False)

In [None]:
# check dataframe
unidentified_count.head()

In [None]:
# make dictionary of states and unidentified case counts
unidentified_dict = dict(zip(unidentified_count.State, unidentified_count.Unidentified_CaseCount))
len(unidentified_dict)
# unidentified_dict

In [None]:
# Get count of values in unidentified database w/ no state assigned
uni_null_series = unidentified_df.loc[unidentified_df['State'].isnull()].count()
uni_null_ct = uni_null_series['Case Number']
uni_null_ct

### 4/4: Make summary dataframe

In [None]:
# make a new summary dataframe based on the state centroids
summary_df = state_centroids_v2_df
# sort by state name
summary_df = summary_df.sort_values(by=['STATEFP'])
summary_df.head()

In [None]:
len(summary_df)

#### 4a: Add Missing Person count for each state

In [None]:
# Add new column to dataframe, using the state name field as a key in the missing_dict, to pull the correct missing case count for each state
summary_df['Missing_Count'] = summary_df['NAME'].map(missing_dict)
summary_df.head()

#### 4b: Add Unclaimed Person count for each state

In [None]:
# Add new column to dataframe, using the state name field as a key in the unclaimed_dict, to pull the correct unclaimed case count for each state
summary_df['Unclaimed_Count'] = summary_df['NAME'].map(unclaimed_dict)
summary_df.head()

In [None]:
# Check null values [NOTE: seems fine to have nulls]
unclaimed_null_df = summary_df.loc[summary_df['Unclaimed_Count'].isnull()]
unclaimed_null_df

In [None]:
# # change NaN Unclaimed count for fake FIPS 99 to count of values in missing database w/ no state assigned (21)
index_Series = summary_df.loc[summary_df['STATEFP']==99]
index_None = index_Series.index[0]
summary_df.loc[index_None, 'Unclaimed_Count'] = unc_null_ct
# check value
summary_df['Unclaimed_Count'][index_None]

#### 4b: Add Unidentified Person count for each state

In [None]:
# Add new column to dataframe, using the state name field as a key in the unidentified_dict, to pull the correct unidentified case count for each state
summary_df['Unidentified_Count'] = summary_df['NAME'].map(unidentified_dict)
summary_df.head()

In [None]:
# Check null values [NOTE: seems fine to have nulls]
unidentified_null_df = summary_df.loc[summary_df['Unidentified_Count'].isnull()]
unidentified_null_df

#### 4c: Add column for total count of cases in all three databases for each state

In [None]:
summary_df.head()

In [None]:
summary_df.iloc[:, -3]

In [None]:
# Note - may need to change -4 to -3, run, then change back to -4 and re-run. DON'T KNOW WHY
summary_df['Total_Count'] = summary_df.iloc[:, -4:-1].sum(axis=1)
summary_df.head(56)

#### 4c: Convert to geodataframe and export as GeoJSON

In [None]:
# check final summary_df
summary_df.head()

In [None]:
# Convert dataframe to geodataframe
summary_gdf = gpd.GeoDataFrame(summary_df, geometry=gpd.points_from_xy(x=summary_df.Lon_dd, y=summary_df.Lat_dd))
summary_gdf.head()

In [None]:
# write to geoJSON
summary_gdf.to_file("JSON/summary_counts.json", driver="GeoJSON", encoding='utf-8')

# Part 2 - Get state-level GeoJSON with data from all 3 databases

### 2a: Add in State FIPS column to each database dataframe

In [11]:
# make dictionary of states and state FIPS code
state_dict = dict(zip(state_centroids_df.NAME, state_centroids_df.STATEFP))
# state_dict
# state_dict['Alaska']
len(state_dict)

55

#### 1/3: add state FIPS codes to missing persons dataframe

In [12]:
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019
3,MP43187,2/20/2018,johnson,abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016


In [13]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
missing_df['State_FIPS'] = missing_df['State'].map(state_dict)
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020,1
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015,1
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019,1
3,MP43187,2/20/2018,johnson,abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020,1
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016,1


In [14]:
# Check unique values in new dataframe field
missing_df['State_FIPS'].unique()

array([ 1,  2,  4,  5,  6,  8, 69,  9, 10, 11, 12, 13, 66, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 72, 44, 45, 46, 47, 48, 78, 49, 50, 51,
       53, 54, 55, 56], dtype=int64)

In [15]:
## If any nulls, check to see what is producing null values [NOTE: fixed by making changes to csv fields, as noted where csvs are imported]
mis_test_df = missing_df.loc[missing_df['State_FIPS'].isnull()]
mis_test_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS


In [16]:
# check missing df
missing_df.head()
# len(missing_df)

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020,1
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015,1
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019,1
3,MP43187,2/20/2018,johnson,abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020,1
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016,1


#### 2/3: add state FIPS codes to unclaimed persons dataframe

In [17]:
unclaimed_df.head()
# len(unclaimed_df)

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020


In [18]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
unclaimed_df['State_FIPS'] = unclaimed_df['State'].map(state_dict)
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17.0
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17.0
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020,36.0
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020,36.0
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020,36.0


In [19]:
# Check unique values in new dataframe field
unclaimed_df['State_FIPS'].unique()

array([17., 36., 25.,  6., 53., 15., 32., 16., 48., 49., 47., 11., 33.,
       35., 40., 42., 39., 29.,  5., 13.,  1., 34.,  4., 12., 55.,  9.,
        8., 26., 22., 18., 19., 41., 23., nan, 45., 56., 54., 51.,  2.,
       28., 21., 37., 10., 38.])

In [20]:
## If any nulls, check to see what is producing null values [NOTE: cannot be addressed, as these cases do not have a state or county assigned]
unc_test_df = unclaimed_df.loc[unclaimed_df['State_FIPS'].isnull()]
# len(unc_test_df)
unc_test_df

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS
3380,UCP5339,12/3/2016,Williams,Barbara,Female,Asian,,,,5/3/2018,
4774,UCP2489,3/26/2015,Lupien,Daryl,Male,,,,,8/18/2016,
5023,UCP4001,11/2/2014,Brown,David,Male,White / Caucasian,,,,5/11/2017,
5883,UCP3962,3/22/2010,Gonzalez,David,Male,Hispanic / Latino,,,,5/11/2017,
5950,UCP3937,12/7/2009,Togovnick,Bernice,Female,White / Caucasian,,,,5/11/2017,
6183,UCP3901,1/21/2009,Carney,John,Male,White / Caucasian,,,,5/11/2017,
6341,UCP3689,6/5/2008,Guzman,Ralph,Male,Hispanic / Latino,,,,5/2/2017,
6560,UCP3824,7/2/2007,Vasquez,Rafael,Male,Hispanic / Latino,,,,5/4/2017,
6698,UCP3672,9/22/2006,Barajas,Epifanio,Male,Hispanic / Latino,,,,5/2/2017,
6719,UCP3668,8/5/2006,Huff,Paul,Male,White / Caucasian,,,,5/2/2017,


In [21]:
# # change NaN FIPS codes (for cases with no city, county or state) to 99 (none option in state_centroids_v2.csv)
unclaimed_df['State_FIPS'] = unclaimed_df['State_FIPS'].fillna(99)
unclaimed_df['State_FIPS'] = unclaimed_df['State_FIPS'].astype(int)
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020,36
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020,36
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020,36


In [22]:
## Double check re-assigned values
unc_test_df2 = unclaimed_df.loc[unclaimed_df['State_FIPS']==99]
unc_test_df2
# len(unc_test_df2)

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS
3380,UCP5339,12/3/2016,Williams,Barbara,Female,Asian,,,,5/3/2018,99
4774,UCP2489,3/26/2015,Lupien,Daryl,Male,,,,,8/18/2016,99
5023,UCP4001,11/2/2014,Brown,David,Male,White / Caucasian,,,,5/11/2017,99
5883,UCP3962,3/22/2010,Gonzalez,David,Male,Hispanic / Latino,,,,5/11/2017,99
5950,UCP3937,12/7/2009,Togovnick,Bernice,Female,White / Caucasian,,,,5/11/2017,99
6183,UCP3901,1/21/2009,Carney,John,Male,White / Caucasian,,,,5/11/2017,99
6341,UCP3689,6/5/2008,Guzman,Ralph,Male,Hispanic / Latino,,,,5/2/2017,99
6560,UCP3824,7/2/2007,Vasquez,Rafael,Male,Hispanic / Latino,,,,5/4/2017,99
6698,UCP3672,9/22/2006,Barajas,Epifanio,Male,Hispanic / Latino,,,,5/2/2017,99
6719,UCP3668,8/5/2006,Huff,Paul,Male,White / Caucasian,,,,5/2/2017,99


In [23]:
# check full dataframe
len(unclaimed_df)

8335

#### 3/3: add state FIPS codes to unidentified persons dataframe

In [24]:
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified
0,UP51903,3/30/1986,,,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018
3,UP55409,3/26/2000,,,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019


In [25]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
unidentified_df['State_FIPS'] = unidentified_df['State'].map(state_dict)
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS
0,UP51903,3/30/1986,,,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020,1
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019,1
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018,1
3,UP55409,3/26/2000,,,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020,1
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019,1


In [26]:
# Check unique values in new dataframe field
unidentified_df['State_FIPS'].unique()

array([ 1,  2,  4,  5,  6,  8,  9, 10, 11, 12, 13, 66, 15, 16, 17, 18, 19,
       20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
       37, 38, 39, 40, 41, 42, 72, 44, 45, 46, 47, 48, 78, 49, 50, 51, 53,
       54, 55, 56], dtype=int64)

In [27]:
## If any nulls, check to see what is producing null values [NOTE: fixed by making changes to csv fields, as noted where csvs are imported]
uni_test_df = unidentified_df.loc[unidentified_df['State_FIPS'].isnull()]
uni_test_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS


### Construct GeoJSON with correct structure

Goal format:
{
"type": "Feature",
    "name": "Wisconsin",
    "properties": {
        "missing": [ ],
        "unclaimed": [ ],
        "unidentified": [ ],
        "filtered": [ ]
    }
    "geometry": {
          "type": "Point",
          "coordinates": [
            -117.79750667,
            36.03755926
          ]
}
* each array will be a list of dictionaries. Each dictionary = one case. Keys = headers

In [None]:
# get headers
missing_header = list(missing_df.columns.values)
print("missing header:", missing_header)
unclaimed_header = list(unclaimed_df.columns.values)
print("unclaimed header:", unclaimed_header)
unidentified_header = list(unidentified_df.columns.values)
print("unidentified header:", unidentified_header)

In [None]:
# check df
unclaimed_df.head()

In [None]:
# check data types
type(unclaimed_df['State_FIPS'][0])

In [None]:
# check centroids
state_centroids_v2_df.head()
len(state_centroids_v2_df)

In [None]:
# sort each database by state FIPS
# sort state centroids by state FIPS
state_centroids_v2_df = state_centroids_v2_df.sort_values(by=['STATEFP'])
state_centroids_v2_df.head()

In [None]:
# sort each database by state FIPS
# sort missing by state FIPS
missing_df = missing_df.sort_values(by=['State_FIPS'])
missing_df.head()

In [None]:
# sort unclaimed by state FIPS
unclaimed_df = unclaimed_df.sort_values(by=['State_FIPS'])
unclaimed_df.head()

In [None]:
# sort unidentified by state FIPS
unidentified_df = unidentified_df.sort_values(by=['State_FIPS'])
unidentified_df.head()

In [None]:
## Create state array
state_array = []
## for each state in state_centroids...
i = 0
while i < len(state_centroids_v2_df):
    state_dict = {}
    state_dict["type"] = "Feature"
    state_dict["name"] = state_centroids_v2_df["NAME"][i]
    state_dict["name_abbr"] = state_centroids_v2_df["STUSPS"][i]
    state_dict["state_FIPS"] = str(state_centroids_v2_df["STATEFP"][i])
    state_dict["properties"] = {}
    missing_array = []
    j = 0
    while j < len(missing_df):
        # check if state_fips matches
        if missing_df['State_FIPS'][j] == state_centroids_v2_df["STATEFP"][i]:
            missing_dict = {}
            for item in missing_header:
                missing_dict[item] = str(missing_df[item][j])
            # append dictionary to missing array
            missing_array.append(missing_dict)
        # increment j
        j += 1
    state_dict["properties"]['missing'] = missing_array     
    
    unclaimed_array = []
    k = 0
    while k < len(unclaimed_df):
        # check if state_fips matches
        if unclaimed_df['State_FIPS'][k] == state_centroids_v2_df["STATEFP"][i]:
            unclaimed_dict = {}
            for item in unclaimed_header:
                unclaimed_dict[item] = str(unclaimed_df[item][k])
            # append dictionary to unclaimed array
            unclaimed_array.append(unclaimed_dict)
        # increment k
        k += 1
    state_dict["properties"]['unclaimed'] = unclaimed_array
    
    unidentified_array = []
    l = 0
    while l < len(unidentified_df):
        # check if state_fips matches
        if unidentified_df['State_FIPS'][l] == state_centroids_v2_df["STATEFP"][i]:
            unidentified_dict = {}
            for item in unidentified_header:
                unidentified_dict[item] = str(unidentified_df[item][l])
            # append dictionary to unclaimed array
            unidentified_array.append(unidentified_dict)
        # increment l
        l += 1
    state_dict["properties"]['unidentified'] = unidentified_array
    
    state_dict["properties"]['filtered'] = []
    ## set geometry
    state_dict["geometry"] = {}
    state_dict["geometry"]["type"] = "Point"
    state_dict["geometry"]["coordinates"] = [state_centroids_v2_df['Lon_dd'][i], state_centroids_v2_df['Lat_dd'][i]]
    ## append state dictionary to array
    state_array.append(state_dict)
    # increment interator
    i += 1
# state_array

In [None]:
# check item in array
len(state_array)
# state_array[55]['properties']['unclaimed']

In [None]:
# Create FeatureCollection
state_feature_collection = {}
state_feature_collection["type"] = "FeatureCollection"
state_feature_collection["features"] = state_array
# state_feature_collection

In [None]:
# Convert FeatureCollection to JSON format
state_geojson = geojson.dumps(state_feature_collection)
# check type to make sure conversion was sucessful
print(type(state_geojson))

In [None]:
# Save JSON-formatted FeatureCollection as JSON file
# Save as new json file
with open('JSON/state_geojson.json', 'w', encoding='utf-8') as json_file:
    json_file.write(state_geojson)

## Part 2 - OLD METHOD: Get state-level JSONs for all 3 databases

### 2b: Join each database dataframe to the state_centroids dataframe

In [None]:
# check state centroids dataframe
state_centroids_df.head()

#### 1/4: Join state centroids dataframe to missing persons dataframe
##### (assign each row the coordinates of a state centroid based on the state fips code)

In [None]:
# check dataframe
missing_df.head()

In [None]:
# check dataframe length
len(missing_df)

In [None]:
# join state centroids dataframe to missing persons dataframe - MAKE SURE LENGTH MATCHES LENGTH OF MISSING_DF
missing_states_df = pd.merge(left = missing_df, right = state_centroids_df, how = 'left', left_on = 'State_FIPS', right_on = 'STATEFP')
# len(missing_states_df)
missing_states_df.head()

#### 1/4: Join state centroids dataframe to unclaimed persons dataframe
##### (assign each row the coordinates of a state centroid based on the state fips code)

In [None]:
# check dataframe
unclaimed_df.head()

In [None]:
# check dataframe length
len(unclaimed_df)

In [None]:
# join state centroids dataframe to unclaimed persons dataframe - MAKE SURE LENGTH MATCHES LENGTH OF UNCLAIMED_DF
unclaimed_states_df = pd.merge(left = unclaimed_df, right = state_centroids_df, how = 'left', left_on = 'State_FIPS', right_on = 'STATEFP')
# len(unclaimed_states_df)
unclaimed_states_df.head()

In [None]:
# make sure int STATEFP joined ok to float State_FIPS [ NOTE: it joined fine]
# unclaimed_states_df[['State_FIPS', 'STATEFP']]

In [None]:
## Double check rows w/ no FIPS to make sure missing_df data retained [NOTE: it was!]
unc_states_test_df = unclaimed_states_df.loc[unclaimed_states_df['State_FIPS'].isnull()]
unc_states_test_df

#### 1/4: Join state centroids dataframe to unidentified persons dataframe
##### (assign each row the coordinates of a state centroid based on the state fips code)

In [None]:
# check dataframe
unidentified_df.head()

In [None]:
# check dataframe length
len(unidentified_df)

In [None]:
# join state centroids dataframe to unidentified persons dataframe - MAKE SURE LENGTH MATCHES LENGTH OF UNIDENTIFIED_DF
unidentified_states_df = pd.merge(left = unidentified_df, right = state_centroids_df, how = 'left', left_on = 'State_FIPS', right_on = 'STATEFP')
# len(unidentified_states_df)
unidentified_states_df.head()

#### 4/4: Convert dataframes to geodataframes and export as GeoJSON files

##### 4a: missing_states_df

In [None]:
# check dataframe
missing_states_df.head()

In [None]:
# Convert missing persons dataframe to geodataframe
missing_states_gdf = gpd.GeoDataFrame(missing_states_df, geometry=gpd.points_from_xy(x=missing_states_df.Lon_dd, y=missing_states_df.Lat_dd))
# len(missing_states_gdf)
missing_states_gdf.head()

In [None]:
# write to geoJSON
missing_states_gdf.to_file("JSON/missing_states.json", driver="GeoJSON", encoding='utf-8')

##### 4b: unclaimed_states_df

In [None]:
# check dataframe
unclaimed_states_df.head()

In [None]:
# Convert unclaimed persons dataframe to geodataframe
unclaimed_states_gdf = gpd.GeoDataFrame(unclaimed_states_df, geometry=gpd.points_from_xy(x=unclaimed_states_df.Lon_dd, y=unclaimed_states_df.Lat_dd))
# len(unclaimed_states_gdf)
unclaimed_states_gdf.head()

In [None]:
# write to geoJSON
unclaimed_states_gdf.to_file("JSON/unclaimed_states.json", driver="GeoJSON", encoding='utf-8')

##### 4c: unidentified_states_df

In [None]:
# check dataframe
unidentified_states_df.head()

In [None]:
# Convert unidentified persons dataframe to geodataframe
unidentified_states_gdf = gpd.GeoDataFrame(unidentified_states_df, geometry=gpd.points_from_xy(x=unidentified_states_df.Lon_dd, y=unidentified_states_df.Lat_dd))
# len(unidentified_states_gdf)
unidentified_states_gdf.head()

In [None]:
# write to geoJSON
unidentified_states_gdf.to_file("JSON/unidentified_states.json", driver="GeoJSON", encoding='utf-8')

#### Check that GeoJSON files load properly

In [None]:
test_miss_df = gpd.read_file('JSON/missing_states.json')
len(test_miss_df)
# test_miss_df.head()

In [None]:
test_unc_df = gpd.read_file('JSON/unclaimed_states.json')
len(test_unc_df)
# test_unc_df.head()

In [None]:
test_uni_df = gpd.read_file('JSON/unidentified_states.json')
len(test_uni_df)
# test_uni_df.head()

# Part 3: Get county-level JSONs for all 3 databases

#### Add in State name and County_Key columns to county centroid v2 dataframe

In [28]:
# make dictionary of state FIPS codes and state names
state_FIPS_dict = dict(zip(state_centroids_v2_df.STATEFP, state_centroids_v2_df.NAME))
# state_FIPS_dict
# state_FIPS_dict['Alaska']
len(state_FIPS_dict)

56

In [29]:
# check county centroids v2 df
county_centroids_v2_df.head()

Unnamed: 0,OBJECTID,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd
0,1,21,7,516850,0500000US21007,21007,Ballard,6,639387500.0,69473325,1,-88.999262,37.058489
1,2,21,17,516855,0500000US21017,21017,Bourbon,6,750439400.0,4829777,2,-84.217155,38.206742
2,3,21,31,516862,0500000US21031,21031,Butler,6,1103572000.0,13943044,3,-86.681628,37.207292
3,4,21,65,516879,0500000US21065,21065,Estill,6,655509900.0,6516335,4,-83.964316,37.692451
4,5,21,69,516881,0500000US21069,21069,Fleming,6,902727200.0,7182793,5,-83.69666,38.370126


In [30]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
county_centroids_v2_df['STATE_NAME'] = county_centroids_v2_df['STATEFP'].map(state_FIPS_dict)
# check dataframe
county_centroids_v2_df.head()
# Check unique values in new dataframe field
# county_centroids_v2_df['STATE_NAME'].unique()

Unnamed: 0,OBJECTID,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,STATE_NAME
0,1,21,7,516850,0500000US21007,21007,Ballard,6,639387500.0,69473325,1,-88.999262,37.058489,Kentucky
1,2,21,17,516855,0500000US21017,21017,Bourbon,6,750439400.0,4829777,2,-84.217155,38.206742,Kentucky
2,3,21,31,516862,0500000US21031,21031,Butler,6,1103572000.0,13943044,3,-86.681628,37.207292,Kentucky
3,4,21,65,516879,0500000US21065,21065,Estill,6,655509900.0,6516335,4,-83.964316,37.692451,Kentucky
4,5,21,69,516881,0500000US21069,21069,Fleming,6,902727200.0,7182793,5,-83.69666,38.370126,Kentucky


In [31]:
# sort county centroids by state FIPS
county_centroids_v2_df = county_centroids_v2_df.sort_values(by=['STATEFP'])
county_centroids_v2_df.head()

Unnamed: 0,OBJECTID,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,STATE_NAME
1043,1044,1,61,161556,0500000US01061,1061,Geneva,6,1487898000.0,11578163,1044,-85.839096,31.095061,Alabama
1255,1256,1,17,161534,0500000US01017,1017,Chambers,6,1545086000.0,16971701,1256,-85.392035,32.914352,Alabama
3228,100000,1,999,9999100,9999999US100000,1999,,0,0.0,0,99991,45.0,-90.0,Alabama
471,472,1,49,161550,0500000US01049,1049,DeKalb,6,2012676000.0,4121543,472,-85.804114,34.459807,Alabama
472,473,1,63,161557,0500000US01063,1063,Greene,6,1675782000.0,33416141,473,-87.952209,32.853154,Alabama


In [32]:
# Add column with compound field key
county_centroids_v2_df['County_Key'] = county_centroids_v2_df['STATEFP'].astype(str) + "_" + county_centroids_v2_df['NAME']
county_centroids_v2_df.head()

Unnamed: 0,OBJECTID,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,STATE_NAME,County_Key
1043,1044,1,61,161556,0500000US01061,1061,Geneva,6,1487898000.0,11578163,1044,-85.839096,31.095061,Alabama,1_Geneva
1255,1256,1,17,161534,0500000US01017,1017,Chambers,6,1545086000.0,16971701,1256,-85.392035,32.914352,Alabama,1_Chambers
3228,100000,1,999,9999100,9999999US100000,1999,,0,0.0,0,99991,45.0,-90.0,Alabama,1_None
471,472,1,49,161550,0500000US01049,1049,DeKalb,6,2012676000.0,4121543,472,-85.804114,34.459807,Alabama,1_DeKalb
472,473,1,63,161557,0500000US01063,1063,Greene,6,1675782000.0,33416141,473,-87.952209,32.853154,Alabama,1_Greene


In [33]:
# check unique values and length
county_key_v2_list = county_centroids_v2_df['County_Key'].unique()
len(county_key_v2_list)

3283

In [35]:
# make dictionary of counties and county FIPS code (GEOID field)
county_v2_dict = dict(zip(county_centroids_v2_df.County_Key, county_centroids_v2_df.GEOID))
len(county_v2_dict)

3283

#### Add in County_key column to county centroids dataframe

In [36]:
# sort county centroids by state FIPS
county_centroids_df = county_centroids_df.sort_values(by=['STATEFP'])
county_centroids_df.head()

Unnamed: 0,OBJECTID,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd
3211,3212,1,121,161586,0500000US01121,1121,Talladega,6,1908257000.0,60926366,3218,-86.16589,33.380086
1194,1195,1,93,161573,0500000US01093,1093,Marion,6,1922657000.0,3184076,1195,-87.88714,34.136562
1255,1256,1,17,161534,0500000US01017,1017,Chambers,6,1545086000.0,16971701,1256,-85.392035,32.914352
1215,1216,1,59,161555,0500000US01059,1059,Franklin,6,1641841000.0,32643981,1216,-87.84374,34.441699
2780,2781,1,127,161589,0500000US01127,1127,Walker,6,2048686000.0,36754696,2785,-87.297329,33.803318


In [37]:
# Add column with compound field key
county_centroids_df['County_Key'] = county_centroids_df['STATEFP'].astype(str) + "_" + county_centroids_df['NAME']
county_centroids_df.head()

Unnamed: 0,OBJECTID,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,County_Key
3211,3212,1,121,161586,0500000US01121,1121,Talladega,6,1908257000.0,60926366,3218,-86.16589,33.380086,1_Talladega
1194,1195,1,93,161573,0500000US01093,1093,Marion,6,1922657000.0,3184076,1195,-87.88714,34.136562,1_Marion
1255,1256,1,17,161534,0500000US01017,1017,Chambers,6,1545086000.0,16971701,1256,-85.392035,32.914352,1_Chambers
1215,1216,1,59,161555,0500000US01059,1059,Franklin,6,1641841000.0,32643981,1216,-87.84374,34.441699,1_Franklin
2780,2781,1,127,161589,0500000US01127,1127,Walker,6,2048686000.0,36754696,2785,-87.297329,33.803318,1_Walker


In [38]:
# check unique values and length
county_key_list = county_centroids_df['County_Key'].unique()
len(county_key_list)

3227

In [40]:
# make dictionary of counties and county FIPS code (GEOID field)
county_dict = dict(zip(county_centroids_df.County_Key, county_centroids_df.GEOID))
len(county_dict)

3227

#### Add in County FIPS column to missing dataframe

##### To identify null values, add in County FIPS column to missing dataframe using county_centroids_df

In [41]:
# check missing df
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020,1
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015,1
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019,1
3,MP43187,2/20/2018,johnson,abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020,1
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016,1


In [42]:
# Add column with compound field key
missing_df['State_County'] = missing_df['State_FIPS'].astype(str) + "_" + missing_df['County']
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020,1,1_Autauga
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015,1,1_Autauga
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019,1,1_Baldwin
3,MP43187,2/20/2018,johnson,abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020,1,1_Baldwin
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016,1,1_Baldwin


In [43]:
# Add new column to dataframe, using the State_County name field as a key in the county_dict, to pull the correct County FIPS code for each row
missing_df['County_FIPS'] = missing_df['State_County'].map(county_dict)
# Check unique values in new dataframe field
missing_df['County_FIPS'].unique()

array([ 1001.,  1003.,  1005., ..., 56035., 56037., 56039.])

In [44]:
# check null values [NOTE: all ok - no city or county assigned]
missing_county_null_df = missing_df.loc[missing_df['County_FIPS'].isnull()]
# missing_county_null_df.shape
missing_county_null_df

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
14181,MP4569,5/5/1974,Colonna Aponte,John,12,,,Puerto Rico,Male,"White / Caucasian, Hispanic / Latino",5/15/2018,72,,
14182,MP54374,12/15/1984,Cruz,Marcos,2,,,Puerto Rico,Male,Black / African American,3/12/2020,72,,
14183,MP4568,5/4/1974,Colonna Aponte,Giannina,11,,,Puerto Rico,Female,"White / Caucasian, Hispanic / Latino",4/25/2018,72,,


In [45]:
# As needed, export nulls to address
# missing_county_null_df.to_csv('missing_county_nulls.csv', encoding='Windows-1252')

In [46]:
# check missing_df
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020,1,1_Autauga,1001.0
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015,1,1_Autauga,1001.0
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019,1,1_Baldwin,1003.0
3,MP43187,2/20/2018,johnson,abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020,1,1_Baldwin,1003.0
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016,1,1_Baldwin,1003.0


##### Replace null state values with 'None' and null county values with 'None', then re-write county_FIPS column using county_centroids_v2_dataframe

In [47]:
## Check if any rows with null value for 'State'
mis_state_test_df = missing_df.loc[missing_df['State'].isnull()]
# len(mis_state_test_df)
mis_state_test_df

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS


In [48]:
## Check if any rows with null value for 'City' and County'
mis_county_test_df = missing_df.loc[missing_df['County'].isnull()]
# len(mis_county_test_df)
mis_county_test_df

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
14181,MP4569,5/5/1974,Colonna Aponte,John,12,,,Puerto Rico,Male,"White / Caucasian, Hispanic / Latino",5/15/2018,72,,
14182,MP54374,12/15/1984,Cruz,Marcos,2,,,Puerto Rico,Male,Black / African American,3/12/2020,72,,
14183,MP4568,5/4/1974,Colonna Aponte,Giannina,11,,,Puerto Rico,Female,"White / Caucasian, Hispanic / Latino",4/25/2018,72,,


In [49]:
# # change NaN County name (for cases with no city or county) to 'None'
missing_df['County'] = missing_df['County'].fillna('None')
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020,1,1_Autauga,1001.0
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015,1,1_Autauga,1001.0
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019,1,1_Baldwin,1003.0
3,MP43187,2/20/2018,johnson,abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020,1,1_Baldwin,1003.0
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016,1,1_Baldwin,1003.0


In [50]:
## Double check re-assigned values
mis_county_test_2_df = missing_df.loc[missing_df['County']=='None']
mis_county_test_2_df
# len(mis_county_test_2_df)

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
14181,MP4569,5/5/1974,Colonna Aponte,John,12,,,Puerto Rico,Male,"White / Caucasian, Hispanic / Latino",5/15/2018,72,,
14182,MP54374,12/15/1984,Cruz,Marcos,2,,,Puerto Rico,Male,Black / African American,3/12/2020,72,,
14183,MP4568,5/4/1974,Colonna Aponte,Giannina,11,,,Puerto Rico,Female,"White / Caucasian, Hispanic / Latino",4/25/2018,72,,


In [51]:
# Re-Add column with compound field key
missing_df['State_County'] = missing_df['State_FIPS'].astype(str) + "_" + missing_df['County']
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020,1,1_Autauga,1001.0
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015,1,1_Autauga,1001.0
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019,1,1_Baldwin,1003.0
3,MP43187,2/20/2018,johnson,abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020,1,1_Baldwin,1003.0
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016,1,1_Baldwin,1003.0


In [52]:
## Double check re-assigned values
mis_county_test_3_df = missing_df.loc[missing_df['County']=='None']
mis_county_test_3_df
# len(mis_county_test_3_df)

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
14181,MP4569,5/5/1974,Colonna Aponte,John,12,,,Puerto Rico,Male,"White / Caucasian, Hispanic / Latino",5/15/2018,72,72_None,
14182,MP54374,12/15/1984,Cruz,Marcos,2,,,Puerto Rico,Male,Black / African American,3/12/2020,72,72_None,
14183,MP4568,5/4/1974,Colonna Aponte,Giannina,11,,,Puerto Rico,Female,"White / Caucasian, Hispanic / Latino",4/25/2018,72,72_None,


In [53]:
# Re-add County FIPS column to dataframe, using the State_County name field as a key in the county_v2_dict, to pull the correct County FIPS code for each row
missing_df['County_FIPS'] = missing_df['State_County'].map(county_v2_dict)
# Check unique values in new dataframe field
missing_df['County_FIPS'].unique()

array([ 1001,  1003,  1005, ..., 56035, 56037, 56039], dtype=int64)

In [54]:
# check null values to make sure none are left [Note - all gone!]
missing_county_null_v2_df = missing_df.loc[missing_df['County_FIPS'].isnull()]
# missing_county_null_v2_df.shape
missing_county_null_v2_df

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS


In [55]:
## Double check re-assigned values
mis_county_test_4_df = missing_df.loc[missing_df['County']=='None']
mis_county_test_4_df
# len(mis_county_test_4_df)

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
14181,MP4569,5/5/1974,Colonna Aponte,John,12,,,Puerto Rico,Male,"White / Caucasian, Hispanic / Latino",5/15/2018,72,72_None,72999
14182,MP54374,12/15/1984,Cruz,Marcos,2,,,Puerto Rico,Male,Black / African American,3/12/2020,72,72_None,72999
14183,MP4568,5/4/1974,Colonna Aponte,Giannina,11,,,Puerto Rico,Female,"White / Caucasian, Hispanic / Latino",4/25/2018,72,72_None,72999


#### Add in County FIPS column to unclaimed dataframe

##### To identify null values, add in County FIPS column to unclaimed dataframe using county_centroids_df

In [56]:
# check unclaimed df
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020,36
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020,36
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020,36


In [57]:
# Add column with compound field key
unclaimed_df['State_County'] = unclaimed_df['State_FIPS'].astype(str) + "_" + unclaimed_df['County']
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020,36,36_New York
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020,36,36_Kings
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020,36,36_New York


In [58]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
unclaimed_df['County_FIPS'] = unclaimed_df['State_County'].map(county_dict)
# Check unique values in new dataframe field
unclaimed_df['County_FIPS'].unique()

array([17197., 36061., 36047., 36005., 36081., 25025.,  6073., 17063.,
       36085., 53053., 15003., 32031., 25021., 25027., 16001., 25009.,
       25017., 25003., 25013., 25023., 32027., 25015.,  6047., 48201.,
       49035., 47093., 53075., 53027., 11001., 25005., 47157., 33005.,
       35031., 53049., 53077., 40143., 42029., 47155., 47001., 39099.,
       29019.,  6075.,  5143.,  6093., 13121.,  1073., 34025., 34023.,
       25001.,  4019.,  4013.,  6023.,  6099., 34001., 12087., 47035.,
        6037., 40121., 53061., 55025.,  6071.,  9003., 48141., 48339.,
        9009., 42101., 12071., 53033., 17043., 34021.,  9011., 48215.,
        8087., 12051., 26163.,  9001., 47009., 17031., 22069.,  9005.,
       32005.,  9007., 47065., 34013.,    nan,  4007., 47013., 18067.,
       42001.,  6059., 19155.,  6083., 26025., 40101., 40041., 42091.,
       55057., 48167., 19153.,  6025., 26121., 40145., 25011.,  9015.,
        1115.,  5035., 53035., 53073., 41051., 16027., 23005., 47179.,
      

In [59]:
# check null values
unclaimed_county_null_df = unclaimed_df.loc[unclaimed_df['County_FIPS'].isnull()]
# unclaimed_county_null_df.shape
unclaimed_county_null_df

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
1996,UCP5438,4/30/2018,Flores,Victor,Male,Black / African American,,,New York,5/14/2019,36,,
2011,UCP5440,4/22/2018,Browne,Edward,Male,Hispanic / Latino,,,New York,5/27/2019,36,,
2028,UCP5297,4/15/2018,Davis,Thelma,Female,Black / African American,,,New York,5/20/2019,36,,
2031,UCP5296,4/14/2018,Fasulo,Timothy,Male,White / Caucasian,,,New York,4/25/2019,36,,
2033,UCP5439,4/14/2018,Reyes,Nivia,Female,Other,,,New York,5/27/2019,36,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7286,UCP990,11/6/1997,Pope,Leon,Male,White / Caucasian,,,Washington,4/14/2020,53,,
7339,UCP1999,10/22/1991,Robles,Salvador,Male,Other,,,Tennessee,1/10/2020,47,,
7344,UCP4522,10/18/1990,Way,Kenneth,Male,Black / African American,,,New York,5/11/2019,36,,
7372,UCP1548,6/17/1973,Seah,Tee Kee,Male,Asian,,,Nevada,5/28/2019,32,,


In [60]:
# As needed, export nulls to address [NOTE: all have been addressed. 1241 cases have no county]
# unclaimed_county_null_df.to_csv('unclaimed_county_nulls.csv', encoding='Windows-1252')

In [61]:
# check unclaimed_df
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will,17197.0
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will,17197.0
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020,36,36_New York,36061.0
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020,36,36_Kings,36047.0
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020,36,36_New York,36061.0


##### Replace null state values with 'None' and null county values with 'None', then re-write county_FIPS column using county_centroids_v2_dataframe

In [62]:
## Check if any rows with null value for 'State'
unc_state_test_df = unclaimed_df.loc[unclaimed_df['State'].isnull()]
# len(unc_state_test_df)
unc_state_test_df

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
3380,UCP5339,12/3/2016,Williams,Barbara,Female,Asian,,,,5/3/2018,99,,
4774,UCP2489,3/26/2015,Lupien,Daryl,Male,,,,,8/18/2016,99,,
5023,UCP4001,11/2/2014,Brown,David,Male,White / Caucasian,,,,5/11/2017,99,,
5883,UCP3962,3/22/2010,Gonzalez,David,Male,Hispanic / Latino,,,,5/11/2017,99,,
5950,UCP3937,12/7/2009,Togovnick,Bernice,Female,White / Caucasian,,,,5/11/2017,99,,
6183,UCP3901,1/21/2009,Carney,John,Male,White / Caucasian,,,,5/11/2017,99,,
6341,UCP3689,6/5/2008,Guzman,Ralph,Male,Hispanic / Latino,,,,5/2/2017,99,,
6560,UCP3824,7/2/2007,Vasquez,Rafael,Male,Hispanic / Latino,,,,5/4/2017,99,,
6698,UCP3672,9/22/2006,Barajas,Epifanio,Male,Hispanic / Latino,,,,5/2/2017,99,,
6719,UCP3668,8/5/2006,Huff,Paul,Male,White / Caucasian,,,,5/2/2017,99,,


In [63]:
# # change NaN State name (for cases with no city or county or state) to 'None'
unclaimed_df['State'] = unclaimed_df['State'].fillna('None')
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will,17197.0
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will,17197.0
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020,36,36_New York,36061.0
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020,36,36_Kings,36047.0
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020,36,36_New York,36061.0


In [64]:
## Double check re-assigned values
unc_state_test_2_df = unclaimed_df.loc[unclaimed_df['State']=='None']
unc_state_test_2_df
# len(unc_state_test_2_df)

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
3380,UCP5339,12/3/2016,Williams,Barbara,Female,Asian,,,,5/3/2018,99,,
4774,UCP2489,3/26/2015,Lupien,Daryl,Male,,,,,8/18/2016,99,,
5023,UCP4001,11/2/2014,Brown,David,Male,White / Caucasian,,,,5/11/2017,99,,
5883,UCP3962,3/22/2010,Gonzalez,David,Male,Hispanic / Latino,,,,5/11/2017,99,,
5950,UCP3937,12/7/2009,Togovnick,Bernice,Female,White / Caucasian,,,,5/11/2017,99,,
6183,UCP3901,1/21/2009,Carney,John,Male,White / Caucasian,,,,5/11/2017,99,,
6341,UCP3689,6/5/2008,Guzman,Ralph,Male,Hispanic / Latino,,,,5/2/2017,99,,
6560,UCP3824,7/2/2007,Vasquez,Rafael,Male,Hispanic / Latino,,,,5/4/2017,99,,
6698,UCP3672,9/22/2006,Barajas,Epifanio,Male,Hispanic / Latino,,,,5/2/2017,99,,
6719,UCP3668,8/5/2006,Huff,Paul,Male,White / Caucasian,,,,5/2/2017,99,,


In [65]:
## Check if any rows with null value for 'City' and County'
unc_county_test_df = unclaimed_df.loc[unclaimed_df['County'].isnull()]
# len(unc_county_test_df)
unc_county_test_df

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
1996,UCP5438,4/30/2018,Flores,Victor,Male,Black / African American,,,New York,5/14/2019,36,,
2011,UCP5440,4/22/2018,Browne,Edward,Male,Hispanic / Latino,,,New York,5/27/2019,36,,
2028,UCP5297,4/15/2018,Davis,Thelma,Female,Black / African American,,,New York,5/20/2019,36,,
2031,UCP5296,4/14/2018,Fasulo,Timothy,Male,White / Caucasian,,,New York,4/25/2019,36,,
2033,UCP5439,4/14/2018,Reyes,Nivia,Female,Other,,,New York,5/27/2019,36,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7286,UCP990,11/6/1997,Pope,Leon,Male,White / Caucasian,,,Washington,4/14/2020,53,,
7339,UCP1999,10/22/1991,Robles,Salvador,Male,Other,,,Tennessee,1/10/2020,47,,
7344,UCP4522,10/18/1990,Way,Kenneth,Male,Black / African American,,,New York,5/11/2019,36,,
7372,UCP1548,6/17/1973,Seah,Tee Kee,Male,Asian,,,Nevada,5/28/2019,32,,


In [66]:
# # change NaN County name (for cases with no city or county) to 'None'
unclaimed_df['County'] = unclaimed_df['County'].fillna('None')
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will,17197.0
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will,17197.0
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020,36,36_New York,36061.0
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020,36,36_Kings,36047.0
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020,36,36_New York,36061.0


In [67]:
## Double check re-assigned values
unc_county_test_2_df = unclaimed_df.loc[unclaimed_df['County']=='None']
unc_county_test_2_df
# len(unc_county_test_2_df)

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
1996,UCP5438,4/30/2018,Flores,Victor,Male,Black / African American,,,New York,5/14/2019,36,,
2011,UCP5440,4/22/2018,Browne,Edward,Male,Hispanic / Latino,,,New York,5/27/2019,36,,
2028,UCP5297,4/15/2018,Davis,Thelma,Female,Black / African American,,,New York,5/20/2019,36,,
2031,UCP5296,4/14/2018,Fasulo,Timothy,Male,White / Caucasian,,,New York,4/25/2019,36,,
2033,UCP5439,4/14/2018,Reyes,Nivia,Female,Other,,,New York,5/27/2019,36,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7286,UCP990,11/6/1997,Pope,Leon,Male,White / Caucasian,,,Washington,4/14/2020,53,,
7339,UCP1999,10/22/1991,Robles,Salvador,Male,Other,,,Tennessee,1/10/2020,47,,
7344,UCP4522,10/18/1990,Way,Kenneth,Male,Black / African American,,,New York,5/11/2019,36,,
7372,UCP1548,6/17/1973,Seah,Tee Kee,Male,Asian,,,Nevada,5/28/2019,32,,


In [68]:
# Re-Add column with compound field key
unclaimed_df['State_County'] = unclaimed_df['State_FIPS'].astype(str) + "_" + unclaimed_df['County']
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will,17197.0
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will,17197.0
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020,36,36_New York,36061.0
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020,36,36_Kings,36047.0
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020,36,36_New York,36061.0


In [69]:
## Double check re-assigned values
unc_county_test_3_df = unclaimed_df.loc[unclaimed_df['County']=='None']
unc_county_test_3_df
# len(unc_county_test_3_df)

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
1996,UCP5438,4/30/2018,Flores,Victor,Male,Black / African American,,,New York,5/14/2019,36,36_None,
2011,UCP5440,4/22/2018,Browne,Edward,Male,Hispanic / Latino,,,New York,5/27/2019,36,36_None,
2028,UCP5297,4/15/2018,Davis,Thelma,Female,Black / African American,,,New York,5/20/2019,36,36_None,
2031,UCP5296,4/14/2018,Fasulo,Timothy,Male,White / Caucasian,,,New York,4/25/2019,36,36_None,
2033,UCP5439,4/14/2018,Reyes,Nivia,Female,Other,,,New York,5/27/2019,36,36_None,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7286,UCP990,11/6/1997,Pope,Leon,Male,White / Caucasian,,,Washington,4/14/2020,53,53_None,
7339,UCP1999,10/22/1991,Robles,Salvador,Male,Other,,,Tennessee,1/10/2020,47,47_None,
7344,UCP4522,10/18/1990,Way,Kenneth,Male,Black / African American,,,New York,5/11/2019,36,36_None,
7372,UCP1548,6/17/1973,Seah,Tee Kee,Male,Asian,,,Nevada,5/28/2019,32,32_None,


In [70]:
# Re-add County FIPS column to dataframe, using the State_County name field as a key in the county_v2_dict, to pull the correct County FIPS code for each row
unclaimed_df['County_FIPS'] = unclaimed_df['State_County'].map(county_v2_dict)
# Check unique values in new dataframe field
unclaimed_df['County_FIPS'].unique()

array([17197, 36061, 36047, 36005, 36081, 25025,  6073, 17063, 36085,
       53053, 15003, 32031, 25021, 25027, 16001, 25009, 25017, 25003,
       25013, 25023, 32027, 25015,  6047, 48201, 49035, 47093, 53075,
       53027, 11001, 25005, 47157, 33005, 35031, 53049, 53077, 40143,
       42029, 47155, 47001, 39099, 29019,  6075,  5143,  6093, 13121,
        1073, 34025, 34023, 25001,  4019,  4013,  6023,  6099, 34001,
       12087, 47035,  6037, 40121, 53061, 55025,  6071,  9003, 48141,
       48339,  9009, 42101, 12071, 53033, 17043, 34021,  9011, 48215,
        8087, 12051, 26163,  9001, 47009, 17031, 22069,  9005, 32005,
        9007, 47065, 34013, 36999,  4007, 47013, 18067, 42001,  6059,
       19155,  6083, 26025, 40101, 40041, 42091, 55057, 17999, 48167,
       19153,  6025, 40999, 42999, 26121, 40145, 25011,  9015,  1115,
        5035, 53035, 53073, 41051, 16027, 23005, 47179, 48469, 40135,
       99999, 34029, 40097, 16013, 29051, 48229, 40115, 45007, 12075,
       35059, 53001,

In [71]:
# check null values to make sure none are left [Note - all gone!]
unclaimed_county_null_v2_df = unclaimed_df.loc[unclaimed_df['County_FIPS'].isnull()]
# unclaimed_county_null_v2_df.shape
unclaimed_county_null_v2_df

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS


In [72]:
## Double check re-assigned values
unc_county_test_4_df = unclaimed_df.loc[unclaimed_df['County']=='None']
unc_county_test_4_df
# len(unc_county_test_4_df)

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
1996,UCP5438,4/30/2018,Flores,Victor,Male,Black / African American,,,New York,5/14/2019,36,36_None,36999
2011,UCP5440,4/22/2018,Browne,Edward,Male,Hispanic / Latino,,,New York,5/27/2019,36,36_None,36999
2028,UCP5297,4/15/2018,Davis,Thelma,Female,Black / African American,,,New York,5/20/2019,36,36_None,36999
2031,UCP5296,4/14/2018,Fasulo,Timothy,Male,White / Caucasian,,,New York,4/25/2019,36,36_None,36999
2033,UCP5439,4/14/2018,Reyes,Nivia,Female,Other,,,New York,5/27/2019,36,36_None,36999
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7286,UCP990,11/6/1997,Pope,Leon,Male,White / Caucasian,,,Washington,4/14/2020,53,53_None,53999
7339,UCP1999,10/22/1991,Robles,Salvador,Male,Other,,,Tennessee,1/10/2020,47,47_None,47999
7344,UCP4522,10/18/1990,Way,Kenneth,Male,Black / African American,,,New York,5/11/2019,36,36_None,36999
7372,UCP1548,6/17/1973,Seah,Tee Kee,Male,Asian,,,Nevada,5/28/2019,32,32_None,32999


#### Add in County FIPS column to unidentified dataframe

##### To identify null values, add in County FIPS column to unidentified dataframe using county_centroids_df

In [73]:
# check unidentified df
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS
0,UP51903,3/30/1986,,,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020,1
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019,1
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018,1
3,UP55409,3/26/2000,,,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020,1
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019,1


In [74]:
# Add column with compound field key
unidentified_df['State_County'] = unidentified_df['State_FIPS'].astype(str) + "_" + unidentified_df['County']
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County
0,UP51903,3/30/1986,,,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020,1,1_Autauga
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019,1,1_Autauga
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018,1,1_Baldwin
3,UP55409,3/26/2000,,,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020,1,1_Baldwin
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019,1,1_Bibb


In [75]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
unidentified_df['County_FIPS'] = unidentified_df['State_County'].map(county_dict)
# Check unique values in new dataframe field
unidentified_df['County_FIPS'].unique()

array([ 1001.,  1003.,  1007., ..., 56021., 56033., 56037.])

In [76]:
# check null values
unidentified_county_null_df = unidentified_df.loc[unidentified_df['County_FIPS'].isnull()]
# unidentified_county_null_df.shape
# len(unidentified_county_null_df)
unidentified_county_null_df

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
55,UP14028,12/21/1991,18.0,25.0,,,Alabama,Male,White / Caucasian,3/31/2017,1,,
109,UP13395,9/20/2004,,,,,Alaska,Male,Uncertain,4/14/2020,2,,
110,UP7281,4/30/2001,20.0,30.0,,,Alaska,Male,White / Caucasian,4/14/2020,2,,
111,UP13575,9/11/1997,,,,,Alaska,Male,Uncertain,7/2/2018,2,,
6392,UP4846,4/14/1988,37.0,52.0,,,Indiana,Female,White / Caucasian,6/3/2017,18,,
7676,UP5018,4/19/1996,49.0,59.0,,,Missouri,Female,Black / African American,10/23/2018,29,,
8536,UP17664,1/29/2018,,,,,New Mexico,Male,Uncertain,1/21/2020,35,,
8540,UP16690,5/3/2017,,,,,New Mexico,Unsure,Uncertain,2/8/2020,35,,
8541,UP5578,3/26/2003,50.0,60.0,,,New Mexico,Female,Black / African American,10/24/2018,35,,
10966,UP56112,10/10/2017,,,,,Puerto Rico,Male,Uncertain,2/10/2020,72,,


In [77]:
# As needed, export nulls to address [NOTE - all have been addressed - 28 remain with no city or county]
# unidentified_county_null_df.to_csv('unidentified_county_nulls.csv', encoding='Windows-1252')

In [78]:
# check unclaimed_df
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
0,UP51903,3/30/1986,,,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020,1,1_Autauga,1001.0
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019,1,1_Autauga,1001.0
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018,1,1_Baldwin,1003.0
3,UP55409,3/26/2000,,,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020,1,1_Baldwin,1003.0
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019,1,1_Bibb,1007.0


##### Replace null state values with 'None' and null county values with 'None', then re-write county_FIPS column using county_centroids_v2_dataframe

In [79]:
## Check if any rows with null value for 'State'
uni_state_test_df = unidentified_df.loc[unidentified_df['State'].isnull()]
# len(uni_state_test_df)
uni_state_test_df

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS


In [80]:
## Check if any rows with null value for 'City' and County'
uni_county_test_df = unidentified_df.loc[unidentified_df['County'].isnull()]
# len(uni_county_test_df)
uni_county_test_df

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
55,UP14028,12/21/1991,18.0,25.0,,,Alabama,Male,White / Caucasian,3/31/2017,1,,
109,UP13395,9/20/2004,,,,,Alaska,Male,Uncertain,4/14/2020,2,,
110,UP7281,4/30/2001,20.0,30.0,,,Alaska,Male,White / Caucasian,4/14/2020,2,,
111,UP13575,9/11/1997,,,,,Alaska,Male,Uncertain,7/2/2018,2,,
6392,UP4846,4/14/1988,37.0,52.0,,,Indiana,Female,White / Caucasian,6/3/2017,18,,
7676,UP5018,4/19/1996,49.0,59.0,,,Missouri,Female,Black / African American,10/23/2018,29,,
8536,UP17664,1/29/2018,,,,,New Mexico,Male,Uncertain,1/21/2020,35,,
8540,UP16690,5/3/2017,,,,,New Mexico,Unsure,Uncertain,2/8/2020,35,,
8541,UP5578,3/26/2003,50.0,60.0,,,New Mexico,Female,Black / African American,10/24/2018,35,,
10966,UP56112,10/10/2017,,,,,Puerto Rico,Male,Uncertain,2/10/2020,72,,


In [81]:
# # change NaN County name (for cases with no city or county) to 'None'
unidentified_df['County'] = unidentified_df['County'].fillna('None')
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
0,UP51903,3/30/1986,,,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020,1,1_Autauga,1001.0
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019,1,1_Autauga,1001.0
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018,1,1_Baldwin,1003.0
3,UP55409,3/26/2000,,,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020,1,1_Baldwin,1003.0
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019,1,1_Bibb,1007.0


In [82]:
## Double check re-assigned values
uni_county_test_2_df = unidentified_df.loc[unidentified_df['County']=='None']
uni_county_test_2_df
# len(uni_county_test_2_df)

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
55,UP14028,12/21/1991,18.0,25.0,,,Alabama,Male,White / Caucasian,3/31/2017,1,,
109,UP13395,9/20/2004,,,,,Alaska,Male,Uncertain,4/14/2020,2,,
110,UP7281,4/30/2001,20.0,30.0,,,Alaska,Male,White / Caucasian,4/14/2020,2,,
111,UP13575,9/11/1997,,,,,Alaska,Male,Uncertain,7/2/2018,2,,
6392,UP4846,4/14/1988,37.0,52.0,,,Indiana,Female,White / Caucasian,6/3/2017,18,,
7676,UP5018,4/19/1996,49.0,59.0,,,Missouri,Female,Black / African American,10/23/2018,29,,
8536,UP17664,1/29/2018,,,,,New Mexico,Male,Uncertain,1/21/2020,35,,
8540,UP16690,5/3/2017,,,,,New Mexico,Unsure,Uncertain,2/8/2020,35,,
8541,UP5578,3/26/2003,50.0,60.0,,,New Mexico,Female,Black / African American,10/24/2018,35,,
10966,UP56112,10/10/2017,,,,,Puerto Rico,Male,Uncertain,2/10/2020,72,,


In [83]:
# Re-Add column with compound field key
unidentified_df['State_County'] = unidentified_df['State_FIPS'].astype(str) + "_" + unidentified_df['County']
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
0,UP51903,3/30/1986,,,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020,1,1_Autauga,1001.0
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019,1,1_Autauga,1001.0
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018,1,1_Baldwin,1003.0
3,UP55409,3/26/2000,,,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020,1,1_Baldwin,1003.0
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019,1,1_Bibb,1007.0


In [84]:
## Double check re-assigned values
uni_county_test_3_df = unidentified_df.loc[unidentified_df['County']=='None']
uni_county_test_3_df
# len(uni_county_test_3_df)

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
55,UP14028,12/21/1991,18.0,25.0,,,Alabama,Male,White / Caucasian,3/31/2017,1,1_None,
109,UP13395,9/20/2004,,,,,Alaska,Male,Uncertain,4/14/2020,2,2_None,
110,UP7281,4/30/2001,20.0,30.0,,,Alaska,Male,White / Caucasian,4/14/2020,2,2_None,
111,UP13575,9/11/1997,,,,,Alaska,Male,Uncertain,7/2/2018,2,2_None,
6392,UP4846,4/14/1988,37.0,52.0,,,Indiana,Female,White / Caucasian,6/3/2017,18,18_None,
7676,UP5018,4/19/1996,49.0,59.0,,,Missouri,Female,Black / African American,10/23/2018,29,29_None,
8536,UP17664,1/29/2018,,,,,New Mexico,Male,Uncertain,1/21/2020,35,35_None,
8540,UP16690,5/3/2017,,,,,New Mexico,Unsure,Uncertain,2/8/2020,35,35_None,
8541,UP5578,3/26/2003,50.0,60.0,,,New Mexico,Female,Black / African American,10/24/2018,35,35_None,
10966,UP56112,10/10/2017,,,,,Puerto Rico,Male,Uncertain,2/10/2020,72,72_None,


In [85]:
# Re-add County FIPS column to dataframe, using the State_County name field as a key in the county_v2_dict, to pull the correct County FIPS code for each row
unidentified_df['County_FIPS'] = unidentified_df['State_County'].map(county_v2_dict)
# Check unique values in new dataframe field
unidentified_df['County_FIPS'].unique()

array([ 1001,  1003,  1007, ..., 56021, 56033, 56037], dtype=int64)

In [86]:
# check null values to make sure none are left [Note - all gone!]
unidentified_county_null_v2_df = unidentified_df.loc[unidentified_df['County_FIPS'].isnull()]
# unidentified_county_null_v2_df.shape
unidentified_county_null_v2_df

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS


In [87]:
## Double check re-assigned values
uni_county_test_4_df = unidentified_df.loc[unidentified_df['County']=='None']
uni_county_test_4_df
# len(uni_county_test_4_df)

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
55,UP14028,12/21/1991,18.0,25.0,,,Alabama,Male,White / Caucasian,3/31/2017,1,1_None,1999
109,UP13395,9/20/2004,,,,,Alaska,Male,Uncertain,4/14/2020,2,2_None,2999
110,UP7281,4/30/2001,20.0,30.0,,,Alaska,Male,White / Caucasian,4/14/2020,2,2_None,2999
111,UP13575,9/11/1997,,,,,Alaska,Male,Uncertain,7/2/2018,2,2_None,2999
6392,UP4846,4/14/1988,37.0,52.0,,,Indiana,Female,White / Caucasian,6/3/2017,18,18_None,18999
7676,UP5018,4/19/1996,49.0,59.0,,,Missouri,Female,Black / African American,10/23/2018,29,29_None,29999
8536,UP17664,1/29/2018,,,,,New Mexico,Male,Uncertain,1/21/2020,35,35_None,35999
8540,UP16690,5/3/2017,,,,,New Mexico,Unsure,Uncertain,2/8/2020,35,35_None,35999
8541,UP5578,3/26/2003,50.0,60.0,,,New Mexico,Female,Black / African American,10/24/2018,35,35_None,35999
10966,UP56112,10/10/2017,,,,,Puerto Rico,Male,Uncertain,2/10/2020,72,72_None,72999


### Construct COUNTY-LEVEL GeoJSON with correct structure

Goal format:
{
"type": "Feature",
    "name": "Dane",
    "state": "Wisconsin",
    "state_FIPS": ,
    "county_FIPS": ,
    "properties": {
        "missing": [ ],
        "unclaimed": [ ],
        "unidentified": [ ],
        "filtered": [ ]
    }
    "geometry": {
          "type": "Point",
          "coordinates": [
            -117.79750667,
            36.03755926
          ]
}
* each array will be a list of dictionaries. Each dictionary = one case. Keys = headers

In [88]:
# get headers
missing_header = list(missing_df.columns.values)
print("missing header:", missing_header)
unclaimed_header = list(unclaimed_df.columns.values)
print("unclaimed header:", unclaimed_header)
unidentified_header = list(unidentified_df.columns.values)
print("unidentified header:", unidentified_header)

missing header: ['Case Number', 'DLC', 'Last Name', 'First Name', 'Missing Age', 'City', 'County', 'State', 'Sex', 'Race / Ethnicity', 'Date Modified', 'State_FIPS', 'State_County', 'County_FIPS']
unclaimed header: ['Case Number', 'DBF', 'Last Name', 'First Name', 'Sex', 'Race / Ethnicity', 'City', 'County', 'State', 'Date Modified', 'State_FIPS', 'State_County', 'County_FIPS']
unidentified header: ['Case Number', 'DBF', 'Age From', 'Age To', 'City', 'County', 'State', 'Sex', 'Race / Ethnicity', 'Date Modified', 'State_FIPS', 'State_County', 'County_FIPS']


In [89]:
# check df
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will,17197
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will,17197
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020,36,36_New York,36061
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020,36,36_Kings,36047
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020,36,36_New York,36061


In [90]:
# check data types
type(unclaimed_df['State_FIPS'][0])

numpy.int32

In [93]:
# check centroids
county_centroids_v2_df.head()
len(county_centroids_v2_df)

3283

In [94]:
# sort each database by county FIPS
# sort county centroids by county FIPS
county_centroids_v2_df = county_centroids_v2_df.sort_values(by=['GEOID'])
county_centroids_v2_df.head()

Unnamed: 0,OBJECTID,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,STATE_NAME,County_Key
16,17,1,1,161526,0500000US01001,1001,Autauga,6,1539602000.0,25706961,17,-86.642735,32.534929,Alabama,1_Autauga
2754,2755,1,3,161527,0500000US01003,1003,Baldwin,6,4117547000.0,1133055836,2759,-87.722569,30.727484,Alabama,1_Baldwin
17,18,1,5,161528,0500000US01005,1005,Barbour,6,2292145000.0,50538698,18,-85.393214,31.869587,Alabama,1_Barbour
2336,2337,1,7,161529,0500000US01007,1007,Bibb,6,1612167000.0,9602089,2340,-87.12648,32.998634,Alabama,1_Bibb
1020,1021,1,9,161530,0500000US01009,1009,Blount,6,1670104000.0,15015423,1021,-86.567385,33.980875,Alabama,1_Blount


In [95]:
# sort each database by county FIPS
# sort missing by county FIPS
missing_df = missing_df.sort_values(by=['County_FIPS'])
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020,1,1_Autauga,1001
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015,1,1_Autauga,1001
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019,1,1_Baldwin,1003
3,MP43187,2/20/2018,johnson,abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020,1,1_Baldwin,1003
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016,1,1_Baldwin,1003


In [96]:
# sort unclaimed by county FIPS
unclaimed_df = unclaimed_df.sort_values(by=['County_FIPS'])
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
4956,UCP2755,12/26/2014,Knox,Barbara,Female,White / Caucasian,Birmingham,Jefferson,Alabama,1/11/2019,1,1_Jefferson,1073
2244,UCP54578,1/31/2018,Lovelady,Altha,Female,White / Caucasian,Bessemer,Jefferson,Alabama,3/18/2020,1,1_Jefferson,1073
1977,UCP54579,5/5/2018,Smith,Curtis,Male,White / Caucasian,Birmingham,Jefferson,Alabama,3/18/2020,1,1_Jefferson,1073
2239,UCP5131,2/1/2018,Martin,Michael,Male,White / Caucasian,Center Point,Jefferson,Alabama,1/11/2019,1,1_Jefferson,1073
4065,UCP2747,2/13/2016,Bragoz,John,Male,White / Caucasian,Birmingham,Jefferson,Alabama,1/11/2019,1,1_Jefferson,1073


In [97]:
# sort unidentified by county FIPS
unidentified_df = unidentified_df.sort_values(by=['County_FIPS'])
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
0,UP51903,3/30/1986,,,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020,1,1_Autauga,1001
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019,1,1_Autauga,1001
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018,1,1_Baldwin,1003
3,UP55409,3/26/2000,,,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020,1,1_Baldwin,1003
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019,1,1_Bibb,1007


In [116]:
## Create county array 
county_array = []
## for each county in county_centroids_v2...
i = 0
while i < len(county_centroids_v2_df):
    county_dict = {}
    county_dict["type"] = "Feature"
    county_dict["name"] = str(county_centroids_v2_df["NAME"][i])
    county_dict["county_FIPS"] = str(county_centroids_v2_df["GEOID"][i])
    county_dict["state_name"] = str(county_centroids_v2_df["STATE_NAME"][i])
    county_dict["state_FIPS"] = str(county_centroids_v2_df["STATEFP"][i])
    county_dict["properties"] = {}
    missing_array = []
    j = 0
    while j < len(missing_df):
        # check if state_fips matches
        if missing_df['County_FIPS'][j] == county_centroids_v2_df["GEOID"][i]:
            missing_dict = {}
            for item in missing_header:
                missing_dict[item] = str(missing_df[item][j])
            # append dictionary to missing array
            missing_array.append(missing_dict)
        # increment j
        j += 1
    county_dict["properties"]['missing'] = missing_array     
    
    unclaimed_array = []
    k = 0
    while k < len(unclaimed_df):
        # check if state_fips matches
        if unclaimed_df['County_FIPS'][k] == county_centroids_v2_df["GEOID"][i]:
            unclaimed_dict = {}
            for item in unclaimed_header:
                unclaimed_dict[item] = str(unclaimed_df[item][k])
            # append dictionary to unclaimed array
            unclaimed_array.append(unclaimed_dict)
        # increment k
        k += 1
    county_dict["properties"]['unclaimed'] = unclaimed_array
    
    unidentified_array = []
    l = 0
    while l < len(unidentified_df):
        # check if state_fips matches
        if unidentified_df['County_FIPS'][l] == county_centroids_v2_df["GEOID"][i]:
            unidentified_dict = {}
            for item in unidentified_header:
                unidentified_dict[item] = str(unidentified_df[item][l])
            # append dictionary to unclaimed array
            unidentified_array.append(unidentified_dict)
        # increment l
        l += 1
    county_dict["properties"]['unidentified'] = unidentified_array
    
    county_dict["properties"]['filtered'] = []
    ## set geometry
    county_dict["geometry"] = {}
    county_dict["geometry"]["type"] = "Point"
    county_dict["geometry"]["coordinates"] = [county_centroids_v2_df['Lon_dd'][i], county_centroids_v2_df['Lat_dd'][i]]
    ## append county dictionary to array
    county_array.append(county_dict)
    # test statement
    print("added county", str(i), "of 3283")
    # increment interator
    i += 1
# county_array

added county 1000 of 3283
added county 1001 of 3283
added county 1002 of 3283
added county 1003 of 3283
added county 1004 of 3283
added county 1005 of 3283
added county 1006 of 3283
added county 1007 of 3283
added county 1008 of 3283
added county 1009 of 3283
added county 1010 of 3283
added county 1011 of 3283
added county 1012 of 3283
added county 1013 of 3283
added county 1014 of 3283
added county 1015 of 3283
added county 1016 of 3283
added county 1017 of 3283
added county 1018 of 3283
added county 1019 of 3283
added county 1020 of 3283
added county 1021 of 3283
added county 1022 of 3283
added county 1023 of 3283
added county 1024 of 3283
added county 1025 of 3283
added county 1026 of 3283
added county 1027 of 3283
added county 1028 of 3283
added county 1029 of 3283
added county 1030 of 3283
added county 1031 of 3283
added county 1032 of 3283
added county 1033 of 3283
added county 1034 of 3283
added county 1035 of 3283
added county 1036 of 3283
added county 1037 of 3283
added county

In [117]:
county_array

[{'type': 'Feature',
  'name': 'Anderson',
  'county_FIPS': '45007',
  'state_name': 'South Carolina',
  'state_FIPS': '45',
  'properties': {'missing': [{'Case Number': 'MP37734',
     'DLC': '10/2/2016',
     'Last Name': 'Garrett',
     'First Name': 'Charles',
     'Missing Age': '33',
     'City': 'Anderson',
     'County': 'Anderson',
     'State': 'South Carolina',
     'Sex': 'Male',
     'Race / Ethnicity': 'White / Caucasian',
     'Date Modified': '1/15/2020',
     'State_FIPS': '45',
     'State_County': '45_Anderson',
     'County_FIPS': '45007'},
    {'Case Number': 'MP29849',
     'DLC': '4/20/2015',
     'Last Name': 'Quarles',
     'First Name': 'Emmanuel',
     'Missing Age': '26',
     'City': 'Anderson',
     'County': 'Anderson',
     'State': 'South Carolina',
     'Sex': 'Male',
     'Race / Ethnicity': 'Black / African American',
     'Date Modified': '1/15/2020',
     'State_FIPS': '45',
     'State_County': '45_Anderson',
     'County_FIPS': '45007'},
    {'Ca

In [118]:
# check item in array
len(county_array)
# county_array[55]['properties']['unclaimed']

50

In [119]:
# Create FeatureCollection
county_feature_collection = {}
county_feature_collection["type"] = "FeatureCollection"
county_feature_collection["features"] = county_array
# county_feature_collection

In [120]:
# Convert FeatureCollection to JSON format
county_geojson = geojson.dumps(county_feature_collection)
# check type to make sure conversion was sucessful
print(type(county_geojson))

<class 'str'>


In [121]:
# Save JSON-formatted FeatureCollection as JSON file
# Save as new json file
with open('JSON/county_geojson.json', 'w', encoding='utf-8') as json_file:
    json_file.write(county_geojson)