TO DO:
1. ~Fix counties for unclaimed~
2. ~Fix counties for unidentified~
3. ~Add line to state_centroids with south pole coordinates and nonsense FIPS code~
4. ~Re-export state-level json (now that county fields have been updated within the databases and American Samoa has been removed)~
    * ~MAKE SURE TO MAP NAS to nonsense FIPS~
5. ~Add to county centroids:~
    * ~55 lines with south pole coordinates and nonsense county FIPS codes (state only_999)~
    * ~1 line with south pole coordinates and nonsense FIPS code (99)~
6. ~Format county data - state name, and then name and county FIPS code, to get GEOID~
7. ~Export county-level json~
    * ~any records with no county get pulled (to separate state FIPS with no county key and nonsense coordinates [south pole])~
    * ~make sure state name included as field with each database, not just FIPS code~
8. ~Re-export summary count JSON (as a few cases have been deleted)~
    * ~Address 21 NAs for Unclaimed?~
9. ~Clean null values for age and gender and sex and race / ethnicity~
10. Re-export state and county JSONS
11. See how bad city data would be
    * likely need to make all city names .lower

In [1]:
# import necessary packages
import pandas as pd
import geopandas as gpd
import numpy as np
import json
import geojson

Notes:
1. Edits to Missing_04182020.csv prior to import
  * Virgin Islands (to United States Virgin Islands), 
  * Tennesse (to Tennessee), and 
  * Northern Mariana Islands (to Commonwealth of the Northern Mariana Islands)
  * Address all county nulls
2. Edits to Unclaimed_0418202.csv priort to import
  * Address all county nulls
3. Edits to Unidentified_04182020.csv prior to import
  * Virgin Islands (to United States Virgin Islands)
  * Address all county nulls
4. Edits to state_centroids_v2 prior to import
  * Add one row with south pole coordinates and nonsense FIPS code(99) - for cases w/ no city, county, or state
5. Edits to county_centroids_v2 prior to import
  * Add one row with south pole coordinates and nonsense FIPS code(99) - for cases w/ no city, county, or state
  * Add 55 rows with south pole coordinates and nonsense county FIPS codes(999) - for cases w/ no city or county

In [2]:
# Read in csvs
city_df = pd.read_csv('cities.csv')

In [3]:
state_centroids_df = pd.read_csv('state_centroids.csv')

In [4]:
# alternate centroids - with None option with FIPS 99
state_centroids_v2_df = pd.read_csv('state_centroids_v2.csv')

In [5]:
county_centroids_df = pd.read_csv('county_centroids.csv', encoding='Windows-1252')

In [6]:
# alternate centroids - with None options with count FIPS 999
county_centroids_v2_df = pd.read_csv('county_centroids_v2.csv', encoding='Windows-1252')

In [7]:
missing_df = pd.read_csv('Missing_04182020.csv')

In [8]:
unclaimed_df = pd.read_csv('Unclaimed_04182020.csv')

In [9]:
unidentified_df = pd.read_csv('Unidentified_04182020.csv')

In [10]:
# check dataframe
county_centroids_v2_df.head()

Unnamed: 0,OBJECTID,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd
0,1,21,7,516850,0500000US21007,21007,Ballard,6,639387500.0,69473325,1,-88.999262,37.058489
1,2,21,17,516855,0500000US21017,21017,Bourbon,6,750439400.0,4829777,2,-84.217155,38.206742
2,3,21,31,516862,0500000US21031,21031,Butler,6,1103572000.0,13943044,3,-86.681628,37.207292
3,4,21,65,516879,0500000US21065,21065,Estill,6,655509900.0,6516335,4,-83.964316,37.692451
4,5,21,69,516881,0500000US21069,21069,Fleming,6,902727200.0,7182793,5,-83.69666,38.370126


# PART 1: Summary data (count for all 3 databases, by state)

### 1/4: Get count of missing person cases

In [None]:
missing_count = missing_df.groupby('State').count()
len(missing_count)
missing_count.head()

In [None]:
# Remove all columns except case number count
missing_count = missing_count.drop(columns=['DLC','Last Name', 'First Name', 'Missing Age', 'City', 'County', 'Sex', 'Race / Ethnicity', 'Date Modified'])

In [None]:
# add column for state (since state is now index)
missing_count['State'] = missing_count.index

In [None]:
# rename case number count column
missing_count = missing_count.rename(columns = {'Case Number': 'Missing_CaseCount'}, inplace = False)

In [None]:
# check dataframe
missing_count.head()

In [None]:
# make dictionary of states and missing person counts
missing_dict = dict(zip(missing_count.State, missing_count.Missing_CaseCount))
len(missing_dict)
# missing_dict

In [None]:
# Get count of values in missing database w/ no state assigned
mis_null_series = missing_df.loc[missing_df['State'].isnull()].count()
mis_null_ct = mis_null_series['Case Number']
mis_null_ct

### 2/4: Get count of unclaimed persons

In [None]:
unclaimed_count = unclaimed_df.groupby('State').count()
# len(unclaimed_count)
unclaimed_count.head()

In [None]:
# Remove all columns except case number count
unclaimed_count = unclaimed_count.drop(columns=['DBF','Last Name', 'First Name', 'Sex', 'Race / Ethnicity', 'City', 'County', 'Date Modified'])

In [None]:
# add column for state (since state is now index)
unclaimed_count['State'] = unclaimed_count.index

In [None]:
# rename case number count column
unclaimed_count = unclaimed_count.rename(columns = {'Case Number': 'Unclaimed_CaseCount'}, inplace = False)

In [None]:
# check dataframe
unclaimed_count.head()

In [None]:
# make dictionary of states and unclaimed case counts
unclaimed_dict = dict(zip(unclaimed_count.State, unclaimed_count.Unclaimed_CaseCount))
len(unclaimed_dict)
# unclaimed_dict

In [None]:
# Get count of values in unclaimed database w/ no state assigned
unc_null_series = unclaimed_df.loc[unclaimed_df['State'].isnull()].count()
unc_null_ct = unc_null_series['Case Number']
unc_null_ct

### 3/4: Get count of unidentified persons

In [None]:
unidentified_count = unidentified_df.groupby('State').count()
# len(unidentified_count)
unidentified_count.head()

In [None]:
# Remove all columns except case number count
unidentified_count = unidentified_count.drop(columns=['DBF','Age From', 'Age To', 'City', 'County', 'Sex', 'Race / Ethnicity', 'Date Modified'])

In [None]:
# add column for state (since state is now index)
unidentified_count['State'] = unidentified_count.index

In [None]:
# rename case number count column
unidentified_count = unidentified_count.rename(columns = {'Case Number': 'Unidentified_CaseCount'}, inplace = False)

In [None]:
# check dataframe
unidentified_count.head()

In [None]:
# make dictionary of states and unidentified case counts
unidentified_dict = dict(zip(unidentified_count.State, unidentified_count.Unidentified_CaseCount))
len(unidentified_dict)
# unidentified_dict

In [None]:
# Get count of values in unidentified database w/ no state assigned
uni_null_series = unidentified_df.loc[unidentified_df['State'].isnull()].count()
uni_null_ct = uni_null_series['Case Number']
uni_null_ct

### 4/4: Make summary dataframe

In [None]:
# make a new summary dataframe based on the state centroids
summary_df = state_centroids_v2_df
# sort by state name
summary_df = summary_df.sort_values(by=['STATEFP'])
summary_df.head()

In [None]:
len(summary_df)

#### 4a: Add Missing Person count for each state

In [None]:
# Add new column to dataframe, using the state name field as a key in the missing_dict, to pull the correct missing case count for each state
summary_df['Missing_Count'] = summary_df['NAME'].map(missing_dict)
summary_df.head()

#### 4b: Add Unclaimed Person count for each state

In [None]:
# Add new column to dataframe, using the state name field as a key in the unclaimed_dict, to pull the correct unclaimed case count for each state
summary_df['Unclaimed_Count'] = summary_df['NAME'].map(unclaimed_dict)
summary_df.head()

In [None]:
# Check null values [NOTE: seems fine to have nulls]
unclaimed_null_df = summary_df.loc[summary_df['Unclaimed_Count'].isnull()]
unclaimed_null_df

In [None]:
# # change NaN Unclaimed count for fake FIPS 99 to count of values in missing database w/ no state assigned (21)
index_Series = summary_df.loc[summary_df['STATEFP']==99]
index_None = index_Series.index[0]
summary_df.loc[index_None, 'Unclaimed_Count'] = unc_null_ct
# check value
summary_df['Unclaimed_Count'][index_None]

#### 4b: Add Unidentified Person count for each state

In [None]:
# Add new column to dataframe, using the state name field as a key in the unidentified_dict, to pull the correct unidentified case count for each state
summary_df['Unidentified_Count'] = summary_df['NAME'].map(unidentified_dict)
summary_df.head()

In [None]:
# Check null values [NOTE: seems fine to have nulls]
unidentified_null_df = summary_df.loc[summary_df['Unidentified_Count'].isnull()]
unidentified_null_df

#### 4c: Add column for total count of cases in all three databases for each state

In [None]:
summary_df.head()

In [None]:
summary_df.iloc[:, -3]

In [None]:
# Note - may need to change -4 to -3, run, then change back to -4 and re-run. DON'T KNOW WHY
summary_df['Total_Count'] = summary_df.iloc[:, -4:-1].sum(axis=1)
summary_df.head(56)

#### 4c: Convert to geodataframe and export as GeoJSON

In [None]:
# check final summary_df
summary_df.head()

In [None]:
# Convert dataframe to geodataframe
summary_gdf = gpd.GeoDataFrame(summary_df, geometry=gpd.points_from_xy(x=summary_df.Lon_dd, y=summary_df.Lat_dd))
summary_gdf.head()

In [None]:
# write to geoJSON
summary_gdf.to_file("JSON/summary_counts.json", driver="GeoJSON", encoding='utf-8')

# Part 2 - Prep data by replacing null attribute values [NOT CITY, COUNTY, OR STATE - Those are addressed later]

### Clean Missing dataframe

##### Check for null values, and replace as needed

In [11]:
# check dataframe
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019
3,MP43187,2/20/2018,johnson,abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016


In [12]:
## Check if any rows with null value for DLC [NOTE: Leave as NaN]
missing_attr_df = missing_df.loc[missing_df['DLC'].isnull()]
# len(missing_attr_df)
missing_attr_df

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified
14160,MP56286,,DELAHAY,Glenora,,York,York,Pennsylvania,Female,White / Caucasian,4/8/2020
16310,MP54713,,Garcia,Ruben,,Austin,Travis,Texas,Male,Hispanic / Latino,4/3/2020


In [13]:
## Check if any rows with null value for Missing Age
missing_attr_1_df = missing_df.loc[missing_df['Missing Age'].isnull()]
# len(missing_attr_1_df)
missing_attr_1_df

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified
3293,MP35843,7/20/1990,Renteria,Martin,,Santa Ana,Orange,California,Male,Hispanic / Latino,11/21/2019
4756,MP59185,11/25/1970,Rindahl,Baby Boy,,Wheat Ridge,Jefferson,Colorado,Male,White / Caucasian,1/27/2020
5480,MP38301,8/13/2014,Weedon,Nehemiah,,Jacksonville,Duval,Florida,Male,Black / African American,4/13/2020
6208,MP38734,2/15/2016,Richardson,Baby,,Orlando,Orange,Florida,Female,Black / African American,4/9/2020
6883,MP566,7/7/1999,Murphy,Sarah,,Columbus,Muscogee,Georgia,Female,Black / African American,10/4/2011
7079,MP1284,4/21/2002,Martinez,Priscilla,,Honokowai,Maui,Hawaii,Female,"White / Caucasian, Hispanic / Latino",3/5/2020
7085,MP1282,7/29/2005,Amaral,Robert,,Lanai,Maui,Hawaii,Male,Other,3/5/2020
7826,MP17063,5/20/2003,Liendo,Fabian,,Wapello,Louisa,Iowa,Male,"White / Caucasian, Hispanic / Latino",10/4/2017
12607,MP14033,4/1/1965,Franks,Elizabeth,,Toledo,Lucas,Ohio,Female,White / Caucasian,6/1/2018
13678,MP20168,12/4/1974,Lloyd,Charles,,Banks,Washington,Oregon,Male,White / Caucasian,3/2/2020


In [14]:
# Re-assign missing age values to be 'Unknown'
missing_df['Missing Age'] = missing_df['Missing Age'].fillna('Unknown')
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019
3,MP43187,2/20/2018,johnson,abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016


In [15]:
## Double check re-assigned values
missing_age_test_df = missing_df.loc[missing_df['Missing Age']=='Unknown']
missing_age_test_df
# len(missing_age_test_df)

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified
3293,MP35843,7/20/1990,Renteria,Martin,Unknown,Santa Ana,Orange,California,Male,Hispanic / Latino,11/21/2019
4756,MP59185,11/25/1970,Rindahl,Baby Boy,Unknown,Wheat Ridge,Jefferson,Colorado,Male,White / Caucasian,1/27/2020
5480,MP38301,8/13/2014,Weedon,Nehemiah,Unknown,Jacksonville,Duval,Florida,Male,Black / African American,4/13/2020
6208,MP38734,2/15/2016,Richardson,Baby,Unknown,Orlando,Orange,Florida,Female,Black / African American,4/9/2020
6883,MP566,7/7/1999,Murphy,Sarah,Unknown,Columbus,Muscogee,Georgia,Female,Black / African American,10/4/2011
7079,MP1284,4/21/2002,Martinez,Priscilla,Unknown,Honokowai,Maui,Hawaii,Female,"White / Caucasian, Hispanic / Latino",3/5/2020
7085,MP1282,7/29/2005,Amaral,Robert,Unknown,Lanai,Maui,Hawaii,Male,Other,3/5/2020
7826,MP17063,5/20/2003,Liendo,Fabian,Unknown,Wapello,Louisa,Iowa,Male,"White / Caucasian, Hispanic / Latino",10/4/2017
12607,MP14033,4/1/1965,Franks,Elizabeth,Unknown,Toledo,Lucas,Ohio,Female,White / Caucasian,6/1/2018
13678,MP20168,12/4/1974,Lloyd,Charles,Unknown,Banks,Washington,Oregon,Male,White / Caucasian,3/2/2020


In [16]:
## Check if any rows with null value for Race / Ethnicity
missing_attr_2_df = missing_df.loc[missing_df['Race / Ethnicity'].isnull()]
# len(missing_attr_2_df)
missing_attr_2_df

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified
12167,MP55602,4/1/1955,Williams,James,27,Marion,McDowell,North Carolina,Male,,8/1/2019


In [17]:
# Re-assign missing race / ethnicity values to be 'Uncertain'
missing_df['Race / Ethnicity'] = missing_df['Race / Ethnicity'].fillna('Uncertain')
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019
3,MP43187,2/20/2018,johnson,abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016


In [19]:
## Double check re-assigned values
missing_race_test_df = missing_df.loc[missing_df['Race / Ethnicity']=='Uncertain']
missing_race_test_df
# len(missing_race_test_df)

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified
252,MP55684,11/22/1979,Hoffman,Ronald,37,Dutch Harbor,Aleutians West,Alaska,Male,Uncertain,4/14/2020
259,MP34147,7/24/2004,Flores-Mata,Valeriano,32,St. Paul,Aleutians West,Alaska,Male,Uncertain,4/14/2020
306,MP57385,8/27/1985,Garcia,James,26,Anchorage,Anchorage,Alaska,Male,Uncertain,4/14/2020
307,MP43344,8/17/1981,Hawk,Robert,34,Anchorage,Anchorage,Alaska,Male,Uncertain,2/5/2020
414,MP4342,5/1/2004,Keys,Avi,19,Fairbanks,Fairbanks North Star,Alaska,Male,Uncertain,4/18/2018
462,MP66815,7/18/1981,Riddell,Randall,26,Juneau,Juneau,Alaska,Male,Uncertain,4/16/2020
463,MP67097,7/18/1981,Mourant,Rob,23,Juneau,Juneau,Alaska,Male,Uncertain,4/16/2020
465,MP66843,9/6/1967,Johnston,Kenneth,42,Juneau,Juneau,Alaska,Male,Uncertain,4/14/2020
539,MP66592,11/27/1979,Pipkin,Jay,17,Soldotna,Kenai Peninsula,Alaska,Male,Uncertain,4/14/2020
602,MP29019,5/28/1972,Torsen,Melvin,23,Kodiak,Kodiak Island,Alaska,Male,Uncertain,4/14/2020


##### Force all names to lowercase then capitalize


In [20]:
# check dataframe
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019
3,MP43187,2/20/2018,johnson,abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016


In [21]:
missing_df['Last Name'] = missing_df['Last Name'].str.lower()
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified
0,MP2341,5/8/1999,reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020
1,MP8522,1/15/2010,walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015
2,MP50314,5/16/2018,keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019
3,MP43187,2/20/2018,johnson,abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020
4,MP23019,11/5/2012,shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016


In [22]:
missing_df['Last Name'] = missing_df['Last Name'].str.capitalize()
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019
3,MP43187,2/20/2018,Johnson,abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016


In [23]:
missing_df['First Name'] = missing_df['First Name'].str.lower()
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified
0,MP2341,5/8/1999,Reynolds,james,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020
1,MP8522,1/15/2010,Walker,jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015
2,MP50314,5/16/2018,Keszthelyi,steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019
3,MP43187,2/20/2018,Johnson,abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020
4,MP23019,11/5/2012,Shroyer,james,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016


In [24]:
missing_df['First Name'] = missing_df['First Name'].str.capitalize()
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019
3,MP43187,2/20/2018,Johnson,Abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016


### Clean Unclaimed dataframe

##### Check for null values, and replace as needed

In [25]:
# check dataframe
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020


In [26]:
## Check if any rows with null value for DBF [NOTE: Leave as NaN]
unclaimed_attr_df = unclaimed_df.loc[unclaimed_df['DBF'].isnull()]
# len(unclaimed_attr_df)
unclaimed_attr_df

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified
7382,UCP66140,,Beverly,John,,,,Yakima,Washington,4/5/2020
7383,UCP66175,,Bradford,Mary,,,,Yakima,Washington,4/6/2020
7384,UCP66181,,Neal,Terry,,,,Yakima,Washington,4/6/2020
7385,UCP66087,,Dick,Donald,,,,Yakima,Washington,4/4/2020
7386,UCP66095,,Coyle,Edgar,,,,Yakima,Washington,4/4/2020
...,...,...,...,...,...,...,...,...,...,...
8330,UCP320,,Baskin,Clyde,Male,Black / African American,Spokane Valley,Spokane,Washington,4/14/2020
8331,UCP341,,Brotherton,Quilen,Male,,,Spokane,Washington,4/14/2020
8332,UCP326,,Bell,Phillip,Male,,,Spokane,Washington,4/14/2020
8333,UCP345,,Buonantony,Elena,Female,,,Spokane,Washington,4/14/2020


In [27]:
## Check if any rows with null value for last name
unclaimed_attr_1_df = unclaimed_df.loc[unclaimed_df['Last Name'].isnull()]
# len(unclaimed_attr_1_df)
unclaimed_attr_1_df

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified
7911,UCP62756,,,,,,,San Bernardino,California,12/19/2019
8265,UCP62750,,,,,,,San Bernardino,California,12/19/2019


In [28]:
# Re-assign missing last name values to be 'Unknown'
unclaimed_df['Last Name'] = unclaimed_df['Last Name'].fillna('Unknown')
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020


In [29]:
## Double check re-assigned values
unclaimed_lname_test_df = unclaimed_df.loc[unclaimed_df['Last Name']=='Unknown']
unclaimed_lname_test_df
# len(unclaimed_lname_test_df)

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified
7911,UCP62756,,Unknown,,,,,San Bernardino,California,12/19/2019
8265,UCP62750,,Unknown,,,,,San Bernardino,California,12/19/2019


In [30]:
## Check if any rows with null value for first name
unclaimed_attr_2_df = unclaimed_df.loc[unclaimed_df['First Name'].isnull()]
# len(unclaimed_attr_2_df)
unclaimed_attr_2_df

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified
7911,UCP62756,,Unknown,,,,,San Bernardino,California,12/19/2019
8168,UCP66063,,Goldstein,,,,,Yakima,Washington,3/26/2020
8265,UCP62750,,Unknown,,,,,San Bernardino,California,12/19/2019


In [31]:
# Re-assign missing first name values to be 'Unknown'
unclaimed_df['First Name'] = unclaimed_df['First Name'].fillna('Unknown')
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020


In [32]:
## Double check re-assigned values
unclaimed_fname_test_df = unclaimed_df.loc[unclaimed_df['First Name']=='Unknown']
unclaimed_fname_test_df
# len(unclaimed_fname_test_df)

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified
7911,UCP62756,,Unknown,Unknown,,,,San Bernardino,California,12/19/2019
8168,UCP66063,,Goldstein,Unknown,,,,Yakima,Washington,3/26/2020
8265,UCP62750,,Unknown,Unknown,,,,San Bernardino,California,12/19/2019


In [33]:
## Check if any rows with null value for sex
unclaimed_attr_3_df = unclaimed_df.loc[unclaimed_df['Sex'].isnull()]
# len(unclaimed_attr_3_df)
unclaimed_attr_3_df

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified
4134,UCP63503,1/22/2016,Vanderbeck,Robert,,,,San Bernardino,California,12/19/2019
4160,UCP63509,1/10/2016,Johnson,James,,,,San Bernardino,California,12/19/2019
4178,UCP63504,1/3/2016,McKenzie,Rexford,,,,San Bernardino,California,12/19/2019
4181,UCP63505,12/31/2015,Schafer,Julia,,,,San Bernardino,California,12/19/2019
4185,UCP63497,12/29/2015,Waldren,Hayward,,,,San Bernardino,California,12/19/2019
...,...,...,...,...,...,...,...,...,...,...
8288,UCP62761,,Sepulveda,Raymond,,,,San Bernardino,California,12/19/2019
8289,UCP62762,,Cruz,Sergio,,,,San Bernardino,California,12/19/2019
8290,UCP62738,,Hartman,William,,,,San Bernardino,California,12/19/2019
8314,UCP1027,,Miah,Mashuk,,,,Wayne,Michigan,3/19/2020


In [34]:
# Re-assign missing sex values to be 'Unknown'
unclaimed_df['Sex'] = unclaimed_df['Sex'].fillna('Unknown')
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020


In [35]:
## Double check re-assigned values
unclaimed_sex_test_df = unclaimed_df.loc[unclaimed_df['Sex']=='Unknown']
unclaimed_sex_test_df
# len(unclaimed_sex_test_df)

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified
4134,UCP63503,1/22/2016,Vanderbeck,Robert,Unknown,,,San Bernardino,California,12/19/2019
4160,UCP63509,1/10/2016,Johnson,James,Unknown,,,San Bernardino,California,12/19/2019
4178,UCP63504,1/3/2016,McKenzie,Rexford,Unknown,,,San Bernardino,California,12/19/2019
4181,UCP63505,12/31/2015,Schafer,Julia,Unknown,,,San Bernardino,California,12/19/2019
4185,UCP63497,12/29/2015,Waldren,Hayward,Unknown,,,San Bernardino,California,12/19/2019
...,...,...,...,...,...,...,...,...,...,...
8288,UCP62761,,Sepulveda,Raymond,Unknown,,,San Bernardino,California,12/19/2019
8289,UCP62762,,Cruz,Sergio,Unknown,,,San Bernardino,California,12/19/2019
8290,UCP62738,,Hartman,William,Unknown,,,San Bernardino,California,12/19/2019
8314,UCP1027,,Miah,Mashuk,Unknown,,,Wayne,Michigan,3/19/2020


In [36]:
## Check if any rows with null value for race / ethnicity
unclaimed_attr_4_df = unclaimed_df.loc[unclaimed_df['Race / Ethnicity'].isnull()]
# len(unclaimed_attr_4_df)
unclaimed_attr_4_df

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified
2182,UCP5385,2/18/2018,Tharpe,Ronald,Male,,lakewood,Pierce,Washington,5/5/2018
2240,UCP5361,2/1/2018,Weigand,Michael,Male,,Tacoma,Pierce,Washington,5/5/2018
2241,UCP5384,2/1/2018,Weigand,Michael,Male,,Tacoma,Pierce,Washington,5/5/2018
2245,UCP5398,1/31/2018,Campbell,Mollie,Female,,Puyallup,Pierce,Washington,5/8/2018
2268,UCP5363,1/22/2018,Claxton,Sueann,Female,,Tacoma,Pierce,Washington,5/5/2018
...,...,...,...,...,...,...,...,...,...,...
8329,UCP1040,,Snell,Gerald,Male,,,Wayne,Michigan,3/19/2020
8331,UCP341,,Brotherton,Quilen,Male,,,Spokane,Washington,4/14/2020
8332,UCP326,,Bell,Phillip,Male,,,Spokane,Washington,4/14/2020
8333,UCP345,,Buonantony,Elena,Female,,,Spokane,Washington,4/14/2020


In [37]:
# Re-assign missing race / ethnicity values to be 'Uncertain'
unclaimed_df['Race / Ethnicity'] = unclaimed_df['Race / Ethnicity'].fillna('Uncertain')
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020


In [39]:
## Double check re-assigned values
unclaimed_race_test_df = unclaimed_df.loc[unclaimed_df['Race / Ethnicity']=='Uncertain']
unclaimed_race_test_df
# len(unclaimed_race_test_df)

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified
66,UCP68060,3/21/2020,Moren,Carmen,Female,Uncertain,Bronx,Bronx,New York,4/11/2020
80,UCP67791,3/18/2020,Bekeova,Ilona,Female,Uncertain,Manhattan,New York,New York,4/3/2020
98,UCP68150,3/12/2020,Bell,Thomas,Male,Uncertain,Puyallup,Pierce,Washington,4/13/2020
99,UCP67863,3/11/2020,Olave,Rigoberto,Male,Uncertain,Queens,Queens,New York,4/5/2020
108,UCP68044,3/9/2020,Alarcon,Jose,Male,Uncertain,Bronx,Bronx,New York,4/10/2020
...,...,...,...,...,...,...,...,...,...,...
8329,UCP1040,,Snell,Gerald,Male,Uncertain,,Wayne,Michigan,3/19/2020
8331,UCP341,,Brotherton,Quilen,Male,Uncertain,,Spokane,Washington,4/14/2020
8332,UCP326,,Bell,Phillip,Male,Uncertain,,Spokane,Washington,4/14/2020
8333,UCP345,,Buonantony,Elena,Female,Uncertain,,Spokane,Washington,4/14/2020


##### Force all names to lowercase, then capitalize

In [40]:
# check dataframe
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020


In [41]:
unclaimed_df['Last Name'] = unclaimed_df['Last Name'].str.lower()
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified
0,UCP68231,4/15/2020,sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020
1,UCP68248,4/13/2020,brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020
2,UCP68242,4/9/2020,smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020
3,UCP68244,4/9/2020,bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020
4,UCP68228,4/9/2020,rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020


In [42]:
unclaimed_df['Last Name'] = unclaimed_df['Last Name'].str.capitalize()
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020


In [43]:
unclaimed_df['First Name'] = unclaimed_df['First Name'].str.lower()
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified
0,UCP68231,4/15/2020,Sanders,stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020
1,UCP68248,4/13/2020,Brookshire,renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020
2,UCP68242,4/9/2020,Smith,victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020
3,UCP68244,4/9/2020,Bellamy,eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020
4,UCP68228,4/9/2020,Rodriguez,elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020


In [44]:
unclaimed_df['First Name'] = unclaimed_df['First Name'].str.capitalize()
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020


### Clean Unidentified dataframe

##### Check for null values, and replace as needed

In [45]:
# check dataframe
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified
0,UP51903,3/30/1986,,,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018
3,UP55409,3/26/2000,,,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019


In [46]:
## Check if any rows with null value for DBF [NOTE: Leave as NaN]
unidentified_attr_df = unidentified_df.loc[unidentified_df['DBF'].isnull()]
# len(unidentified_attr_df)
unidentified_attr_df

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified
1621,UP54136,,,,,Pima,Arizona,,Uncertain,1/16/2020
3718,UP17837,,65.0,70.0,,Orange,California,Male,White / Caucasian,11/1/2018


In [47]:
## Check if any rows with null value for Age From
unidentified_attr_1_df = unidentified_df.loc[unidentified_df['Age From'].isnull()]
# len(unidentified_attr_1_df)
unidentified_attr_1_df

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified
0,UP51903,3/30/1986,,,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020
3,UP55409,3/26/2000,,,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020
34,UP51245,11/29/2017,,,Huntsville,Madison,Alabama,Male,Uncertain,12/27/2019
35,UP11867,12/8/2013,,,Huntsville,Madison,Alabama,Male,White / Caucasian,11/9/2018
36,UP51243,11/29/2017,,,Huntsville,Madison,Alabama,Female,Uncertain,12/27/2019
...,...,...,...,...,...,...,...,...,...,...
13206,UP52045,6/14/2018,,,Cyclone,Wyoming,West Virginia,Male,Uncertain,1/15/2020
13216,UP13695,11/9/1989,,,Paris,Kenosha,Wisconsin,Male,White / Caucasian,4/11/2017
13240,UP11611,6/5/2008,,,Baraboo,Sauk,Wisconsin,Male,Uncertain,11/22/2013
13248,UP7631,7/10/1973,,,Waukesha,Waukesha,Wisconsin,Male,White / Caucasian,11/9/2013


In [48]:
# Re-assign missing age from values to be 0
unidentified_df['Age From'] = unidentified_df['Age From'].fillna(0)
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified
0,UP51903,3/30/1986,0.0,,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018
3,UP55409,3/26/2000,0.0,,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019


In [49]:
## Double check re-assigned values
unidentified_agefrom_test_df = unidentified_df.loc[unidentified_df['Age From']==0]
unidentified_agefrom_test_df
# len(unidentified_agefrom_test_df)

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified
0,UP51903,3/30/1986,0.0,,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020
3,UP55409,3/26/2000,0.0,,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020
34,UP51245,11/29/2017,0.0,,Huntsville,Madison,Alabama,Male,Uncertain,12/27/2019
35,UP11867,12/8/2013,0.0,,Huntsville,Madison,Alabama,Male,White / Caucasian,11/9/2018
36,UP51243,11/29/2017,0.0,,Huntsville,Madison,Alabama,Female,Uncertain,12/27/2019
...,...,...,...,...,...,...,...,...,...,...
13236,UP13617,11/14/2014,0.0,1.0,West Allis,Milwaukee,Wisconsin,Male,Uncertain,4/20/2016
13240,UP11611,6/5/2008,0.0,,Baraboo,Sauk,Wisconsin,Male,Uncertain,11/22/2013
13248,UP7631,7/10/1973,0.0,,Waukesha,Waukesha,Wisconsin,Male,White / Caucasian,11/9/2013
13251,UP58757,12/16/1970,0.0,,Neenah,Winnebago,Wisconsin,Female,White / Caucasian,1/10/2020


In [50]:
## Check if any rows with null value for Age To
unidentified_attr_2_df = unidentified_df.loc[unidentified_df['Age To'].isnull()]
# len(unidentified_attr_2_df)
unidentified_attr_2_df

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified
0,UP51903,3/30/1986,0.0,,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020
3,UP55409,3/26/2000,0.0,,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020
34,UP51245,11/29/2017,0.0,,Huntsville,Madison,Alabama,Male,Uncertain,12/27/2019
35,UP11867,12/8/2013,0.0,,Huntsville,Madison,Alabama,Male,White / Caucasian,11/9/2018
36,UP51243,11/29/2017,0.0,,Huntsville,Madison,Alabama,Female,Uncertain,12/27/2019
...,...,...,...,...,...,...,...,...,...,...
13206,UP52045,6/14/2018,0.0,,Cyclone,Wyoming,West Virginia,Male,Uncertain,1/15/2020
13216,UP13695,11/9/1989,0.0,,Paris,Kenosha,Wisconsin,Male,White / Caucasian,4/11/2017
13240,UP11611,6/5/2008,0.0,,Baraboo,Sauk,Wisconsin,Male,Uncertain,11/22/2013
13248,UP7631,7/10/1973,0.0,,Waukesha,Waukesha,Wisconsin,Male,White / Caucasian,11/9/2013


In [51]:
# Re-assign missing age to values to be 120
unidentified_df['Age To'] = unidentified_df['Age To'].fillna(120)
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified
0,UP51903,3/30/1986,0.0,120.0,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018
3,UP55409,3/26/2000,0.0,120.0,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019


In [52]:
## Double check re-assigned values
unidentified_ageto_test_df = unidentified_df.loc[unidentified_df['Age To']==120]
unidentified_ageto_test_df
# len(unidentified_ageto_test_df)

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified
0,UP51903,3/30/1986,0.0,120.0,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020
3,UP55409,3/26/2000,0.0,120.0,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020
34,UP51245,11/29/2017,0.0,120.0,Huntsville,Madison,Alabama,Male,Uncertain,12/27/2019
35,UP11867,12/8/2013,0.0,120.0,Huntsville,Madison,Alabama,Male,White / Caucasian,11/9/2018
36,UP51243,11/29/2017,0.0,120.0,Huntsville,Madison,Alabama,Female,Uncertain,12/27/2019
...,...,...,...,...,...,...,...,...,...,...
13206,UP52045,6/14/2018,0.0,120.0,Cyclone,Wyoming,West Virginia,Male,Uncertain,1/15/2020
13216,UP13695,11/9/1989,0.0,120.0,Paris,Kenosha,Wisconsin,Male,White / Caucasian,4/11/2017
13240,UP11611,6/5/2008,0.0,120.0,Baraboo,Sauk,Wisconsin,Male,Uncertain,11/22/2013
13248,UP7631,7/10/1973,0.0,120.0,Waukesha,Waukesha,Wisconsin,Male,White / Caucasian,11/9/2013


In [53]:
## Check if any rows with null value for sex
unidentified_attr_3_df = unidentified_df.loc[unidentified_df['Sex'].isnull()]
# len(unidentified_attr_3_df)
unidentified_attr_3_df

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified
1621,UP54136,,0.0,120.0,,Pima,Arizona,,Uncertain,1/16/2020


In [54]:
# Re-assign missing sex values to be 'Unknown'
unidentified_df['Sex'] = unidentified_df['Sex'].fillna('Unknown')
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified
0,UP51903,3/30/1986,0.0,120.0,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018
3,UP55409,3/26/2000,0.0,120.0,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019


In [55]:
## Double check re-assigned values
unidentified_sex_test_df = unidentified_df.loc[unidentified_df['Sex']=='Unknown']
unidentified_sex_test_df
# len(unidentified_sex_test_df)

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified
1621,UP54136,,0.0,120.0,,Pima,Arizona,Unknown,Uncertain,1/16/2020


In [56]:
## Check if any rows with null value for race / ethnicity
unidentified_attr_4_df = unidentified_df.loc[unidentified_df['Race / Ethnicity'].isnull()]
# len(unidentified_attr_4_df)
unidentified_attr_4_df

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified
369,UP1978,11/10/2001,0.0,0.0,Gila Bend,Maricopa,Arizona,Female,,7/26/2018
484,UP2024,12/1/2003,17.0,24.0,,Maricopa,Arizona,Male,,6/4/2014
499,UP1938,5/27/1996,22.0,43.0,,Maricopa,Arizona,Male,,6/4/2014
500,UP1936,2/27/1996,19.0,35.0,,Maricopa,Arizona,Male,,6/4/2014
506,UP1927,12/7/1991,45.0,60.0,,Maricopa,Arizona,Male,,4/4/2019
511,UP1914,9/23/1987,25.0,35.0,,Maricopa,Arizona,Male,,6/4/2014
4527,UP5903,5/9/1984,0.0,0.0,,San Francisco,California,Male,,7/23/2019
4528,UP5874,11/27/1981,0.0,0.0,,San Francisco,California,Male,,7/23/2019
4529,UP5841,3/7/1975,0.0,0.0,,San Francisco,California,Male,,7/23/2019
4656,UP53055,4/27/1986,0.0,120.0,Porterville,Tulare,California,Male,,11/29/2018


In [57]:
# Re-assign missing race / ethnicity values to be 'Uncertain'
unidentified_df['Race / Ethnicity'] = unidentified_df['Race / Ethnicity'].fillna('Uncertain')
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified
0,UP51903,3/30/1986,0.0,120.0,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018
3,UP55409,3/26/2000,0.0,120.0,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019


In [59]:
## Double check re-assigned values
unidentified_race_test_df = unidentified_df.loc[unidentified_df['Race / Ethnicity']=='Uncertain']
unidentified_race_test_df
# len(unidentified_race_test_df)

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified
6,UP5011,7/22/2007,30.0,40.0,Titus,Elmore,Alabama,Female,Uncertain,4/13/2020
7,UP15338,4/12/2002,18.0,99.0,Adger,Jefferson,Alabama,Unsure,Uncertain,3/18/2020
11,UP14991,4/8/2016,20.0,60.0,Birmingham,Jefferson,Alabama,Male,Uncertain,3/18/2020
12,UP13987,12/23/2012,18.0,99.0,Birmingham,Jefferson,Alabama,Male,Uncertain,3/18/2020
15,UP14466,6/22/2004,17.0,30.0,Birmingham,Jefferson,Alabama,Male,Uncertain,3/18/2020
...,...,...,...,...,...,...,...,...,...,...
13243,UP12012,10/11/1979,20.0,35.0,East Troy,Walworth,Wisconsin,Male,Uncertain,7/18/2019
13245,UP7405,11/18/1984,30.0,55.0,Town of Mukwonago,Waukesha,Wisconsin,Female,Uncertain,10/11/2019
13246,UP7634,9/28/1975,18.0,99.0,Town of Oconomowoc,Waukesha,Wisconsin,Male,Uncertain,1/27/2012
13249,UP7548,4/3/1976,25.0,50.0,Waukesha,Waukesha,Wisconsin,Female,Uncertain,10/3/2019


# Part 3 - Prep data for state-level GeoJSON with data from all 3 databases

### 2a: Add in State FIPS column to each database dataframe

In [60]:
# make dictionary of states and state FIPS code
state_dict = dict(zip(state_centroids_df.NAME, state_centroids_df.STATEFP))
# state_dict
# state_dict['Alaska']
len(state_dict)

55

#### 1/3: add state FIPS codes to missing persons dataframe

In [61]:
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019
3,MP43187,2/20/2018,Johnson,Abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016


In [62]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
missing_df['State_FIPS'] = missing_df['State'].map(state_dict)
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020,1
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015,1
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019,1
3,MP43187,2/20/2018,Johnson,Abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020,1
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016,1


In [63]:
# Check unique values in new dataframe field
missing_df['State_FIPS'].unique()

array([ 1,  2,  4,  5,  6,  8, 69,  9, 10, 11, 12, 13, 66, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39, 40, 41, 42, 72, 44, 45, 46, 47, 48, 78, 49, 50, 51,
       53, 54, 55, 56], dtype=int64)

In [64]:
## If any nulls, check to see what is producing null values [NOTE: fixed by making changes to csv fields, as noted where csvs are imported]
mis_test_df = missing_df.loc[missing_df['State_FIPS'].isnull()]
mis_test_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS


In [65]:
# check missing df
missing_df.head()
# len(missing_df)

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020,1
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015,1
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019,1
3,MP43187,2/20/2018,Johnson,Abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020,1
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016,1


#### 2/3: add state FIPS codes to unclaimed persons dataframe

In [66]:
unclaimed_df.head()
# len(unclaimed_df)

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020


In [67]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
unclaimed_df['State_FIPS'] = unclaimed_df['State'].map(state_dict)
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17.0
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17.0
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020,36.0
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020,36.0
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020,36.0


In [68]:
# Check unique values in new dataframe field
unclaimed_df['State_FIPS'].unique()

array([17., 36., 25.,  6., 53., 15., 32., 16., 48., 49., 47., 11., 33.,
       35., 40., 42., 39., 29.,  5., 13.,  1., 34.,  4., 12., 55.,  9.,
        8., 26., 22., 18., 19., 41., 23., nan, 45., 56., 54., 51.,  2.,
       28., 21., 37., 10., 38.])

In [69]:
## If any nulls, check to see what is producing null values [NOTE: cannot be addressed, as these cases do not have a state or county assigned]
unc_test_df = unclaimed_df.loc[unclaimed_df['State_FIPS'].isnull()]
# len(unc_test_df)
unc_test_df

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS
3380,UCP5339,12/3/2016,Williams,Barbara,Female,Asian,,,,5/3/2018,
4774,UCP2489,3/26/2015,Lupien,Daryl,Male,Uncertain,,,,8/18/2016,
5023,UCP4001,11/2/2014,Brown,David,Male,White / Caucasian,,,,5/11/2017,
5883,UCP3962,3/22/2010,Gonzalez,David,Male,Hispanic / Latino,,,,5/11/2017,
5950,UCP3937,12/7/2009,Togovnick,Bernice,Female,White / Caucasian,,,,5/11/2017,
6183,UCP3901,1/21/2009,Carney,John,Male,White / Caucasian,,,,5/11/2017,
6341,UCP3689,6/5/2008,Guzman,Ralph,Male,Hispanic / Latino,,,,5/2/2017,
6560,UCP3824,7/2/2007,Vasquez,Rafael,Male,Hispanic / Latino,,,,5/4/2017,
6698,UCP3672,9/22/2006,Barajas,Epifanio,Male,Hispanic / Latino,,,,5/2/2017,
6719,UCP3668,8/5/2006,Huff,Paul,Male,White / Caucasian,,,,5/2/2017,


In [70]:
# # change NaN FIPS codes (for cases with no city, county or state) to 99 (none option in state_centroids_v2.csv)
unclaimed_df['State_FIPS'] = unclaimed_df['State_FIPS'].fillna(99)
unclaimed_df['State_FIPS'] = unclaimed_df['State_FIPS'].astype(int)
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020,36
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020,36
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020,36


In [71]:
## Double check re-assigned values
unc_test_df2 = unclaimed_df.loc[unclaimed_df['State_FIPS']==99]
unc_test_df2
# len(unc_test_df2)

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS
3380,UCP5339,12/3/2016,Williams,Barbara,Female,Asian,,,,5/3/2018,99
4774,UCP2489,3/26/2015,Lupien,Daryl,Male,Uncertain,,,,8/18/2016,99
5023,UCP4001,11/2/2014,Brown,David,Male,White / Caucasian,,,,5/11/2017,99
5883,UCP3962,3/22/2010,Gonzalez,David,Male,Hispanic / Latino,,,,5/11/2017,99
5950,UCP3937,12/7/2009,Togovnick,Bernice,Female,White / Caucasian,,,,5/11/2017,99
6183,UCP3901,1/21/2009,Carney,John,Male,White / Caucasian,,,,5/11/2017,99
6341,UCP3689,6/5/2008,Guzman,Ralph,Male,Hispanic / Latino,,,,5/2/2017,99
6560,UCP3824,7/2/2007,Vasquez,Rafael,Male,Hispanic / Latino,,,,5/4/2017,99
6698,UCP3672,9/22/2006,Barajas,Epifanio,Male,Hispanic / Latino,,,,5/2/2017,99
6719,UCP3668,8/5/2006,Huff,Paul,Male,White / Caucasian,,,,5/2/2017,99


In [72]:
# check full dataframe
len(unclaimed_df)

8335

#### 3/3: add state FIPS codes to unidentified persons dataframe

In [73]:
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified
0,UP51903,3/30/1986,0.0,120.0,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018
3,UP55409,3/26/2000,0.0,120.0,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019


In [74]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
unidentified_df['State_FIPS'] = unidentified_df['State'].map(state_dict)
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS
0,UP51903,3/30/1986,0.0,120.0,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020,1
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019,1
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018,1
3,UP55409,3/26/2000,0.0,120.0,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020,1
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019,1


In [75]:
# Check unique values in new dataframe field
unidentified_df['State_FIPS'].unique()

array([ 1,  2,  4,  5,  6,  8,  9, 10, 11, 12, 13, 66, 15, 16, 17, 18, 19,
       20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
       37, 38, 39, 40, 41, 42, 72, 44, 45, 46, 47, 48, 78, 49, 50, 51, 53,
       54, 55, 56], dtype=int64)

In [76]:
## If any nulls, check to see what is producing null values [NOTE: fixed by making changes to csv fields, as noted where csvs are imported]
uni_test_df = unidentified_df.loc[unidentified_df['State_FIPS'].isnull()]
uni_test_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS


# Part 4: Prep data for county-level JSONs for all 3 databases

#### Add in State name and County_Key columns to county centroid v2 dataframe

In [77]:
# make dictionary of state FIPS codes and state names
state_FIPS_dict = dict(zip(state_centroids_v2_df.STATEFP, state_centroids_v2_df.NAME))
# state_FIPS_dict
# state_FIPS_dict['Alaska']
len(state_FIPS_dict)

56

In [78]:
# check county centroids v2 df
county_centroids_v2_df.head()

Unnamed: 0,OBJECTID,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd
0,1,21,7,516850,0500000US21007,21007,Ballard,6,639387500.0,69473325,1,-88.999262,37.058489
1,2,21,17,516855,0500000US21017,21017,Bourbon,6,750439400.0,4829777,2,-84.217155,38.206742
2,3,21,31,516862,0500000US21031,21031,Butler,6,1103572000.0,13943044,3,-86.681628,37.207292
3,4,21,65,516879,0500000US21065,21065,Estill,6,655509900.0,6516335,4,-83.964316,37.692451
4,5,21,69,516881,0500000US21069,21069,Fleming,6,902727200.0,7182793,5,-83.69666,38.370126


In [79]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
county_centroids_v2_df['STATE_NAME'] = county_centroids_v2_df['STATEFP'].map(state_FIPS_dict)
# check dataframe
county_centroids_v2_df.head()
# Check unique values in new dataframe field
# county_centroids_v2_df['STATE_NAME'].unique()

Unnamed: 0,OBJECTID,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,STATE_NAME
0,1,21,7,516850,0500000US21007,21007,Ballard,6,639387500.0,69473325,1,-88.999262,37.058489,Kentucky
1,2,21,17,516855,0500000US21017,21017,Bourbon,6,750439400.0,4829777,2,-84.217155,38.206742,Kentucky
2,3,21,31,516862,0500000US21031,21031,Butler,6,1103572000.0,13943044,3,-86.681628,37.207292,Kentucky
3,4,21,65,516879,0500000US21065,21065,Estill,6,655509900.0,6516335,4,-83.964316,37.692451,Kentucky
4,5,21,69,516881,0500000US21069,21069,Fleming,6,902727200.0,7182793,5,-83.69666,38.370126,Kentucky


In [80]:
# sort county centroids by state FIPS
county_centroids_v2_df = county_centroids_v2_df.sort_values(by=['STATEFP'])
county_centroids_v2_df.head()

Unnamed: 0,OBJECTID,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,STATE_NAME
1043,1044,1,61,161556,0500000US01061,1061,Geneva,6,1487898000.0,11578163,1044,-85.839096,31.095061,Alabama
1255,1256,1,17,161534,0500000US01017,1017,Chambers,6,1545086000.0,16971701,1256,-85.392035,32.914352,Alabama
3228,100000,1,999,9999100,9999999US100000,1999,,0,0.0,0,99991,45.0,-90.0,Alabama
471,472,1,49,161550,0500000US01049,1049,DeKalb,6,2012676000.0,4121543,472,-85.804114,34.459807,Alabama
472,473,1,63,161557,0500000US01063,1063,Greene,6,1675782000.0,33416141,473,-87.952209,32.853154,Alabama


In [81]:
# Add column with compound field key
county_centroids_v2_df['County_Key'] = county_centroids_v2_df['STATEFP'].astype(str) + "_" + county_centroids_v2_df['NAME']
county_centroids_v2_df.head()

Unnamed: 0,OBJECTID,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,STATE_NAME,County_Key
1043,1044,1,61,161556,0500000US01061,1061,Geneva,6,1487898000.0,11578163,1044,-85.839096,31.095061,Alabama,1_Geneva
1255,1256,1,17,161534,0500000US01017,1017,Chambers,6,1545086000.0,16971701,1256,-85.392035,32.914352,Alabama,1_Chambers
3228,100000,1,999,9999100,9999999US100000,1999,,0,0.0,0,99991,45.0,-90.0,Alabama,1_None
471,472,1,49,161550,0500000US01049,1049,DeKalb,6,2012676000.0,4121543,472,-85.804114,34.459807,Alabama,1_DeKalb
472,473,1,63,161557,0500000US01063,1063,Greene,6,1675782000.0,33416141,473,-87.952209,32.853154,Alabama,1_Greene


In [82]:
# check unique values and length
county_key_v2_list = county_centroids_v2_df['County_Key'].unique()
len(county_key_v2_list)

3283

In [83]:
# make dictionary of counties and county FIPS code (GEOID field)
county_v2_dict = dict(zip(county_centroids_v2_df.County_Key, county_centroids_v2_df.GEOID))
len(county_v2_dict)

3283

#### Add in County_key column to county centroids dataframe

In [84]:
# sort county centroids by state FIPS
county_centroids_df = county_centroids_df.sort_values(by=['STATEFP'])
county_centroids_df.head()

Unnamed: 0,OBJECTID,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd
3211,3212,1,121,161586,0500000US01121,1121,Talladega,6,1908257000.0,60926366,3218,-86.16589,33.380086
1194,1195,1,93,161573,0500000US01093,1093,Marion,6,1922657000.0,3184076,1195,-87.88714,34.136562
1255,1256,1,17,161534,0500000US01017,1017,Chambers,6,1545086000.0,16971701,1256,-85.392035,32.914352
1215,1216,1,59,161555,0500000US01059,1059,Franklin,6,1641841000.0,32643981,1216,-87.84374,34.441699
2780,2781,1,127,161589,0500000US01127,1127,Walker,6,2048686000.0,36754696,2785,-87.297329,33.803318


In [85]:
# Add column with compound field key
county_centroids_df['County_Key'] = county_centroids_df['STATEFP'].astype(str) + "_" + county_centroids_df['NAME']
county_centroids_df.head()

Unnamed: 0,OBJECTID,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,County_Key
3211,3212,1,121,161586,0500000US01121,1121,Talladega,6,1908257000.0,60926366,3218,-86.16589,33.380086,1_Talladega
1194,1195,1,93,161573,0500000US01093,1093,Marion,6,1922657000.0,3184076,1195,-87.88714,34.136562,1_Marion
1255,1256,1,17,161534,0500000US01017,1017,Chambers,6,1545086000.0,16971701,1256,-85.392035,32.914352,1_Chambers
1215,1216,1,59,161555,0500000US01059,1059,Franklin,6,1641841000.0,32643981,1216,-87.84374,34.441699,1_Franklin
2780,2781,1,127,161589,0500000US01127,1127,Walker,6,2048686000.0,36754696,2785,-87.297329,33.803318,1_Walker


In [86]:
# check unique values and length
county_key_list = county_centroids_df['County_Key'].unique()
len(county_key_list)

3227

In [87]:
# make dictionary of counties and county FIPS code (GEOID field)
county_dict = dict(zip(county_centroids_df.County_Key, county_centroids_df.GEOID))
len(county_dict)

3227

#### Add in County FIPS column to missing dataframe

##### To identify null values, add in County FIPS column to missing dataframe using county_centroids_df

In [88]:
# check missing df
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020,1
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015,1
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019,1
3,MP43187,2/20/2018,Johnson,Abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020,1
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016,1


In [89]:
# Add column with compound field key
missing_df['State_County'] = missing_df['State_FIPS'].astype(str) + "_" + missing_df['County']
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020,1,1_Autauga
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015,1,1_Autauga
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019,1,1_Baldwin
3,MP43187,2/20/2018,Johnson,Abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020,1,1_Baldwin
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016,1,1_Baldwin


In [90]:
# Add new column to dataframe, using the State_County name field as a key in the county_dict, to pull the correct County FIPS code for each row
missing_df['County_FIPS'] = missing_df['State_County'].map(county_dict)
# Check unique values in new dataframe field
missing_df['County_FIPS'].unique()

array([ 1001.,  1003.,  1005., ..., 56035., 56037., 56039.])

In [91]:
# check null values [NOTE: all ok - no city or county assigned]
missing_county_null_df = missing_df.loc[missing_df['County_FIPS'].isnull()]
# missing_county_null_df.shape
missing_county_null_df

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
14181,MP4569,5/5/1974,Colonna aponte,John,12,,,Puerto Rico,Male,"White / Caucasian, Hispanic / Latino",5/15/2018,72,,
14182,MP54374,12/15/1984,Cruz,Marcos,2,,,Puerto Rico,Male,Black / African American,3/12/2020,72,,
14183,MP4568,5/4/1974,Colonna aponte,Giannina,11,,,Puerto Rico,Female,"White / Caucasian, Hispanic / Latino",4/25/2018,72,,


In [95]:
# As needed, export nulls to address [NOTE: all that can have been addressed]
# missing_county_null_df.to_csv('missing_county_nulls.csv', encoding='Windows-1252')

In [96]:
# check missing_df
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020,1,1_Autauga,1001.0
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015,1,1_Autauga,1001.0
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019,1,1_Baldwin,1003.0
3,MP43187,2/20/2018,Johnson,Abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020,1,1_Baldwin,1003.0
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016,1,1_Baldwin,1003.0


##### Replace null state values with 'None' and null county values with 'None', then re-write county_FIPS column using county_centroids_v2_dataframe

In [97]:
## Check if any rows with null value for 'State'
mis_state_test_df = missing_df.loc[missing_df['State'].isnull()]
# len(mis_state_test_df)
mis_state_test_df

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS


In [98]:
## Check if any rows with null value for 'City' and County'
mis_county_test_df = missing_df.loc[missing_df['County'].isnull()]
# len(mis_county_test_df)
mis_county_test_df

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
14181,MP4569,5/5/1974,Colonna aponte,John,12,,,Puerto Rico,Male,"White / Caucasian, Hispanic / Latino",5/15/2018,72,,
14182,MP54374,12/15/1984,Cruz,Marcos,2,,,Puerto Rico,Male,Black / African American,3/12/2020,72,,
14183,MP4568,5/4/1974,Colonna aponte,Giannina,11,,,Puerto Rico,Female,"White / Caucasian, Hispanic / Latino",4/25/2018,72,,


In [99]:
# # change NaN County name (for cases with no city or county) to 'None'
missing_df['County'] = missing_df['County'].fillna('None')
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020,1,1_Autauga,1001.0
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015,1,1_Autauga,1001.0
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019,1,1_Baldwin,1003.0
3,MP43187,2/20/2018,Johnson,Abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020,1,1_Baldwin,1003.0
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016,1,1_Baldwin,1003.0


In [100]:
## Double check re-assigned values
mis_county_test_2_df = missing_df.loc[missing_df['County']=='None']
mis_county_test_2_df
# len(mis_county_test_2_df)

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
14181,MP4569,5/5/1974,Colonna aponte,John,12,,,Puerto Rico,Male,"White / Caucasian, Hispanic / Latino",5/15/2018,72,,
14182,MP54374,12/15/1984,Cruz,Marcos,2,,,Puerto Rico,Male,Black / African American,3/12/2020,72,,
14183,MP4568,5/4/1974,Colonna aponte,Giannina,11,,,Puerto Rico,Female,"White / Caucasian, Hispanic / Latino",4/25/2018,72,,


In [101]:
# Re-Add column with compound field key
missing_df['State_County'] = missing_df['State_FIPS'].astype(str) + "_" + missing_df['County']
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020,1,1_Autauga,1001.0
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015,1,1_Autauga,1001.0
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019,1,1_Baldwin,1003.0
3,MP43187,2/20/2018,Johnson,Abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020,1,1_Baldwin,1003.0
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016,1,1_Baldwin,1003.0


In [102]:
## Double check re-assigned values
mis_county_test_3_df = missing_df.loc[missing_df['County']=='None']
mis_county_test_3_df
# len(mis_county_test_3_df)

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
14181,MP4569,5/5/1974,Colonna aponte,John,12,,,Puerto Rico,Male,"White / Caucasian, Hispanic / Latino",5/15/2018,72,72_None,
14182,MP54374,12/15/1984,Cruz,Marcos,2,,,Puerto Rico,Male,Black / African American,3/12/2020,72,72_None,
14183,MP4568,5/4/1974,Colonna aponte,Giannina,11,,,Puerto Rico,Female,"White / Caucasian, Hispanic / Latino",4/25/2018,72,72_None,


In [103]:
# Re-add County FIPS column to dataframe, using the State_County name field as a key in the county_v2_dict, to pull the correct County FIPS code for each row
missing_df['County_FIPS'] = missing_df['State_County'].map(county_v2_dict)
# Check unique values in new dataframe field
missing_df['County_FIPS'].unique()

array([ 1001,  1003,  1005, ..., 56035, 56037, 56039], dtype=int64)

In [104]:
# check null values to make sure none are left [Note - all gone!]
missing_county_null_v2_df = missing_df.loc[missing_df['County_FIPS'].isnull()]
# missing_county_null_v2_df.shape
missing_county_null_v2_df

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS


In [105]:
## Double check re-assigned values
mis_county_test_4_df = missing_df.loc[missing_df['County']=='None']
mis_county_test_4_df
# len(mis_county_test_4_df)

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
14181,MP4569,5/5/1974,Colonna aponte,John,12,,,Puerto Rico,Male,"White / Caucasian, Hispanic / Latino",5/15/2018,72,72_None,72999
14182,MP54374,12/15/1984,Cruz,Marcos,2,,,Puerto Rico,Male,Black / African American,3/12/2020,72,72_None,72999
14183,MP4568,5/4/1974,Colonna aponte,Giannina,11,,,Puerto Rico,Female,"White / Caucasian, Hispanic / Latino",4/25/2018,72,72_None,72999


#### Add in County FIPS column to unclaimed dataframe

##### To identify null values, add in County FIPS column to unclaimed dataframe using county_centroids_df

In [106]:
# check unclaimed df
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020,36
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020,36
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020,36


In [107]:
# Add column with compound field key
unclaimed_df['State_County'] = unclaimed_df['State_FIPS'].astype(str) + "_" + unclaimed_df['County']
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020,36,36_New York
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020,36,36_Kings
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020,36,36_New York


In [108]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
unclaimed_df['County_FIPS'] = unclaimed_df['State_County'].map(county_dict)
# Check unique values in new dataframe field
unclaimed_df['County_FIPS'].unique()

array([17197., 36061., 36047., 36005., 36081., 25025.,  6073., 17063.,
       36085., 53053., 15003., 32031., 25021., 25027., 16001., 25009.,
       25017., 25003., 25013., 25023., 32027., 25015.,  6047., 48201.,
       49035., 47093., 53075., 53027., 11001., 25005., 47157., 33005.,
       35031., 53049., 53077., 40143., 42029., 47155., 47001., 39099.,
       29019.,  6075.,  5143.,  6093., 13121.,  1073., 34025., 34023.,
       25001.,  4019.,  4013.,  6023.,  6099., 34001., 12087., 47035.,
        6037., 40121., 53061., 55025.,  6071.,  9003., 48141., 48339.,
        9009., 42101., 12071., 53033., 17043., 34021.,  9011., 48215.,
        8087., 12051., 26163.,  9001., 47009., 17031., 22069.,  9005.,
       32005.,  9007., 47065., 34013.,    nan,  4007., 47013., 18067.,
       42001.,  6059., 19155.,  6083., 26025., 40101., 40041., 42091.,
       55057., 48167., 19153.,  6025., 26121., 40145., 25011.,  9015.,
        1115.,  5035., 53035., 53073., 41051., 16027., 23005., 47179.,
      

In [109]:
# check null values
unclaimed_county_null_df = unclaimed_df.loc[unclaimed_df['County_FIPS'].isnull()]
# unclaimed_county_null_df.shape
unclaimed_county_null_df

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
1996,UCP5438,4/30/2018,Flores,Victor,Male,Black / African American,,,New York,5/14/2019,36,,
2011,UCP5440,4/22/2018,Browne,Edward,Male,Hispanic / Latino,,,New York,5/27/2019,36,,
2028,UCP5297,4/15/2018,Davis,Thelma,Female,Black / African American,,,New York,5/20/2019,36,,
2031,UCP5296,4/14/2018,Fasulo,Timothy,Male,White / Caucasian,,,New York,4/25/2019,36,,
2033,UCP5439,4/14/2018,Reyes,Nivia,Female,Other,,,New York,5/27/2019,36,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7286,UCP990,11/6/1997,Pope,Leon,Male,White / Caucasian,,,Washington,4/14/2020,53,,
7339,UCP1999,10/22/1991,Robles,Salvador,Male,Other,,,Tennessee,1/10/2020,47,,
7344,UCP4522,10/18/1990,Way,Kenneth,Male,Black / African American,,,New York,5/11/2019,36,,
7372,UCP1548,6/17/1973,Seah,Tee kee,Male,Asian,,,Nevada,5/28/2019,32,,


In [110]:
# As needed, export nulls to address [NOTE: all have been addressed. 1241 cases have no county]
# unclaimed_county_null_df.to_csv('unclaimed_county_nulls.csv', encoding='Windows-1252')

In [111]:
# check unclaimed_df
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will,17197.0
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will,17197.0
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020,36,36_New York,36061.0
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020,36,36_Kings,36047.0
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020,36,36_New York,36061.0


##### Replace null state values with 'None' and null county values with 'None', then re-write county_FIPS column using county_centroids_v2_dataframe

In [112]:
## Check if any rows with null value for 'State'
unc_state_test_df = unclaimed_df.loc[unclaimed_df['State'].isnull()]
# len(unc_state_test_df)
unc_state_test_df

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
3380,UCP5339,12/3/2016,Williams,Barbara,Female,Asian,,,,5/3/2018,99,,
4774,UCP2489,3/26/2015,Lupien,Daryl,Male,Uncertain,,,,8/18/2016,99,,
5023,UCP4001,11/2/2014,Brown,David,Male,White / Caucasian,,,,5/11/2017,99,,
5883,UCP3962,3/22/2010,Gonzalez,David,Male,Hispanic / Latino,,,,5/11/2017,99,,
5950,UCP3937,12/7/2009,Togovnick,Bernice,Female,White / Caucasian,,,,5/11/2017,99,,
6183,UCP3901,1/21/2009,Carney,John,Male,White / Caucasian,,,,5/11/2017,99,,
6341,UCP3689,6/5/2008,Guzman,Ralph,Male,Hispanic / Latino,,,,5/2/2017,99,,
6560,UCP3824,7/2/2007,Vasquez,Rafael,Male,Hispanic / Latino,,,,5/4/2017,99,,
6698,UCP3672,9/22/2006,Barajas,Epifanio,Male,Hispanic / Latino,,,,5/2/2017,99,,
6719,UCP3668,8/5/2006,Huff,Paul,Male,White / Caucasian,,,,5/2/2017,99,,


In [113]:
# # change NaN State name (for cases with no city or county or state) to 'None'
unclaimed_df['State'] = unclaimed_df['State'].fillna('None')
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will,17197.0
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will,17197.0
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020,36,36_New York,36061.0
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020,36,36_Kings,36047.0
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020,36,36_New York,36061.0


In [114]:
## Double check re-assigned values
unc_state_test_2_df = unclaimed_df.loc[unclaimed_df['State']=='None']
unc_state_test_2_df
# len(unc_state_test_2_df)

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
3380,UCP5339,12/3/2016,Williams,Barbara,Female,Asian,,,,5/3/2018,99,,
4774,UCP2489,3/26/2015,Lupien,Daryl,Male,Uncertain,,,,8/18/2016,99,,
5023,UCP4001,11/2/2014,Brown,David,Male,White / Caucasian,,,,5/11/2017,99,,
5883,UCP3962,3/22/2010,Gonzalez,David,Male,Hispanic / Latino,,,,5/11/2017,99,,
5950,UCP3937,12/7/2009,Togovnick,Bernice,Female,White / Caucasian,,,,5/11/2017,99,,
6183,UCP3901,1/21/2009,Carney,John,Male,White / Caucasian,,,,5/11/2017,99,,
6341,UCP3689,6/5/2008,Guzman,Ralph,Male,Hispanic / Latino,,,,5/2/2017,99,,
6560,UCP3824,7/2/2007,Vasquez,Rafael,Male,Hispanic / Latino,,,,5/4/2017,99,,
6698,UCP3672,9/22/2006,Barajas,Epifanio,Male,Hispanic / Latino,,,,5/2/2017,99,,
6719,UCP3668,8/5/2006,Huff,Paul,Male,White / Caucasian,,,,5/2/2017,99,,


In [115]:
## Check if any rows with null value for 'City' and County'
unc_county_test_df = unclaimed_df.loc[unclaimed_df['County'].isnull()]
# len(unc_county_test_df)
unc_county_test_df

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
1996,UCP5438,4/30/2018,Flores,Victor,Male,Black / African American,,,New York,5/14/2019,36,,
2011,UCP5440,4/22/2018,Browne,Edward,Male,Hispanic / Latino,,,New York,5/27/2019,36,,
2028,UCP5297,4/15/2018,Davis,Thelma,Female,Black / African American,,,New York,5/20/2019,36,,
2031,UCP5296,4/14/2018,Fasulo,Timothy,Male,White / Caucasian,,,New York,4/25/2019,36,,
2033,UCP5439,4/14/2018,Reyes,Nivia,Female,Other,,,New York,5/27/2019,36,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7286,UCP990,11/6/1997,Pope,Leon,Male,White / Caucasian,,,Washington,4/14/2020,53,,
7339,UCP1999,10/22/1991,Robles,Salvador,Male,Other,,,Tennessee,1/10/2020,47,,
7344,UCP4522,10/18/1990,Way,Kenneth,Male,Black / African American,,,New York,5/11/2019,36,,
7372,UCP1548,6/17/1973,Seah,Tee kee,Male,Asian,,,Nevada,5/28/2019,32,,


In [116]:
# # change NaN County name (for cases with no city or county) to 'None'
unclaimed_df['County'] = unclaimed_df['County'].fillna('None')
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will,17197.0
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will,17197.0
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020,36,36_New York,36061.0
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020,36,36_Kings,36047.0
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020,36,36_New York,36061.0


In [117]:
## Double check re-assigned values
unc_county_test_2_df = unclaimed_df.loc[unclaimed_df['County']=='None']
unc_county_test_2_df
# len(unc_county_test_2_df)

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
1996,UCP5438,4/30/2018,Flores,Victor,Male,Black / African American,,,New York,5/14/2019,36,,
2011,UCP5440,4/22/2018,Browne,Edward,Male,Hispanic / Latino,,,New York,5/27/2019,36,,
2028,UCP5297,4/15/2018,Davis,Thelma,Female,Black / African American,,,New York,5/20/2019,36,,
2031,UCP5296,4/14/2018,Fasulo,Timothy,Male,White / Caucasian,,,New York,4/25/2019,36,,
2033,UCP5439,4/14/2018,Reyes,Nivia,Female,Other,,,New York,5/27/2019,36,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7286,UCP990,11/6/1997,Pope,Leon,Male,White / Caucasian,,,Washington,4/14/2020,53,,
7339,UCP1999,10/22/1991,Robles,Salvador,Male,Other,,,Tennessee,1/10/2020,47,,
7344,UCP4522,10/18/1990,Way,Kenneth,Male,Black / African American,,,New York,5/11/2019,36,,
7372,UCP1548,6/17/1973,Seah,Tee kee,Male,Asian,,,Nevada,5/28/2019,32,,


In [118]:
# Re-Add column with compound field key
unclaimed_df['State_County'] = unclaimed_df['State_FIPS'].astype(str) + "_" + unclaimed_df['County']
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will,17197.0
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will,17197.0
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020,36,36_New York,36061.0
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020,36,36_Kings,36047.0
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020,36,36_New York,36061.0


In [119]:
## Double check re-assigned values
unc_county_test_3_df = unclaimed_df.loc[unclaimed_df['County']=='None']
unc_county_test_3_df
# len(unc_county_test_3_df)

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
1996,UCP5438,4/30/2018,Flores,Victor,Male,Black / African American,,,New York,5/14/2019,36,36_None,
2011,UCP5440,4/22/2018,Browne,Edward,Male,Hispanic / Latino,,,New York,5/27/2019,36,36_None,
2028,UCP5297,4/15/2018,Davis,Thelma,Female,Black / African American,,,New York,5/20/2019,36,36_None,
2031,UCP5296,4/14/2018,Fasulo,Timothy,Male,White / Caucasian,,,New York,4/25/2019,36,36_None,
2033,UCP5439,4/14/2018,Reyes,Nivia,Female,Other,,,New York,5/27/2019,36,36_None,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7286,UCP990,11/6/1997,Pope,Leon,Male,White / Caucasian,,,Washington,4/14/2020,53,53_None,
7339,UCP1999,10/22/1991,Robles,Salvador,Male,Other,,,Tennessee,1/10/2020,47,47_None,
7344,UCP4522,10/18/1990,Way,Kenneth,Male,Black / African American,,,New York,5/11/2019,36,36_None,
7372,UCP1548,6/17/1973,Seah,Tee kee,Male,Asian,,,Nevada,5/28/2019,32,32_None,


In [120]:
# Re-add County FIPS column to dataframe, using the State_County name field as a key in the county_v2_dict, to pull the correct County FIPS code for each row
unclaimed_df['County_FIPS'] = unclaimed_df['State_County'].map(county_v2_dict)
# Check unique values in new dataframe field
unclaimed_df['County_FIPS'].unique()

array([17197, 36061, 36047, 36005, 36081, 25025,  6073, 17063, 36085,
       53053, 15003, 32031, 25021, 25027, 16001, 25009, 25017, 25003,
       25013, 25023, 32027, 25015,  6047, 48201, 49035, 47093, 53075,
       53027, 11001, 25005, 47157, 33005, 35031, 53049, 53077, 40143,
       42029, 47155, 47001, 39099, 29019,  6075,  5143,  6093, 13121,
        1073, 34025, 34023, 25001,  4019,  4013,  6023,  6099, 34001,
       12087, 47035,  6037, 40121, 53061, 55025,  6071,  9003, 48141,
       48339,  9009, 42101, 12071, 53033, 17043, 34021,  9011, 48215,
        8087, 12051, 26163,  9001, 47009, 17031, 22069,  9005, 32005,
        9007, 47065, 34013, 36999,  4007, 47013, 18067, 42001,  6059,
       19155,  6083, 26025, 40101, 40041, 42091, 55057, 17999, 48167,
       19153,  6025, 40999, 42999, 26121, 40145, 25011,  9015,  1115,
        5035, 53035, 53073, 41051, 16027, 23005, 47179, 48469, 40135,
       99999, 34029, 40097, 16013, 29051, 48229, 40115, 45007, 12075,
       35059, 53001,

In [121]:
# check null values to make sure none are left [Note - all gone!]
unclaimed_county_null_v2_df = unclaimed_df.loc[unclaimed_df['County_FIPS'].isnull()]
# unclaimed_county_null_v2_df.shape
unclaimed_county_null_v2_df

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS


In [122]:
## Double check re-assigned values
unc_county_test_4_df = unclaimed_df.loc[unclaimed_df['County']=='None']
unc_county_test_4_df
# len(unc_county_test_4_df)

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
1996,UCP5438,4/30/2018,Flores,Victor,Male,Black / African American,,,New York,5/14/2019,36,36_None,36999
2011,UCP5440,4/22/2018,Browne,Edward,Male,Hispanic / Latino,,,New York,5/27/2019,36,36_None,36999
2028,UCP5297,4/15/2018,Davis,Thelma,Female,Black / African American,,,New York,5/20/2019,36,36_None,36999
2031,UCP5296,4/14/2018,Fasulo,Timothy,Male,White / Caucasian,,,New York,4/25/2019,36,36_None,36999
2033,UCP5439,4/14/2018,Reyes,Nivia,Female,Other,,,New York,5/27/2019,36,36_None,36999
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7286,UCP990,11/6/1997,Pope,Leon,Male,White / Caucasian,,,Washington,4/14/2020,53,53_None,53999
7339,UCP1999,10/22/1991,Robles,Salvador,Male,Other,,,Tennessee,1/10/2020,47,47_None,47999
7344,UCP4522,10/18/1990,Way,Kenneth,Male,Black / African American,,,New York,5/11/2019,36,36_None,36999
7372,UCP1548,6/17/1973,Seah,Tee kee,Male,Asian,,,Nevada,5/28/2019,32,32_None,32999


#### Add in County FIPS column to unidentified dataframe

##### To identify null values, add in County FIPS column to unidentified dataframe using county_centroids_df

In [123]:
# check unidentified df
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS
0,UP51903,3/30/1986,0.0,120.0,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020,1
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019,1
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018,1
3,UP55409,3/26/2000,0.0,120.0,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020,1
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019,1


In [124]:
# Add column with compound field key
unidentified_df['State_County'] = unidentified_df['State_FIPS'].astype(str) + "_" + unidentified_df['County']
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County
0,UP51903,3/30/1986,0.0,120.0,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020,1,1_Autauga
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019,1,1_Autauga
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018,1,1_Baldwin
3,UP55409,3/26/2000,0.0,120.0,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020,1,1_Baldwin
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019,1,1_Bibb


In [125]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
unidentified_df['County_FIPS'] = unidentified_df['State_County'].map(county_dict)
# Check unique values in new dataframe field
unidentified_df['County_FIPS'].unique()

array([ 1001.,  1003.,  1007., ..., 56021., 56033., 56037.])

In [126]:
# check null values
unidentified_county_null_df = unidentified_df.loc[unidentified_df['County_FIPS'].isnull()]
# unidentified_county_null_df.shape
# len(unidentified_county_null_df)
unidentified_county_null_df

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
55,UP14028,12/21/1991,18.0,25.0,,,Alabama,Male,White / Caucasian,3/31/2017,1,,
109,UP13395,9/20/2004,0.0,120.0,,,Alaska,Male,Uncertain,4/14/2020,2,,
110,UP7281,4/30/2001,20.0,30.0,,,Alaska,Male,White / Caucasian,4/14/2020,2,,
111,UP13575,9/11/1997,0.0,120.0,,,Alaska,Male,Uncertain,7/2/2018,2,,
6392,UP4846,4/14/1988,37.0,52.0,,,Indiana,Female,White / Caucasian,6/3/2017,18,,
7676,UP5018,4/19/1996,49.0,59.0,,,Missouri,Female,Black / African American,10/23/2018,29,,
8536,UP17664,1/29/2018,0.0,120.0,,,New Mexico,Male,Uncertain,1/21/2020,35,,
8540,UP16690,5/3/2017,0.0,120.0,,,New Mexico,Unsure,Uncertain,2/8/2020,35,,
8541,UP5578,3/26/2003,50.0,60.0,,,New Mexico,Female,Black / African American,10/24/2018,35,,
10966,UP56112,10/10/2017,0.0,120.0,,,Puerto Rico,Male,Uncertain,2/10/2020,72,,


In [127]:
# As needed, export nulls to address [NOTE - all have been addressed - 28 remain with no city or county]
# unidentified_county_null_df.to_csv('unidentified_county_nulls.csv', encoding='Windows-1252')

In [128]:
# check unclaimed_df
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
0,UP51903,3/30/1986,0.0,120.0,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020,1,1_Autauga,1001.0
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019,1,1_Autauga,1001.0
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018,1,1_Baldwin,1003.0
3,UP55409,3/26/2000,0.0,120.0,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020,1,1_Baldwin,1003.0
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019,1,1_Bibb,1007.0


##### Replace null state values with 'None' and null county values with 'None', then re-write county_FIPS column using county_centroids_v2_dataframe

In [129]:
## Check if any rows with null value for 'State'
uni_state_test_df = unidentified_df.loc[unidentified_df['State'].isnull()]
# len(uni_state_test_df)
uni_state_test_df

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS


In [130]:
## Check if any rows with null value for 'City' and County'
uni_county_test_df = unidentified_df.loc[unidentified_df['County'].isnull()]
# len(uni_county_test_df)
uni_county_test_df

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
55,UP14028,12/21/1991,18.0,25.0,,,Alabama,Male,White / Caucasian,3/31/2017,1,,
109,UP13395,9/20/2004,0.0,120.0,,,Alaska,Male,Uncertain,4/14/2020,2,,
110,UP7281,4/30/2001,20.0,30.0,,,Alaska,Male,White / Caucasian,4/14/2020,2,,
111,UP13575,9/11/1997,0.0,120.0,,,Alaska,Male,Uncertain,7/2/2018,2,,
6392,UP4846,4/14/1988,37.0,52.0,,,Indiana,Female,White / Caucasian,6/3/2017,18,,
7676,UP5018,4/19/1996,49.0,59.0,,,Missouri,Female,Black / African American,10/23/2018,29,,
8536,UP17664,1/29/2018,0.0,120.0,,,New Mexico,Male,Uncertain,1/21/2020,35,,
8540,UP16690,5/3/2017,0.0,120.0,,,New Mexico,Unsure,Uncertain,2/8/2020,35,,
8541,UP5578,3/26/2003,50.0,60.0,,,New Mexico,Female,Black / African American,10/24/2018,35,,
10966,UP56112,10/10/2017,0.0,120.0,,,Puerto Rico,Male,Uncertain,2/10/2020,72,,


In [131]:
# # change NaN County name (for cases with no city or county) to 'None'
unidentified_df['County'] = unidentified_df['County'].fillna('None')
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
0,UP51903,3/30/1986,0.0,120.0,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020,1,1_Autauga,1001.0
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019,1,1_Autauga,1001.0
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018,1,1_Baldwin,1003.0
3,UP55409,3/26/2000,0.0,120.0,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020,1,1_Baldwin,1003.0
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019,1,1_Bibb,1007.0


In [132]:
## Double check re-assigned values
uni_county_test_2_df = unidentified_df.loc[unidentified_df['County']=='None']
uni_county_test_2_df
# len(uni_county_test_2_df)

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
55,UP14028,12/21/1991,18.0,25.0,,,Alabama,Male,White / Caucasian,3/31/2017,1,,
109,UP13395,9/20/2004,0.0,120.0,,,Alaska,Male,Uncertain,4/14/2020,2,,
110,UP7281,4/30/2001,20.0,30.0,,,Alaska,Male,White / Caucasian,4/14/2020,2,,
111,UP13575,9/11/1997,0.0,120.0,,,Alaska,Male,Uncertain,7/2/2018,2,,
6392,UP4846,4/14/1988,37.0,52.0,,,Indiana,Female,White / Caucasian,6/3/2017,18,,
7676,UP5018,4/19/1996,49.0,59.0,,,Missouri,Female,Black / African American,10/23/2018,29,,
8536,UP17664,1/29/2018,0.0,120.0,,,New Mexico,Male,Uncertain,1/21/2020,35,,
8540,UP16690,5/3/2017,0.0,120.0,,,New Mexico,Unsure,Uncertain,2/8/2020,35,,
8541,UP5578,3/26/2003,50.0,60.0,,,New Mexico,Female,Black / African American,10/24/2018,35,,
10966,UP56112,10/10/2017,0.0,120.0,,,Puerto Rico,Male,Uncertain,2/10/2020,72,,


In [133]:
# Re-Add column with compound field key
unidentified_df['State_County'] = unidentified_df['State_FIPS'].astype(str) + "_" + unidentified_df['County']
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
0,UP51903,3/30/1986,0.0,120.0,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020,1,1_Autauga,1001.0
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019,1,1_Autauga,1001.0
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018,1,1_Baldwin,1003.0
3,UP55409,3/26/2000,0.0,120.0,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020,1,1_Baldwin,1003.0
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019,1,1_Bibb,1007.0


In [134]:
## Double check re-assigned values
uni_county_test_3_df = unidentified_df.loc[unidentified_df['County']=='None']
uni_county_test_3_df
# len(uni_county_test_3_df)

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
55,UP14028,12/21/1991,18.0,25.0,,,Alabama,Male,White / Caucasian,3/31/2017,1,1_None,
109,UP13395,9/20/2004,0.0,120.0,,,Alaska,Male,Uncertain,4/14/2020,2,2_None,
110,UP7281,4/30/2001,20.0,30.0,,,Alaska,Male,White / Caucasian,4/14/2020,2,2_None,
111,UP13575,9/11/1997,0.0,120.0,,,Alaska,Male,Uncertain,7/2/2018,2,2_None,
6392,UP4846,4/14/1988,37.0,52.0,,,Indiana,Female,White / Caucasian,6/3/2017,18,18_None,
7676,UP5018,4/19/1996,49.0,59.0,,,Missouri,Female,Black / African American,10/23/2018,29,29_None,
8536,UP17664,1/29/2018,0.0,120.0,,,New Mexico,Male,Uncertain,1/21/2020,35,35_None,
8540,UP16690,5/3/2017,0.0,120.0,,,New Mexico,Unsure,Uncertain,2/8/2020,35,35_None,
8541,UP5578,3/26/2003,50.0,60.0,,,New Mexico,Female,Black / African American,10/24/2018,35,35_None,
10966,UP56112,10/10/2017,0.0,120.0,,,Puerto Rico,Male,Uncertain,2/10/2020,72,72_None,


In [135]:
# Re-add County FIPS column to dataframe, using the State_County name field as a key in the county_v2_dict, to pull the correct County FIPS code for each row
unidentified_df['County_FIPS'] = unidentified_df['State_County'].map(county_v2_dict)
# Check unique values in new dataframe field
unidentified_df['County_FIPS'].unique()

array([ 1001,  1003,  1007, ..., 56021, 56033, 56037], dtype=int64)

In [136]:
# check null values to make sure none are left [Note - all gone!]
unidentified_county_null_v2_df = unidentified_df.loc[unidentified_df['County_FIPS'].isnull()]
# unidentified_county_null_v2_df.shape
unidentified_county_null_v2_df

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS


In [137]:
## Double check re-assigned values
uni_county_test_4_df = unidentified_df.loc[unidentified_df['County']=='None']
uni_county_test_4_df
# len(uni_county_test_4_df)

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
55,UP14028,12/21/1991,18.0,25.0,,,Alabama,Male,White / Caucasian,3/31/2017,1,1_None,1999
109,UP13395,9/20/2004,0.0,120.0,,,Alaska,Male,Uncertain,4/14/2020,2,2_None,2999
110,UP7281,4/30/2001,20.0,30.0,,,Alaska,Male,White / Caucasian,4/14/2020,2,2_None,2999
111,UP13575,9/11/1997,0.0,120.0,,,Alaska,Male,Uncertain,7/2/2018,2,2_None,2999
6392,UP4846,4/14/1988,37.0,52.0,,,Indiana,Female,White / Caucasian,6/3/2017,18,18_None,18999
7676,UP5018,4/19/1996,49.0,59.0,,,Missouri,Female,Black / African American,10/23/2018,29,29_None,29999
8536,UP17664,1/29/2018,0.0,120.0,,,New Mexico,Male,Uncertain,1/21/2020,35,35_None,35999
8540,UP16690,5/3/2017,0.0,120.0,,,New Mexico,Unsure,Uncertain,2/8/2020,35,35_None,35999
8541,UP5578,3/26/2003,50.0,60.0,,,New Mexico,Female,Black / African American,10/24/2018,35,35_None,35999
10966,UP56112,10/10/2017,0.0,120.0,,,Puerto Rico,Male,Uncertain,2/10/2020,72,72_None,72999


# Part 5: Prep data for city-level JSONs for all 3 databases

#### Clean up city dataframe

In [None]:
city_df.head()

In [None]:
# Drop columns that aren't needed
city_df = city_df.drop(columns=['FEATURE_CL','PRIMARY_LA', 'PRIM_LONG_', 'SOURCE_LAT', 'SOURCE_LON', 'SOURCE_L_1', 'SOURCE_L_2', 'ELEV_IN_M', 'ELEV_IN_FT', 'MAP_NAME', 'DATE_CREAT', 'DATE_EDITE'])

In [None]:
city_df.head()

In [None]:
# Rename coordinate columns
city_df.rename(columns={'PRIM_LAT_D': 'Lat_dd', 'PRIM_LON_1': 'Lon_dd'}, inplace=True)

In [None]:
city_df.head()

In [None]:
# make dictionary of states and state FIPS code
state_dict = dict(zip(state_centroids_df.NAME, state_centroids_df.STATEFP))
# state_dict
# state_dict['Alaska']
len(state_dict)

In [None]:
# check dataframes
missing_df.head()

In [None]:
unclaimed_df.head()

In [None]:
unidentified_df.head()

# PART 6

### Construct State-level GeoJSON with correct structure

Goal format:
{
"type": "Feature",
    "name": "Wisconsin",
    "properties": {
        "missing": [ ],
        "unclaimed": [ ],
        "unidentified": [ ],
        "filtered": [ ]
    }
    "geometry": {
          "type": "Point",
          "coordinates": [
            -117.79750667,
            36.03755926
          ]
}
* each array will be a list of dictionaries. Each dictionary = one case. Keys = headers

In [138]:
# get headers
missing_header = list(missing_df.columns.values)
print("missing header:", missing_header)
unclaimed_header = list(unclaimed_df.columns.values)
print("unclaimed header:", unclaimed_header)
unidentified_header = list(unidentified_df.columns.values)
print("unidentified header:", unidentified_header)

missing header: ['Case Number', 'DLC', 'Last Name', 'First Name', 'Missing Age', 'City', 'County', 'State', 'Sex', 'Race / Ethnicity', 'Date Modified', 'State_FIPS', 'State_County', 'County_FIPS']
unclaimed header: ['Case Number', 'DBF', 'Last Name', 'First Name', 'Sex', 'Race / Ethnicity', 'City', 'County', 'State', 'Date Modified', 'State_FIPS', 'State_County', 'County_FIPS']
unidentified header: ['Case Number', 'DBF', 'Age From', 'Age To', 'City', 'County', 'State', 'Sex', 'Race / Ethnicity', 'Date Modified', 'State_FIPS', 'State_County', 'County_FIPS']


In [139]:
# check df
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
0,UCP68231,4/15/2020,Sanders,Stephen,Male,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will,17197
1,UCP68248,4/13/2020,Brookshire,Renee,Female,White / Caucasian,Joliet,Will,Illinois,4/16/2020,17,17_Will,17197
2,UCP68242,4/9/2020,Smith,Victoria,Female,White / Caucasian,Manhattan,New York,New York,4/16/2020,36,36_New York,36061
3,UCP68244,4/9/2020,Bellamy,Eldred,Male,Black / African American,Brooklyn,Kings,New York,4/16/2020,36,36_Kings,36047
4,UCP68228,4/9/2020,Rodriguez,Elido,Male,Hispanic / Latino,Manhattan,New York,New York,4/15/2020,36,36_New York,36061


In [140]:
# check data types
type(unclaimed_df['State_FIPS'][0])

numpy.int32

In [141]:
# check centroids
state_centroids_v2_df.head()
len(state_centroids_v2_df)

56

In [142]:
# sort each database by state FIPS
# sort state centroids by state FIPS
state_centroids_v2_df = state_centroids_v2_df.sort_values(by=['STATEFP'])
state_centroids_v2_df.head()

Unnamed: 0,OBJECTID,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd
17,18,1,1779775,0400000US01,1,AL,Alabama,0,131000000000.0,4593327000.0,18,-86.828092,32.790364
27,28,2,1785533,0400000US02,2,AK,Alaska,0,1480000000000.0,245000000000.0,28,-152.680813,64.530206
35,36,4,1779777,0400000US04,4,AZ,Arizona,0,294000000000.0,1027338000.0,36,-111.664418,34.29311
51,53,5,68085,0400000US05,5,AR,Arkansas,0,135000000000.0,2962860000.0,53,-92.439268,34.899745
16,17,6,1779778,0400000US06,6,CA,California,0,404000000000.0,20463870000.0,17,-119.270414,36.373627


In [143]:
# sort each database by state FIPS
# sort missing by state FIPS
missing_df = missing_df.sort_values(by=['State_FIPS'])
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020,1,1_Autauga,1001
151,MP5734,2/15/2002,Brown,Tamborah,22,Mobile,Mobile,Alabama,Male,Black / African American,5/7/2019,1,1_Mobile,1097
152,MP33530,2/10/1999,Brown,Anthony,21,Mobile,Mobile,Alabama,Male,Black / African American,3/4/2020,1,1_Mobile,1097
153,MP35096,4/4/1996,Powe,Edgar,31,Mobile,Mobile,Alabama,Male,Black / African American,3/4/2020,1,1_Mobile,1097
154,MP64666,1/16/2020,Edwards,Katrina,16,Mobile,Mobile,Alabama,Female,White / Caucasian,3/4/2020,1,1_Mobile,1097


In [144]:
# sort unclaimed by state FIPS
unclaimed_df = unclaimed_df.sort_values(by=['State_FIPS'])
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
2516,UCP5127,10/28/2017,Elmore,Jeanne,Female,White / Caucasian,Birmingham,Jefferson,Alabama,1/11/2019,1,1_Jefferson,1073
5382,UCP2769,5/26/2013,Black,Billy,Male,White / Caucasian,Birmingham,Jefferson,Alabama,9/20/2018,1,1_Jefferson,1073
2989,UCP4416,4/22/2017,Harris,Scott,Male,White / Caucasian,Birmingham,Jefferson,Alabama,1/11/2019,1,1_Jefferson,1073
5095,UCP2760,8/7/2014,Allison,Larry,Male,White / Caucasian,Birmingham,Jefferson,Alabama,9/20/2018,1,1_Jefferson,1073
5406,UCP2770,4/11/2013,Williams,Darryl,Male,Black / African American,Birmingham,Jefferson,Alabama,9/20/2018,1,1_Jefferson,1073


In [145]:
# sort unidentified by state FIPS
unidentified_df = unidentified_df.sort_values(by=['State_FIPS'])
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
0,UP51903,3/30/1986,0.0,120.0,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020,1,1_Autauga,1001
30,UP15332,9/5/2004,18.0,99.0,Mountain Brook,Jefferson,Alabama,Male,Black / African American,3/18/2020,1,1_Jefferson,1073
31,UP9834,1/28/2012,3.0,7.0,Opelika,Lee,Alabama,Female,Black / African American,4/8/2020,1,1_Lee,1081
32,UP13161,11/3/1982,29.0,32.0,,Lee,Alabama,Female,Black / African American,7/10/2015,1,1_Lee,1081
33,UP2747,5/7/2008,45.0,60.0,Huntsville,Limestone,Alabama,Male,White / Caucasian,5/30/2012,1,1_Limestone,1083


In [146]:
## Create state array
state_array = []
## for each state in state_centroids...
i = 0
while i < len(state_centroids_v2_df):
    state_dict = {}
    state_dict["type"] = "Feature"
    state_dict["name"] = state_centroids_v2_df["NAME"][i]
    state_dict["name_abbr"] = state_centroids_v2_df["STUSPS"][i]
    state_dict["state_FIPS"] = str(state_centroids_v2_df["STATEFP"][i])
    state_dict["properties"] = {}
    missing_array = []
    j = 0
    while j < len(missing_df):
        # check if state_fips matches
        if missing_df['State_FIPS'][j] == state_centroids_v2_df["STATEFP"][i]:
            missing_dict = {}
            for item in missing_header:
                missing_dict[item] = str(missing_df[item][j])
            # append dictionary to missing array
            missing_array.append(missing_dict)
        # increment j
        j += 1
    state_dict["properties"]['missing'] = missing_array     
    
    unclaimed_array = []
    k = 0
    while k < len(unclaimed_df):
        # check if state_fips matches
        if unclaimed_df['State_FIPS'][k] == state_centroids_v2_df["STATEFP"][i]:
            unclaimed_dict = {}
            for item in unclaimed_header:
                unclaimed_dict[item] = str(unclaimed_df[item][k])
            # append dictionary to unclaimed array
            unclaimed_array.append(unclaimed_dict)
        # increment k
        k += 1
    state_dict["properties"]['unclaimed'] = unclaimed_array
    
    unidentified_array = []
    l = 0
    while l < len(unidentified_df):
        # check if state_fips matches
        if unidentified_df['State_FIPS'][l] == state_centroids_v2_df["STATEFP"][i]:
            unidentified_dict = {}
            for item in unidentified_header:
                unidentified_dict[item] = str(unidentified_df[item][l])
            # append dictionary to unclaimed array
            unidentified_array.append(unidentified_dict)
        # increment l
        l += 1
    state_dict["properties"]['unidentified'] = unidentified_array
    
    state_dict["properties"]['filtered'] = []
    ## set geometry
    state_dict["geometry"] = {}
    state_dict["geometry"]["type"] = "Point"
    state_dict["geometry"]["coordinates"] = [state_centroids_v2_df['Lon_dd'][i], state_centroids_v2_df['Lat_dd'][i]]
    ## append state dictionary to array
    state_array.append(state_dict)
    # increment interator
    i += 1
# state_array

In [149]:
# check item in array
len(state_array)
# state_array[0]['properties']['unclaimed']

[{'Case Number': 'UCP64285',
  'DBF': '2/22/2013',
  'Last Name': 'Dillard',
  'First Name': 'Joseph',
  'Sex': 'Male',
  'Race / Ethnicity': 'Black / African American',
  'City': 'Jackson',
  'County': 'Hinds',
  'State': 'Mississippi',
  'Date Modified': '4/8/2020',
  'State_FIPS': '28',
  'State_County': '28_Hinds',
  'County_FIPS': '28049'},
 {'Case Number': 'UCP5008',
  'DBF': 'nan',
  'Last Name': 'Knight',
  'First Name': 'Rachel',
  'Sex': 'Female',
  'Race / Ethnicity': 'White / Caucasian',
  'City': 'nan',
  'County': 'Scott',
  'State': 'Mississippi',
  'Date Modified': '1/3/2019',
  'State_FIPS': '28',
  'State_County': '28_Scott',
  'County_FIPS': '28123'}]

In [150]:
# Create FeatureCollection
state_feature_collection = {}
state_feature_collection["type"] = "FeatureCollection"
state_feature_collection["features"] = state_array
# state_feature_collection

In [151]:
# Convert FeatureCollection to JSON format
state_geojson = geojson.dumps(state_feature_collection)
# check type to make sure conversion was sucessful
print(type(state_geojson))

<class 'str'>


In [152]:
# Save JSON-formatted FeatureCollection as JSON file
# Save as new json file
with open('JSON/state_geojson.json', 'w', encoding='utf-8') as json_file:
    json_file.write(state_geojson)

### Construct COUNTY-LEVEL GeoJSON with correct structure

Goal format:
{
"type": "Feature",
    "name": "Dane",
    "state": "Wisconsin",
    "state_FIPS": ,
    "county_FIPS": ,
    "properties": {
        "missing": [ ],
        "unclaimed": [ ],
        "unidentified": [ ],
        "filtered": [ ]
    }
    "geometry": {
          "type": "Point",
          "coordinates": [
            -117.79750667,
            36.03755926
          ]
}
* each array will be a list of dictionaries. Each dictionary = one case. Keys = headers

In [153]:
# get headers
missing_header = list(missing_df.columns.values)
print("missing header:", missing_header)
unclaimed_header = list(unclaimed_df.columns.values)
print("unclaimed header:", unclaimed_header)
unidentified_header = list(unidentified_df.columns.values)
print("unidentified header:", unidentified_header)

missing header: ['Case Number', 'DLC', 'Last Name', 'First Name', 'Missing Age', 'City', 'County', 'State', 'Sex', 'Race / Ethnicity', 'Date Modified', 'State_FIPS', 'State_County', 'County_FIPS']
unclaimed header: ['Case Number', 'DBF', 'Last Name', 'First Name', 'Sex', 'Race / Ethnicity', 'City', 'County', 'State', 'Date Modified', 'State_FIPS', 'State_County', 'County_FIPS']
unidentified header: ['Case Number', 'DBF', 'Age From', 'Age To', 'City', 'County', 'State', 'Sex', 'Race / Ethnicity', 'Date Modified', 'State_FIPS', 'State_County', 'County_FIPS']


In [154]:
# check df
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
2516,UCP5127,10/28/2017,Elmore,Jeanne,Female,White / Caucasian,Birmingham,Jefferson,Alabama,1/11/2019,1,1_Jefferson,1073
5382,UCP2769,5/26/2013,Black,Billy,Male,White / Caucasian,Birmingham,Jefferson,Alabama,9/20/2018,1,1_Jefferson,1073
2989,UCP4416,4/22/2017,Harris,Scott,Male,White / Caucasian,Birmingham,Jefferson,Alabama,1/11/2019,1,1_Jefferson,1073
5095,UCP2760,8/7/2014,Allison,Larry,Male,White / Caucasian,Birmingham,Jefferson,Alabama,9/20/2018,1,1_Jefferson,1073
5406,UCP2770,4/11/2013,Williams,Darryl,Male,Black / African American,Birmingham,Jefferson,Alabama,9/20/2018,1,1_Jefferson,1073


In [155]:
# check data types
type(unclaimed_df['State_FIPS'][0])

numpy.int32

In [156]:
# check centroids
county_centroids_v2_df.head()
len(county_centroids_v2_df)

3283

In [157]:
# sort each database by county FIPS
# sort county centroids by county FIPS
county_centroids_v2_df = county_centroids_v2_df.sort_values(by=['GEOID'])
county_centroids_v2_df.head()

Unnamed: 0,OBJECTID,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,STATE_NAME,County_Key
16,17,1,1,161526,0500000US01001,1001,Autauga,6,1539602000.0,25706961,17,-86.642735,32.534929,Alabama,1_Autauga
2754,2755,1,3,161527,0500000US01003,1003,Baldwin,6,4117547000.0,1133055836,2759,-87.722569,30.727484,Alabama,1_Baldwin
17,18,1,5,161528,0500000US01005,1005,Barbour,6,2292145000.0,50538698,18,-85.393214,31.869587,Alabama,1_Barbour
2336,2337,1,7,161529,0500000US01007,1007,Bibb,6,1612167000.0,9602089,2340,-87.12648,32.998634,Alabama,1_Bibb
1020,1021,1,9,161530,0500000US01009,1009,Blount,6,1670104000.0,15015423,1021,-86.567385,33.980875,Alabama,1_Blount


In [158]:
# sort each database by county FIPS
# sort missing by county FIPS
missing_df = missing_df.sort_values(by=['County_FIPS'])
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
0,MP2341,5/8/1999,Reynolds,James,28,Opp,Autauga,Alabama,Male,Black / African American,2/14/2020,1,1_Autauga,1001
1,MP8522,1/15/2010,Walker,Jeremy,32,Prattville,Autauga,Alabama,Male,White / Caucasian,7/27/2015,1,1_Autauga,1001
2,MP50314,5/16/2018,Keszthelyi,Steve,79,Elberta,Baldwin,Alabama,Male,White / Caucasian,11/13/2019,1,1_Baldwin,1003
3,MP43187,2/20/2018,Johnson,Abram,60,fairhope,Baldwin,Alabama,Male,White / Caucasian,2/13/2020,1,1_Baldwin,1003
4,MP23019,11/5/2012,Shroyer,James,40,Fairhope,Baldwin,Alabama,Male,White / Caucasian,2/3/2016,1,1_Baldwin,1003


In [159]:
# sort unclaimed by county FIPS
unclaimed_df = unclaimed_df.sort_values(by=['County_FIPS'])
unclaimed_df.head()

Unnamed: 0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,State,Date Modified,State_FIPS,State_County,County_FIPS
2516,UCP5127,10/28/2017,Elmore,Jeanne,Female,White / Caucasian,Birmingham,Jefferson,Alabama,1/11/2019,1,1_Jefferson,1073
2244,UCP54578,1/31/2018,Lovelady,Altha,Female,White / Caucasian,Bessemer,Jefferson,Alabama,3/18/2020,1,1_Jefferson,1073
1687,UCP54583,9/15/2018,Walker,Patricia,Female,White / Caucasian,Tarrant,Jefferson,Alabama,3/18/2020,1,1_Jefferson,1073
5233,UCP2763,1/25/2014,Brewer,Dale,Male,White / Caucasian,Lipscomb,Jefferson,Alabama,9/20/2018,1,1_Jefferson,1073
2987,UCP4417,4/22/2017,Clarke,Michael,Male,White / Caucasian,Birmingham,Jefferson,Alabama,1/11/2019,1,1_Jefferson,1073


In [160]:
# sort unidentified by county FIPS
unidentified_df = unidentified_df.sort_values(by=['County_FIPS'])
unidentified_df.head()

Unnamed: 0,Case Number,DBF,Age From,Age To,City,County,State,Sex,Race / Ethnicity,Date Modified,State_FIPS,State_County,County_FIPS
0,UP51903,3/30/1986,0.0,120.0,Marbury,Autauga,Alabama,Female,White / Caucasian,1/28/2020,1,1_Autauga,1001
1,UP5000,9/29/2007,40.0,60.0,Prattville,Autauga,Alabama,Female,Black / African American,8/12/2019,1,1_Autauga,1001
2,UP6519,1/6/2006,45.0,59.0,Elberta,Baldwin,Alabama,Male,White / Caucasian,10/24/2018,1,1_Baldwin,1003
3,UP55409,3/26/2000,0.0,120.0,,Baldwin,Alabama,Male,"White / Caucasian, Asian, American Indian / Al...",2/13/2020,1,1_Baldwin,1003
4,UP13483,3/27/1961,14.0,17.0,,Bibb,Alabama,Male,White / Caucasian,3/25/2019,1,1_Bibb,1007


In [161]:
## Create county array 
county_array = []
## for each county in county_centroids_v2...
i = 0
while i < len(county_centroids_v2_df):
    county_dict = {}
    county_dict["type"] = "Feature"
    county_dict["name"] = str(county_centroids_v2_df["NAME"][i])
    county_dict["county_FIPS"] = str(county_centroids_v2_df["GEOID"][i])
    county_dict["state_name"] = str(county_centroids_v2_df["STATE_NAME"][i])
    county_dict["state_FIPS"] = str(county_centroids_v2_df["STATEFP"][i])
    county_dict["properties"] = {}
    missing_array = []
    j = 0
    while j < len(missing_df):
        # check if state_fips matches
        if missing_df['County_FIPS'][j] == county_centroids_v2_df["GEOID"][i]:
            missing_dict = {}
            for item in missing_header:
                missing_dict[item] = str(missing_df[item][j])
            # append dictionary to missing array
            missing_array.append(missing_dict)
        # increment j
        j += 1
    county_dict["properties"]['missing'] = missing_array     
    
    unclaimed_array = []
    k = 0
    while k < len(unclaimed_df):
        # check if state_fips matches
        if unclaimed_df['County_FIPS'][k] == county_centroids_v2_df["GEOID"][i]:
            unclaimed_dict = {}
            for item in unclaimed_header:
                unclaimed_dict[item] = str(unclaimed_df[item][k])
            # append dictionary to unclaimed array
            unclaimed_array.append(unclaimed_dict)
        # increment k
        k += 1
    county_dict["properties"]['unclaimed'] = unclaimed_array
    
    unidentified_array = []
    l = 0
    while l < len(unidentified_df):
        # check if state_fips matches
        if unidentified_df['County_FIPS'][l] == county_centroids_v2_df["GEOID"][i]:
            unidentified_dict = {}
            for item in unidentified_header:
                unidentified_dict[item] = str(unidentified_df[item][l])
            # append dictionary to unclaimed array
            unidentified_array.append(unidentified_dict)
        # increment l
        l += 1
    county_dict["properties"]['unidentified'] = unidentified_array
    
    county_dict["properties"]['filtered'] = []
    ## set geometry
    county_dict["geometry"] = {}
    county_dict["geometry"]["type"] = "Point"
    county_dict["geometry"]["coordinates"] = [county_centroids_v2_df['Lon_dd'][i], county_centroids_v2_df['Lat_dd'][i]]
    ## append county dictionary to array
    county_array.append(county_dict)
    # test statement
    print("added county", str(i+1), "of 3283")
    # increment interator
    i += 1
# county_array

added county 1 of 3283
added county 2 of 3283
added county 3 of 3283
added county 4 of 3283
added county 5 of 3283
added county 6 of 3283
added county 7 of 3283
added county 8 of 3283
added county 9 of 3283
added county 10 of 3283
added county 11 of 3283
added county 12 of 3283
added county 13 of 3283
added county 14 of 3283
added county 15 of 3283
added county 16 of 3283
added county 17 of 3283
added county 18 of 3283
added county 19 of 3283
added county 20 of 3283
added county 21 of 3283
added county 22 of 3283
added county 23 of 3283
added county 24 of 3283
added county 25 of 3283
added county 26 of 3283
added county 27 of 3283
added county 28 of 3283
added county 29 of 3283
added county 30 of 3283
added county 31 of 3283
added county 32 of 3283
added county 33 of 3283
added county 34 of 3283
added county 35 of 3283
added county 36 of 3283
added county 37 of 3283
added county 38 of 3283
added county 39 of 3283
added county 40 of 3283
added county 41 of 3283
added county 42 of 3283
a

In [162]:
# check item in array
len(county_array)
# county_array[55]['properties']['unclaimed']

3283

In [163]:
# Create FeatureCollection
county_feature_collection = {}
county_feature_collection["type"] = "FeatureCollection"
county_feature_collection["features"] = county_array
# county_feature_collection

In [164]:
# Convert FeatureCollection to JSON format
county_geojson = geojson.dumps(county_feature_collection)
# check type to make sure conversion was sucessful
print(type(county_geojson))

<class 'str'>


In [165]:
# Save JSON-formatted FeatureCollection as JSON file
# Save as new json file
with open('JSON/county_geojson.json', 'w', encoding='utf-8') as json_file:
    json_file.write(county_geojson)