In [1]:
# import necessary packages
import pandas as pd
import geopandas as gpd
import numpy as np

In [2]:
# Read in csvs
#
# Notes: 
# In input .csv for missing people had to fix:
# Virgin Islands (to United States Virgin Islands), 
# Tennesse (to Tennessee), and 
# Northern Mariana Islands (to Commonwealth of the Northern Mariana Islands)
#
# In input .csv for unclaimed people had to fix:
# [nothing yet]
#
# In input .csv for unidentified people had to fix:
# Virgin Islands (to United States Virgin Islands)
#
city_df = pd.read_csv('cities.csv')
county_centroids_df = pd.read_csv('county_centroids.csv')
state_centroids_df = pd.read_csv('state_centroids.csv')
missing_df = pd.read_csv('Missing_04182020.csv')
unclaimed_df = pd.read_csv('Unclaimed_04182020.csv')
unidentified_df = pd.read_csv('Unidentified_04182020.csv')

In [3]:
missing_df.head()

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified
0,MP68158,3/19/2020,Mencer,Steven,33,Unalaska,Aleutians West,Alaska,Male,White / Caucasian,4/14/2020
1,MP68103,2/25/2020,Germain,Peter,76,Trapper Creek,Matanuska-Susitna,Alaska,Male,White / Caucasian,4/14/2020
2,MP65871,1/5/2020,Pastel,Kale,36,Sitka,Sitka,Alaska,Male,White / Caucasian,4/14/2020
3,MP64094,12/31/2019,Cobban,David,30,Kodiak,Kodiak Island,Alaska,Male,White / Caucasian,4/14/2020
4,MP64101,12/31/2019,Rainey,Brock,47,Kodiak,Kodiak Island,Alaska,Male,White / Caucasian,4/14/2020


# PART 1: Summary data (count for all 3 databases, by state)

### 1/4: Add count of missing person cases

In [4]:
missing_count = missing_df.groupby('State').count()
len(missing_count)
missing_count.head()

Unnamed: 0_level_0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,Sex,Race / Ethnicity,Date Modified
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Alabama,238,238,238,238,238,238,237,238,238,238
Alaska,582,582,582,582,582,582,539,582,582,582
Arizona,916,916,916,916,916,915,916,916,916,916
Arkansas,280,280,280,280,280,280,280,280,280,280
California,2512,2512,2512,2512,2511,2511,2511,2512,2512,2512


In [5]:
# Remove all columns except case number count
missing_count = missing_count.drop(columns=['DLC','Last Name', 'First Name', 'Missing Age', 'City', 'County', 'Sex', 'Race / Ethnicity', 'Date Modified'])

In [6]:
# add column for state (since state is now index)
missing_count['State'] = missing_count.index

In [7]:
# rename case number count column
missing_count = missing_count.rename(columns = {'Case Number': 'Missing_CaseCount'}, inplace = False)

In [8]:
# check dataframe
missing_count.head()

Unnamed: 0_level_0,Missing_CaseCount,State
State,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,238,Alabama
Alaska,582,Alaska
Arizona,916,Arizona
Arkansas,280,Arkansas
California,2512,California


In [9]:
# make dictionary of states and missing person counts
missing_dict = dict(zip(missing_count.State, missing_count.Missing_CaseCount))
len(missing_dict)
# missing_dict

55

### 2/4: Add count of unclaimed persons

In [10]:
unclaimed_count = unclaimed_df.groupby('State').count()
# len(unclaimed_count)
unclaimed_count.head()

Unnamed: 0_level_0,Case Number,DBF,Last Name,First Name,Sex,Race / Ethnicity,City,County,Date Modified
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama,63,63,63,63,63,63,63,62,63
Alaska,1,1,1,1,1,1,1,1,1
Arizona,21,20,21,21,21,21,20,21,21
Arkansas,16,16,16,16,16,16,16,16,16
California,1680,1046,1678,1678,769,775,691,1650,1680


In [11]:
# Remove all columns except case number count
unclaimed_count = unclaimed_count.drop(columns=['DBF','Last Name', 'First Name', 'Sex', 'Race / Ethnicity', 'City', 'County', 'Date Modified'])

In [12]:
# add column for state (since state is now index)
unclaimed_count['State'] = unclaimed_count.index

In [13]:
# rename case number count column
unclaimed_count = unclaimed_count.rename(columns = {'Case Number': 'Unclaimed_CaseCount'}, inplace = False)

In [14]:
# check dataframe
unclaimed_count.head()

Unnamed: 0_level_0,Unclaimed_CaseCount,State
State,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,63,Alabama
Alaska,1,Alaska
Arizona,21,Arizona
Arkansas,16,Arkansas
California,1680,California


In [15]:
# make dictionary of states and unclaimed case counts
unclaimed_dict = dict(zip(unclaimed_count.State, unclaimed_count.Unclaimed_CaseCount))
len(unclaimed_dict)
# unclaimed_dict

43

### 3/4: Add count of unidentified persons

In [16]:
unidentified_count = unidentified_df.groupby('State').count()
# len(unidentified_count)
unidentified_count.head()

Unnamed: 0_level_0,Case Number,DBF,Age From,Age To,City,County,Sex,Race / Ethnicity,Date Modified
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama,56,56,46,46,48,55,56,56,56
Alaska,56,56,23,23,50,39,56,56,56
Arizona,1773,1772,1670,1670,1427,1773,1772,1767,1773
Arkansas,100,100,95,95,94,100,100,100,100
California,2741,2740,2426,2426,2218,2741,2741,2737,2741


In [17]:
# Remove all columns except case number count
unidentified_count = unidentified_count.drop(columns=['DBF','Age From', 'Age To', 'City', 'County', 'Sex', 'Race / Ethnicity', 'Date Modified'])

In [18]:
# add column for state (since state is now index)
unidentified_count['State'] = unidentified_count.index

In [19]:
# rename case number count column
unidentified_count = unidentified_count.rename(columns = {'Case Number': 'Unidentified_CaseCount'}, inplace = False)

In [20]:
# check dataframe
unidentified_count.head()

Unnamed: 0_level_0,Unidentified_CaseCount,State
State,Unnamed: 1_level_1,Unnamed: 2_level_1
Alabama,56,Alabama
Alaska,56,Alaska
Arizona,1773,Arizona
Arkansas,100,Arkansas
California,2741,California


In [21]:
# make dictionary of states and unidentified case counts
unidentified_dict = dict(zip(unidentified_count.State, unidentified_count.Unidentified_CaseCount))
len(unidentified_dict)
# unidentified_dict

54

### 4/4: Make summary dataframe

In [22]:
# make a new summary dataframe
summary_df = state_centroids_df
# sort by state name
summary_df = summary_df.sort_values(by=['NAME'])
summary_df.head()

Unnamed: 0,OBJECTID,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd
17,18,1,1779775,0400000US01,1,AL,Alabama,0,131174000000.0,4593327000.0,18,-86.828092,32.790364
27,28,2,1785533,0400000US02,2,AK,Alaska,0,1478840000000.0,245482000000.0,28,-152.680813,64.530206
37,38,60,1802701,0400000US60,60,AS,American Samoa,0,197759100.0,1307244000.0,38,-170.718268,-14.300454
35,36,4,1779777,0400000US04,4,AZ,Arizona,0,294199000000.0,1027338000.0,36,-111.664418,34.29311
52,53,5,68085,0400000US05,5,AR,Arkansas,0,134769000000.0,2962860000.0,53,-92.439268,34.899745


In [23]:
len(summary_df)

56

#### 4a: Add Missing Person count for each state

In [24]:
# Add new column to dataframe, using the state name field as a key in the missing_dict, to pull the correct missing case count for each state
summary_df['Missing_Count'] = summary_df['NAME'].map(missing_dict)
summary_df.head()

Unnamed: 0,OBJECTID,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,Missing_Count
17,18,1,1779775,0400000US01,1,AL,Alabama,0,131174000000.0,4593327000.0,18,-86.828092,32.790364,238.0
27,28,2,1785533,0400000US02,2,AK,Alaska,0,1478840000000.0,245482000000.0,28,-152.680813,64.530206,582.0
37,38,60,1802701,0400000US60,60,AS,American Samoa,0,197759100.0,1307244000.0,38,-170.718268,-14.300454,
35,36,4,1779777,0400000US04,4,AZ,Arizona,0,294199000000.0,1027338000.0,36,-111.664418,34.29311,916.0
52,53,5,68085,0400000US05,5,AR,Arkansas,0,134769000000.0,2962860000.0,53,-92.439268,34.899745,280.0


In [25]:
# Check null values [NOTE: seems fine to have nulls]
missing_null_df = summary_df.loc[summary_df['Missing_Count'].isnull()]
missing_null_df

Unnamed: 0,OBJECTID,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,Missing_Count
37,38,60,1802701,0400000US60,60,AS,American Samoa,0,197759063.0,1307244000.0,38,-170.718268,-14.300454,


#### 4b: Add Unclaimed Person count for each state

In [26]:
# Add new column to dataframe, using the state name field as a key in the unclaimed_dict, to pull the correct unclaimed case count for each state
summary_df['Unclaimed_Count'] = summary_df['NAME'].map(unclaimed_dict)
summary_df.head()

Unnamed: 0,OBJECTID,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,Missing_Count,Unclaimed_Count
17,18,1,1779775,0400000US01,1,AL,Alabama,0,131174000000.0,4593327000.0,18,-86.828092,32.790364,238.0,63.0
27,28,2,1785533,0400000US02,2,AK,Alaska,0,1478840000000.0,245482000000.0,28,-152.680813,64.530206,582.0,1.0
37,38,60,1802701,0400000US60,60,AS,American Samoa,0,197759100.0,1307244000.0,38,-170.718268,-14.300454,,
35,36,4,1779777,0400000US04,4,AZ,Arizona,0,294199000000.0,1027338000.0,36,-111.664418,34.29311,916.0,21.0
52,53,5,68085,0400000US05,5,AR,Arkansas,0,134769000000.0,2962860000.0,53,-92.439268,34.899745,280.0,16.0


In [27]:
# Check null values [NOTE: seems fine to have nulls]
unclaimed_null_df = summary_df.loc[summary_df['Unclaimed_Count'].isnull()]
unclaimed_null_df

Unnamed: 0,OBJECTID,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,Missing_Count,Unclaimed_Count
37,38,60,1802701,0400000US60,60,AS,American Samoa,0,197759100.0,1307244000.0,38,-170.718268,-14.300454,,
45,46,69,1779809,0400000US69,69,MP,Commonwealth of the Northern Mariana Islands,0,472292500.0,4644252000.0,46,145.753615,15.188891,2.0,
44,45,66,1802705,0400000US66,66,GU,Guam,0,543555800.0,934337500.0,45,144.702724,13.357731,2.0,
26,27,20,481813,0400000US20,20,KS,Kansas,0,211755000000.0,1344141000.0,27,-98.380213,38.484708,93.0,
40,41,24,1714934,0400000US24,24,MD,Maryland,0,25151100000.0,6979967000.0,41,-76.774792,39.039764,193.0,
54,55,27,662849,0400000US27,27,MN,Minnesota,0,206229000000.0,18945220000.0,55,-94.309126,46.316468,181.0,
31,32,30,767982,0400000US30,30,MT,Montana,0,376963000000.0,3869209000.0,32,-109.645144,47.03347,86.0,
10,11,31,1779792,0400000US31,31,NE,Nebraska,0,198957000000.0,1371829000.0,11,-99.810809,41.527149,73.0,
13,14,72,1779808,0400000US72,72,PR,Puerto Rico,0,8868896000.0,4922383000.0,14,-66.478054,18.224102,23.0,
46,47,44,1219835,0400000US44,44,RI,Rhode Island,0,2677780000.0,1323670000.0,47,-71.589051,41.694954,21.0,


#### 4b: Add Unidentified Person count for each state

In [28]:
# Add new column to dataframe, using the state name field as a key in the unidentified_dict, to pull the correct unidentified case count for each state
summary_df['Unidentified_Count'] = summary_df['NAME'].map(unidentified_dict)
summary_df.head()

Unnamed: 0,OBJECTID,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,Missing_Count,Unclaimed_Count,Unidentified_Count
17,18,1,1779775,0400000US01,1,AL,Alabama,0,131174000000.0,4593327000.0,18,-86.828092,32.790364,238.0,63.0,56.0
27,28,2,1785533,0400000US02,2,AK,Alaska,0,1478840000000.0,245482000000.0,28,-152.680813,64.530206,582.0,1.0,56.0
37,38,60,1802701,0400000US60,60,AS,American Samoa,0,197759100.0,1307244000.0,38,-170.718268,-14.300454,,,
35,36,4,1779777,0400000US04,4,AZ,Arizona,0,294199000000.0,1027338000.0,36,-111.664418,34.29311,916.0,21.0,1773.0
52,53,5,68085,0400000US05,5,AR,Arkansas,0,134769000000.0,2962860000.0,53,-92.439268,34.899745,280.0,16.0,100.0


In [29]:
# Check null values [NOTE: seems fine to have nulls]
unidentified_null_df = summary_df.loc[summary_df['Unidentified_Count'].isnull()]
unidentified_null_df

Unnamed: 0,OBJECTID,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,Missing_Count,Unclaimed_Count,Unidentified_Count
37,38,60,1802701,0400000US60,60,AS,American Samoa,0,197759063.0,1307244000.0,38,-170.718268,-14.300454,,,
45,46,69,1779809,0400000US69,69,MP,Commonwealth of the Northern Mariana Islands,0,472292529.0,4644252000.0,46,145.753615,15.188891,2.0,,


#### 4c: Convert to geodataframe and export as GeoJSON

In [30]:
# check final summary_df
summary_df.head()

Unnamed: 0,OBJECTID,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,Missing_Count,Unclaimed_Count,Unidentified_Count
17,18,1,1779775,0400000US01,1,AL,Alabama,0,131174000000.0,4593327000.0,18,-86.828092,32.790364,238.0,63.0,56.0
27,28,2,1785533,0400000US02,2,AK,Alaska,0,1478840000000.0,245482000000.0,28,-152.680813,64.530206,582.0,1.0,56.0
37,38,60,1802701,0400000US60,60,AS,American Samoa,0,197759100.0,1307244000.0,38,-170.718268,-14.300454,,,
35,36,4,1779777,0400000US04,4,AZ,Arizona,0,294199000000.0,1027338000.0,36,-111.664418,34.29311,916.0,21.0,1773.0
52,53,5,68085,0400000US05,5,AR,Arkansas,0,134769000000.0,2962860000.0,53,-92.439268,34.899745,280.0,16.0,100.0


In [31]:
# Convert dataframe to geodataframe
summary_gdf = gpd.GeoDataFrame(summary_df, geometry=gpd.points_from_xy(x=summary_df.Lon_dd, y=summary_df.Lat_dd))
summary_gdf.head()

Unnamed: 0,OBJECTID,STATEFP,STATENS,AFFGEOID,GEOID,STUSPS,NAME,LSAD,ALAND,AWATER,ORIG_FID,Lon_dd,Lat_dd,Missing_Count,Unclaimed_Count,Unidentified_Count,geometry
17,18,1,1779775,0400000US01,1,AL,Alabama,0,131174000000.0,4593327000.0,18,-86.828092,32.790364,238.0,63.0,56.0,POINT (-86.82809 32.79036)
27,28,2,1785533,0400000US02,2,AK,Alaska,0,1478840000000.0,245482000000.0,28,-152.680813,64.530206,582.0,1.0,56.0,POINT (-152.68081 64.53021)
37,38,60,1802701,0400000US60,60,AS,American Samoa,0,197759100.0,1307244000.0,38,-170.718268,-14.300454,,,,POINT (-170.71827 -14.30045)
35,36,4,1779777,0400000US04,4,AZ,Arizona,0,294199000000.0,1027338000.0,36,-111.664418,34.29311,916.0,21.0,1773.0,POINT (-111.66442 34.29311)
52,53,5,68085,0400000US05,5,AR,Arkansas,0,134769000000.0,2962860000.0,53,-92.439268,34.899745,280.0,16.0,100.0,POINT (-92.43927 34.89975)


In [32]:
# write to geoJSON
summary_gdf.to_file("JSON/summary_counts.json", driver="GeoJSON")

# Part 2: Get state-level JSONs for all 3 databases

#### Add in State FIPS column

In [None]:
# make dictionary of states and state FIPS code
state_dict = dict(zip(state_centroids_df.NAME, state_centroids_df.STATEFP))
# state_dict['Alaska']

In [None]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
missing_df['State_FIPS'] = missing_df['State'].map(state_dict)
# Check unique values in new dataframe field
missing_df['State_FIPS'].unique()

In [None]:
## Check to see what is producing null values
# test_df = missing_df.loc[missing_df['State_FIPS'].isnull()]

In [None]:
# check misisng df
missing_df.head()

#### Add in County FIPS column

In [None]:
# check county dataframe
county_centroids_df.head()

In [None]:
# Add column with compound field key
county_centroids_df['County_Key'] = county_centroids_df['STATEFP'].astype(str) + "_" + county_centroids_df['NAME']
county_centroids_df.head()

In [None]:
# check unique values and length
county_key_list = county_centroids_df['County_Key'].unique()
len(county_key_list)

In [None]:
# make dictionary of counties and county FIPS code (GEIOD field)
county_dict = dict(zip(county_centroids_df.County_Key, county_centroids_df.GEOID))
county_dict

In [None]:
# check missing df
missing_df.head()

In [None]:
# Add column with compound field key
missing_df['State_County'] = missing_df['State_FIPS'].astype(str) + "_" + missing_df['County']
missing_df.head()

In [None]:
# Add new column to dataframe, using the state name field as a key in the state_dict, to pull the correct FIPS code for each row
missing_df['County_FIPS'] = missing_df['State_County'].map(county_dict)
# Check unique values in new dataframe field
missing_df['County_FIPS'].unique()

In [None]:
test2_df = missing_df.loc[missing_df['County_FIPS'].isnull()]
test2_df.shape

In [None]:
test2_df.to_csv('county_nulls.csv')

In [None]:
# check missing_df
missing_df.head()