In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import os
import requests
import gzip
import shutil
import glob
import folium

In [2]:
sharedFolder = '/work/group/egodat/reu23_clark/'
dataFolder = sharedFolder + 'data/'

**Data sources to combine:**
* CBSA geometry (from 2019, but used for all years)
* Liminal/commuting data 2019, 2020
* ACS data for 2010, 2015, and 2021
* Internet speed data 2019, 2021
* Air quality data 2010, 2015, 2021
* Housing Price Index 2010, 2015, 2021

* Parklands data
* Population/migration data

**CBSA geometry from 2019**

In [20]:
CBSAs = gpd.read_file(dataFolder + "/tl_2019_us_cbsa.zip").to_crs(3857)
CBSAs = CBSAs[['CBSAFP', 'NAME', 'geometry']].rename(columns={'CBSAFP': 'CBSA'})
CBSAs['CBSA'] = CBSAs['CBSA'].astype(int)

combined10 = CBSAs.copy()
combined15 = CBSAs.copy()
combined19 = CBSAs.copy()
combined20 = CBSAs.copy()
combined21 = CBSAs.copy()

**Liminal and Commuting Data for 2019 and 2020**

CBSA: Core Based Statistical Area id \
LSAD: What type of area it is (metropolitan or micropolitan) \
NAME: Name of CBSA \
geometry: Shape of CBSA \
metro: Is it a metro with population > 1M? \
micro: Is it a metro/micro with population < 100k? \
inBuff: Is it within 100Mi of a metro? \
commuteBuffCount: Number of people commuting to a nearby metro in (from LODES data) \
workingPop: Working population for (from LODES data) \
commutePercent: commuteBuffCount / workingPop * 100 \
liminal: Is it "micro" with at least 5% commuting to a metro?

In [21]:
liminal19 = pd.read_pickle(sharedFolder + 'gdf2019.pkl')
liminal20 = pd.read_pickle(sharedFolder + 'gdf2020.pkl')

#Name, geometry, population will come from another file, do not need 100Mi buffer geometry
liminal19.drop(columns=['NAME', 'geometry', 'POPESTIMATE2020', 'POPESTIMATE2021', 'POPESTIMATE2022', 'buff'], inplace=True)
liminal20.drop(columns=['NAME', 'geometry', 'POPESTIMATE2020', 'POPESTIMATE2021', 'POPESTIMATE2022', 'buff'], inplace=True)

metro = liminal20['metro']
micro = liminal19['micro']
liminal = liminal19['liminal'] | liminal20['liminal']

combined19 = pd.merge(combined19, liminal19, on='CBSA', how='left')
combined20 = pd.merge(combined20, liminal20, on='CBSA', how='left')

combined10['liminal'] = liminal; combined10['micro'] = micro; combined10['metro'] = metro
combined15['liminal'] = liminal; combined15['micro'] = micro; combined15['metro'] = metro
combined19['liminal'] = liminal; combined19['micro'] = micro; combined19['metro'] = metro
combined20['liminal'] = liminal; combined20['micro'] = micro; combined20['metro'] = metro
combined21['liminal'] = liminal; combined21['micro'] = micro; combined21['metro'] = metro

**Internet Speed Data for 2020 q1** (need to change to 2019 and 2021)

avg_d_kbps: Average download speed \
avg_u_kbps: Average upload speed \
avg_lat_ms: Average latency in ms \
tests: Number of tests preformed in CBSA \
devices: number of devices tested in CBSA

In [22]:
intSpeed19 = pd.read_pickle(dataFolder + 'internet_speed/speedByCbsa2019.pkl')
# intSpeed20 = pd.read_pickle(sharedFolder + 'internet_speed/speedByCbsa2020.pkl')
intSpeed21 = pd.read_pickle(dataFolder + 'internet_speed/speedByCbsa2021.pkl')

#Name and geometry come from CBSA data
intSpeed19.drop(columns=['NAME', 'geometry'], inplace=True)
# intSpeed20.drop(columns=['NAME', 'geometry'], inplace=True)
intSpeed21.drop(columns=['NAME', 'geometry'], inplace=True)

combined19 = pd.merge(combined19, intSpeed19, on='CBSA', how='outer')
# combined20 = pd.merge(combined20, intSpeed, on='CBSA', how='outer')
combined21 = pd.merge(combined21, intSpeed21, on='CBSA', how='outer')

**ACS Data for 2010, 2015, and 2021**

Variables are from ACS website

In [23]:
acsData = pd.read_pickle(sharedFolder + 'ACS_data.pkl')
print(acsData.columns)

acsData10 = acsData[acsData['year'] == 2010].drop(columns='year')
acsData15 = acsData[acsData['year'] == 2015].drop(columns='year')
acsData19 = acsData[acsData['year'] == 2019].drop(columns='year')
acsData21 = acsData[acsData['year'] == 2021].drop(columns='year')

combined10 = pd.merge(combined10, acsData10, on='CBSA', how='left')
combined15 = pd.merge(combined15, acsData15, on='CBSA', how='left')
combined19 = pd.merge(combined19, acsData19, on='CBSA', how='left')
combined21 = pd.merge(combined21, acsData21, on='CBSA', how='left')

Index(['CBSA', 'year', 'Total_Population', 'Total_Male_Pop',
       'Total_Female_Pop', 'Median_Age', 'M_Median_Age', 'F_Median_Age',
       'Race_white', 'Race_black', 'Race_Am_Indian', 'Race_Asian',
       'Race_Pac_Isl', 'Median_Income', 'Less_High', 'High_Equiv',
       'College_Assoc_Equiv', 'Bachelors', 'Grad_Prof', 'Med_House_Val',
       'Ag_For_Fish_Hunt_Mine', 'Construction', 'Manufacturing', 'Wholesale',
       'Retail', 'Trans_Ware_Util', 'Information', 'Fin_Insur_Real',
       'Fin_Insur', 'Real_Rental', 'Prof_Sci_Tech', 'Mngmt_Enter',
       'Admin_WstMngmt', 'Ed_Servc', 'Hlthcr_Social', 'Arts_Entr_Rec',
       'Accom_Food_Srvc', 'Other_Servc', 'Public_admin', 'Num_Total_Worker',
       'Num_Comp_Info_Res', 'Num_Soft_Dev', 'Num_Comp_Sys_Analyst',
       'Num_Info_Sec_Analyst', 'Num_Comp_Programmer', 'Num_Soft_Qual',
       'Num_Web_Dev', 'Sal_Avg_Occ', 'Avg_Comp_EngSci', 'Avg_Mngmt_Bus_Fin',
       'G_Mobil_Bach', 'G_Mobil_Prof', 'Education_Pop', 'Less_High_pct',
       '

**EPA Air Quality Data for 2010, 2015, 2021**



In [24]:
aqi10 = pd.read_pickle(dataFolder + 'AQI2010.pkl')
aqi15 = pd.read_pickle(dataFolder + 'AQI2015.pkl')
aqi19 = pd.read_pickle(dataFolder + 'AQI2019.pkl')
aqi21 = pd.read_pickle(dataFolder + 'AQI2021.pkl')

aqi10 = aqi10.drop(columns='CBSA').rename(columns={'CBSA Code': 'CBSA'})
aqi15 = aqi15.drop(columns='CBSA').rename(columns={'CBSA Code': 'CBSA'})
aqi19 = aqi19.drop(columns='CBSA').rename(columns={'CBSA Code': 'CBSA'})
aqi21 = aqi21.drop(columns='CBSA').rename(columns={'CBSA Code': 'CBSA'})

combined10 = pd.merge(combined10, aqi10, on='CBSA', how='left')
combined15 = pd.merge(combined15, aqi15, on='CBSA', how='left')
combined19 = pd.merge(combined19, aqi19, on='CBSA', how='left')
combined21 = pd.merge(combined21, aqi21, on='CBSA', how='left')

**Housing Price Index**

In [25]:
hpi = pd.read_pickle(dataFolder + 'HPI/hpi.pkl')

hpi10 = hpi[hpi['Year'] == 2010].drop(columns=['Name', 'Year'])
hpi15 = hpi[hpi['Year'] == 2015].drop(columns=['Name', 'Year'])
hpi19 = hpi[hpi['Year'] == 2019].drop(columns=['Name', 'Year'])
hpi21 = hpi[hpi['Year'] == 2021].drop(columns=['Name', 'Year'])

combined10 = pd.merge(combined10, hpi10, on='CBSA', how='left')
combined15 = pd.merge(combined15, hpi15, on='CBSA', how='left')
combined19 = pd.merge(combined19, hpi19, on='CBSA', how='left')
combined21 = pd.merge(combined21, hpi21, on='CBSA', how='left')

**Migration and Population Change**

Note: For 2010, "rate of" variables use 2011 data

In [26]:
migration = pd.read_pickle(dataFolder + '2010-2021_net_migration.pkl')

migration10 = migration[['POPESTIMATE2010', 'NPOPCHG_2010', 'BIRTHS2010', 'DEATHS2010', 'NATURALINC2010', 'INTERNATIONALMIG2010', 'DOMESTICMIG2010', 'NETMIG2010', 'RESIDUAL2010', 'GQESTIMATES2010', 'RBIRTH2011', 'RDEATH2011', 'RNATURALINC2011', 'RINTERNATIONALMIG2011', 'RDOMESTICMIG2011', 'RNETMIG2011']].copy()
migration15 = migration[['POPESTIMATE2015', 'NPOPCHG_2015', 'BIRTHS2015', 'DEATHS2015', 'NATURALINC2015', 'INTERNATIONALMIG2015', 'DOMESTICMIG2015', 'NETMIG2015', 'RESIDUAL2015', 'GQESTIMATES2015', 'RBIRTH2015', 'RDEATH2015', 'RNATURALINC2015', 'RINTERNATIONALMIG2015', 'RDOMESTICMIG2015', 'RNETMIG2015']].copy()
migration19 = migration[['POPESTIMATE2019', 'NPOPCHG_2019', 'BIRTHS2019', 'DEATHS2019', 'NATURALINC2019', 'INTERNATIONALMIG2019', 'DOMESTICMIG2019', 'NETMIG2019', 'RESIDUAL2019', 'GQESTIMATES2019', 'RBIRTH2019', 'RDEATH2019', 'RNATURALINC2019', 'RINTERNATIONALMIG2019', 'RDOMESTICMIG2019', 'RNETMIG2019']].copy()
migration21 = migration[['POPESTIMATE2021', 'NPOPCHG2021',  'BIRTHS2021', 'DEATHS2021', 'NATURALCHG2021', 'INTERNATIONALMIG2021', 'DOMESTICMIG2021', 'NETMIG2021', 'RESIDUAL2021', 'GQESTIMATES2021', 'RBIRTH2021', 'RDEATH2021', 'RNATURALCHG2021', 'RINTERNATIONALMIG2021', 'RDOMESTICMIG2021', 'RNETMIG2019']].copy()

migration10 = migration10.reset_index().rename(columns={'cbsa': 'CBSA', 'POPESTIMATE2010': 'POPESTIMATE', 'NPOPCHG_2010': 'NPOPCHG', 'BIRTHS2010': 'BIRTHS', 'DEATHS2010': 'DEATHS', 'NATURALINC2010': 'NATURALINC', 'INTERNATIONALMIG2010': 'INTERNATIONALMIG', 'DOMESTICMIG2010': 'DOMESTICMIG', 'NETMIG2010': 'NETMIG', 'RESIDUAL2010': 'RESIDUAL', 'GQESTIMATES2010': 'GQESTIMATES', 'RBIRTH2011': 'RBIRTH', 'RDEATH2011': 'RDEATH', 'RNATURALINC2011': 'RNATURALINC', 'RINTERNATIONALMIG2011': 'RINTERNATIONALMIG', 'RDOMESTICMIG2011': 'RDOMESTICMIG', 'RNETMIG2011': 'RNETMIG'})
migration15 = migration15.reset_index().rename(columns={'cbsa': 'CBSA', 'POPESTIMATE2015': 'POPESTIMATE', 'NPOPCHG_2015': 'NPOPCHG', 'BIRTHS2015': 'BIRTHS', 'DEATHS2015': 'DEATHS', 'NATURALINC2015': 'NATURALINC', 'INTERNATIONALMIG2015': 'INTERNATIONALMIG', 'DOMESTICMIG2015': 'DOMESTICMIG', 'NETMIG2015': 'NETMIG', 'RESIDUAL2015': 'RESIDUAL', 'GQESTIMATES2015': 'GQESTIMATES', 'RBIRTH2015': 'RBIRTH', 'RDEATH2015': 'RDEATH', 'RNATURALINC2015': 'RNATURALINC', 'RINTERNATIONALMIG2015': 'RINTERNATIONALMIG', 'RDOMESTICMIG2015': 'RDOMESTICMIG', 'RNETMIG2015': 'RNETMIG'})
migration19 = migration19.reset_index().rename(columns={'cbsa': 'CBSA', 'POPESTIMATE2019': 'POPESTIMATE', 'NPOPCHG_2019': 'NPOPCHG', 'BIRTHS2019': 'BIRTHS', 'DEATHS2019': 'DEATHS', 'NATURALINC2019': 'NATURALINC', 'INTERNATIONALMIG2019': 'INTERNATIONALMIG', 'DOMESTICMIG2019': 'DOMESTICMIG', 'NETMIG2019': 'NETMIG', 'RESIDUAL2019': 'RESIDUAL', 'GQESTIMATES2019': 'GQESTIMATES', 'RBIRTH2019': 'RBIRTH', 'RDEATH2019': 'RDEATH', 'RNATURALINC2019': 'RNATURALINC', 'RINTERNATIONALMIG2019': 'RINTERNATIONALMIG', 'RDOMESTICMIG2019': 'RDOMESTICMIG', 'RNETMIG2019': 'RNETMIG'})
migration21 = migration21.reset_index().rename(columns={'cbsa': 'CBSA', 'POPESTIMATE2021': 'POPESTIMATE', 'NPOPCHG2021':  'NPOPCHG', 'BIRTHS2021': 'BIRTHS', 'DEATHS2021': 'DEATHS', 'NATURALCHG2021': 'NATURALINC', 'INTERNATIONALMIG2021': 'INTERNATIONALMIG', 'DOMESTICMIG2021': 'DOMESTICMIG', 'NETMIG2021': 'NETMIG', 'RESIDUAL2021': 'RESIDUAL', 'GQESTIMATES2021': 'GQESTIMATES', 'RBIRTH2021': 'RBIRTH', 'RDEATH2021': 'RDEATH', 'RNATURALCHG2021': 'RNATURALINC', 'RINTERNATIONALMIG2021': 'RINTERNATIONALMIG', 'RDOMESTICMIG2021': 'RDOMESTICMIG', 'RNETMIG2021': 'RNETMIG'})

combined10 = pd.merge(combined10, migration10, on='CBSA', how='left')
combined15 = pd.merge(combined15, migration15, on='CBSA', how='left')
combined19 = pd.merge(combined19, migration19, on='CBSA', how='left')
combined21 = pd.merge(combined21, migration21, on='CBSA', how='left')

In [29]:
list(combined15.columns)
# combined15
# combined19
# combined21

['CBSA',
 'NAME',
 'geometry',
 'liminal',
 'micro',
 'metro',
 'Total_Population',
 'Total_Male_Pop',
 'Total_Female_Pop',
 'Median_Age',
 'M_Median_Age',
 'F_Median_Age',
 'Race_white',
 'Race_black',
 'Race_Am_Indian',
 'Race_Asian',
 'Race_Pac_Isl',
 'Median_Income',
 'Less_High',
 'High_Equiv',
 'College_Assoc_Equiv',
 'Bachelors',
 'Grad_Prof',
 'Med_House_Val',
 'Ag_For_Fish_Hunt_Mine',
 'Construction',
 'Manufacturing',
 'Wholesale',
 'Retail',
 'Trans_Ware_Util',
 'Information',
 'Fin_Insur_Real',
 'Fin_Insur',
 'Real_Rental',
 'Prof_Sci_Tech',
 'Mngmt_Enter',
 'Admin_WstMngmt',
 'Ed_Servc',
 'Hlthcr_Social',
 'Arts_Entr_Rec',
 'Accom_Food_Srvc',
 'Other_Servc',
 'Public_admin',
 'Num_Total_Worker',
 'Num_Comp_Info_Res',
 'Num_Soft_Dev',
 'Num_Comp_Sys_Analyst',
 'Num_Info_Sec_Analyst',
 'Num_Comp_Programmer',
 'Num_Soft_Qual',
 'Num_Web_Dev',
 'Sal_Avg_Occ',
 'Avg_Comp_EngSci',
 'Avg_Mngmt_Bus_Fin',
 'G_Mobil_Bach',
 'G_Mobil_Prof',
 'Education_Pop',
 'Less_High_pct',
 'High_

In [28]:
combined10.to_pickle(dataFolder + "combined2010.pkl")
combined15.to_pickle(dataFolder + "combined2015.pkl")
combined19.to_pickle(dataFolder + "combined2019.pkl")
combined20.to_pickle(dataFolder + "combined2020.pkl")
combined21.to_pickle(dataFolder + "combined2021.pkl")