In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import os
import requests
import gzip
import shutil
import glob
import folium

In [2]:
from config import *

**Data sources to combine:**
* CBSA geometry (from 2019, but used for all years)
* Liminal/commuting data 2019, 2020
* ACS data for 2010, 2015, and 2021
* Internet speed data 2019, 2021
* Air quality data 2010, 2015, 2021
* Housing Price Index 2010, 2015, 2021

* Parklands data
* Population/migration data

### CBSA geometry from 2019

In [3]:
CBSAs = gpd.read_file(dataFolder + "tl_2019_us_cbsa.zip").to_crs(3857)
CBSAs = CBSAs.rename(columns={'CBSAFP': 'CBSA'})[['CBSA', 'NAME', 'geometry']]
CBSAs['CBSA'] = CBSAs['CBSA'].astype(int)

In [4]:
combined10 = CBSAs.copy()
combined15 = CBSAs.copy()
combined19 = CBSAs.copy()
combined20 = CBSAs.copy()
combined21 = CBSAs.copy()

### Liminal and Commuting Data for 2019 and 2020

CBSA: Core Based Statistical Area id \
LSAD: What type of area it is (metropolitan or micropolitan) \
NAME: Name of CBSA \
geometry: Shape of CBSA \
metro: Is it a metro with population > 1M? \
micro: Is it a metro/micro with population < 100k? \
inBuff: Is it within 100Mi of a metro? \
commuteBuffCount: Number of people commuting to a nearby metro in (from LODES data) \
workingPop: Working population for (from LODES data) \
commutePercent: commuteBuffCount / workingPop * 100 \
liminal: Is it "micro" with at least 5% commuting to a metro?

In [5]:
liminal19 = pd.read_pickle(sharedFolder + 'gdf2019.pkl')
liminal20 = pd.read_pickle(sharedFolder + 'gdf2020.pkl')

#Name, geometry, population will come from another file, do not need 100Mi buffer geometry
liminal19.drop(columns=['NAME', 'geometry', 'POPESTIMATE2020', 'POPESTIMATE2021', 'POPESTIMATE2022', 'buff'], inplace=True)
liminal20.drop(columns=['NAME', 'geometry', 'POPESTIMATE2020', 'POPESTIMATE2021', 'POPESTIMATE2022', 'buff'], inplace=True)

metro = liminal20['metro']
micro = liminal20['micro']
liminal = liminal19['liminal'] | liminal20['liminal']

liminal19['liminal'] = liminal
liminal20['liminal'] = liminal

combined10 = pd.merge(combined10, liminal19, on='CBSA', how='inner') #Remove cbsas without liminal data
combined15 = pd.merge(combined15, liminal19, on='CBSA', how='inner')
combined19 = pd.merge(combined19, liminal19, on='CBSA', how='inner')
combined20 = pd.merge(combined20, liminal20, on='CBSA', how='inner')
combined21 = pd.merge(combined21, liminal20, on='CBSA', how='inner')

#Data is not for these years
combined10.drop(columns=['commuteBuffCount', 'workingPop', 'commutePercent'], inplace=True)
combined15.drop(columns=['commuteBuffCount', 'workingPop', 'commutePercent'], inplace=True)
combined21.drop(columns=['commuteBuffCount', 'workingPop', 'commutePercent'], inplace=True)

### EPA Air Quality Data for 2010, 2015, 2019, 2021



In [6]:
aqi10 = pd.read_pickle(dataFolder + 'AQI2010.pkl')
aqi15 = pd.read_pickle(dataFolder + 'AQI2015.pkl')
aqi19 = pd.read_pickle(dataFolder + 'AQI2019.pkl')
aqi21 = pd.read_pickle(dataFolder + 'AQI2021.pkl')

aqi10 = aqi10.drop(columns=['CBSA', 'Year']).rename(columns={'CBSA Code': 'CBSA'})
aqi15 = aqi15.drop(columns=['CBSA', 'Year']).rename(columns={'CBSA Code': 'CBSA'})
aqi19 = aqi19.drop(columns=['CBSA', 'Year']).rename(columns={'CBSA Code': 'CBSA'})
aqi21 = aqi21.drop(columns=['CBSA', 'Year']).rename(columns={'CBSA Code': 'CBSA'})

combined10 = pd.merge(combined10, aqi10, on='CBSA', how='left')
combined15 = pd.merge(combined15, aqi15, on='CBSA', how='left')
combined19 = pd.merge(combined19, aqi19, on='CBSA', how='left')
combined21 = pd.merge(combined21, aqi21, on='CBSA', how='left')

In [7]:
aqi21

Unnamed: 0,CBSA,Days with AQI,Good Days,Moderate Days,Unhealthy for Sensitive Groups Days,Unhealthy Days,Very Unhealthy Days,Hazardous Days,Max AQI,90th Percentile AQI,Median AQI,Days CO,Days NO2,Days Ozone,Days PM2.5,Days PM10
0,10100,356,313,40,0,3,0,0,191,55,22,0,0,0,166,190
1,10140,365,359,6,0,0,0,0,80,32,20,0,0,0,365,0
2,10260,7,7,0,0,0,0,0,17,17,10,0,0,0,7,0
3,10300,322,246,76,0,0,0,0,93,61,40,0,0,171,151,0
4,10420,356,264,89,3,0,0,0,108,67,40,0,0,185,171,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,49420,365,232,106,12,15,0,0,192,89,40,0,0,0,359,6
502,49620,357,244,111,2,0,0,0,148,67,43,0,7,165,185,0
503,49660,365,272,90,3,0,0,0,147,65,41,0,0,203,162,0
504,49700,364,165,177,17,5,0,0,169,90,54,0,0,149,214,1


### Migration and Population Change

Note: For 2010, "rate of" variables use 2011 data

In [9]:
migration = pd.read_pickle(dataFolder + '2010-2022_net_migration.pkl')

migration.rename(columns={'NATURALCHG2021': 'NATURALINC2021', 'RNATURALCHG2021': 'RNATURALINC2021'}, inplace=True)
migration.drop(columns=['CENSUS2010POP', 'ESTIMATESBASE2010', 'GQESTIMATESBASE2010'], inplace=True)

for var in ['BIRTHS', 'DEATHS', 'NATURALINC', 'INTERNATIONALMIG', 'DOMESTICMIG', 'NETMIG']:
    migration['R'+var+'w2010'] = migration[var+'2011'] / ((migration['POPESTIMATE2010'] + migration['POPESTIMATE2011']) / 2) * 1000
    migration['R'+var+'w2015'] = migration[var+'2015'] / ((migration['POPESTIMATE2014'] + migration['POPESTIMATE2015']) / 2) * 1000
    migration['R'+var+'w2019'] = migration[var+'2019'] / ((migration['POPESTIMATE2018'] + migration['POPESTIMATE2019']) / 2) * 1000
    migration['R'+var+'w2021'] = migration[var+'2021'] / ((migration['POPESTIMATE2020'] + migration['POPESTIMATE2021']) / 2) * 1000

migration10 = migration[[c for c in migration.columns if '2010' in c]].copy()
migration15 = migration[[c for c in migration.columns if '2015' in c]].copy()
migration19 = migration[[c for c in migration.columns if '2019' in c]].copy()
migration21 = migration[[c for c in migration.columns if '2021' in c]].copy()

migration10 = migration10.reset_index().rename(columns={c: c.replace('2010', '') for c in migration10.columns}).rename(columns={'cbsa': 'CBSA'})
migration15 = migration15.reset_index().rename(columns={c: c.replace('2015', '') for c in migration15.columns}).rename(columns={'cbsa': 'CBSA'})
migration19 = migration19.reset_index().rename(columns={c: c.replace('2019', '') for c in migration19.columns}).rename(columns={'cbsa': 'CBSA'})
migration21 = migration21.reset_index().rename(columns={c: c.replace('2021', '') for c in migration21.columns}).rename(columns={'cbsa': 'CBSA'})

combined10 = pd.merge(combined10, migration10, on='CBSA', how='left')
combined15 = pd.merge(combined15, migration15, on='CBSA', how='left')
combined19 = pd.merge(combined19, migration19, on='CBSA', how='left')
combined21 = pd.merge(combined21, migration21, on='CBSA', how='left')

In [10]:
migration15

Unnamed: 0,CBSA,POPESTIMATE,NPOPCHG_,BIRTHS,DEATHS,NATURALINC,INTERNATIONALMIG,DOMESTICMIG,NETMIG,RESIDUAL,...,RNATURALINC,RINTERNATIONALMIG,RDOMESTICMIG,RNETMIG,RBIRTHSw,RDEATHSw,RNATURALINCw,RINTERNATIONALMIGw,RDOMESTICMIGw,RNETMIGw
0,10100,42432,182,535,437,98,84,4,88,-4,...,2.626722,1.882048,0.839186,2.721234,12.635507,10.320965,2.314541,1.983893,0.094471,2.078364
1,10140,71027,231,789,824,-35,5,267,272,-6,...,-0.493573,0.070510,3.765257,3.835767,11.126545,11.620118,-0.493573,0.070510,3.765257,3.835767
2,10180,169770,1362,2415,1701,714,502,163,665,-17,...,0.838378,1.433012,4.680371,6.113383,14.282419,10.059791,4.222628,2.968851,0.963989,3.932840
3,10220,38343,51,515,439,76,66,-87,-21,-4,...,1.983428,1.722451,-2.270503,-0.548052,13.440334,11.456906,1.983428,1.722451,-2.270503,-0.548052
4,10300,98432,-342,1040,1069,-29,42,-351,-309,-4,...,-0.294109,0.425951,-3.559729,-3.133779,10.547346,10.841455,-0.294109,0.425951,-3.559729,-3.133779
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
917,49660,549114,-4153,5653,7284,-1631,383,-2890,-2507,-15,...,-2.916313,0.642342,-5.523767,-4.881425,10.255982,13.215032,-2.959050,0.694860,-5.243196,-4.548337
918,49700,169263,1015,2503,1392,1111,420,-503,-83,-13,...,6.756230,2.409231,-2.971692,-0.562460,14.832109,8.248620,6.583489,2.488808,-2.980644,-0.491836
919,49740,205187,1123,3026,1422,1604,1518,-2003,-485,4,...,7.838710,7.418430,-9.788614,-2.370184,14.787991,6.949281,7.838710,7.418430,-9.788614,-2.370184
920,49780,86213,295,1060,1020,40,94,173,267,-12,...,0.464762,1.092191,2.010097,3.102288,12.316201,11.851439,0.464762,1.092191,2.010097,3.102288


### Internet Speed Data

avg_d_kbps: Average download speed \
avg_u_kbps: Average upload speed \
avg_lat_ms: Average latency in ms \
tests: Number of tests preformed in CBSA \
devices: number of devices tested in CBSA

In [11]:
intSpeed19 = pd.read_pickle(dataFolder + 'internet_speed/speedByCbsa2019.pkl')
intSpeed20 = pd.read_pickle(dataFolder + 'internet_speed/speedByCbsa2020.pkl')
intSpeed21 = pd.read_pickle(dataFolder + 'internet_speed/speedByCbsa2021.pkl')

#Name and geometry come from CBSA data
intSpeed19.drop(columns=['NAME', 'geometry', 'index_right'], inplace=True)
intSpeed20.drop(columns=['NAME', 'geometry', 'index_right'], inplace=True)
intSpeed21.drop(columns=['NAME', 'geometry', 'index_right'], inplace=True)

combined19 = pd.merge(combined19, intSpeed19, on='CBSA', how='left')
combined20 = pd.merge(combined20, intSpeed20, on='CBSA', how='left')
combined21 = pd.merge(combined21, intSpeed21, on='CBSA', how='left')

In [12]:
intSpeed21

Unnamed: 0,CBSA,avg_d_kbps,avg_u_kbps,avg_lat_ms,tests,devices
0,12020,151937.701735,41624.537882,189898,35167,12737
1,12060,205488.114906,82655.859473,2687156,1465131,458176
2,12100,213254.557135,21587.345600,87871,51529,15761
3,12120,96660.075505,16978.787753,116628,5013,1682
4,12140,134945.971817,45009.268591,70500,9178,2945
...,...,...,...,...,...,...
933,49060,121050.135994,19912.459719,32289,4817,1353
934,49080,67338.556338,11607.729376,18840,3596,994
935,49100,118909.496792,30872.018237,41091,9030,2961
936,49180,184031.243746,48710.074255,442003,159664,51525


### Housing Price Index

In [13]:
hpi = pd.read_pickle(dataFolder + 'HPI/hpi.pkl')

hpi10 = hpi[hpi['Year'] == 2010].drop(columns=['Name', 'Year'])
hpi15 = hpi[hpi['Year'] == 2015].drop(columns=['Name', 'Year'])
hpi19 = hpi[hpi['Year'] == 2019].drop(columns=['Name', 'Year'])
hpi21 = hpi[hpi['Year'] == 2021].drop(columns=['Name', 'Year'])

combined10 = pd.merge(combined10, hpi10, on='CBSA', how='left')
combined15 = pd.merge(combined15, hpi15, on='CBSA', how='left')
combined19 = pd.merge(combined19, hpi19, on='CBSA', how='left')
combined21 = pd.merge(combined21, hpi21, on='CBSA', how='left')

In [14]:
hpi10

Unnamed: 0,CBSA,Annual Change (%),HPI,HPI with 1990 base,HPI with 2000 base
25,1,-4.79,208.19,189.50,130.45
64,2,-1.53,199.90,228.22,149.61
96,4,-14.28,231.74,,145.34
144,5,-1.67,297.59,182.82,130.00
191,6,-13.51,497.65,183.29,138.06
...,...,...,...,...,...
40509,49620,-4.13,430.28,171.17,141.85
40556,49660,-3.87,305.87,152.17,101.74
40604,49700,-6.16,449.93,148.52,127.74
40650,49740,-11.61,305.04,178.67,137.88


### ACS Data for 2010, 2015, 2019, and 2021

Variables are from ACS website

In [15]:
acsData = pd.read_pickle(sharedFolder + 'ACS_data.pkl')
print(acsData.columns)

acsData10 = acsData[acsData['year'] == 2010].drop(columns='year')
acsData15 = acsData[acsData['year'] == 2015].drop(columns='year')
acsData19 = acsData[acsData['year'] == 2019].drop(columns='year')
acsData21 = acsData[acsData['year'] == 2021].drop(columns='year')

combined10 = pd.merge(combined10, acsData10, on='CBSA', how='left')
combined15 = pd.merge(combined15, acsData15, on='CBSA', how='left')
combined19 = pd.merge(combined19, acsData19, on='CBSA', how='left')
combined21 = pd.merge(combined21, acsData21, on='CBSA', how='left')

Index(['CBSA', 'year', 'Total_Population', 'Total_Male_Pop',
       'Total_Female_Pop', 'Median_Age', 'M_Median_Age', 'F_Median_Age',
       'Race_white', 'Race_black', 'Race_Am_Indian', 'Race_Asian',
       'Race_Pac_Isl', 'Median_Income', 'Less_High', 'High_Equiv',
       'College_Assoc_Equiv', 'Bachelors', 'Grad_Prof',
       'Ag_For_Fish_Hunt_Mine', 'Construction', 'Manufacturing', 'Wholesale',
       'Retail', 'Trans_Ware_Util', 'Information', 'Fin_Insur_Real',
       'Fin_Insur', 'Real_Rental', 'Prof_Sci_Tech', 'Mngmt_Enter',
       'Admin_WstMngmt', 'Ed_Servc', 'Hlthcr_Social', 'Arts_Entr_Rec',
       'Accom_Food_Srvc', 'Other_Servc', 'Public_admin', 'Num_Total_Worker',
       'Num_Comp_Info_Res', 'Num_Soft_Dev', 'Num_Comp_Sys_Analyst',
       'Num_Info_Sec_Analyst', 'Num_Comp_Programmer', 'Num_Soft_Qual',
       'Num_Web_Dev', 'Sal_Avg_Occ', 'Avg_Comp_EngSci', 'Avg_Mngmt_Bus_Fin',
       'Num_Workers', 'Num_Worked_Home', 'Worked_Home_White',
       'Worked_Home_Black', 'Worked_

In [16]:
acsData10

Unnamed: 0,CBSA,Total_Population,Total_Male_Pop,Total_Female_Pop,Median_Age,M_Median_Age,F_Median_Age,Race_white,Race_black,Race_Am_Indian,...,Median_Income_Worked_home,Pop_BS_Above,Pop_HS_Above,Pct_Less_HS,Pct_HS_Grad,Pct_Some_College,Pct_Bachelors,Pct_Grad,Pct_BS_Above,Pct_HS_Above
0,10100,40058,19558,20500,42.900000,41.650000,44.20000,37805,159,1140,...,23405.000000,6343.0,23826.0,0.108442,0.342875,0.311331,0.185189,0.052163,0.237352,0.891558
4,10140,72092,36989,35103,41.600000,40.500000,43.00000,62179,708,2858,...,29244.000000,7239.0,42031.0,0.156495,0.328925,0.369303,0.096289,0.048988,0.145277,0.843505
8,10180,163092,81854,81238,37.400000,36.033333,39.70000,132211,12020,833,...,27540.333333,22038.0,84603.0,0.177374,0.295211,0.313131,0.150012,0.064271,0.214284,0.822626
12,10220,36644,17619,19025,35.700000,34.000000,37.40000,26818,904,3799,...,13726.000000,6150.0,19800.0,0.155758,0.312284,0.269731,0.143734,0.118492,0.262227,0.844242
16,10300,100911,50688,50223,39.400000,37.600000,41.00000,92930,2389,606,...,14688.000000,12968.0,59281.0,0.120721,0.370187,0.316746,0.124622,0.067725,0.192346,0.879279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3692,49700,164580,82385,82195,32.850000,31.900000,33.70000,110905,3278,2557,...,27393.000000,16255.0,78597.0,0.222697,0.256886,0.359660,0.115571,0.045186,0.160758,0.777303
3696,49740,190526,95522,95004,33.900000,32.600000,35.40000,143632,3774,2604,...,17045.000000,15437.0,83129.0,0.284222,0.270049,0.312809,0.086776,0.046143,0.132919,0.715778
3700,49780,85951,41446,44505,39.200000,38.100000,40.30000,80261,2886,154,...,25181.000000,8111.0,49511.0,0.134801,0.456287,0.267173,0.084176,0.057562,0.141739,0.865199
3704,49820,13609,6778,6831,29.000000,27.400000,31.30000,11755,0,0,...,14438.000000,722.0,4272.0,0.432594,0.291805,0.179705,0.066941,0.028955,0.095896,0.567406


### Building Permits

In [17]:
building10 = pd.read_pickle(dataFolder + 'Building_Permits/dataByCbsa2010.pkl')
building15 = pd.read_pickle(dataFolder + 'Building_Permits/dataByCbsa2015.pkl')
building19 = pd.read_pickle(dataFolder + 'Building_Permits/dataByCbsa2019.pkl')
building21 = pd.read_pickle(dataFolder + 'Building_Permits/dataByCbsa2021.pkl')

combined10 = pd.merge(combined10, building10, on='CBSA', how='left')
combined15 = pd.merge(combined15, building15, on='CBSA', how='left')
combined19 = pd.merge(combined19, building19, on='CBSA', how='left')
combined21 = pd.merge(combined21, building21, on='CBSA', how='left')

In [18]:
building10

Unnamed: 0_level_0,Tot_Bldgs,Tot_Units,Tot_Bldgs_Value
CBSA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10100,107,191,15209110
10140,164,166,24624483
10180,280,390,43582902
10220,40,86,6347555
10300,102,102,12133875
...,...,...,...
49660,295,325,61246666
49700,150,150,25659325
49740,455,455,59099893
49780,6,30,2762000


### Save Files

In [19]:
combined10.to_pickle(dataFolder + "combined2010.pkl")
combined15.to_pickle(dataFolder + "combined2015.pkl")
combined19.to_pickle(dataFolder + "combined2019.pkl")
combined20.to_pickle(dataFolder + "combined2020.pkl")
combined21.to_pickle(dataFolder + "combined2021.pkl")

In [20]:
combined10.drop(columns='geometry').to_csv("combined2010.csv", index=False)
combined15.drop(columns='geometry').to_csv("combined2015.csv", index=False)
combined19.drop(columns='geometry').to_csv("combined2019.csv", index=False)
combined20.drop(columns='geometry').to_csv("combined2020.csv", index=False)
combined21.drop(columns='geometry').to_csv("combined2021.csv", index=False)

In [21]:
combined10

Unnamed: 0,CBSA,NAME,geometry,LSAD,metro,micro,inBuff,liminal,Days with AQI,Good Days,...,Pct_Less_HS,Pct_HS_Grad,Pct_Some_College,Pct_Bachelors,Pct_Grad,Pct_BS_Above,Pct_HS_Above,Tot_Bldgs,Tot_Units,Tot_Bldgs_Value
0,12020,"Athens-Clarke County, GA","POLYGON ((-9299339.399 4024225.385, -9298722.4...",Metropolitan Statistical Area,False,False,True,False,365.0,205.0,...,0.176456,0.255152,0.224882,0.180229,0.163282,0.343510,0.823544,226.0,226.0,4.513729e+07
1,12060,"Atlanta-Sandy Springs-Alpharetta, GA","POLYGON ((-9499808.416 3982318.514, -9499830.0...",Metropolitan Statistical Area,True,False,True,False,365.0,110.0,...,0.129961,0.257481,0.268990,0.225436,0.118132,0.343568,0.870039,6506.0,7608.0,1.284808e+09
2,12100,"Atlantic City-Hammonton, NJ","POLYGON ((-8333014.948 4782122.394, -8333009.7...",Metropolitan Statistical Area,False,False,True,False,365.0,265.0,...,0.152881,0.355335,0.256109,0.163281,0.072394,0.235675,0.847119,429.0,512.0,7.940623e+07
3,12120,"Atmore, AL","POLYGON ((-9753303.463 3638074.432, -9753303.2...",Micropolitan Statistical Area,False,True,False,False,,,...,0.270274,0.352607,0.267958,0.073609,0.035550,0.109160,0.729726,7.0,7.0,4.594400e+05
4,12140,"Auburn, IN","POLYGON ((-9483635.192 5068556.085, -9483636.4...",Micropolitan Statistical Area,False,True,True,False,,,...,0.118283,0.437719,0.286763,0.103023,0.054213,0.157236,0.881717,70.0,70.0,1.119984e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
921,49060,"Winfield, KS","POLYGON ((-10814782.844 4481524.994, -10814783...",Micropolitan Statistical Area,False,True,True,False,,,...,0.135448,0.306683,0.362863,0.129762,0.065244,0.195006,0.864552,36.0,37.0,5.657908e+06
922,49080,"Winnemucca, NV","POLYGON ((-13233024.775 5006594.818, -13233215...",Micropolitan Statistical Area,False,True,False,False,,,...,0.190790,0.356784,0.318607,0.102332,0.031487,0.133819,0.809210,15.0,18.0,1.831417e+06
923,49100,"Winona, MN","POLYGON ((-10250133.062 5468589.383, -10250142...",Micropolitan Statistical Area,False,True,True,True,118.0,107.0,...,0.112342,0.306520,0.330572,0.157651,0.092914,0.250566,0.887658,45.0,113.0,1.520123e+07
924,49180,"Winston-Salem, NC","POLYGON ((-8955841.577 4336663.225, -8955841.5...",Metropolitan Statistical Area,False,False,True,False,365.0,157.0,...,0.165161,0.320217,0.274501,0.163958,0.076163,0.240122,0.834839,1202.0,1467.0,1.914697e+08
