In [1]:
import pandas as pd
import geopandas as gpd
from config import *

In [2]:
csvs = {
    2010: pd.read_csv('combined2010.csv'),
    2015: pd.read_csv('combined2015.csv'),
    2019: pd.read_csv('combined2019.csv'),
    2020: pd.read_csv('combined2020.csv'),
    2021: pd.read_csv('combined2021.csv')
}
codebook = pd.read_excel('Codebook.xlsx')

In [3]:
data = pd.DataFrame(index=pd.Index([], name='CBSA'))

for year, csv in csvs.items():
    csv.set_index('CBSA', inplace=True)
    csv.rename(columns={
        name: (name, year) for name in csv.columns
    }, inplace=True)

    data = pd.merge(data, csv, on='CBSA', how='outer')

In [4]:
data.rename(columns={ #Why do I need to do this?
    ('NPOPCHG_', 2010): ('NPOPCHG', 2010),
    ('NPOPCHG_', 2015): ('NPOPCHG', 2015),
    ('NPOPCHG_', 2019): ('NPOPCHG', 2019)
}, inplace=True)

In [5]:
cols = []

for i, row in codebook.iterrows():
    name = row['column name']
    years = str(row['year(s)'])
    
    if name in {'CBSA', 'geometry'}: continue
    if years == 'nan':
        print('nan')
        continue
    
    for year in years.split(','):
        year = int(year)
        cols.append((name, year))

for col in sorted(data.columns):
    if not col in cols:
        print(col, "was removed")

data = data[cols]

('LSAD', 2010) was removed
('LSAD', 2015) was removed
('LSAD', 2019) was removed
('LSAD', 2021) was removed
('NAME', 2010) was removed
('NAME', 2015) was removed
('NAME', 2019) was removed
('NAME', 2021) was removed
('Num_Comp_Info_Res', 2010) was removed
('Num_Comp_Info_Res', 2015) was removed
('Num_Comp_Programmer', 2010) was removed
('Num_Comp_Programmer', 2015) was removed
('Num_Comp_Sys_Analyst', 2010) was removed
('Num_Comp_Sys_Analyst', 2015) was removed
('Num_Info_Sec_Analyst', 2010) was removed
('Num_Info_Sec_Analyst', 2015) was removed
('Num_Soft_Dev', 2010) was removed
('Num_Soft_Dev', 2015) was removed
('Num_Soft_Qual', 2010) was removed
('Num_Soft_Qual', 2015) was removed
('Num_Total_Worker', 2010) was removed
('Num_Total_Worker', 2015) was removed
('Num_Web_Dev', 2010) was removed
('Num_Web_Dev', 2015) was removed
('Pct_HS_Above', 2010) was removed
('Pct_HS_Above', 2015) was removed
('Pct_HS_Above', 2019) was removed
('Pct_HS_Above', 2021) was removed
('Pop_BS_Above', 201

In [6]:
data.columns = pd.MultiIndex.from_tuples(data.columns, names=('Variable', 'Year')) #Create MultiIndex
data

Variable,NAME,LSAD,metro,metro,micro,micro,inBuff,inBuff,commuteBuffCount,commuteBuffCount,...,Pct_Bachelors,Pct_Bachelors,Pct_Grad,Pct_Grad,Pct_Grad,Pct_Grad,Pct_BS_Above,Pct_BS_Above,Pct_BS_Above,Pct_BS_Above
Year,2020,2020,2019,2020,2019,2020,2019,2020,2019,2020,...,2019,2021,2010,2015,2019,2021,2010,2015,2019,2021
CBSA,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
12020,"Athens-Clarke County, GA",Metropolitan Statistical Area,False,False,False,False,True,True,21008.0,20168.0,...,0.199069,0.208003,0.163282,0.171961,0.194500,0.208607,0.343510,0.349642,0.393569,0.416610
12060,"Atlanta-Sandy Springs-Alpharetta, GA",Metropolitan Statistical Area,True,True,False,False,True,True,2412279.0,2346403.0,...,0.239876,0.249387,0.118132,0.129166,0.145954,0.154312,0.343568,0.358495,0.385830,0.403700
12100,"Atlantic City-Hammonton, NJ",Metropolitan Statistical Area,False,False,False,False,True,True,27041.0,24765.0,...,0.185758,0.192943,0.072394,0.081323,0.095540,0.101170,0.235675,0.251161,0.281298,0.294113
12120,"Atmore, AL",Micropolitan Statistical Area,False,False,True,True,False,False,0.0,0.0,...,0.081674,0.084772,0.035550,0.041643,0.043002,0.040714,0.109160,0.125389,0.124676,0.125486
12140,"Auburn, IN",Micropolitan Statistical Area,False,False,True,True,True,True,728.0,624.0,...,0.128260,0.137329,0.054213,0.055964,0.052092,0.049539,0.157236,0.170453,0.180352,0.186867
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49060,"Winfield, KS",Micropolitan Statistical Area,False,False,True,True,True,True,0.0,0.0,...,0.143744,0.152669,0.065244,0.064707,0.074014,0.088707,0.195006,0.191989,0.217758,0.241376
49080,"Winnemucca, NV",Micropolitan Statistical Area,False,False,True,True,False,False,0.0,0.0,...,0.106532,0.145505,0.031487,0.048663,0.061184,0.050186,0.133819,0.136980,0.167716,0.195691
49100,"Winona, MN",Micropolitan Statistical Area,False,False,True,True,True,True,1488.0,1382.0,...,0.196674,0.207330,0.092914,0.097327,0.104039,0.111075,0.250566,0.282071,0.300714,0.318405
49180,"Winston-Salem, NC",Metropolitan Statistical Area,False,False,False,False,True,True,41421.0,40087.0,...,0.173623,0.176995,0.076163,0.088186,0.092833,0.097289,0.240122,0.259673,0.266456,0.274283


In [7]:
data.to_csv('dataset.csv')
data.to_excel('dataset.xlsx')