In [2]:
# initialize with library imports 
import pandas as pd
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.precision', 2)

# census packages
import censusdata
import cenpy
from census import Census
from us import states

# other packages
import sys
import numpy as np
import scipy as sp
import pickle
import os

import matplotlib.pyplot as plt
from math import radians, cos, sin, asin, sqrt
import datetime
from sklearn.linear_model import LinearRegression
import seaborn as sns
sns.set(style="ticks")
%matplotlib inline

In [3]:
# Read CSV of geovariables and convert to dictionary
GeoIDs = pd.read_csv(r"C:\Users\jenki\Documents\School\Thesis\Data\Database Production\GeoIDs.csv", header = 0)
geoIDs_records = GeoIDs.to_dict('records')
GeoIDs.columns.to_list()

['MSA', 'State', 'County', 'StateFP', 'CountyFP', 'StateCounty', 'MSAFP']

In [4]:
# Set API Key
key = Census("1653b8c59dba3b919d678f6cb5b42c8394a2f52b")

# Set variables
indicators = ['B01003_001E', 'B19049_001E', 'B19301_001E', 'B23025_003E', 'B23025_005E', 'B25001_001E',
              'B25002_003E', 'B25058_001E', 'B25064_001E', 'B25071_001E', 'B25077_001E', 'B25083_001E'] 

In [5]:
# Array of years 2013-2020 to download API data in BG
yearstodownload = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

In [7]:
# Download variables for each year from API and append to respective dataframes
data = {}
for year in yearstodownload:
    datayear = []
    
    for item in geoIDs_records:
        print('Downloading {}: {} - {}'.format(year, item['MSA'], item['County']))
        state = item['StateFP']
        county = "{:03}".format(item['CountyFP'])
        
        df = censusdata.download('acs5', year, 
                                     censusdata.censusgeo([('state', str(state)), ('county', county), ('block group', '*')]),
                                     indicators)
        
        
        datayear.append(df)
        
    datayear = pd.concat(datayear)
    
    data[year] = datayear

Downloading 2013: Delaware Valley - New Castle County
Downloading 2013: Delaware Valley - Cecil County
Downloading 2013: Greater Boston - Essex County
Downloading 2013: Greater Boston - Middlesex County
Downloading 2013: Greater Boston - Norfolk County
Downloading 2013: Greater Boston - Plymouth County
Downloading 2013: Greater Boston - Suffolk County
Downloading 2013: Flint - Genesee County
Downloading 2013: Metro Detroit - Lapeer County
Downloading 2013: Metro Detroit - Livingston County
Downloading 2013: Metro Detroit - Macomb County
Downloading 2013: Metro Detroit - Oakland County
Downloading 2013: Metro Detroit - St. Clair County
Downloading 2013: Metro Detroit - Wayne County
Downloading 2013: Greater Boston - Rockingham County


ConnectionError: HTTPSConnectionPool(host='api.census.gov', port=443): Max retries exceeded with url: /data/2013/acs/acs5?get=NAME,B01003_001E,B19049_001E,B19301_001E,B23025_003E,B23025_005E,B25001_001E,B25002_003E,B25058_001E,B25064_001E,B25071_001E,B25077_001E,B25083_001E&for=block+group:*&in=state:33+county:015 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001F6FFC373D0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))

In [None]:
# for year in yearstodownload:
#     data[year].drop(columns=['B25092_001E'],inplace=True)
#     print(year, data[year].shape)

In [None]:
for year in yearstodownload:
    data[year].insert(0, 'Year', year)
    print(year, data[year].shape)

In [None]:
# save dictionary "data" into pickle file
pickle_out = open('censusdatabase.pickle', 'wb')
pickle.dump(data, pickle_out)
pickle_out.close()

In [None]:
# # create new dictionary from pickle file
# pickle_in = open('censusdatabase.pickle', 'rb')
# new_datadict = pickle.load(pickle_in)

In [None]:
# new_datadict[2020].head()

In [None]:
# # print file size
# print('File size of pickle file is', round(os.path.getsize('censusdatabase.pickle') / (1024**2), 1), 'MB')

In [None]:
# newcastlebg = censusdata.download('acs5', 2020,
#                                  censusdata.censusgeo([('state','10'),('county','003'),('block group','*')]),
#                                 ['B01003_001E', 'B19049_001E', 'B19301_001E', 'B23025_003E', 'B23025_005E', 'B25001_001E',
#                                 'B25002_003E', 'B25058_001E', 'B25064_001E', 'B25071_001E', 'B25077_001E', 'B25083_001E'])
# # newcastlebg['percent_unemployed'] = newcastlebg.B23025_005E / newcastlebg.B23025_003E * 100
# #newcastlebg.head()

In [None]:
# get_geoid = lambda censusgeo: '15000US' + ''.join([val for _, val in censusgeo.params()])
# get_state = lambda censusgeo: censusgeo.params()[0][1]
# get_county = lambda censusgeo: censusgeo.params()[1][1]
# get_tract = lambda censusgeo: censusgeo.params()[2][1]
# get_blockgroup = lambda censusgeo: censusgeo.params()[3][1]

In [None]:
# newcastlebg = newcastlebg.reset_index()

In [None]:
# newcastlebg.rename(columns={'index': 'censusgeo'}, inplace=True)
# newcastlebg['geoid'] = newcastlebg['censusgeo'].apply(get_geoid)

In [None]:
# newcastlebg['state'] = newcastlebg['censusgeo'].apply(get_state)
# newcastlebg['county'] = newcastlebg['censusgeo'].apply(get_county)
# newcastlebg['tract'] = newcastlebg['censusgeo'].apply(get_tract)
# newcastlebg['statecountyfp'] = newcastlebg['state'] + newcastlebg['county']

In [None]:
# geoids_aux = GeoIDs[['MSA','StateCounty']]
# geoids_aux = geoids_aux.astype({'StateCounty': str})
# newcastlebg_new = newcastlebg.merge(geoids_aux, left_on='statecountyfp', right_on='StateCounty')
# newcastlebg_new.drop(columns=['StateCounty'], inplace=True)

In [None]:
# assert newcastlebg.shape[0] == newcastlebg_new.shape[0]
# newcastlebg.shape[0]

In [None]:
# df2020 = new_datadict[2020]
# df2020.head()

In [None]:
# df2020 = df2020.reset_index()
# df2020.rename(columns={'index': 'censusgeo'}, inplace=True)
# df2020['geoid'] = df2020['censusgeo'].apply(get_geoid)

In [None]:
# df2020['geoid'].to_csv('geoids2020.csv',index=False)

In [None]:
# real_path = os.path.realpath('geoids2020.csv')
# dir_path = os.path.dirname(real_path)
# print(dir_path)