# Setup

In [112]:
import pandas as pd
import sys
import numpy as np
import scipy as sp
import pickle
import os

import matplotlib.pyplot as plt
from math import radians, cos, sin, asin, sqrt
import datetime
from sklearn.linear_model import LinearRegression
import seaborn as sns
sns.set(style="ticks")
%matplotlib inline

In [113]:
# Read CSV of geovariables and convert to dictionary
GeoIDs = pd.read_csv(r"C:\Users\jenki\Documents\School\Thesis\Data\Database Production\GeoIDs.csv", header = 0)
geoIDs_records = GeoIDs.to_dict('records')
GeoIDs.columns.to_list()

['MSA', 'State', 'County', 'StateFP', 'CountyFP', 'StateCounty', 'MSAFP']

# Data Compilation

## Loading in downloaded API data

In [115]:
pickle_in = open('downloadedcensusdata.pickle', 'rb')
datadict = pickle.load(pickle_in)

In [116]:
datadict[2013].head(1)

Unnamed: 0,Year,B01003_001E,B19049_001E,B19301_001E,B23025_003E,B23025_005E,B25001_001E,B25002_003E,B25058_001E,B25064_001E,B25071_001E,B25077_001E,B25083_001E
"Block Group 3, Census Tract 147.03, New Castle County, Delaware: Summary level: 150, state:10> county:003> tract:014703> block group:3",2013,2593,57287,25211.0,1614,170,936,54,921,992,20.5,171800,-666666666


## Extracting georeferencing variables

In [None]:
get_geoid = lambda censusgeo: '15000US' + ''.join([val for _, val in censusgeo.params()])
get_state = lambda censusgeo: censusgeo.params()[0][1]
get_county = lambda censusgeo: censusgeo.params()[1][1]
get_tract = lambda censusgeo: censusgeo.params()[2][1]
get_blockgroup = lambda censusgeo: censusgeo.params()[3][1]

In [None]:
# Array of years 2013-2020 for years in the dictionary
years = [2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020]

In [None]:
for year in years:
    datadict[year].reset_index(inplace=True)
    datadict[year].rename(columns={'index': 'censusgeo'}, inplace=True)

In [None]:
for year in years:
    datadict[year]['geoid'] = datadict[year]['censusgeo'].apply(get_geoid)        
    datadict[year]['statefp'] = datadict[year]['censusgeo'].apply(get_state)
    datadict[year]['countyfp'] = datadict[year]['censusgeo'].apply(get_county)
    datadict[year]['tract'] = datadict[year]['censusgeo'].apply(get_tract)
    datadict[year]['blockgroup'] = datadict[year]['censusgeo'].apply(get_blockgroup)
    datadict[year]['statecountyfp'] = datadict[year]['statefp'] + datadict[year]['countyfp']

In [None]:
geoids_aux = GeoIDs[['MSA','StateCounty','State','County']]
geoids_aux = geoids_aux.astype({'StateCounty': str})

In [None]:
for year in years:
    datadict[year] = datadict[year].merge(geoids_aux, left_on='statecountyfp', right_on='StateCounty')
    datadict[year].drop(columns=['StateCounty'], inplace=True)

## Removing and renaming variables

In [None]:
for year in years:
    datadict[year].drop(columns=['B25083_001E','censusgeo'],inplace=True)

In [None]:
for year in years:
    datadict[year].rename(columns= {'Year':'year',
                                   'B01003_001E': 'population',
                                   'B19049_001E': 'medianhouseholdincome',
                                   'B19301_001E': 'percapitaincome',
                                   'B23025_003E': 'totalcivilianlaborforce',
                                   'B23025_005E': 'unemployedpopulation',
                                   'B25001_001E': 'totalhousingunits',
                                   'B25002_003E': 'vacanthousingunits',
                                   'B25058_001E': 'mediancontractrent',
                                   'B25064_001E': 'mediangrossrent',
                                   'B25071_001E': 'mediangrossretnaspercentageofhouseholdincome',
                                   'B25077_001E': 'medianhomevalue',
                                   'MSA': 'msa',
                                   'State': 'state',
                                   'County': 'county'}, inplace=True)

In [None]:
datadict[2013].columns

In [None]:
dtypes = datadict[2013].dtypes.to_dict()

In [None]:
pd.set_option('display.max_columns', None)
datadict[2013].head(1)

### Save progress to "clean" pickle! 

In [None]:
# save dictionary "datadict" into pickle file
pickle_out = open('cleanedcensusdatabase.pickle', 'wb')
pickle.dump(datadict, pickle_out)
pickle_out.close()

In [None]:
# print file size
print('File size of pickle file is', round(os.path.getsize('cleanedcensusdatabase.pickle') / (1024**2), 1), 'MB')

Resume point for loading cleaned pickle for continued work

In [117]:
pickle_in = open('cleanedcensusdatabase.pickle', 'rb')
workingdict = pickle.load(pickle_in)

In [118]:
dtypes = workingdict[2013].dtypes.to_dict()

In [119]:
# Check for invalid data. Note that -666666666 values represent null or NaN values (compatible with Numpy's int datatypes)
workingdict[2013].min()

year                                                           2013
population                                                        0
medianhouseholdincome                                    -666666666
percapitaincome                                        -666666666.0
totalcivilianlaborforce                                           0
unemployedpopulation                                              0
totalhousingunits                                                 0
vacanthousingunits                                                0
mediancontractrent                                       -666666666
mediangrossrent                                          -666666666
mediangrossretnaspercentageofhouseholdincome           -666666666.0
medianhomevalue                                          -666666666
geoid                                           15000US100030002001
statefp                                                          10
countyfp                                        

In [120]:
dtypes

{'year': dtype('int64'),
 'population': dtype('int64'),
 'medianhouseholdincome': dtype('int64'),
 'percapitaincome': dtype('float64'),
 'totalcivilianlaborforce': dtype('int64'),
 'unemployedpopulation': dtype('int64'),
 'totalhousingunits': dtype('int64'),
 'vacanthousingunits': dtype('int64'),
 'mediancontractrent': dtype('int64'),
 'mediangrossrent': dtype('int64'),
 'mediangrossretnaspercentageofhouseholdincome': dtype('float64'),
 'medianhomevalue': dtype('int64'),
 'geoid': dtype('O'),
 'statefp': dtype('O'),
 'countyfp': dtype('O'),
 'tract': dtype('O'),
 'blockgroup': dtype('O'),
 'statecountyfp': dtype('O'),
 'msa': dtype('O'),
 'state': dtype('O'),
 'county': dtype('O')}

## Reading in 1990, 2000, 2010-2012 data

In [None]:
folder = r"C:\\Users\\jenki\\Documents\\School\\Thesis\\Data\Database Production\\"

In [None]:
otherdata = pd.read_csv(folder+"databasecsv.csv")

In [None]:
# workaround for null values: fill with a negative value to match the dictionary. convert columns to same data types
otherdata = otherdata.fillna(-666666666).astype(dtypes)

In [None]:
# workingdict[2015].info()

In [None]:
otheryears = [1990, 2000, 2010, 2011, 2012]

In [None]:
# Add read data into workingdict
for year in set(otherdata.year):
    workingdict[year] = otherdata[otherdata.year == year].copy()

# Cleaning Database

## Null values
Replace all null values in the database with -666666666 to standardize the negative values across the whole database (both dataframe and dictionary).

In [None]:
keys = list(workingdict.keys())

In [None]:
# replace null values in dictionary with -666666666
for year in keys:
    workingdict[year] = workingdict[year].fillna(-666666666)

In [None]:
# Create dataframe from the dictionary
alldata = pd.concat(workingdict.values())

In [None]:
# Check info to confirm data types and number of null/non-null values
alldata.info()

## Save progress to new pickles!

In [None]:
# save dataframe "alldata" into pickle file
pickle_out = open('alldatadf.pickle', 'wb')
pickle.dump(alldata, pickle_out)
pickle_out.close()

In [None]:
# print file size
print('File size of pickle file is', round(os.path.getsize('alldatadf.pickle') / (1024**2), 1), 'MB')

In [None]:
# save dictionary "datadict" into new pickle file
pickle_out = open('alldatadict.pickle', 'wb')
pickle.dump(workingdict, pickle_out)
pickle_out.close()

## Checkpoint
Checking to see how many BG index values stay consistent over the years of data available

In [121]:
pickle_in = open('alldatadf.pickle', 'rb')
alldata = pickle.load(pickle_in)

In [122]:
g = alldata.groupby(['geoid', 'year']).agg({'tract': 'count'}).reset_index()

In [123]:
g.groupby('year').agg({'geoid': 'count'})

Unnamed: 0_level_0,geoid
year,Unnamed: 1_level_1
1990,17379
2000,15895
2010,15249
2011,15249
2012,15249
2013,15249
2014,15249
2015,15249
2016,15249
2017,15249


In [124]:
g = g.groupby(['geoid']).agg({'year': 'count'}).reset_index()

In [125]:
g.groupby('year').agg({'geoid': 'count'})

Unnamed: 0_level_0,geoid
year,Unnamed: 1_level_1
1,13641
2,2202
3,66
10,938
11,3488
12,4962
13,5861


## Remove rows with no data
Moving forward we will work in the dataframe.
Check for rows that have 0 or -666666666 values across all of the columns. Where this occurs, drop the row from the database entirely. 

In [102]:
# Read in database pickles and check
pickle_in = open('alldatadf.pickle', 'rb')
cleaning = pickle.load(pickle_in)
cleaning.columns

Index(['year', 'population', 'medianhouseholdincome', 'percapitaincome',
       'totalcivilianlaborforce', 'unemployedpopulation', 'totalhousingunits',
       'vacanthousingunits', 'mediancontractrent', 'mediangrossrent',
       'mediangrossretnaspercentageofhouseholdincome', 'medianhomevalue',
       'geoid', 'statefp', 'countyfp', 'tract', 'blockgroup', 'statecountyfp',
       'msa', 'state', 'county'],
      dtype='object')

In [104]:
cleaning.shape

(201728, 21)

In [105]:
# Keep rows where at least one of the indicators has a value greater than 0.
cleaning = cleaning[(cleaning.medianhouseholdincome > 0) | 
                   (cleaning.percapitaincome > 0) |
                   (cleaning.totalcivilianlaborforce > 0) |
                   (cleaning.unemployedpopulation > 0) |
                   (cleaning.totalhousingunits > 0) |
                   (cleaning.vacanthousingunits > 0) |
                   (cleaning.mediancontractrent > 0) |
                   (cleaning.mediangrossrent > 0) |
                   (cleaning.mediangrossretnaspercentageofhouseholdincome > 0) |
                   (cleaning.medianhomevalue > 0)]

In [106]:
cleaning.shape

(200512, 21)

Check to see how many BGs are left across each year

In [107]:
g = cleaning.groupby(['geoid', 'year']).agg({'tract': 'count'}).reset_index()

In [108]:
g.groupby('year').agg({'geoid': 'count'})

Unnamed: 0_level_0,geoid
year,Unnamed: 1_level_1
1990,17195
2000,15809
2010,15192
2011,15197
2012,15197
2013,15198
2014,15200
2015,15200
2016,15200
2017,15201


In [111]:
# save dataframe "cleaning" into pickle file
pickle_out = open('cleaneddatadf.pickle', 'wb')
pickle.dump(cleaning, pickle_out)
pickle_out.close()