# Data transformation with crosswalks
Crosswalking 1990, 2000, and 2010-2019 data to 2020 geographies using the multipliers that were formed in the previous notebook.

The cleaned data (that is, with all null rows removed) will be transformed using the derived crosswalks.
The approach taken is using the concatenated data (in dataframe) instead of from the dictionary as previously performed.

In [1]:
# Set up all packages that may be used

import pandas as pd
import sys
import numpy as np
import scipy as sp
import pickle
import os

# import matplotlib.pyplot as plt
# from math import radians, cos, sin, asin, sqrt
# import datetime
# from sklearn.linear_model import LinearRegression
# import seaborn as sns
# sns.set(style="ticks")
# %matplotlib inline

pd.set_option('display.max_columns', None)
# import geopandas as gpd
# import contexily as ctx
# import matlabplotlib.pyplot as plt

## Set up data to be used

In [2]:
# Read in multiplier files
newfolder = r"C:\\Users\\jenki\\Documents\\School\\Thesis\\Data\NHGIS Data\\Crosswalks\\GEOID\\CrosswalkMultipliers\\"
x90 = pd.read_csv(newfolder + "multiplierweights90-20.csv")
x00 = pd.read_csv(newfolder + "multiplierweights00-20.csv")
x10 = pd.read_csv(newfolder + "multiplierweights10-20.csv")

In [3]:
# From multiplier files, convert GEOID90BG and GEOID20BG to object type
x90 = x90.astype({'GEOID90BG':str, 'GEOID20BG':str}) 
x00 = x00.astype({'GEOID00BG':str, 'GEOID20BG':str}) 
x10 = x10.astype({'GEOID10BG':str, 'GEOID20BG':str}) 

In [4]:
# Read in database pickle and check
pickle_in = open('cleaneddatadf.pickle', 'rb')
data = pickle.load(pickle_in)
data['year'].unique()

array([2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 1990, 2000, 2010,
       2011, 2012], dtype=int64)

In [5]:
# Establish variables to be used 
indicators = ['population', 'totalcivilianlaborforce','unemployedpopulation','totalhousingunits','vacanthousingunits']
attributes = ['year','tract','state','county','msa','medianhouseholdincome','percapitaincome','mediancontractrent','mediangrossrent','medianhomevalue']
columns = indicators+attributes

## Prepare data for transformation

In [6]:
# Check for na values
data.isna().sum().sum()

0

In [7]:
# add new column 'geoidBG' for reference when performing merge and multiplication. 
data['geoidBG'] = data['geoid'].str[7:]

## Filter and transform 1990 data

In [8]:
data90 = data[data['year']==1990]

In [9]:
# Merge multiplier file to 1990 dataframe
merge90 = data90.merge(x90, left_on='geoidBG', right_on='GEOID90BG', how='inner')

In [10]:
# For all count-based indicators, multiply by weight from crosswalk
for x in indicators:
    merge90[x] = merge90[x]*merge90['WEIGHT']
    merge90[x].apply(np.ceil)
    merge90 = merge90.astype({x:int})

In [11]:
# Group block groups by unique block group values from 2020, using maximum values. Summation only makes sense if the crosswalks are given by block group, or if we are transforming the data from blocks.
group90 = merge90.groupby(['GEOID20BG']).agg({'year':'max','msa':'max','state':'max','county':'max','tract':'max', 'blockgroup':'max',
                                         'population':'max', 'totalcivilianlaborforce':'max', 'unemployedpopulation':'max',
                                         'totalhousingunits':'max', 'vacanthousingunits':'max',
                                         'medianhouseholdincome':'max', 'percapitaincome':'max','mediancontractrent':'max', 'mediangrossrent':'max', 'medianhomevalue':'max', 'mediangrossretnaspercentageofhouseholdincome':'max'
                                         })
group90.reset_index(inplace=True)

## Filter and transform 2000 data

In [12]:
data00 = data[data['year']==2000]

In [13]:
# Merge multiplier file to 2000 dataframe
merge00 = data00.merge(x00, left_on='geoidBG', right_on='GEOID00BG', how='inner')

In [14]:
merge00.GEOID00BG.nunique()

11501

In [15]:
# For all count-based indicators, multiply by weight from crosswalk
for x in indicators:
    merge00[x] = merge00[x]*merge00['WEIGHT']
    merge00[x].apply(np.ceil)
    merge00 = merge00.astype({x:int})

In [16]:
group00 = merge00.groupby(['GEOID20BG']).agg({'year':'max','msa':'max','state':'max','county':'max','tract':'max', 'blockgroup':'max',
                                         'population':'max', 'totalcivilianlaborforce':'max', 'unemployedpopulation':'max',
                                         'totalhousingunits':'max', 'vacanthousingunits':'max',
                                         'medianhouseholdincome':'max', 'percapitaincome':'max','mediancontractrent':'max', 'mediangrossrent':'max', 'medianhomevalue':'max', 'mediangrossretnaspercentageofhouseholdincome':'max'
                                         })
group00.reset_index(inplace=True)

In [17]:
len(group00)

11539

## Filter and transform 2010-2019 data

In [18]:
years = [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]

In [19]:
datayears = data.query('year in @years')
len(datayears)

151983

In [20]:
merge = {}
for y in years:
    datayear = data[data['year']==y]
    merge[y] = datayear.merge(x10, left_on='geoidBG', right_on='GEOID10BG', how='inner')

In [21]:
dictgroup = {}

for y in years:
    group = merge[y].groupby(['GEOID20BG']).agg({'year':'max','msa':'max','state':'max','county':'max','tract':'max', 'blockgroup':'max',
                                         'population':'max', 'totalcivilianlaborforce':'max', 'unemployedpopulation':'max',
                                         'totalhousingunits':'max', 'vacanthousingunits':'max',
                                         'medianhouseholdincome':'max', 'percapitaincome':'max','mediancontractrent':'max', 'mediangrossrent':'max', 'medianhomevalue':'max', 'mediangrossretnaspercentageofhouseholdincome':'max'
                                         })
   
    group.reset_index(inplace=True)
    
    dictgroup[y] = group

## Compile all years of transformed data

In [22]:
# Copy dataframes for 1990 and 2000 data (grouped) into the new dictionary of data
dictgroup[1990] = group90.copy()
dictgroup[2000] = group00.copy()

### Clean 2020 data into the same format (columns and all) to insert into the new dictionary of data

In [23]:
dictgroup[2000].columns

Index(['GEOID20BG', 'year', 'msa', 'state', 'county', 'tract', 'blockgroup',
       'population', 'totalcivilianlaborforce', 'unemployedpopulation',
       'totalhousingunits', 'vacanthousingunits', 'medianhouseholdincome',
       'percapitaincome', 'mediancontractrent', 'mediangrossrent',
       'medianhomevalue', 'mediangrossretnaspercentageofhouseholdincome'],
      dtype='object')

In [24]:
# Create new dataframe for 2020 data
data20 = data[data['year']== 2020]

In [25]:
data20.columns

Index(['year', 'population', 'medianhouseholdincome', 'percapitaincome',
       'totalcivilianlaborforce', 'unemployedpopulation', 'totalhousingunits',
       'vacanthousingunits', 'mediancontractrent', 'mediangrossrent',
       'mediangrossretnaspercentageofhouseholdincome', 'medianhomevalue',
       'geoid', 'statefp', 'countyfp', 'tract', 'blockgroup', 'statecountyfp',
       'msa', 'state', 'county', 'geoidBG'],
      dtype='object')

In [26]:
# Rename geoidBG column to GEOID20BG
data20.rename(columns = {'geoidBG':'GEOID20BG'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data20.rename(columns = {'geoidBG':'GEOID20BG'}, inplace = True)


In [27]:
# Add to dictgroup dictionary
dictgroup[2020] = data20.copy()

### Convert all dataframe datatypes into the same types

In [28]:
group90 = group90.astype({'population':"Int64", 
                          'totalcivilianlaborforce':"Int64", 
                          'unemployedpopulation':"Int64", 
                          'totalhousingunits':"Int64", 
                          'vacanthousingunits':"Int64"})

In [29]:
dtypes_new = group90.dtypes.to_dict()

In [30]:
keys = list(dictgroup.keys())
keys

[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 1990, 2000, 2020]

In [31]:
for key in keys:
    dictgroup[key] = dictgroup[key].astype(dtypes_new)

In [36]:
# Convert all negative values to null
for year in keys:
    dictgroup[year].loc[dictgroup[year]["totalcivilianlaborforce"] < 0, "totalcivilianlaborforce"] = np.nan
    dictgroup[year].loc[dictgroup[year]["unemployedpopulation"] < 0, "unemployedpopulation"] = np.nan
    dictgroup[year].loc[dictgroup[year]["medianhouseholdincome"] < 0, "medianhouseholdincome"] = np.nan
    dictgroup[year].loc[dictgroup[year]["percapitaincome"] < 0, "percapitaincome"] = np.nan
    dictgroup[year].loc[dictgroup[year]["mediancontractrent"] < 0, "mediancontractrent"] = np.nan
    dictgroup[year].loc[dictgroup[year]["mediangrossrent"] < 0, "mediangrossrent"] = np.nan
    dictgroup[year].loc[dictgroup[year]["medianhomevalue"] < 0, "medianhomevalue"] = np.nan
    dictgroup[year].loc[dictgroup[year]["mediangrossretnaspercentageofhouseholdincome"] < 0, "mediangrossretnaspercentageofhouseholdincome"] = np.nan

In [37]:
dictgroup[2010]['totalcivilianlaborforce']

0        <NA>
1        <NA>
2        <NA>
3        <NA>
4        <NA>
         ... 
15937    <NA>
15938    <NA>
15939    <NA>
15940    <NA>
15941    <NA>
Name: totalcivilianlaborforce, Length: 15942, dtype: Int64

In [38]:
for year in keys:
    print(dictgroup[year].isnull().sum())

GEOID20BG                                           0
year                                                0
msa                                                 0
state                                               0
county                                              0
tract                                               0
blockgroup                                          0
population                                          0
totalcivilianlaborforce                         15942
unemployedpopulation                            15942
totalhousingunits                                   0
vacanthousingunits                                  0
medianhouseholdincome                              36
percapitaincome                                    13
mediancontractrent                               2128
mediangrossrent                                  2097
medianhomevalue                                   291
mediangrossretnaspercentageofhouseholdincome     1828
dtype: int64
GEOID20BG      

In [39]:
dataframe = pd.concat(dictgroup.values())

In [40]:
g = dataframe.groupby(['GEOID20BG', 'year']).agg({'tract': 'count'}).reset_index()
g.groupby('year').agg({'GEOID20BG': 'count'})

Unnamed: 0_level_0,GEOID20BG
year,Unnamed: 1_level_1
1990,15977
2000,11539
2010,15942
2011,15949
2012,15954
2013,15954
2014,15957
2015,15957
2016,15957
2017,15944


In [41]:
g = g.groupby(['GEOID20BG']).agg({'year': 'count'}).reset_index()
g.groupby('year').agg({'GEOID20BG': 'count'})

Unnamed: 0_level_0,GEOID20BG
year,Unnamed: 1_level_1
1,14
2,9
3,6
5,4
6,13
7,1
8,3
10,9
11,84
12,4663


# Save all progress to pickles

In [42]:
# save dataframe "dataframe" into pickle file
pickle_out = open('allcrosswalkeddatadf_new.pickle', 'wb')
pickle.dump(dataframe, pickle_out)
pickle_out.close()

In [43]:
# save dictionary "dictgroup" into new pickle file
pickle_out = open('allcrosswalkeddictionary_new.pickle', 'wb')
pickle.dump(dictgroup, pickle_out)
pickle_out.close()