## ACS 2020 Unemployment data for DC

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

In [3]:
df = pd.read_csv('./data/acs-2020.csv')

### Preview data

In [4]:
df.head(5)

Unnamed: 0,sumlev,geoid,areaname,stab,state,county,tract,BG,CBSA,metdiv,logrecno,B23025i1,B23025i2,B23025i3,B23025i4,B23025i5,B23025i6,B23025i7
0,Geographic summary level,,,State abbreviation,State code,County code,Tract code,Block group code,CBSA code,Metro division of MSA,,Total:,In labor force:,Civilian labor force:,Employed,Unemployed,Armed Forces,Not in labor force
1,150,15000US110010001011,"Block Group 1, Census Tract 1.01, District of ...",DC,11,District of Columbia DC,101,1,.,,215.0,1134,787,787,769,18,0,347
2,150,15000US110010001021,"Block Group 1, Census Tract 1.02, District of ...",DC,11,District of Columbia DC,102,1,.,,216.0,1251,1099,1099,1099,0,0,152
3,150,15000US110010001022,"Block Group 2, Census Tract 1.02, District of ...",DC,11,District of Columbia DC,102,2,.,,217.0,969,590,590,543,47,0,379
4,150,15000US110010001023,"Block Group 3, Census Tract 1.02, District of ...",DC,11,District of Columbia DC,102,3,.,,218.0,769,575,575,562,13,0,194


### Data wrangling

In [5]:
# subset the dataframe to select useful cols and rows
df_sml = df[['geoid','stab','state', 'tract', 'BG', 'B23025i3', 'B23025i5']]
df_sml.head()

Unnamed: 0,geoid,stab,state,tract,BG,B23025i3,B23025i5
0,,State abbreviation,State code,Tract code,Block group code,Civilian labor force:,Unemployed
1,15000US110010001011,DC,11,101,1,787,18
2,15000US110010001021,DC,11,102,1,1099,0
3,15000US110010001022,DC,11,102,2,590,47
4,15000US110010001023,DC,11,102,3,575,13


In [6]:
# rename columns
df_sml = df_sml.rename(columns={'B23025i3': 'labor_force', 'B23025i5': 'unemployed', 'BG':'block_group', 'geoid':'geoid_long'})
df_sml.head()

Unnamed: 0,geoid_long,stab,state,tract,block_group,labor_force,unemployed
0,,State abbreviation,State code,Tract code,Block group code,Civilian labor force:,Unemployed
1,15000US110010001011,DC,11,101,1,787,18
2,15000US110010001021,DC,11,102,1,1099,0
3,15000US110010001022,DC,11,102,2,590,47
4,15000US110010001023,DC,11,102,3,575,13


In [7]:
# remove header
df_clean = df_sml[1:]
df_clean.head()

Unnamed: 0,geoid_long,stab,state,tract,block_group,labor_force,unemployed
1,15000US110010001011,DC,11,101,1,787,18
2,15000US110010001021,DC,11,102,1,1099,0
3,15000US110010001022,DC,11,102,2,590,47
4,15000US110010001023,DC,11,102,3,575,13
5,15000US110010002011,DC,11,201,1,0,0


In [8]:
# change data types
print('Before:', df_clean.dtypes, '\n')
df_clean = df_clean.astype({'labor_force': 'int', 'unemployed':'int', 'state':'string', 'tract':'string', 'block_group':'string', 'stab':'string'})
print('After:', df_clean.dtypes)

Before: geoid_long     object
stab           object
state          object
tract          object
block_group    object
labor_force    object
unemployed     object
dtype: object 

After: geoid_long     object
stab           string
state          string
tract          string
block_group    string
labor_force     int32
unemployed      int32
dtype: object


- unemployment rate = (Unemployed population)/(Labor force)*100
- geoid = state code + county code + tract code + block group code

In [9]:
# feature generation/modification
'''unemployment rate'''
df_clean['unemployment_rate'] = round(df_clean['unemployed']/df_clean['labor_force']*100,2)

'''county'''
df_clean['county'] = '001'

'''tract'''
df_clean['tract'] = df_clean['tract'].str.zfill(6)

'''geoid'''
df_clean['geoid'] = df_clean['state'] + df_clean['county'] + df_clean['tract'] + df_clean['block_group']
# df_clean = df_clean.astype({'geoid': 'string'})

In [10]:
# select data from DC only
df_fnl = df_clean[df_clean['stab']=='DC']
print(len(df_clean))
print(len(df_fnl))
df_fnl.head(10)

10613
571


Unnamed: 0,geoid_long,stab,state,tract,block_group,labor_force,unemployed,unemployment_rate,county,geoid
1,15000US110010001011,DC,11,101,1,787,18,2.29,1,110010001011
2,15000US110010001021,DC,11,102,1,1099,0,0.0,1,110010001021
3,15000US110010001022,DC,11,102,2,590,47,7.97,1,110010001022
4,15000US110010001023,DC,11,102,3,575,13,2.26,1,110010001023
5,15000US110010002011,DC,11,201,1,0,0,,1,110010002011
6,15000US110010002012,DC,11,201,2,1481,135,9.12,1,110010002012
7,15000US110010002021,DC,11,202,1,758,40,5.28,1,110010002021
8,15000US110010002022,DC,11,202,2,385,0,0.0,1,110010002022
9,15000US110010002023,DC,11,202,3,806,23,2.85,1,110010002023
10,15000US110010002024,DC,11,202,4,1050,48,4.57,1,110010002024


In [85]:
df_fnl.to_csv('./data/acs-2020-DC.csv')