# CENSUS DATA


Assumptions:
- When getting demographic info, assumed the percentages were from the total population
- With employment related statistics, assumed the percentages were from the total employed.

# EDA

## Data Exploration

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

In [2]:
import os

In [3]:
path = os.getcwd()

### Import data

##### Zip Code Data
- using for city information

In [4]:
uszip_filename = os.path.join(path, 'workspace/uszips.csv')

In [5]:
uszip = pd.read_csv(uszip_filename)

In [6]:
print(uszip.shape)
uszip.head()

(33121, 18)


Unnamed: 0,zip,lat,lng,city,state_id,state_name,zcta,parent_zcta,population,density,county_fips,county_name,county_weights,county_names_all,county_fips_all,imprecise,military,timezone
0,601,18.18005,-66.75218,Adjuntas,PR,Puerto Rico,True,,17113.0,102.7,72001,Adjuntas,"{""72001"": ""99.43"", ""72141"": ""0.57""}",Adjuntas|Utuado,72001|72141,False,False,America/Puerto_Rico
1,602,18.36074,-67.17519,Aguada,PR,Puerto Rico,True,,37751.0,476.0,72003,Aguada,"{""72003"": ""100""}",Aguada,72003,False,False,America/Puerto_Rico
2,603,18.4544,-67.12201,Aguadilla,PR,Puerto Rico,True,,47081.0,574.9,72005,Aguadilla,"{""72005"": ""100""}",Aguadilla,72005,False,False,America/Puerto_Rico
3,606,18.16721,-66.93828,Maricao,PR,Puerto Rico,True,,6392.0,58.3,72093,Maricao,"{""72093"": ""94.88"", ""72153"": ""3.78"", ""72121"": ""...",Maricao|Yauco|Sabana Grande,72093|72153|72121,False,False,America/Puerto_Rico
4,610,18.29032,-67.12244,Anasco,PR,Puerto Rico,True,,26686.0,286.9,72011,Añasco,"{""72011"": ""99.45"", ""72003"": ""0.55""}",Añasco|Aguada,72011|72003,False,False,America/Puerto_Rico


In [7]:
uszip['zip'] = uszip['zip'].astype('str').str.zfill(5)

In [8]:
uszip['zip'].nunique()

33121

##### Census 
- Contains population, demographic, poverty, and employment data by tract

In [9]:
census_filename = os.path.join(path, 'workspace/acs2017_census_tract_data.csv')

In [10]:
df = pd.read_csv(census_filename)
print(df.shape)
df.head()

(74001, 37)


Unnamed: 0,TractId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,VotingAgeCitizen,Income,IncomeErr,IncomePerCap,IncomePerCapErr,Poverty,ChildPoverty,Professional,Service,Office,Construction,Production,Drive,Carpool,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,1001020100,Alabama,Autauga County,1845,899,946,2.4,86.3,5.2,0.0,1.2,0.0,1407,67826.0,14560.0,33018.0,6294.0,10.7,20.8,38.5,15.6,22.8,10.8,12.4,94.2,3.3,0.0,0.5,0.0,2.1,24.5,881,74.2,21.2,4.5,0.0,4.6
1,1001020200,Alabama,Autauga County,2172,1167,1005,1.1,41.6,54.5,0.0,1.0,0.0,1652,41287.0,3819.0,18996.0,2453.0,22.4,35.8,30.5,24.9,22.9,6.3,15.4,90.5,9.1,0.0,0.0,0.5,0.0,22.2,852,75.9,15.0,9.0,0.0,3.4
2,1001020300,Alabama,Autauga County,3385,1533,1852,8.0,61.4,26.5,0.6,0.7,0.4,2480,46806.0,9496.0,21236.0,2562.0,14.7,21.1,27.9,19.4,33.3,9.9,9.6,88.3,8.4,0.0,1.0,0.8,1.5,23.1,1482,73.3,21.1,4.8,0.7,4.7
3,1001020400,Alabama,Autauga County,4267,2001,2266,9.6,80.3,7.1,0.5,0.2,0.0,3257,55895.0,4369.0,28068.0,3190.0,2.3,1.7,29.0,16.6,25.8,9.1,19.5,82.3,11.2,0.0,1.5,2.9,2.1,25.9,1849,75.8,19.7,4.5,0.0,6.1
4,1001020500,Alabama,Autauga County,9965,5054,4911,0.9,77.5,16.4,0.0,3.1,0.0,7229,68143.0,14424.0,36905.0,10706.0,12.2,17.9,48.8,13.8,20.5,3.5,13.4,86.9,11.2,0.0,0.8,0.3,0.7,21.0,4787,71.4,24.1,4.5,0.0,2.3


In [11]:
df['TractId'].nunique()

74001

##### Tract to Zip
- maps census tracts to zip codes

In [12]:
tract_zip_filename = os.path.join(path, 'workspace/TRACT_ZIP_122017.xlsx')

In [13]:
tract_zip = pd.read_excel(tract_zip_filename)

In [14]:
print(tract_zip.shape)
tract_zip.head()

(168662, 6)


Unnamed: 0,tract,zip,res_ratio,bus_ratio,oth_ratio,tot_ratio
0,1001020100,36067,1.0,1.0,1.0,1.0
1,1001020200,36008,0.028243,0.012987,0.0,0.023846
2,1001020200,36067,0.971757,0.987013,1.0,0.976154
3,1001020300,36067,1.0,1.0,1.0,1.0
4,1001020400,36067,0.025543,0.585987,0.25,0.071038


In [15]:
tract_zip = tract_zip[['tract', 'zip', 'res_ratio']]

In [16]:
tract_zip = tract_zip.rename(columns = {'tract':'TractId'})

In [17]:
tract_zip['zip'].nunique()

39301

In [18]:
tract_zip['TractId'].nunique()

73584

## Merge Census with Tract_Zip Data

In [19]:
merge = df.merge(tract_zip, on=['TractId'])

In [20]:
print(merge.shape)
merge.head()

(168385, 39)


Unnamed: 0,TractId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,VotingAgeCitizen,Income,IncomeErr,IncomePerCap,IncomePerCapErr,Poverty,ChildPoverty,Professional,Service,Office,Construction,Production,Drive,Carpool,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment,zip,res_ratio
0,1001020100,Alabama,Autauga County,1845,899,946,2.4,86.3,5.2,0.0,1.2,0.0,1407,67826.0,14560.0,33018.0,6294.0,10.7,20.8,38.5,15.6,22.8,10.8,12.4,94.2,3.3,0.0,0.5,0.0,2.1,24.5,881,74.2,21.2,4.5,0.0,4.6,36067,1.0
1,1001020200,Alabama,Autauga County,2172,1167,1005,1.1,41.6,54.5,0.0,1.0,0.0,1652,41287.0,3819.0,18996.0,2453.0,22.4,35.8,30.5,24.9,22.9,6.3,15.4,90.5,9.1,0.0,0.0,0.5,0.0,22.2,852,75.9,15.0,9.0,0.0,3.4,36008,0.028243
2,1001020200,Alabama,Autauga County,2172,1167,1005,1.1,41.6,54.5,0.0,1.0,0.0,1652,41287.0,3819.0,18996.0,2453.0,22.4,35.8,30.5,24.9,22.9,6.3,15.4,90.5,9.1,0.0,0.0,0.5,0.0,22.2,852,75.9,15.0,9.0,0.0,3.4,36067,0.971757
3,1001020300,Alabama,Autauga County,3385,1533,1852,8.0,61.4,26.5,0.6,0.7,0.4,2480,46806.0,9496.0,21236.0,2562.0,14.7,21.1,27.9,19.4,33.3,9.9,9.6,88.3,8.4,0.0,1.0,0.8,1.5,23.1,1482,73.3,21.1,4.8,0.7,4.7,36067,1.0
4,1001020400,Alabama,Autauga County,4267,2001,2266,9.6,80.3,7.1,0.5,0.2,0.0,3257,55895.0,4369.0,28068.0,3190.0,2.3,1.7,29.0,16.6,25.8,9.1,19.5,82.3,11.2,0.0,1.5,2.9,2.1,25.9,1849,75.8,19.7,4.5,0.0,6.1,36067,0.025543


### Seeing what does not transfer over

In [21]:
df[(~df.TractId.isin(merge.TractId))]

Unnamed: 0,TractId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,VotingAgeCitizen,Income,IncomeErr,IncomePerCap,IncomePerCapErr,Poverty,ChildPoverty,Professional,Service,Office,Construction,Production,Drive,Carpool,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
43,1003990000,Alabama,Baldwin County,0,0,0,,,,,,,0,,,,,,,,,,,,,,,,,,,0,,,,,
868,1097990000,Alabama,Mobile County,0,0,0,,,,,,,0,,,,,,,,,,,,,,,,,,,0,,,,,
1266,2105000200,Alaska,Hoonah-Angoon Census Area,62,35,27,0.0,11.3,0.0,82.3,0.0,0.0,60,42500.0,9004.0,23827.0,3232.0,6.5,50.0,39.3,32.1,7.1,10.7,10.7,51.9,0.0,0.0,25.9,0.0,22.2,13.8,28,32.1,35.7,32.1,0.0,12.5
1324,2185000100,Alaska,North Slope Borough,4444,2366,2078,5.3,13.3,0.3,59.4,10.5,3.3,2769,82964.0,13376.0,31860.0,5094.0,11.2,13.0,35.6,15.0,26.4,7.4,15.6,48.1,26.8,0.5,11.9,9.0,3.8,6.6,1800,44.3,54.4,1.3,0.0,14.2
1353,4001944202,Arizona,Apache County,4094,1933,2161,3.4,0.5,0.4,93.9,0.2,0.0,2561,26296.0,6947.0,10079.0,1702.0,53.9,70.0,43.8,17.6,12.8,13.4,12.3,73.1,13.7,2.1,7.5,0.0,3.5,22.8,851,39.4,56.9,3.8,0.0,28.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73982,72151950700,Puerto Rico,Yabucoa Municipio,4744,2377,2367,100.0,0.0,0.0,0.0,0.0,0.0,4061,16087.0,3373.0,8157.0,1248.0,53.6,42.2,41.0,12.1,22.1,12.9,11.9,85.5,14.5,0.0,0.0,0.0,0.0,33.9,1034,61.4,36.2,2.4,0.0,17.9
73983,72151950800,Puerto Rico,Yabucoa Municipio,3531,1915,1616,100.0,0.0,0.0,0.0,0.0,0.0,2924,15767.0,2603.0,8133.0,1350.0,42.4,52.7,12.7,31.9,21.2,9.2,25.0,81.1,13.7,0.0,2.7,2.5,0.0,25.0,780,71.0,18.2,10.8,0.0,30.4
73984,72151950900,Puerto Rico,Yabucoa Municipio,5822,2892,2930,99.7,0.3,0.0,0.0,0.0,0.0,4380,13841.0,3098.0,7088.0,1162.0,58.7,66.5,29.0,24.1,25.5,9.2,12.2,95.7,2.0,0.0,2.3,0.0,0.0,32.3,1143,66.0,29.4,4.6,0.0,28.7
73989,72151990000,Puerto Rico,Yabucoa Municipio,0,0,0,,,,,,,0,,,,,,,,,,,,,,,,,,,0,,,,,


In [22]:
tract_zip[(~tract_zip.TractId.isin(merge.TractId))]

Unnamed: 0,TractId,zip,res_ratio
543,1037030200,36080,1.0
2828,1115050101,35146,1.0
3017,1123231500,36866,1.0
3673,2270000100,99563,0.0
3674,2270000100,99620,0.0
...,...,...,...
168657,78030960800,802,1.0
168658,78030960900,802,1.0
168659,78030961000,802,1.0
168660,78030961100,802,1.0


## Data Cleaning

### isnull().sum()

In [23]:
merge.isnull().sum()

TractId                0
State                  0
County                 0
TotalPop               0
Men                    0
Women                  0
Hispanic             506
White                506
Black                506
Native               506
Asian                506
Pacific              506
VotingAgeCitizen       0
Income              1291
IncomeErr           1291
IncomePerCap         596
IncomePerCapErr      596
Poverty              710
ChildPoverty        1286
Professional         652
Service              652
Office               652
Construction         652
Production           652
Drive                633
Carpool              633
Transit              633
Walk                 633
OtherTransp          633
WorkAtHome           633
MeanCommute          949
Employed               0
PrivateWork          652
PublicWork           652
SelfEmployed         652
FamilyWork           652
Unemployment         650
zip                    0
res_ratio              0
dtype: int64

### Fill na with 0

In [24]:
merge.fillna(0, inplace=True)

In [25]:
merge.isnull().sum()

TractId             0
State               0
County              0
TotalPop            0
Men                 0
Women               0
Hispanic            0
White               0
Black               0
Native              0
Asian               0
Pacific             0
VotingAgeCitizen    0
Income              0
IncomeErr           0
IncomePerCap        0
IncomePerCapErr     0
Poverty             0
ChildPoverty        0
Professional        0
Service             0
Office              0
Construction        0
Production          0
Drive               0
Carpool             0
Transit             0
Walk                0
OtherTransp         0
WorkAtHome          0
MeanCommute         0
Employed            0
PrivateWork         0
PublicWork          0
SelfEmployed        0
FamilyWork          0
Unemployment        0
zip                 0
res_ratio           0
dtype: int64

In [26]:
merge.State.unique()

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'District of Columbia',
       'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana',
       'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
       'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
       'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
       'New Jersey', 'New Mexico', 'New York', 'North Carolina',
       'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
       'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee',
       'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming', 'Puerto Rico'],
      dtype=object)

### Remove nonstates

In [27]:
# Remove areas we don't have in other df
merge = merge[merge['State'] != 'Puerto Rico']

In [28]:
merge['State'].nunique()

51

## Map Tracts to Zip Code
* first get multiply % columns by total population
* multiply % columns by employed
* multiply residential ratio across rows

### Make Numbers Compatible

#### Dividing certain columns by 100


##### Making sure I capture the correct columns

In [29]:
merge.iloc[:,6:11+1]

Unnamed: 0,Hispanic,White,Black,Native,Asian,Pacific
0,2.4,86.3,5.2,0.0,1.2,0.0
1,1.1,41.6,54.5,0.0,1.0,0.0
2,1.1,41.6,54.5,0.0,1.0,0.0
3,8.0,61.4,26.5,0.6,0.7,0.4
4,9.6,80.3,7.1,0.5,0.2,0.0
...,...,...,...,...,...,...
166947,3.0,89.4,1.0,0.1,3.0,0.0
166948,3.0,89.4,1.0,0.1,3.0,0.0
166949,3.0,89.4,1.0,0.1,3.0,0.0
166950,3.0,89.4,1.0,0.1,3.0,0.0


In [30]:
merge.iloc[:,17:30+1]

Unnamed: 0,Poverty,ChildPoverty,Professional,Service,Office,Construction,Production,Drive,Carpool,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute
0,10.7,20.8,38.5,15.6,22.8,10.8,12.4,94.2,3.3,0.0,0.5,0.0,2.1,24.5
1,22.4,35.8,30.5,24.9,22.9,6.3,15.4,90.5,9.1,0.0,0.0,0.5,0.0,22.2
2,22.4,35.8,30.5,24.9,22.9,6.3,15.4,90.5,9.1,0.0,0.0,0.5,0.0,22.2
3,14.7,21.1,27.9,19.4,33.3,9.9,9.6,88.3,8.4,0.0,1.0,0.8,1.5,23.1
4,2.3,1.7,29.0,16.6,25.8,9.1,19.5,82.3,11.2,0.0,1.5,2.9,2.1,25.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166947,12.5,18.5,32.2,15.9,13.4,20.5,17.9,72.2,8.0,9.5,2.1,3.0,5.2,31.3
166948,12.5,18.5,32.2,15.9,13.4,20.5,17.9,72.2,8.0,9.5,2.1,3.0,5.2,31.3
166949,12.5,18.5,32.2,15.9,13.4,20.5,17.9,72.2,8.0,9.5,2.1,3.0,5.2,31.3
166950,12.5,18.5,32.2,15.9,13.4,20.5,17.9,72.2,8.0,9.5,2.1,3.0,5.2,31.3


In [31]:
merge.iloc[:,32:36+1]

Unnamed: 0,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,74.2,21.2,4.5,0.0,4.6
1,75.9,15.0,9.0,0.0,3.4
2,75.9,15.0,9.0,0.0,3.4
3,73.3,21.1,4.8,0.7,4.7
4,75.8,19.7,4.5,0.0,6.1
...,...,...,...,...,...
166947,64.6,22.9,10.9,1.6,0.4
166948,64.6,22.9,10.9,1.6,0.4
166949,64.6,22.9,10.9,1.6,0.4
166950,64.6,22.9,10.9,1.6,0.4


In [32]:
merge.iloc[:,6:11+1] /= 100

In [33]:
merge.iloc[:,17:30+1] /= 100

In [34]:
merge.iloc[:,32:36+1] /= 100

In [35]:
merge.head()

Unnamed: 0,TractId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,VotingAgeCitizen,Income,IncomeErr,IncomePerCap,IncomePerCapErr,Poverty,ChildPoverty,Professional,Service,Office,Construction,Production,Drive,Carpool,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment,zip,res_ratio
0,1001020100,Alabama,Autauga County,1845,899,946,0.024,0.863,0.052,0.0,0.012,0.0,1407,67826.0,14560.0,33018.0,6294.0,0.107,0.208,0.385,0.156,0.228,0.108,0.124,0.942,0.033,0.0,0.005,0.0,0.021,0.245,881,0.742,0.212,0.045,0.0,0.046,36067,1.0
1,1001020200,Alabama,Autauga County,2172,1167,1005,0.011,0.416,0.545,0.0,0.01,0.0,1652,41287.0,3819.0,18996.0,2453.0,0.224,0.358,0.305,0.249,0.229,0.063,0.154,0.905,0.091,0.0,0.0,0.005,0.0,0.222,852,0.759,0.15,0.09,0.0,0.034,36008,0.028243
2,1001020200,Alabama,Autauga County,2172,1167,1005,0.011,0.416,0.545,0.0,0.01,0.0,1652,41287.0,3819.0,18996.0,2453.0,0.224,0.358,0.305,0.249,0.229,0.063,0.154,0.905,0.091,0.0,0.0,0.005,0.0,0.222,852,0.759,0.15,0.09,0.0,0.034,36067,0.971757
3,1001020300,Alabama,Autauga County,3385,1533,1852,0.08,0.614,0.265,0.006,0.007,0.004,2480,46806.0,9496.0,21236.0,2562.0,0.147,0.211,0.279,0.194,0.333,0.099,0.096,0.883,0.084,0.0,0.01,0.008,0.015,0.231,1482,0.733,0.211,0.048,0.007,0.047,36067,1.0
4,1001020400,Alabama,Autauga County,4267,2001,2266,0.096,0.803,0.071,0.005,0.002,0.0,3257,55895.0,4369.0,28068.0,3190.0,0.023,0.017,0.29,0.166,0.258,0.091,0.195,0.823,0.112,0.0,0.015,0.029,0.021,0.259,1849,0.758,0.197,0.045,0.0,0.061,36067,0.025543


### Multiply 

##### Multiply Columns by Total Population

In [37]:
col = ['Hispanic','White', 'Black', 'Native', 'Asian', 'Pacific','Poverty',
       'ChildPoverty','Unemployment']

In [38]:
merge[col] = merge[col].multiply(merge['TotalPop'], axis = 'index')

In [39]:
merge[col] = merge[col].astype(int)

##### Multiply Columns by Employment

In [40]:
employment = ['Professional', 'Service', 'Office', 'Construction', 'Production', 
              'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp', 'WorkAtHome', 
              'MeanCommute','PrivateWork', 'PublicWork','SelfEmployed', 'FamilyWork']

In [41]:
merge[employment] = merge[employment].multiply(merge['Employed'], axis = 'index')

In [42]:
merge[employment] = merge[employment].astype(int)

##### Check values

In [43]:
df.loc[
       (df.State == 'Wyoming')
       &(df.TotalPop == 3329)
       &(df.TractId == 56045951100)]

Unnamed: 0,TractId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,VotingAgeCitizen,Income,IncomeErr,IncomePerCap,IncomePerCapErr,Poverty,ChildPoverty,Professional,Service,Office,Construction,Production,Drive,Carpool,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
73054,56045951100,Wyoming,Weston County,3329,1946,1383,3.0,89.4,1.0,0.1,3.0,0.0,2700,62435.0,9525.0,32665.0,4047.0,12.5,18.5,32.2,15.9,13.4,20.5,17.9,72.2,8.0,9.5,2.1,3.0,5.2,31.3,1583,64.6,22.9,10.9,1.6,0.4


In [44]:
merge.loc[166948:166948]

Unnamed: 0,TractId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,VotingAgeCitizen,Income,IncomeErr,IncomePerCap,IncomePerCapErr,Poverty,ChildPoverty,Professional,Service,Office,Construction,Production,Drive,Carpool,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment,zip,res_ratio
166948,56045951100,Wyoming,Weston County,3329,1946,1383,99,2976,33,3,99,0,2700,62435.0,9525.0,32665.0,4047.0,416,615,509,251,212,324,283,1142,126,150,33,47,82,495,1583,1022,362,172,25,13,82729,0.002424


In [45]:
int(3/100 * 3329)

99

In [46]:
int(89.4/100 * 3329)

2976

In [47]:
int(64.6/100  * 1583)

1022

In [48]:
int(1.6/100  * 1583)

25

In [49]:
assert merge['Hispanic'][166948] == 99

In [50]:
assert merge['White'][166948] == 2976

In [51]:
assert merge['PrivateWork'][166948] == 1022

In [52]:
assert merge['FamilyWork'][166948] == 25

In [53]:
df.loc[
       (df.State == 'Alabama')
       &(df.TotalPop == 7131)
       &(df.TractId == 1003011601)]

Unnamed: 0,TractId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,VotingAgeCitizen,Income,IncomeErr,IncomePerCap,IncomePerCapErr,Poverty,ChildPoverty,Professional,Service,Office,Construction,Production,Drive,Carpool,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
41,1003011601,Alabama,Baldwin County,7131,3699,3432,9.7,87.6,1.4,0.7,0.1,0.0,5232,49900.0,9397.0,21858.0,3270.0,22.4,35.1,25.4,23.9,22.6,11.3,16.7,83.4,7.4,0.0,0.5,0.3,8.4,29.3,3260,76.8,9.5,13.5,0.2,3.1


In [54]:
merge.loc[105:105]

Unnamed: 0,TractId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,VotingAgeCitizen,Income,IncomeErr,IncomePerCap,IncomePerCapErr,Poverty,ChildPoverty,Professional,Service,Office,Construction,Production,Drive,Carpool,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment,zip,res_ratio
105,1003011601,Alabama,Baldwin County,7131,3699,3432,691,6246,99,49,7,0,5232,49900.0,9397.0,21858.0,3270.0,1597,2502,828,779,736,368,544,2718,241,0,16,9,273,955,3260,2503,309,440,6,221,36580,0.01071


In [55]:
int(9.7/100 * 7131)

691

In [56]:
int(25.4/100 * 3260)

828

In [57]:
int(11.3/100 * 3260)

368

In [58]:
int(9.5/100 * 3260)

309

In [59]:
int(13.5/100 * 3260)

440

In [60]:
assert merge['Hispanic'][105] == 691

In [61]:
assert merge['Professional'][105] == 828

In [62]:
assert merge['Construction'][105] == 368

In [63]:
assert merge['PublicWork'][105] == 309

In [64]:
assert merge['SelfEmployed'][105] == 440

#### Mutiply each row by residential ratio

In [65]:
newcol = ['TotalPop', 'Men', 'Women', 'Hispanic',
       'White', 'Black', 'Native', 'Asian', 'Pacific', 'VotingAgeCitizen',
       'Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr', 'Poverty',
       'ChildPoverty', 'Professional', 'Service', 'Office', 'Construction',
       'Production', 'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp',
       'WorkAtHome', 'MeanCommute', 'Employed', 'PrivateWork', 'PublicWork',
       'SelfEmployed', 'FamilyWork', 'Unemployment']

In [66]:
merge[newcol] = merge[newcol].multiply(merge['res_ratio'], axis = 'index')

In [67]:
merge[newcol] = merge[newcol].astype(int)

In [69]:
min(merge['zip']), max(merge['zip'])

(501, 99929)

In [70]:
merge['zip'] = merge['zip'].astype('str').str.zfill(5)

In [71]:
merge = merge.drop(columns = ['res_ratio'])

## Map Zip Codes to City

In [73]:
uszip.columns

Index(['zip', 'lat', 'lng', 'city', 'state_id', 'state_name', 'zcta',
       'parent_zcta', 'population', 'density', 'county_fips', 'county_name',
       'county_weights', 'county_names_all', 'county_fips_all', 'imprecise',
       'military', 'timezone'],
      dtype='object')

In [74]:
uszip = uszip[['zip', 'city', 'state_name', 'state_id', 'county_name']]

In [75]:
merge = merge.merge(uszip, on = 'zip')

In [76]:
merge = merge.rename(columns = {'city': 'City',
                                'state_id':'State_Id'})

##  Group by Cities

In [77]:
merge = merge.groupby(['City', 'State_Id'],as_index=False).sum()

In [78]:
merge.head()

Unnamed: 0,City,State_Id,TractId,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,VotingAgeCitizen,Income,IncomeErr,IncomePerCap,IncomePerCapErr,Poverty,ChildPoverty,Professional,Service,Office,Construction,Production,Drive,Carpool,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,Aaronsburg,PA,42027010800,1056,532,524,11,1029,0,0,7,0,739,10258,1258,4516,324,173,334,138,106,95,78,83,349,62,1,19,18,51,139,502,384,54,62,0,33
1,Abbeville,AL,4268121200,6693,3361,3329,188,3999,2441,2,0,0,5612,111583,24675,60066,8645,1252,2581,584,540,497,181,668,2123,231,0,39,30,48,654,2485,2031,335,112,0,453
2,Abbeville,GA,39947880700,4683,3205,1478,203,2229,2173,5,0,0,3869,42497,10566,16894,4017,1543,2318,209,242,110,114,138,726,55,0,11,5,15,244,824,485,276,57,0,184
3,Abbeville,LA,243253457806,25185,12341,12839,894,16810,6186,51,820,0,17956,286959,55459,135638,17577,5334,6718,2664,1951,2804,1499,1477,8380,1226,12,354,136,301,2898,10437,8553,1340,525,0,1851
4,Abbeville,MS,84215850903,2078,1033,1042,65,1270,688,0,7,0,1557,23656,3944,12435,1903,439,691,281,181,127,78,212,757,89,7,2,0,25,276,887,700,126,58,0,164


### Ratios
-return df back to census form

##### Multiply columns by 100

In [79]:
merge.iloc[:,6:11+1] *= 100 

In [80]:
merge.iloc[:,17:30+1] *= 100

In [81]:
merge.iloc[:,32:36+1] *= 100

##### Divide columns by Total Population

In [82]:
col = ['Hispanic','White', 'Black', 'Native', 'Asian', 'Pacific','Poverty',
       'ChildPoverty','Unemployment']

In [83]:
merge[col] = merge[col].divide(merge['TotalPop'], axis = 'index')

In [84]:
merge[col] = merge[col].round(1)

##### Divide columns by Employed

In [85]:
employment = ['Professional', 'Service', 'Office', 'Construction', 'Production', 
              'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp', 'WorkAtHome', 
              'MeanCommute', 'PrivateWork', 'PublicWork','SelfEmployed', 'FamilyWork']

In [86]:
merge[employment] = merge[employment].divide(merge['Employed'], axis = 'index')

In [87]:
merge[employment] = merge[employment].round(1)

#### Create clean copy

In [88]:
merge.columns

Index(['City', 'State_Id', 'TractId', 'TotalPop', 'Men', 'Women', 'Hispanic',
       'White', 'Black', 'Native', 'Asian', 'Pacific', 'VotingAgeCitizen',
       'Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr', 'Poverty',
       'ChildPoverty', 'Professional', 'Service', 'Office', 'Construction',
       'Production', 'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp',
       'WorkAtHome', 'MeanCommute', 'Employed', 'PrivateWork', 'PublicWork',
       'SelfEmployed', 'FamilyWork', 'Unemployment'],
      dtype='object')

In [89]:
columns = ['TractId', 'City', 'State_Id', 
           'TotalPop', 'Men', 'Women', 'Hispanic','White', 'Black', 'Native', 
           'Asian', 'Pacific', 'VotingAgeCitizen','Income', 'IncomeErr', 
           'IncomePerCap', 'IncomePerCapErr', 'Poverty', 'ChildPoverty', 
           'Professional', 'Service', 'Office', 'Construction', 'Production', 
           'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp', 'WorkAtHome', 
           'MeanCommute', 'Employed', 'PrivateWork', 'PublicWork', 
           'SelfEmployed', 'FamilyWork', 'Unemployment']

In [90]:
merge = merge[columns]

In [91]:
merge.isnull().sum()

TractId                0
City                   0
State_Id               0
TotalPop               0
Men                    0
Women                  0
Hispanic            2092
White               2092
Black               2092
Native              2092
Asian               2092
Pacific             2092
VotingAgeCitizen       0
Income                 0
IncomeErr              0
IncomePerCap           0
IncomePerCapErr        0
Poverty             2092
ChildPoverty        2092
Professional        2192
Service             2192
Office              2192
Construction        2192
Production          2192
Drive               2192
Carpool             2192
Transit             2192
Walk                2192
OtherTransp         2192
WorkAtHome          2192
MeanCommute         2192
Employed               0
PrivateWork         2192
PublicWork          2192
SelfEmployed        2192
FamilyWork          2192
Unemployment        2092
dtype: int64

In [92]:
merge = merge.dropna(axis=0, subset = ['Hispanic', 'White', 'Black', 'Native', 'Asian', 'Pacific'], how = 'all')
merge = merge.dropna(axis=0, subset = ['Professional', 'Service', 'Office', 'Construction', 
                                       'Production','Drive', 'Carpool', 'Transit', 'Walk', 
                                       'OtherTransp', 'WorkAtHome', 'MeanCommute'], how = 'all')
merge = merge.replace(np.nan, 0)

In [93]:
print(merge.shape)
merge.head()

(25236, 37)


Unnamed: 0,TractId,City,State_Id,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,VotingAgeCitizen,Income,IncomeErr,IncomePerCap,IncomePerCapErr,Poverty,ChildPoverty,Professional,Service,Office,Construction,Production,Drive,Carpool,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,42027010800,Aaronsburg,PA,1056,532,524,1.0,97.4,0.0,0.0,0.7,0.0,739,10258,1258,4516,324,16.4,31.6,27.5,21.1,18.9,15.5,16.5,69.5,12.4,0.2,3.8,3.6,10.2,27.7,502,76.5,10.8,12.4,0.0,3.1
1,4268121200,Abbeville,AL,6693,3361,3329,2.8,59.7,36.5,0.0,0.0,0.0,5612,111583,24675,60066,8645,18.7,38.6,23.5,21.7,20.0,7.3,26.9,85.4,9.3,0.0,1.6,1.2,1.9,26.3,2485,81.7,13.5,4.5,0.0,6.8
2,39947880700,Abbeville,GA,4683,3205,1478,4.3,47.6,46.4,0.1,0.0,0.0,3869,42497,10566,16894,4017,32.9,49.5,25.4,29.4,13.3,13.8,16.7,88.1,6.7,0.0,1.3,0.6,1.8,29.6,824,58.9,33.5,6.9,0.0,3.9
3,243253457806,Abbeville,LA,25185,12341,12839,3.5,66.7,24.6,0.2,3.3,0.0,17956,286959,55459,135638,17577,21.2,26.7,25.5,18.7,26.9,14.4,14.2,80.3,11.7,0.1,3.4,1.3,2.9,27.8,10437,81.9,12.8,5.0,0.0,7.3
4,84215850903,Abbeville,MS,2078,1033,1042,3.1,61.1,33.1,0.0,0.3,0.0,1557,23656,3944,12435,1903,21.1,33.3,31.7,20.4,14.3,8.8,23.9,85.3,10.0,0.8,0.2,0.0,2.8,31.1,887,78.9,14.2,6.5,0.0,7.9


In [94]:
merge.to_csv('census_csv/census_by_city.csv', index=False)

## CSVs 

### Diversity

In [95]:
total_pop_df = merge[['TotalPop', 'Men', 'Women','City', 'State_Id']]
total_pop_df.head()

Unnamed: 0,TotalPop,Men,Women,City,State_Id
0,1056,532,524,Aaronsburg,PA
1,6693,3361,3329,Abbeville,AL
2,4683,3205,1478,Abbeville,GA
3,25185,12341,12839,Abbeville,LA
4,2078,1033,1042,Abbeville,MS


In [96]:
diversity_df = merge[['Hispanic','White', 'Black', 'Native', 'Asian', 'Pacific', 'City', 'State_Id']]
diversity_df[['Hispanic','White', 'Black', 'Native', 'Asian', 'Pacific']] = diversity_df[['Hispanic','White', 'Black', 'Native', 'Asian', 'Pacific']].round(1)
diversity_df.head()

Unnamed: 0,Hispanic,White,Black,Native,Asian,Pacific,City,State_Id
0,1.0,97.4,0.0,0.0,0.7,0.0,Aaronsburg,PA
1,2.8,59.7,36.5,0.0,0.0,0.0,Abbeville,AL
2,4.3,47.6,46.4,0.1,0.0,0.0,Abbeville,GA
3,3.5,66.7,24.6,0.2,3.3,0.0,Abbeville,LA
4,3.1,61.1,33.1,0.0,0.3,0.0,Abbeville,MS


In [97]:
diversity_df = total_pop_df.merge(diversity_df, on = ['City', 'State_Id'])
diversity_df.to_csv('census_csv/diversity_df.csv', index=False)
diversity_df.head()

Unnamed: 0,TotalPop,Men,Women,City,State_Id,Hispanic,White,Black,Native,Asian,Pacific
0,1056,532,524,Aaronsburg,PA,1.0,97.4,0.0,0.0,0.7,0.0
1,6693,3361,3329,Abbeville,AL,2.8,59.7,36.5,0.0,0.0,0.0
2,4683,3205,1478,Abbeville,GA,4.3,47.6,46.4,0.1,0.0,0.0
3,25185,12341,12839,Abbeville,LA,3.5,66.7,24.6,0.2,3.3,0.0
4,2078,1033,1042,Abbeville,MS,3.1,61.1,33.1,0.0,0.3,0.0


### Income Per Capita

In [98]:
income_per_capita_df = merge[['Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr','City', 'State_Id']]
income_per_capita_df[['Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr']] = income_per_capita_df[['Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr']].round(1)
income_per_capita_df.to_csv("census_csv/income_per_capita_df.csv",index=False)
income_per_capita_df.head()

Unnamed: 0,Income,IncomeErr,IncomePerCap,IncomePerCapErr,City,State_Id
0,10258,1258,4516,324,Aaronsburg,PA
1,111583,24675,60066,8645,Abbeville,AL
2,42497,10566,16894,4017,Abbeville,GA
3,286959,55459,135638,17577,Abbeville,LA
4,23656,3944,12435,1903,Abbeville,MS


### Poverty

In [99]:
poverty_df = merge[['Poverty', 'ChildPoverty','City', 'State_Id']]
poverty_df.to_csv("census_csv/poverty_df.csv",index=False)
poverty_df.head()

Unnamed: 0,Poverty,ChildPoverty,City,State_Id
0,16.4,31.6,Aaronsburg,PA
1,18.7,38.6,Abbeville,AL
2,32.9,49.5,Abbeville,GA
3,21.2,26.7,Abbeville,LA
4,21.1,33.3,Abbeville,MS


### Industries

In [100]:
industry_df = merge[['Professional', 'Service', 'Office', 'Construction', 'Production','City', 'State_Id']]
industry_df.to_csv("census_csv/industry_df.csv",index=False)
industry_df.head()

Unnamed: 0,Professional,Service,Office,Construction,Production,City,State_Id
0,27.5,21.1,18.9,15.5,16.5,Aaronsburg,PA
1,23.5,21.7,20.0,7.3,26.9,Abbeville,AL
2,25.4,29.4,13.3,13.8,16.7,Abbeville,GA
3,25.5,18.7,26.9,14.4,14.2,Abbeville,LA
4,31.7,20.4,14.3,8.8,23.9,Abbeville,MS


### Transportation

In [101]:
transportation_df = merge[['Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp', 'WorkAtHome', 'MeanCommute','City', 'State_Id']]
transportation_df[['Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp', 'WorkAtHome', 'MeanCommute',]] = transportation_df[['Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp', 'WorkAtHome', 'MeanCommute']].round(1)
transportation_df.to_csv("census_csv/transportation_df.csv",index=False)
transportation_df.head()

Unnamed: 0,Drive,Carpool,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute,City,State_Id
0,69.5,12.4,0.2,3.8,3.6,10.2,27.7,Aaronsburg,PA
1,85.4,9.3,0.0,1.6,1.2,1.9,26.3,Abbeville,AL
2,88.1,6.7,0.0,1.3,0.6,1.8,29.6,Abbeville,GA
3,80.3,11.7,0.1,3.4,1.3,2.9,27.8,Abbeville,LA
4,85.3,10.0,0.8,0.2,0.0,2.8,31.1,Abbeville,MS


### Employment

In [102]:
employment_df = merge[['Employed', 'Unemployment','City', 'State_Id']]
employment_df.to_csv('census_csv/employment_df.csv', index=False)
employment_df.head()


Unnamed: 0,Employed,Unemployment,City,State_Id
0,502,3.1,Aaronsburg,PA
1,2485,6.8,Abbeville,AL
2,824,3.9,Abbeville,GA
3,10437,7.3,Abbeville,LA
4,887,7.9,Abbeville,MS


### Employment Type

In [103]:
employment_ratio_df = merge[['PrivateWork', 'PublicWork', 'SelfEmployed', 'FamilyWork','City', 'State_Id']]
employment_ratio_df[['PrivateWork', 'PublicWork', 'SelfEmployed', 'FamilyWork']] = employment_ratio_df[['PrivateWork', 'PublicWork', 'SelfEmployed', 'FamilyWork']].round(1)
employment_ratio_df.to_csv('census_csv/employment_ratio_df.csv', index=False)
employment_ratio_df.head()

Unnamed: 0,PrivateWork,PublicWork,SelfEmployed,FamilyWork,City,State_Id
0,76.5,10.8,12.4,0.0,Aaronsburg,PA
1,81.7,13.5,4.5,0.0,Abbeville,AL
2,58.9,33.5,6.9,0.0,Abbeville,GA
3,81.9,12.8,5.0,0.0,Abbeville,LA
4,78.9,14.2,6.5,0.0,Abbeville,MS
