# CENSUS DATA


Assumptions:
- When getting demographic info, assumed the percentages were from the total population
- With employment related statistics, assumed the percentages were from the total employed.

# EDA

## Data Exploration

In [1]:
import pandas as pd
import numpy as np

In [2]:
import os

In [3]:
path = os.getcwd()

### Import data

##### Zip Code Data
- using for city information

In [4]:
uszip_filename = os.path.join(path, 'uszips.csv')

In [5]:
uszip = pd.read_csv(uszip_filename)

In [6]:
print(uszip.shape)
uszip.head()

(33121, 18)


Unnamed: 0,zip,lat,lng,city,state_id,state_name,zcta,parent_zcta,population,density,county_fips,county_name,county_weights,county_names_all,county_fips_all,imprecise,military,timezone
0,601,18.18005,-66.75218,Adjuntas,PR,Puerto Rico,True,,17113.0,102.7,72001,Adjuntas,"{""72001"": ""99.43"", ""72141"": ""0.57""}",Adjuntas|Utuado,72001|72141,False,False,America/Puerto_Rico
1,602,18.36074,-67.17519,Aguada,PR,Puerto Rico,True,,37751.0,476.0,72003,Aguada,"{""72003"": ""100""}",Aguada,72003,False,False,America/Puerto_Rico
2,603,18.4544,-67.12201,Aguadilla,PR,Puerto Rico,True,,47081.0,574.9,72005,Aguadilla,"{""72005"": ""100""}",Aguadilla,72005,False,False,America/Puerto_Rico
3,606,18.16721,-66.93828,Maricao,PR,Puerto Rico,True,,6392.0,58.3,72093,Maricao,"{""72093"": ""94.88"", ""72153"": ""3.78"", ""72121"": ""...",Maricao|Yauco|Sabana Grande,72093|72153|72121,False,False,America/Puerto_Rico
4,610,18.29032,-67.12244,Anasco,PR,Puerto Rico,True,,26686.0,286.9,72011,Añasco,"{""72011"": ""99.45"", ""72003"": ""0.55""}",Añasco|Aguada,72011|72003,False,False,America/Puerto_Rico


In [7]:
uszip['zip'] = uszip['zip'].astype('str').str.zfill(5)

In [8]:
uszip['zip'].nunique()

33121

##### Census 
- Contains population, demographic, poverty, and employment data by tract

In [9]:
census_filename = os.path.join(path, 'acs2017_census_tract_data.csv')

In [10]:
df = pd.read_csv(census_filename)
print(df.shape)
df.head()

(74001, 37)


Unnamed: 0,TractId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,1001020100,Alabama,Autauga County,1845,899,946,2.4,86.3,5.2,0.0,...,0.5,0.0,2.1,24.5,881,74.2,21.2,4.5,0.0,4.6
1,1001020200,Alabama,Autauga County,2172,1167,1005,1.1,41.6,54.5,0.0,...,0.0,0.5,0.0,22.2,852,75.9,15.0,9.0,0.0,3.4
2,1001020300,Alabama,Autauga County,3385,1533,1852,8.0,61.4,26.5,0.6,...,1.0,0.8,1.5,23.1,1482,73.3,21.1,4.8,0.7,4.7
3,1001020400,Alabama,Autauga County,4267,2001,2266,9.6,80.3,7.1,0.5,...,1.5,2.9,2.1,25.9,1849,75.8,19.7,4.5,0.0,6.1
4,1001020500,Alabama,Autauga County,9965,5054,4911,0.9,77.5,16.4,0.0,...,0.8,0.3,0.7,21.0,4787,71.4,24.1,4.5,0.0,2.3


In [11]:
df['TractId'].nunique()

74001

##### Tract to Zip
- maps census tracts to zip codes

In [12]:
tract_zip_filename = os.path.join(path, 'TRACT_ZIP_122017.xlsx')

In [13]:
tract_zip = pd.read_excel(tract_zip_filename)

In [14]:
print(tract_zip.shape)
tract_zip.head()

(168662, 6)


Unnamed: 0,tract,zip,res_ratio,bus_ratio,oth_ratio,tot_ratio
0,1001020100,36067,1.0,1.0,1.0,1.0
1,1001020200,36008,0.028243,0.012987,0.0,0.023846
2,1001020200,36067,0.971757,0.987013,1.0,0.976154
3,1001020300,36067,1.0,1.0,1.0,1.0
4,1001020400,36067,0.025543,0.585987,0.25,0.071038


In [15]:
tract_zip = tract_zip[['tract', 'zip', 'res_ratio']]

In [16]:
tract_zip = tract_zip.rename(columns = {'tract':'TractId'})

In [17]:
tract_zip['zip'].nunique()

39301

In [18]:
tract_zip['TractId'].nunique()

73584

## Merge Census with Tract_Zip Data

In [19]:
merge = df.merge(tract_zip, on=['TractId'])

In [20]:
print(merge.shape)
merge.head()

(168385, 39)


Unnamed: 0,TractId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment,zip,res_ratio
0,1001020100,Alabama,Autauga County,1845,899,946,2.4,86.3,5.2,0.0,...,2.1,24.5,881,74.2,21.2,4.5,0.0,4.6,36067,1.0
1,1001020200,Alabama,Autauga County,2172,1167,1005,1.1,41.6,54.5,0.0,...,0.0,22.2,852,75.9,15.0,9.0,0.0,3.4,36008,0.028243
2,1001020200,Alabama,Autauga County,2172,1167,1005,1.1,41.6,54.5,0.0,...,0.0,22.2,852,75.9,15.0,9.0,0.0,3.4,36067,0.971757
3,1001020300,Alabama,Autauga County,3385,1533,1852,8.0,61.4,26.5,0.6,...,1.5,23.1,1482,73.3,21.1,4.8,0.7,4.7,36067,1.0
4,1001020400,Alabama,Autauga County,4267,2001,2266,9.6,80.3,7.1,0.5,...,2.1,25.9,1849,75.8,19.7,4.5,0.0,6.1,36067,0.025543


### Seeing what does not transfer over

In [21]:
df[(~df.TractId.isin(merge.TractId))]

Unnamed: 0,TractId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
43,1003990000,Alabama,Baldwin County,0,0,0,,,,,...,,,,,0,,,,,
868,1097990000,Alabama,Mobile County,0,0,0,,,,,...,,,,,0,,,,,
1266,2105000200,Alaska,Hoonah-Angoon Census Area,62,35,27,0.0,11.3,0.0,82.3,...,25.9,0.0,22.2,13.8,28,32.1,35.7,32.1,0.0,12.5
1324,2185000100,Alaska,North Slope Borough,4444,2366,2078,5.3,13.3,0.3,59.4,...,11.9,9.0,3.8,6.6,1800,44.3,54.4,1.3,0.0,14.2
1353,4001944202,Arizona,Apache County,4094,1933,2161,3.4,0.5,0.4,93.9,...,7.5,0.0,3.5,22.8,851,39.4,56.9,3.8,0.0,28.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73982,72151950700,Puerto Rico,Yabucoa Municipio,4744,2377,2367,100.0,0.0,0.0,0.0,...,0.0,0.0,0.0,33.9,1034,61.4,36.2,2.4,0.0,17.9
73983,72151950800,Puerto Rico,Yabucoa Municipio,3531,1915,1616,100.0,0.0,0.0,0.0,...,2.7,2.5,0.0,25.0,780,71.0,18.2,10.8,0.0,30.4
73984,72151950900,Puerto Rico,Yabucoa Municipio,5822,2892,2930,99.7,0.3,0.0,0.0,...,2.3,0.0,0.0,32.3,1143,66.0,29.4,4.6,0.0,28.7
73989,72151990000,Puerto Rico,Yabucoa Municipio,0,0,0,,,,,...,,,,,0,,,,,


In [22]:
tract_zip[(~tract_zip.TractId.isin(merge.TractId))]

Unnamed: 0,TractId,zip,res_ratio
543,1037030200,36080,1.0
2828,1115050101,35146,1.0
3017,1123231500,36866,1.0
3673,2270000100,99563,0.0
3674,2270000100,99620,0.0
...,...,...,...
168657,78030960800,802,1.0
168658,78030960900,802,1.0
168659,78030961000,802,1.0
168660,78030961100,802,1.0


## Data Cleaning

### isnull().sum()

In [23]:
merge.isnull().sum()

TractId                0
State                  0
County                 0
TotalPop               0
Men                    0
Women                  0
Hispanic             506
White                506
Black                506
Native               506
Asian                506
Pacific              506
VotingAgeCitizen       0
Income              1291
IncomeErr           1291
IncomePerCap         596
IncomePerCapErr      596
Poverty              710
ChildPoverty        1286
Professional         652
Service              652
Office               652
Construction         652
Production           652
Drive                633
Carpool              633
Transit              633
Walk                 633
OtherTransp          633
WorkAtHome           633
MeanCommute          949
Employed               0
PrivateWork          652
PublicWork           652
SelfEmployed         652
FamilyWork           652
Unemployment         650
zip                    0
res_ratio              0
dtype: int64

### Fill na with 0

In [24]:
merge.fillna(0, inplace=True)

In [25]:
merge.isnull().sum()

TractId             0
State               0
County              0
TotalPop            0
Men                 0
Women               0
Hispanic            0
White               0
Black               0
Native              0
Asian               0
Pacific             0
VotingAgeCitizen    0
Income              0
IncomeErr           0
IncomePerCap        0
IncomePerCapErr     0
Poverty             0
ChildPoverty        0
Professional        0
Service             0
Office              0
Construction        0
Production          0
Drive               0
Carpool             0
Transit             0
Walk                0
OtherTransp         0
WorkAtHome          0
MeanCommute         0
Employed            0
PrivateWork         0
PublicWork          0
SelfEmployed        0
FamilyWork          0
Unemployment        0
zip                 0
res_ratio           0
dtype: int64

In [26]:
merge.State.unique()

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'District of Columbia',
       'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana',
       'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
       'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
       'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
       'New Jersey', 'New Mexico', 'New York', 'North Carolina',
       'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
       'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee',
       'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming', 'Puerto Rico'],
      dtype=object)

### Remove nonstates

In [27]:
# Remove areas we don't have in other df
merge = merge[merge['State'] != 'Puerto Rico']

In [28]:
merge['State'].nunique()

51

## Map Tracts to Zip Code
* first get multiply % columns by total population
* multiply % columns by employed
* multiply residential ratio across rows

### Make Numbers Compatible

#### Dividing certain columns by 100


##### Making sure I capture the correct columns

In [29]:
merge.iloc[:,6:11+1]

Unnamed: 0,Hispanic,White,Black,Native,Asian,Pacific
0,2.4,86.3,5.2,0.0,1.2,0.0
1,1.1,41.6,54.5,0.0,1.0,0.0
2,1.1,41.6,54.5,0.0,1.0,0.0
3,8.0,61.4,26.5,0.6,0.7,0.4
4,9.6,80.3,7.1,0.5,0.2,0.0
...,...,...,...,...,...,...
166947,3.0,89.4,1.0,0.1,3.0,0.0
166948,3.0,89.4,1.0,0.1,3.0,0.0
166949,3.0,89.4,1.0,0.1,3.0,0.0
166950,3.0,89.4,1.0,0.1,3.0,0.0


In [30]:
merge.iloc[:,17:30+1]

Unnamed: 0,Poverty,ChildPoverty,Professional,Service,Office,Construction,Production,Drive,Carpool,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute
0,10.7,20.8,38.5,15.6,22.8,10.8,12.4,94.2,3.3,0.0,0.5,0.0,2.1,24.5
1,22.4,35.8,30.5,24.9,22.9,6.3,15.4,90.5,9.1,0.0,0.0,0.5,0.0,22.2
2,22.4,35.8,30.5,24.9,22.9,6.3,15.4,90.5,9.1,0.0,0.0,0.5,0.0,22.2
3,14.7,21.1,27.9,19.4,33.3,9.9,9.6,88.3,8.4,0.0,1.0,0.8,1.5,23.1
4,2.3,1.7,29.0,16.6,25.8,9.1,19.5,82.3,11.2,0.0,1.5,2.9,2.1,25.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
166947,12.5,18.5,32.2,15.9,13.4,20.5,17.9,72.2,8.0,9.5,2.1,3.0,5.2,31.3
166948,12.5,18.5,32.2,15.9,13.4,20.5,17.9,72.2,8.0,9.5,2.1,3.0,5.2,31.3
166949,12.5,18.5,32.2,15.9,13.4,20.5,17.9,72.2,8.0,9.5,2.1,3.0,5.2,31.3
166950,12.5,18.5,32.2,15.9,13.4,20.5,17.9,72.2,8.0,9.5,2.1,3.0,5.2,31.3


In [31]:
merge.iloc[:,32:36+1]

Unnamed: 0,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
0,74.2,21.2,4.5,0.0,4.6
1,75.9,15.0,9.0,0.0,3.4
2,75.9,15.0,9.0,0.0,3.4
3,73.3,21.1,4.8,0.7,4.7
4,75.8,19.7,4.5,0.0,6.1
...,...,...,...,...,...
166947,64.6,22.9,10.9,1.6,0.4
166948,64.6,22.9,10.9,1.6,0.4
166949,64.6,22.9,10.9,1.6,0.4
166950,64.6,22.9,10.9,1.6,0.4


In [32]:
merge.iloc[:,6:11+1] /= 100

In [33]:
merge.iloc[:,17:30+1] /= 100

In [34]:
merge.iloc[:,32:36+1] /= 100

In [35]:
merge.head()

Unnamed: 0,TractId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment,zip,res_ratio
0,1001020100,Alabama,Autauga County,1845,899,946,0.024,0.863,0.052,0.0,...,0.021,0.245,881,0.742,0.212,0.045,0.0,0.046,36067,1.0
1,1001020200,Alabama,Autauga County,2172,1167,1005,0.011,0.416,0.545,0.0,...,0.0,0.222,852,0.759,0.15,0.09,0.0,0.034,36008,0.028243
2,1001020200,Alabama,Autauga County,2172,1167,1005,0.011,0.416,0.545,0.0,...,0.0,0.222,852,0.759,0.15,0.09,0.0,0.034,36067,0.971757
3,1001020300,Alabama,Autauga County,3385,1533,1852,0.08,0.614,0.265,0.006,...,0.015,0.231,1482,0.733,0.211,0.048,0.007,0.047,36067,1.0
4,1001020400,Alabama,Autauga County,4267,2001,2266,0.096,0.803,0.071,0.005,...,0.021,0.259,1849,0.758,0.197,0.045,0.0,0.061,36067,0.025543


##### Multiply Columns by Total Population

In [36]:
merge.columns

Index(['TractId', 'State', 'County', 'TotalPop', 'Men', 'Women', 'Hispanic',
       'White', 'Black', 'Native', 'Asian', 'Pacific', 'VotingAgeCitizen',
       'Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr', 'Poverty',
       'ChildPoverty', 'Professional', 'Service', 'Office', 'Construction',
       'Production', 'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp',
       'WorkAtHome', 'MeanCommute', 'Employed', 'PrivateWork', 'PublicWork',
       'SelfEmployed', 'FamilyWork', 'Unemployment', 'zip', 'res_ratio'],
      dtype='object')

In [37]:
col = ['Hispanic','White', 'Black', 'Native', 'Asian', 'Pacific','Poverty',
       'ChildPoverty','Unemployment']

In [38]:
merge[col] = merge[col].multiply(merge['TotalPop'], axis = 'index')

In [39]:
merge[col] = merge[col].astype(int)

### Multiply 

In [40]:
employment = ['Professional', 'Service', 'Office', 'Construction', 'Production', 
              'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp', 'WorkAtHome', 
              'MeanCommute','PrivateWork', 'PublicWork','SelfEmployed', 'FamilyWork']

In [41]:
merge[employment] = merge[employment].multiply(merge['Employed'], axis = 'index')

In [42]:
merge[employment] = merge[employment].astype(int)

##### Check values

In [43]:
df.loc[
       (df.State == 'Wyoming')
       &(df.TotalPop == 3329)
       &(df.TractId == 56045951100)]

Unnamed: 0,TractId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
73054,56045951100,Wyoming,Weston County,3329,1946,1383,3.0,89.4,1.0,0.1,...,2.1,3.0,5.2,31.3,1583,64.6,22.9,10.9,1.6,0.4


In [44]:
merge.loc[166948:166948]

Unnamed: 0,TractId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment,zip,res_ratio
166948,56045951100,Wyoming,Weston County,3329,1946,1383,99,2976,33,3,...,82,495,1583,1022,362,172,25,13,82729,0.002424


In [45]:
int(3/100 * 3329)

99

In [46]:
int(89.4/100 * 3329)

2976

In [47]:
int(64.6/100  * 1583)

1022

In [48]:
int(1.6/100  * 1583)

25

In [49]:
assert merge['Hispanic'][166948] == 99

In [50]:
assert merge['White'][166948] == 2976

In [51]:
assert merge['PrivateWork'][166948] == 1022

In [52]:
assert merge['FamilyWork'][166948] == 25

In [53]:
df.loc[
       (df.State == 'Alabama')
       &(df.TotalPop == 7131)
       &(df.TractId == 1003011601)]

Unnamed: 0,TractId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,Walk,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment
41,1003011601,Alabama,Baldwin County,7131,3699,3432,9.7,87.6,1.4,0.7,...,0.5,0.3,8.4,29.3,3260,76.8,9.5,13.5,0.2,3.1


In [54]:
merge.loc[105:105]

Unnamed: 0,TractId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment,zip,res_ratio
105,1003011601,Alabama,Baldwin County,7131,3699,3432,691,6246,99,49,...,273,955,3260,2503,309,440,6,221,36580,0.01071


In [55]:
int(9.7/100 * 7131)

691

In [56]:
int(25.4/100 * 3260)

828

In [57]:
int(11.3/100 * 3260)

368

In [58]:
int(9.5/100 * 3260)

309

In [59]:
int(13.5/100 * 3260)

440

In [60]:
assert merge['Hispanic'][105] == 691

In [61]:
assert merge['Professional'][105] == 828

In [62]:
assert merge['Construction'][105] == 368

In [63]:
assert merge['PublicWork'][105] == 309

In [64]:
assert merge['SelfEmployed'][105] == 440

#### Mutiply each row by residential ratio

In [65]:
newcol = ['TotalPop', 'Men', 'Women', 'Hispanic',
       'White', 'Black', 'Native', 'Asian', 'Pacific', 'VotingAgeCitizen',
       'Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr', 'Poverty',
       'ChildPoverty', 'Professional', 'Service', 'Office', 'Construction',
       'Production', 'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp',
       'WorkAtHome', 'MeanCommute', 'Employed', 'PrivateWork', 'PublicWork',
       'SelfEmployed', 'FamilyWork', 'Unemployment']

In [66]:
merge[newcol] = merge[newcol].multiply(merge['res_ratio'], axis = 'index')

In [67]:
merge[newcol] = merge[newcol].astype(int)

In [68]:
merge.head(15)

Unnamed: 0,TractId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment,zip,res_ratio
0,1001020100,Alabama,Autauga County,1845,899,946,44,1592,95,0,...,18,215,881,653,186,39,0,84,36067,1.0
1,1001020200,Alabama,Autauga County,61,32,28,0,25,33,0,...,0,5,24,18,3,2,0,2,36008,0.028243
2,1001020200,Alabama,Autauga County,2110,1134,976,22,877,1149,0,...,0,183,827,627,123,73,0,70,36067,0.971757
3,1001020300,Alabama,Autauga County,3385,1533,1852,270,2078,897,20,...,22,342,1482,1086,312,71,10,159,36067,1.0
4,1001020400,Alabama,Autauga County,108,51,57,10,87,7,0,...,0,12,47,35,9,2,0,6,36067,0.025543
5,1001020400,Alabama,Autauga County,4158,1949,2208,398,3338,294,20,...,37,465,1801,1365,354,80,0,253,36066,0.974457
6,1001020500,Alabama,Autauga County,7856,3984,3871,70,6087,1288,0,...,26,792,3774,2693,909,169,0,180,36066,0.78839
7,1001020500,Alabama,Autauga County,2100,1065,1035,18,1627,344,0,...,6,211,1009,720,243,45,0,48,36068,0.210806
8,1001020500,Alabama,Autauga County,8,4,3,0,6,1,0,...,0,0,3,2,0,0,0,0,36067,0.000804
9,1001020600,Alabama,Autauga County,3620,1765,1855,108,2559,908,0,...,109,287,1364,1145,192,25,0,220,36067,1.0


In [69]:
min(merge['zip']), max(merge['zip'])

(501, 99929)

In [70]:
merge['zip'] = merge['zip'].astype('str').str.zfill(5)

In [71]:
merge = merge.drop(columns = ['res_ratio'])

## Ratios
-return df back to census form

##### Multiply columns by 100

In [72]:
merge.iloc[:,6:11+1] *= 100 

In [73]:
merge.iloc[:,17:30+1] *= 100

In [74]:
merge.iloc[:,32:36+1] *= 100

In [75]:
col = ['Hispanic','White', 'Black', 'Native', 'Asian', 'Pacific','Poverty',
       'ChildPoverty','Unemployment']

##### Divide columns by Total Population

In [76]:
merge[col] = merge[col].divide(merge['TotalPop'], axis = 'index')

In [77]:
merge[col] = merge[col].round(1)

##### Divide columns by Employed

In [78]:
employment = ['Professional', 'Service', 'Office', 'Construction', 'Production', 
              'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp', 'WorkAtHome', 
              'MeanCommute', 'PrivateWork', 'PublicWork','SelfEmployed', 'FamilyWork']

In [79]:
merge[employment] = merge[employment].divide(merge['Employed'], axis = 'index')

In [80]:
merge[employment] = merge[employment].round(1)

##### Check

In [81]:
merge.head()

Unnamed: 0,TractId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment,zip
0,1001020100,Alabama,Autauga County,1845,899,946,2.4,86.3,5.1,0.0,...,0.0,2.0,24.4,881,74.1,21.1,4.4,0.0,4.6,36067
1,1001020200,Alabama,Autauga County,61,32,28,0.0,41.0,54.1,0.0,...,0.0,0.0,20.8,24,75.0,12.5,8.3,0.0,3.3,36008
2,1001020200,Alabama,Autauga County,2110,1134,976,1.0,41.6,54.5,0.0,...,0.4,0.0,22.1,827,75.8,14.9,8.8,0.0,3.3,36067
3,1001020300,Alabama,Autauga County,3385,1533,1852,8.0,61.4,26.5,0.6,...,0.7,1.5,23.1,1482,73.3,21.1,4.8,0.7,4.7,36067
4,1001020400,Alabama,Autauga County,108,51,57,9.3,80.6,6.5,0.0,...,2.1,0.0,25.5,47,74.5,19.1,4.3,0.0,5.6,36067


## Map Zip Codes to City

In [82]:
uszip.columns

Index(['zip', 'lat', 'lng', 'city', 'state_id', 'state_name', 'zcta',
       'parent_zcta', 'population', 'density', 'county_fips', 'county_name',
       'county_weights', 'county_names_all', 'county_fips_all', 'imprecise',
       'military', 'timezone'],
      dtype='object')

In [83]:
uszip = uszip[['zip', 'city', 'state_name', 'state_id', 'county_name']]

In [84]:
merge = merge.merge(uszip, on = 'zip')

In [85]:
merge.shape

(159991, 42)

In [86]:
merge.head()

Unnamed: 0,TractId,State,County,TotalPop,Men,Women,Hispanic,White,Black,Native,...,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment,zip,city,state_name,state_id,county_name
0,1001020100,Alabama,Autauga County,1845,899,946,2.4,86.3,5.1,0.0,...,74.1,21.1,4.4,0.0,4.6,36067,Prattville,Alabama,AL,Autauga
1,1001020200,Alabama,Autauga County,2110,1134,976,1.0,41.6,54.5,0.0,...,75.8,14.9,8.8,0.0,3.3,36067,Prattville,Alabama,AL,Autauga
2,1001020300,Alabama,Autauga County,3385,1533,1852,8.0,61.4,26.5,0.6,...,73.3,21.1,4.8,0.7,4.7,36067,Prattville,Alabama,AL,Autauga
3,1001020400,Alabama,Autauga County,108,51,57,9.3,80.6,6.5,0.0,...,74.5,19.1,4.3,0.0,5.6,36067,Prattville,Alabama,AL,Autauga
4,1001020500,Alabama,Autauga County,8,4,3,0.0,75.0,12.5,0.0,...,66.7,0.0,0.0,0.0,0.0,36067,Prattville,Alabama,AL,Autauga


In [87]:
merge.columns

Index(['TractId', 'State', 'County', 'TotalPop', 'Men', 'Women', 'Hispanic',
       'White', 'Black', 'Native', 'Asian', 'Pacific', 'VotingAgeCitizen',
       'Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr', 'Poverty',
       'ChildPoverty', 'Professional', 'Service', 'Office', 'Construction',
       'Production', 'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp',
       'WorkAtHome', 'MeanCommute', 'Employed', 'PrivateWork', 'PublicWork',
       'SelfEmployed', 'FamilyWork', 'Unemployment', 'zip', 'city',
       'state_name', 'state_id', 'county_name'],
      dtype='object')

In [88]:
columns = ['TractId', 'city', 'state_id', 'State', 'County', 'county_name', 
           'TotalPop', 'Men', 'Women', 'Hispanic','White', 'Black', 'Native', 
           'Asian', 'Pacific', 'VotingAgeCitizen','Income', 'IncomeErr', 
           'IncomePerCap', 'IncomePerCapErr', 'Poverty', 'ChildPoverty', 
           'Professional', 'Service', 'Office', 'Construction', 'Production', 
           'Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp', 'WorkAtHome', 
           'MeanCommute', 'Employed', 'PrivateWork', 'PublicWork', 
           'SelfEmployed', 'FamilyWork', 'Unemployment', 'zip', ]

In [89]:
merge = merge[columns]

In [90]:
merge.head()

Unnamed: 0,TractId,city,state_id,State,County,county_name,TotalPop,Men,Women,Hispanic,...,OtherTransp,WorkAtHome,MeanCommute,Employed,PrivateWork,PublicWork,SelfEmployed,FamilyWork,Unemployment,zip
0,1001020100,Prattville,AL,Alabama,Autauga County,Autauga,1845,899,946,2.4,...,0.0,2.0,24.4,881,74.1,21.1,4.4,0.0,4.6,36067
1,1001020200,Prattville,AL,Alabama,Autauga County,Autauga,2110,1134,976,1.0,...,0.4,0.0,22.1,827,75.8,14.9,8.8,0.0,3.3,36067
2,1001020300,Prattville,AL,Alabama,Autauga County,Autauga,3385,1533,1852,8.0,...,0.7,1.5,23.1,1482,73.3,21.1,4.8,0.7,4.7,36067
3,1001020400,Prattville,AL,Alabama,Autauga County,Autauga,108,51,57,9.3,...,2.1,0.0,25.5,47,74.5,19.1,4.3,0.0,5.6,36067
4,1001020500,Prattville,AL,Alabama,Autauga County,Autauga,8,4,3,0.0,...,0.0,0.0,0.0,3,66.7,0.0,0.0,0.0,0.0,36067


### Create clean copy

In [91]:
merge.isnull().sum()

TractId                 0
city                    0
state_id                0
State                   0
County                  0
county_name             0
TotalPop                0
Men                     0
Women                   0
Hispanic             8795
White                8795
Black                8795
Native               8795
Asian                8795
Pacific              8795
VotingAgeCitizen        0
Income                  0
IncomeErr               0
IncomePerCap            0
IncomePerCapErr         0
Poverty              8795
ChildPoverty         8795
Professional        10666
Service             10666
Office              10666
Construction        10666
Production          10666
Drive               10666
Carpool             10666
Transit             10666
Walk                10666
OtherTransp         10666
WorkAtHome          10666
MeanCommute         10666
Employed                0
PrivateWork         10666
PublicWork          10666
SelfEmployed        10666
FamilyWork  

In [92]:
merge = merge.dropna(axis=0, subset = ['Hispanic', 'White', 'Black', 'Native', 'Asian', 'Pacific'], how = 'all')
merge = merge.dropna(axis=0, subset = ['Professional', 'Service', 'Office', 'Construction', 
                                       'Production','Drive', 'Carpool', 'Transit', 'Walk', 
                                       'OtherTransp', 'WorkAtHome', 'MeanCommute'], how = 'all')
merge = merge.replace(np.nan, 0)

In [93]:
merge.shape

(149325, 41)

In [94]:
merge.to_csv('merge.csv')

##  CSVs - Group by Cities

In [95]:
merge = merge.rename(columns = {'city': 'City',
                                'state_id':'State_Id'})

### Diversity

In [96]:
total_pop_df = merge[['TotalPop', 'Men', 'Women','City', 'State_Id']].groupby(['City', 'State_Id'],as_index=False).sum()
total_pop_df.head()

Unnamed: 0,City,State_Id,TotalPop,Men,Women
0,Aaronsburg,PA,1056,532,524
1,Abbeville,AL,6693,3361,3329
2,Abbeville,GA,4683,3205,1478
3,Abbeville,LA,25185,12341,12839
4,Abbeville,MS,2078,1033,1042


In [97]:
diversity_df = merge[['Hispanic','White', 'Black', 'Native', 'Asian', 'Pacific', 'City', 'State_Id']].groupby(['City', 'State_Id'],as_index=False).mean()
diversity_df[['Hispanic','White', 'Black', 'Native', 'Asian', 'Pacific']] = diversity_df[['Hispanic','White', 'Black', 'Native', 'Asian', 'Pacific']].round(1)
diversity_df.head()

Unnamed: 0,City,State_Id,Hispanic,White,Black,Native,Asian,Pacific
0,Aaronsburg,PA,1.0,97.4,0.0,0.0,0.7,0.0
1,Abbeville,AL,3.6,64.2,31.4,0.0,0.0,0.0
2,Abbeville,GA,2.7,58.7,37.1,0.4,0.0,0.0
3,Abbeville,LA,3.9,77.0,14.2,0.3,2.1,0.0
4,Abbeville,MS,4.7,60.9,31.3,0.0,0.8,0.0


In [98]:
diversity_df = total_pop_df.merge(diversity_df, on = ['City', 'State_Id'])
diversity_df.head()

Unnamed: 0,City,State_Id,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific
0,Aaronsburg,PA,1056,532,524,1.0,97.4,0.0,0.0,0.7,0.0
1,Abbeville,AL,6693,3361,3329,3.6,64.2,31.4,0.0,0.0,0.0
2,Abbeville,GA,4683,3205,1478,2.7,58.7,37.1,0.4,0.0,0.0
3,Abbeville,LA,25185,12341,12839,3.9,77.0,14.2,0.3,2.1,0.0
4,Abbeville,MS,2078,1033,1042,4.7,60.9,31.3,0.0,0.8,0.0


### Income Per Capita

In [99]:
income_per_capita_df = merge[['Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr','City', 'State_Id']].groupby(['City', 'State_Id'],as_index=False).mean()
income_per_capita_df[['Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr']] = income_per_capita_df[['Income', 'IncomeErr', 'IncomePerCap', 'IncomePerCapErr']].round(1)
income_per_capita_df.to_csv("income_per_capita_df.csv",index=False)
income_per_capita_df.head()

Unnamed: 0,City,State_Id,Income,IncomeErr,IncomePerCap,IncomePerCapErr
0,Aaronsburg,PA,10258.0,1258.0,4516.0,324.0
1,Abbeville,AL,27895.8,6168.8,15016.5,2161.2
2,Abbeville,GA,14165.7,3522.0,5631.3,1339.0
3,Abbeville,LA,26087.2,5041.7,12330.7,1597.9
4,Abbeville,MS,7885.3,1314.7,4145.0,634.3


### Poverty

In [100]:
poverty_df = merge[['Poverty', 'ChildPoverty','City', 'State_Id']].groupby(['City', 'State_Id'],as_index=False).sum()
poverty_df.to_csv("poverty_df.csv",index=False)
poverty_df.head()

Unnamed: 0,City,State_Id,Poverty,ChildPoverty
0,Aaronsburg,PA,16.4,31.6
1,Abbeville,AL,71.5,126.0
2,Abbeville,GA,75.2,100.5
3,Abbeville,LA,201.7,251.2
4,Abbeville,MS,58.8,87.8


### Industries

In [101]:
industry_df = merge[['Professional', 'Service', 'Office', 'Construction', 'Production','City', 'State_Id']].groupby(['City', 'State_Id'],as_index=False).sum()
industry_df.to_csv("industry_df.csv",index=False)
industry_df.head()

Unnamed: 0,City,State_Id,Professional,Service,Office,Construction,Production
0,Aaronsburg,PA,27.5,21.1,18.9,15.5,16.5
1,Abbeville,AL,109.5,72.2,76.2,39.3,99.5
2,Abbeville,GA,86.0,75.6,42.3,41.6,48.6
3,Abbeville,LA,277.0,169.7,266.4,188.8,135.9
4,Abbeville,MS,107.1,62.7,44.1,24.6,52.3


### Transportation

In [102]:
transportation_df = merge[['Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp', 'WorkAtHome', 'MeanCommute','City', 'State_Id']].groupby(['City', 'State_Id'],as_index=False).mean()
transportation_df[['Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp', 'WorkAtHome', 'MeanCommute',]] = transportation_df[['Drive', 'Carpool', 'Transit', 'Walk', 'OtherTransp', 'WorkAtHome', 'MeanCommute']].round(1)
transportation_df.to_csv("transportation_df.csv",index=False)
transportation_df.head()

Unnamed: 0,City,State_Id,Drive,Carpool,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute
0,Aaronsburg,PA,69.5,12.4,0.2,3.8,3.6,10.2,27.7
1,Abbeville,AL,85.0,9.1,0.0,1.0,1.6,2.4,26.7
2,Abbeville,GA,86.2,6.6,0.0,1.5,0.9,2.9,26.8
3,Abbeville,LA,80.1,9.9,0.1,3.1,1.5,2.8,29.1
4,Abbeville,MS,83.4,11.3,0.4,0.3,0.0,1.5,25.0


### Employment

In [103]:
employment_df = merge[['Employed', 'Unemployment','City', 'State_Id']].groupby(['City', 'State_Id'],as_index=False).sum()
employment_df.to_csv('employment_df.csv', index=False)
employment_df.head()


Unnamed: 0,City,State_Id,Employed,Unemployment
0,Aaronsburg,PA,502,3.1
1,Abbeville,AL,2485,31.0
2,Abbeville,GA,824,18.0
3,Abbeville,LA,10437,76.5
4,Abbeville,MS,887,17.3


### Employment Type

In [104]:
employment_ratio_df = merge[['PrivateWork', 'PublicWork', 'SelfEmployed', 'FamilyWork','City', 'State_Id']].groupby(['City', 'State_Id'],as_index=False).mean()
employment_ratio_df[['PrivateWork', 'PublicWork', 'SelfEmployed', 'FamilyWork']] = employment_ratio_df[['PrivateWork', 'PublicWork', 'SelfEmployed', 'FamilyWork']].round(1)
employment_ratio_df.to_csv('employment_ratio_df.csv', index=False)
employment_ratio_df.head()

Unnamed: 0,City,State_Id,PrivateWork,PublicWork,SelfEmployed,FamilyWork
0,Aaronsburg,PA,76.5,10.8,12.4,0.0
1,Abbeville,AL,79.5,14.7,5.2,0.0
2,Abbeville,GA,59.3,30.3,9.5,0.0
3,Abbeville,LA,81.2,11.5,5.2,0.0
4,Abbeville,MS,71.7,20.5,6.7,0.0


Total_DF

In [105]:
census_df = diversity_df.merge(income_per_capita_df, on=['City', 'State_Id'])
census_df = census_df.merge(poverty_df, on=['City', 'State_Id'])
census_df = census_df.merge(employment_df, on=['City', 'State_Id'])
census_df = census_df.merge(employment_ratio_df, on=['City', 'State_Id'])
census_df = census_df.merge(industry_df, on=['City', 'State_Id'])
census_df = census_df.merge(transportation_df, on=['City', 'State_Id'])


In [106]:
census_df.head()

Unnamed: 0,City,State_Id,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,...,Office,Construction,Production,Drive,Carpool,Transit,Walk,OtherTransp,WorkAtHome,MeanCommute
0,Aaronsburg,PA,1056,532,524,1.0,97.4,0.0,0.0,0.7,...,18.9,15.5,16.5,69.5,12.4,0.2,3.8,3.6,10.2,27.7
1,Abbeville,AL,6693,3361,3329,3.6,64.2,31.4,0.0,0.0,...,76.2,39.3,99.5,85.0,9.1,0.0,1.0,1.6,2.4,26.7
2,Abbeville,GA,4683,3205,1478,2.7,58.7,37.1,0.4,0.0,...,42.3,41.6,48.6,86.2,6.6,0.0,1.5,0.9,2.9,26.8
3,Abbeville,LA,25185,12341,12839,3.9,77.0,14.2,0.3,2.1,...,266.4,188.8,135.9,80.1,9.9,0.1,3.1,1.5,2.8,29.1
4,Abbeville,MS,2078,1033,1042,4.7,60.9,31.3,0.0,0.8,...,44.1,24.6,52.3,83.4,11.3,0.4,0.3,0.0,1.5,25.0


In [107]:
census_df.to_csv('census_by_city.csv', index=False)