# Census Housing by Demographics and State ETL

In this notebook data is extracted from the Census Bureau's American Community Survey 5-Year Data and loaded into a MongoDB database.

We will be using the Subject Tables, extracting variable from the DEMOGRAPHIC CHARACTERISTICS FOR OCCUPIED HOUSING UNITS group (S2502). The variables in question are percentages of Total Occupied Home, Owner Occupied Homes, and Renter Occupied Homes by age and state over the years 2010-2019.




In [1]:
#dependencies
import pandas as pd
import numpy as np
import pymongo
from census import Census
from config import census_key
api_key = census_key

### Extract and Transform Age Data

In [2]:
#transformation function to apply to every dataset 

def transform_census_age_data(census_data):
    
    data = census_data
    
    df = pd.DataFrame(data)
    
    rename_dict = {}

    for name in df.columns:
        code = name[-4:]
        if code == "011E":
            rename_dict[name] = "<35"
        elif code == "012E":
            rename_dict[name] = "35-44" 
        elif code == "013E":
            rename_dict[name] = "45-54"
        elif code == "014E":
            rename_dict[name] = "55-64"
        elif code == "015E":
            rename_dict[name] = "65-74"
        elif code == "016E":
            rename_dict[name] = "75-84"
        elif code == "017E":
            rename_dict[name] = ">85"

    df.rename(columns = rename_dict, inplace = True)
    
    df = df.iloc[: , :-1]
        
    return df

In [3]:
# API calls and transformation

c = Census(api_key, year=2019)

owner_us_occupied_data = c.acs5st.get(("NAME", "S2502_C03_011E", "S2502_C03_012E", "S2502_C03_013E",
                      "S2502_C03_014E","S2502_C03_015E", "S2502_C03_016E", "S2502_C03_017E"), {'for': 'us:*'})

owner_state_occupied_data = c.acs5st.get(("NAME", "S2502_C03_011E", "S2502_C03_012E", "S2502_C03_013E",
                      "S2502_C03_014E","S2502_C03_015E", "S2502_C03_016E", "S2502_C03_017E"), {'for': 'state:*'})

renter_us_occupied_data = c.acs5st.get(("NAME", "S2502_C05_011E", "S2502_C05_012E", "S2502_C05_013E",
                      "S2502_C05_014E","S2502_C05_015E", "S2502_C05_016E", "S2502_C05_017E"), {'for': 'us:*'})

renter_state_occupied_data = c.acs5st.get(("NAME", "S2502_C05_011E", "S2502_C05_012E", "S2502_C05_013E",
                      "S2502_C05_014E","S2502_C05_015E", "S2502_C05_016E", "S2502_C05_017E"), {'for': 'state:*'})


In [4]:
#apply transformation function

owner_us_occupied_df = transform_census_age_data(owner_us_occupied_data)
renter_us_occupied_df = transform_census_age_data(renter_us_occupied_data)
owner_state_occupied_df = transform_census_age_data(owner_state_occupied_data)
renter_state_occupied_df = transform_census_age_data(renter_state_occupied_data)

In [5]:
#merge data

owner_merged = pd.concat([owner_us_occupied_df, owner_state_occupied_df])
renter_merged = pd.concat([renter_us_occupied_df, renter_state_occupied_df])
merged_df = pd.merge(owner_merged,renter_merged, how = "outer", on = "NAME")
merged_df.head()

Unnamed: 0,NAME,<35_x,35-44_x,45-54_x,55-64_x,65-74_x,75-84_x,>85_x,<35_y,35-44_y,45-54_y,55-64_y,65-74_y,75-84_y,>85_y
0,United States,7671585.0,12004992.0,15579130.0,17776092.0,14141320.0,7404225.0,2697037.0,15158389.0,8650343.0,7056648.0,5853787.0,3623508.0,1942024.0,1196968.0
1,Alabama,132804.0,187449.0,247103.0,293618.0,244373.0,136417.0,42984.0,209080.0,114400.0,92630.0,80580.0,51227.0,23724.0,11504.0
2,Alaska,20113.0,28951.0,33290.0,40497.0,27129.0,10061.0,2955.0,38675.0,16665.0,14347.0,11692.0,6183.0,1979.0,809.0
3,Arizona,163278.0,241304.0,300476.0,358348.0,341817.0,193260.0,58273.0,330963.0,186697.0,148863.0,115140.0,74038.0,37013.0,21798.0
4,Arkansas,83039.0,114549.0,141431.0,166048.0,146148.0,82092.0,26148.0,152659.0,73945.0,59312.0,54449.0,31717.0,16589.0,9945.0


In [6]:
merged_df["total_owner_occupied"] = merged_df.iloc[:,1]+merged_df.iloc[:,2]+merged_df.iloc[:,3]+merged_df.iloc[:,4]+ \
                                    merged_df.iloc[:,5]+merged_df.iloc[:,6]+merged_df.iloc[:,7]
merged_df["total_renter_occupied"] = merged_df.iloc[:,8]+merged_df.iloc[:,9]+merged_df.iloc[:,10]+merged_df.iloc[:,11]+ \
                                    merged_df.iloc[:,12]+merged_df.iloc[:,13]+merged_df.iloc[:,14]
merged_df.head()

Unnamed: 0,NAME,<35_x,35-44_x,45-54_x,55-64_x,65-74_x,75-84_x,>85_x,<35_y,35-44_y,45-54_y,55-64_y,65-74_y,75-84_y,>85_y,total_owner_occupied,total_renter_occupied
0,United States,7671585.0,12004992.0,15579130.0,17776092.0,14141320.0,7404225.0,2697037.0,15158389.0,8650343.0,7056648.0,5853787.0,3623508.0,1942024.0,1196968.0,77274381.0,43481667.0
1,Alabama,132804.0,187449.0,247103.0,293618.0,244373.0,136417.0,42984.0,209080.0,114400.0,92630.0,80580.0,51227.0,23724.0,11504.0,1284748.0,583145.0
2,Alaska,20113.0,28951.0,33290.0,40497.0,27129.0,10061.0,2955.0,38675.0,16665.0,14347.0,11692.0,6183.0,1979.0,809.0,162996.0,90350.0
3,Arizona,163278.0,241304.0,300476.0,358348.0,341817.0,193260.0,58273.0,330963.0,186697.0,148863.0,115140.0,74038.0,37013.0,21798.0,1656756.0,914512.0
4,Arkansas,83039.0,114549.0,141431.0,166048.0,146148.0,82092.0,26148.0,152659.0,73945.0,59312.0,54449.0,31717.0,16589.0,9945.0,759455.0,398616.0


In [7]:
#convert data to dictionary

data = {}

for i in range(len(merged_df)): 
    region = merged_df.iloc[i,0]
    record = merged_df.iloc[0,1:].tolist()
    data[region] = record

#dictionary to hold output

geo = [name for name in owner_state_occupied_df["NAME"]]
geo.sort()
geo.insert(0, "United States") 

age_output_dictionary = {
    "metadata" : {"title": "Owner Occupied vs Renter Occupied Homes by Age"},
    "geographies" : geo,
    "labels" : ["<35 year old owners", "35-44 year old owners", "45-54 year old owners", "55-64 year old owners", "65-74 year old owners", "75-84 year old owners", ">85 year old owners", 
                "<35 year old renters", "35-44 year old renters", "45-54 year old renters", "55-64 year old renters", "65-74 year old renters", "75-84 year old renters", ">85 year old renters",
               "total owner occupied", "total renter occupied"],
    "data": data
}

age_output_dictionary

{'metadata': {'title': 'Owner Occupied vs Renter Occupied Homes by Age'},
 'geographies': ['United States',
  'Alabama',
  'Alaska',
  'Arizona',
  'Arkansas',
  'California',
  'Colorado',
  'Connecticut',
  'Delaware',
  'District of Columbia',
  'Florida',
  'Georgia',
  'Hawaii',
  'Idaho',
  'Illinois',
  'Indiana',
  'Iowa',
  'Kansas',
  'Kentucky',
  'Louisiana',
  'Maine',
  'Maryland',
  'Massachusetts',
  'Michigan',
  'Minnesota',
  'Mississippi',
  'Missouri',
  'Montana',
  'Nebraska',
  'Nevada',
  'New Hampshire',
  'New Jersey',
  'New Mexico',
  'New York',
  'North Carolina',
  'North Dakota',
  'Ohio',
  'Oklahoma',
  'Oregon',
  'Pennsylvania',
  'Puerto Rico',
  'Rhode Island',
  'South Carolina',
  'South Dakota',
  'Tennessee',
  'Texas',
  'Utah',
  'Vermont',
  'Virginia',
  'Washington',
  'West Virginia',
  'Wisconsin',
  'Wyoming'],
 'labels': ['<35 year old owners',
  '35-44 year old owners',
  '45-54 year old owners',
  '55-64 year old owners',
  '65-74 y

In [8]:
# #convert data to dictionary

# data = {}

# for i in range(len(merged_df)): 
#     region = merged_df.iloc[i,0]
#     record = {
#         "<35" : [merged_df.iloc[i,1],merged_df.iloc[i,8]],
#         "35-44" : [merged_df.iloc[i,2],merged_df.iloc[i,9]],
#         "45-54": [merged_df.iloc[i,3],merged_df.iloc[i,10]],
#         "55-64": [merged_df.iloc[i,4],merged_df.iloc[i,11]],
#         "65-74": [merged_df.iloc[i,5],merged_df.iloc[i,12]],
#         "75-84": [merged_df.iloc[i,6],merged_df.iloc[i,13]],
#         ">85": [merged_df.iloc[i,7],merged_df.iloc[i,14]],
#     }
    
#     data[region] = record

# #dictionary to hold output

# geo = [name for name in owner_state_occupied_df["NAME"]]
# geo.sort()
# geo.insert(0, "United States") 

# output_dictionary = {
#     "geographies" : geo,
#     "labels" : ["Owner Occupied", "Renter Occupied"],
#     "data": data
# }

# output_dictionary

## Extract and Transform Race Data

In [9]:
def transform_census_race_data(census_data):
    
    data = census_data
    
    df = pd.DataFrame(data)
    
    rename_dict = {}

    for name in df.columns:
        code = name[-4:]
        if code == "003E":
            rename_dict[name] = "african american"
        elif code == "004E":
            rename_dict[name] = "american indian" 
        elif code == "005E":
            rename_dict[name] = "asian"
        elif code == "006E":
            rename_dict[name] = "native hawaiian"
        elif code == "007E":
            rename_dict[name] = "single race (other)"
        elif code == "008E":
            rename_dict[name] = "two of more races"
        elif code == "009E":
            rename_dict[name] = "hispanic/latino"
        elif code == "010E":
            rename_dict[name] = "white"

    df.rename(columns = rename_dict, inplace = True)
    
    df = df.iloc[: , :-1]
        
    return df

In [10]:
# API calls and transformation

c = Census(api_key, year=2019)

owner_us_occupied_data = c.acs5st.get(("NAME", "S2502_C03_003E", "S2502_C03_004E", "S2502_C03_005E",
                      "S2502_C03_006E","S2502_C03_007E", "S2502_C03_008E", "S2502_C03_009E", "S2502_C03_010E"), {'for': 'us:*'})

owner_state_occupied_data = c.acs5st.get(("NAME", "S2502_C03_003E", "S2502_C03_004E", "S2502_C03_005E",
                      "S2502_C03_006E","S2502_C03_007E", "S2502_C03_008E", "S2502_C03_009E", "S2502_C03_010E"), {'for': 'state:*'})

renter_us_occupied_data = c.acs5st.get(("NAME", "S2502_C05_003E", "S2502_C05_004E", "S2502_C05_005E",
                      "S2502_C05_006E","S2502_C05_007E", "S2502_C05_008E", "S2502_C05_009E", "S2502_C05_010E"), {'for': 'us:*'})

renter_state_occupied_data = c.acs5st.get(("NAME", "S2502_C05_003E", "S2502_C05_004E", "S2502_C05_005E",
                      "S2502_C05_006E","S2502_C05_007E", "S2502_C05_008E", "S2502_C05_009E", "S2502_C05_010E"), {'for': 'state:*'})

In [11]:
owner_us_occupied_df = transform_census_race_data(owner_us_occupied_data)
renter_us_occupied_df = transform_census_race_data(renter_us_occupied_data)
owner_state_occupied_df = transform_census_race_data(owner_state_occupied_data)
renter_state_occupied_df = transform_census_race_data(renter_state_occupied_data)

In [12]:
owner_merged = pd.concat([owner_us_occupied_df, owner_state_occupied_df])
renter_merged = pd.concat([renter_us_occupied_df, renter_state_occupied_df])
merged_df = pd.merge(owner_merged,renter_merged, how = "outer", on = "NAME")

In [13]:
merged_df["total_owner_occupied"] = merged_df.iloc[:,1]+merged_df.iloc[:,2]+merged_df.iloc[:,3]+merged_df.iloc[:,4]+ \
                                    merged_df.iloc[:,5]+merged_df.iloc[:,6]+merged_df.iloc[:,7]+merged_df.iloc[:,8]
merged_df["total_renter_occupied"] =merged_df.iloc[:,9]+merged_df.iloc[:,10]+merged_df.iloc[:,11]+merged_df.iloc[:,12]+\
                                    merged_df.iloc[:,13]+merged_df.iloc[:,14]+merged_df.iloc[:,15]+merged_df.iloc[:,16]
merged_df.head()

Unnamed: 0,NAME,african american_x,american indian_x,asian_x,native hawaiian_x,single race (other)_x,two of more races_x,hispanic/latino_x,white_x,african american_y,american indian_y,asian_y,native hawaiian_y,single race (other)_y,two of more races_y,hispanic/latino_y,white_y,total_owner_occupied,total_renter_occupied
0,United States,6225458.0,481954.0,3408305.0,65857.0,1717234.0,1228602.0,7509839.0,58727627.0,8657739.0,405574.0,2314198.0,94713.0,2587399.0,1279058.0,8382274.0,22914494.0,79364876.0,46635449.0
1,Alabama,248861.0,7115.0,12363.0,263.0,7595.0,15198.0,26390.0,975636.0,241072.0,2733.0,8962.0,305.0,9522.0,9071.0,26691.0,296278.0,1293421.0,594634.0
2,Alaska,3092.0,17630.0,6564.0,545.0,1572.0,7475.0,7088.0,121623.0,5126.0,12096.0,5044.0,1484.0,1915.0,6366.0,7486.0,53869.0,165589.0,93386.0
3,Arizona,37621.0,47678.0,46754.0,1437.0,65036.0,29898.0,321232.0,1188841.0,71115.0,39256.0,30327.0,2268.0,59211.0,31017.0,274884.0,483740.0,1738497.0,991818.0
4,Arkansas,75824.0,4319.0,7664.0,123.0,11396.0,11644.0,30601.0,629907.0,98440.0,2951.0,6401.0,1733.0,10676.0,10180.0,27986.0,251992.0,771478.0,410359.0


In [14]:
#convert data to dictionary

data = {}

for i in range(len(merged_df)): 
    region = merged_df.iloc[i,0]
    record = merged_df.iloc[0,1:].tolist()
    data[region] = record

#dictionary to hold output

geo = [name for name in owner_state_occupied_df["NAME"]]
geo.sort()
geo.insert(0, "United States") 

race_output_dictionary = {
    "metadata" : {"title": "Owner Occupied vs Renter Occupied Homes by Race"},
    "geographies" : geo,
    "labels" : ["African American owners", "American Indian owners", "Asian owners", "Native Hawaiian owners", "Other (one race) owners", "Two or More Races owners", "Hispanic or Latino owners", "White not Hispanic owners",
                "African American renters", "American Indian renters", "Asian renters", "Native Hawaiian renters", "Other (one race) renters", "Two or More Races renters", "Hispanic or Latino renters", "White not Hispanic renters",
                "total owner occupied", "total renter occupied"],
    "data": data
}

race_output_dictionary

{'metadata': {'title': 'Owner Occupied vs Renter Occupied Homes by Race'},
 'geographies': ['United States',
  'Alabama',
  'Alaska',
  'Arizona',
  'Arkansas',
  'California',
  'Colorado',
  'Connecticut',
  'Delaware',
  'District of Columbia',
  'Florida',
  'Georgia',
  'Hawaii',
  'Idaho',
  'Illinois',
  'Indiana',
  'Iowa',
  'Kansas',
  'Kentucky',
  'Louisiana',
  'Maine',
  'Maryland',
  'Massachusetts',
  'Michigan',
  'Minnesota',
  'Mississippi',
  'Missouri',
  'Montana',
  'Nebraska',
  'Nevada',
  'New Hampshire',
  'New Jersey',
  'New Mexico',
  'New York',
  'North Carolina',
  'North Dakota',
  'Ohio',
  'Oklahoma',
  'Oregon',
  'Pennsylvania',
  'Puerto Rico',
  'Rhode Island',
  'South Carolina',
  'South Dakota',
  'Tennessee',
  'Texas',
  'Utah',
  'Vermont',
  'Virginia',
  'Washington',
  'West Virginia',
  'Wisconsin',
  'Wyoming'],
 'labels': ['African American owners',
  'American Indian owners',
  'Asian owners',
  'Native Hawaiian owners',
  'Other (on

### Load

In [15]:
# #establish MongoDB connection
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)
db = client.housing_db

# #load in totals
db.census_housing_age.drop()
db.census_housing_demo.drop()
db.census_housing_demo.find()

db.census_housing_demo.insert_one(age_output_dictionary)
db.census_housing_demo.insert_one(race_output_dictionary)



<pymongo.results.InsertOneResult at 0x10e609848>