# Census Housing by Age and State ETL

In this notebook data is extracted from the Census Bureau's American Community Survey 5-Year Data and loaded into a MongoDB database.

We will be using the Subject Tables, extracting variable from the DEMOGRAPHIC CHARACTERISTICS FOR OCCUPIED HOUSING UNITS group (S2502). The variables in question are percentages of Total Occupied Home, Owner Occupied Homes, and Renter Occupied Homes by age and state over the years 2010-2019.




In [1]:
#dependencies
import pandas as pd
import numpy as np
import pymongo
from census import Census
from config import census_key
api_key = census_key

### Extract and Transform

In [2]:
#transformation function to apply to every dataset 

def transform_census_age_data(census_data, year):
    
    data = census_data
    
    df = pd.DataFrame(data)
    
    rename_dict = {}

    for name in df.columns:
        code = name[-4:]
        if code == "011E":
            rename_dict[name] = "<35"
        elif code == "012E":
            rename_dict[name] = "35-44" 
        elif code == "013E":
            rename_dict[name] = "45-54"
        elif code == "014E":
            rename_dict[name] = "55-64"
        elif code == "015E":
            rename_dict[name] = "65-74"
        elif code == "016E":
            rename_dict[name] = "75-84"
        elif code == "017E":
            rename_dict[name] = ">85"

    df.rename(columns = rename_dict, inplace = True)
    
    df["year"] = year
    
    return df

In [3]:
# API calls and transformation

#dictionary to hold output
census_extract = {
    "totals" : [],
    "owner_occupied" : [],
    "renter_occupied" : []
}

for input_year in np.arange(2010,2020):
    
    c = Census(api_key, year=input_year)
    
    if input_year >= 2017: #accomodate for change in variable codes in 2017
        
        total_home_data = c.acs5st.get(("NAME", "S2502_C02_011E", "S2502_C02_012E", "S2502_C02_013E",
                              "S2502_C02_014E","S2502_C02_015E", "S2502_C02_016E", "S2502_C02_017E"), {'for': 'state:*'})

        owner_occupied_data = c.acs5st.get(("NAME", "S2502_C04_011E", "S2502_C04_012E", "S2502_C04_013E",
                              "S2502_C04_014E","S2502_C04_015E", "S2502_C04_016E", "S2502_C04_017E"), {'for': 'state:*'})
        
        renter_occupied_data = c.acs5st.get(("NAME", "S2502_C06_011E", "S2502_C06_012E", "S2502_C06_013E",
                              "S2502_C06_014E","S2502_C06_015E", "S2502_C06_016E", "S2502_C06_017E"), {'for': 'state:*'})
    else:
        
        total_home_data = c.acs5st.get(("NAME", "S2502_C01_011E", "S2502_C01_012E", "S2502_C01_013E",
                              "S2502_C01_014E","S2502_C01_015E", "S2502_C01_016E", "S2502_C01_017E"), {'for': 'state:*'})
        
        owner_occupied_data = c.acs5st.get(("NAME", "S2502_C02_011E", "S2502_C02_012E", "S2502_C02_013E",
                              "S2502_C02_014E","S2502_C02_015E", "S2502_C02_016E", "S2502_C02_017E"), {'for': 'state:*'})

        renter_occupied_data = c.acs5st.get(("NAME", "S2502_C03_011E", "S2502_C03_012E", "S2502_C03_013E",
                              "S2502_C03_014E","S2502_C03_015E", "S2502_C03_016E", "S2502_C03_017E"), {'for': 'state:*'})
    
    #apply transformation function
    total_home_df = transform_census_age_data(total_home_data, input_year)
    owner_occupied_df = transform_census_age_data(owner_occupied_data, input_year)
    renter_occupied_df = transform_census_age_data(renter_occupied_data, input_year)
    
    #append to output dictionary
    census_extract["totals"].append(total_home_df)
    census_extract["owner_occupied"].append(owner_occupied_df)
    census_extract["renter_occupied"].append(renter_occupied_df)


In [12]:
#combine arrays into one large daaframe for each set
census_combined = {key: pd.concat(value) for (key, value) in census_extract.items() }

census_combined["totals"]

Unnamed: 0,NAME,<35,35-44,45-54,55-64,65-74,75-84,>85,state,year
0,Alabama,20.1,18.4,21.0,18.0,12.3,7.7,2.5,01,2010
1,Alaska,23.9,20.0,25.1,18.6,8.1,3.5,0.8,02,2010
2,Arizona,22.5,18.8,19.7,17.0,12.0,7.5,2.6,04,2010
3,Arkansas,21.6,17.9,19.8,17.6,12.5,7.9,2.7,05,2010
4,California,20.7,21.1,22.2,16.8,9.9,6.6,2.6,06,2010
...,...,...,...,...,...,...,...,...,...,...
47,Washington,21.3,17.6,17.9,19.1,14.4,6.8,2.9,53,2019
48,West Virginia,16.0,14.9,17.5,20.8,17.6,9.4,3.7,54,2019
49,Wisconsin,19.9,16.1,18.2,20.1,14.5,7.7,3.6,55,2019
50,Wyoming,22.0,16.4,16.2,20.2,15.3,7.2,2.8,56,2019


In [6]:
#convert to json format so it data can be loaded into MongoDB
census_as_dictionaries = []

for key in census_combined:
    new_index = census_combined[key].reset_index()
    dictionary = new_index.to_dict("records")
    census_as_dictionaries.append(dictionary)

### Load

In [7]:
#establish MongoDB connection
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)
db = client.housing_db

#load in totals
db.total_housing_by_age.drop()
db.total_housing_by_age.find()

for record in census_as_dictionaries[0]:
    db.total_housing_by_age.insert_one(record)

#load in owner-occupied data
db.owner_occupied_housing_by_age.drop()
db.owner_occupied_housing_by_age.find()

for record in census_as_dictionaries[1]:
    db.owner_occupied_housing_by_age.insert_one(record)

    
#load in renter-occupied data
db.renter_occupied_housing_by_age.drop()
db.renter_occupied_housing_by_age.find()

for record in census_as_dictionaries[2]:
    db.renter_occupied_housing_by_age.insert_one(record)