# Census Housing by Age and State ETL

In this notebook data is extracted from the Census Bureau's American Community Survey 5-Year Data and loaded into a MongoDB database.

We will be using the Subject Tables, extracting variable from the DEMOGRAPHIC CHARACTERISTICS FOR OCCUPIED HOUSING UNITS group (S2502). The variables in question are percentages of Total Occupied Home, Owner Occupied Homes, and Renter Occupied Homes by age and state over the years 2010-2019.




In [7]:
#dependencies
import pandas as pd
import numpy as np
import pymongo
from census import Census
from config import census_key
api_key = census_key

### Extract and Transform

In [38]:
#transformation function to apply to every dataset 

def transform_census_age_data(census_data, occupied):
    
    data = census_data
    
    df = pd.DataFrame(data)
    
    rename_dict = {}

    for name in df.columns:
        code = name[-4:]
        if code == "011E":
            rename_dict[name] = "<35"
        elif code == "012E":
            rename_dict[name] = "35-44" 
        elif code == "013E":
            rename_dict[name] = "45-54"
        elif code == "014E":
            rename_dict[name] = "55-64"
        elif code == "015E":
            rename_dict[name] = "65-74"
        elif code == "016E":
            rename_dict[name] = "75-84"
        elif code == "017E":
            rename_dict[name] = ">85"

    df.rename(columns = rename_dict, inplace = True)
    
    df = df.iloc[: , :-1]
    
#     df["occupied_by"] = occupied
    
    return df

In [19]:
# API calls and transformation

#dictionary to hold output

c = Census(api_key, year=2018)

owner_us_occupied_data = c.acs5st.get(("NAME", "S2502_C04_011E", "S2502_C04_012E", "S2502_C04_013E",
                      "S2502_C04_014E","S2502_C04_015E", "S2502_C04_016E", "S2502_C04_017E"), {'for': 'us:*'})

renter_us_occupied_data = c.acs5st.get(("NAME", "S2502_C06_011E", "S2502_C06_012E", "S2502_C06_013E",
                      "S2502_C06_014E","S2502_C06_015E", "S2502_C06_016E", "S2502_C06_017E"), {'for': 'us:*'})

owner_state_occupied_data = c.acs5st.get(("NAME", "S2502_C04_011E", "S2502_C04_012E", "S2502_C04_013E",
                      "S2502_C04_014E","S2502_C04_015E", "S2502_C04_016E", "S2502_C04_017E"), {'for': 'state:*'})

renter_state_occupied_data = c.acs5st.get(("NAME", "S2502_C06_011E", "S2502_C06_012E", "S2502_C06_013E",
                      "S2502_C06_014E","S2502_C06_015E", "S2502_C06_016E", "S2502_C06_017E"), {'for': 'state:*'})


#append to output dictionary
# census_extract["totals"].append(total_home_df)
# census_extract["owner_occupied"].append(owner_occupied_df)
# census_extract["renter_occupied"].append(renter_occupied_df)


In [40]:
#apply transformation function

owner_us_occupied_df = transform_census_age_data(owner_us_occupied_data, "owner")
renter_us_occupied_df = transform_census_age_data(renter_us_occupied_data, "renter")
owner_state_occupied_df = transform_census_age_data(owner_state_occupied_data, "owner")
renter_state_occupied_df = transform_census_age_data(renter_state_occupied_data, "renter")

In [64]:
owner_merged = pd.concat([owner_us_occupied_df, owner_state_occupied_df])
renter_merged = pd.concat([renter_us_occupied_df, renter_state_occupied_df])

In [65]:
merged_df = pd.merge(owner_merged,renter_merged, how = "outer", on = "NAME")
merged_df

Unnamed: 0,NAME,<35_x,35-44_x,45-54_x,55-64_x,65-74_x,75-84_x,>85_x,<35_y,35-44_y,45-54_y,55-64_y,65-74_y,75-84_y,>85_y
0,United States,9.9,15.6,20.7,23.0,17.9,9.4,3.5,35.0,20.0,16.5,13.3,8.0,4.3,2.7
1,Wisconsin,10.9,15.8,21.0,23.6,16.9,8.7,3.1,38.6,16.7,14.3,12.8,7.9,5.3,4.4
2,Wyoming,13.8,15.5,18.1,23.7,17.8,8.2,2.9,42.4,17.7,13.4,13.0,7.2,3.8,2.4
3,Puerto Rico,5.6,13.2,18.8,22.4,22.3,13.2,4.4,27.8,20.5,17.8,15.3,11.2,5.7,1.7
4,Mississippi,10.0,15.2,19.7,22.8,18.7,10.3,3.4,35.7,21.1,16.8,13.9,7.6,3.4,1.5
5,Missouri,11.6,15.4,19.5,22.8,17.7,9.7,3.4,37.9,17.6,15.2,13.7,8.0,4.5,3.0
6,Montana,10.6,14.7,17.6,24.9,19.6,9.7,3.0,39.8,16.4,13.6,13.4,8.1,5.0,3.5
7,Nebraska,13.4,16.5,19.2,22.4,16.3,8.8,3.4,42.4,17.0,13.2,11.9,6.8,4.7,4.1
8,Nevada,10.8,16.0,19.4,22.2,19.4,9.6,2.6,31.5,21.5,18.3,14.2,9.3,3.7,1.5
9,New Hampshire,8.5,14.5,22.7,25.7,17.8,7.9,2.9,33.2,17.1,15.8,15.2,8.6,6.0,3.9


In [81]:
data = {}

for i in range(len(merged_df)): 
    region = merged_df.iloc[i,0]
    record = {
        "<35" : [merged_df.iloc[i,1],merged_df.iloc[i,8]],
        "35-44" : [merged_df.iloc[i,2],merged_df.iloc[i,9]],
        "45-54": [merged_df.iloc[i,3],merged_df.iloc[i,10]],
        "55-64": [merged_df.iloc[i,4],merged_df.iloc[i,11]],
        "65-74": [merged_df.iloc[i,5],merged_df.iloc[i,12]],
        "75-84": [merged_df.iloc[i,6],merged_df.iloc[i,13]],
        ">85": [merged_df.iloc[i,7],merged_df.iloc[i,14]],
    }
    
    data[region] = record
    
output_dictionary = {
    "geographies" : [name for name in merged_df["NAME"]],
    "labels" : ["Owner Occupied", "Renter Occupied"],
    "data": data
}

output_dictionary

{'geographies': ['United States',
  'Wisconsin',
  'Wyoming',
  'Puerto Rico',
  'Mississippi',
  'Missouri',
  'Montana',
  'Nebraska',
  'Nevada',
  'New Hampshire',
  'New Jersey',
  'New Mexico',
  'New York',
  'North Carolina',
  'North Dakota',
  'Ohio',
  'Oklahoma',
  'Oregon',
  'Pennsylvania',
  'Rhode Island',
  'South Carolina',
  'South Dakota',
  'Tennessee',
  'Texas',
  'Vermont',
  'Utah',
  'Virginia',
  'Washington',
  'West Virginia',
  'Alabama',
  'Alaska',
  'Arizona',
  'Arkansas',
  'California',
  'Colorado',
  'Delaware',
  'District of Columbia',
  'Connecticut',
  'Florida',
  'Georgia',
  'Idaho',
  'Hawaii',
  'Illinois',
  'Indiana',
  'Iowa',
  'Kansas',
  'Kentucky',
  'Louisiana',
  'Maine',
  'Maryland',
  'Massachusetts',
  'Michigan',
  'Minnesota'],
 'labels': ['Owner Occupied', 'Renter Occupied'],
 'data': {'United States': {'<35': [9.9, 35.0],
   '35-44': [15.6, 20.0],
   '45-54': [20.7, 16.5],
   '55-64': [23.0, 13.3],
   '65-74': [17.9, 8.0],

### Load

In [84]:
# #establish MongoDB connection
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)
db = client.housing_db

# #load in totals
db.census_housing_age.drop()
db.census_housing_age.find()

db.census_housing_age.insert_one(output_dictionary)

<pymongo.results.InsertOneResult at 0x7fb424c81c88>