# Final Project - Idaho Policy Institute 5
## Ryan Pacheco, Ashley Gilbert, Ben Whitehead

Our goal is to identify characteristics which make a city sustainable, then classify cities based on whether they are growing sustainably or not. We will be looking at the cities in Idaho, California, and New York (state) with a population over 50,000.

## Initial Setup
We will start bby loading all of our data sources:
- US Census Data
- American Community Survey Data
- Greenhouse Gas Data (procured by the EPA)

In [1]:
%pip install census us

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib as plt
import os

from census import Census
from us import states

import plotly.graph_objects as go

In [3]:
states.ID.shapefile_urls('county')

'https://www2.census.gov/geo/tiger/TIGER2010/COUNTY/2010/tl_2010_16_county10.zip'

In [4]:
#load census data using API key
c = Census('fb97753783c42ae57fe1a640e38fe04e921e5d1a')

#American Community Survey Data for California
i = 0
acs_years_ca = []
for x in range(2012, 2018):
    acs_test = c.acs5.state_place(('NAME',
                                   'B01003_001E',
                                   'B00002_001E',
                                   'B09018_007E',
                                   'B01002_001E'), states.CA.fips, '*', year=x)
    acs_years_ca.append(pd.DataFrame.from_records(acs_test))
    print(x)
    acs_years_ca[i] = acs_years_ca[i].rename(columns={
        'NAME' : 'City_Name',
        'place': 'FIPS',
        'B01003_001E': 'Total_Population_{}'.format(x),
        'B00002_001E': 'Total_Housing_{}'.format(x),
        'B09018_007E': 'Presence_of_Non-Relatives_{}'.format(x),
        'B01002_001E': 'Median_Age_{}'.format(x),
    })
    acs_years_ca[i].set_index('FIPS', inplace=True)
    acs_years_ca[i].drop(columns=['City_Name', 'state'], inplace=True)
    acs_years_ca[i] = acs_years_ca[i].nlargest(5, 'Total_Population_{}'.format(x))
    i = i + 1


#American Community Survey Data for New York
i = 0
acs_years_ny = []
for x in range(2012, 2018):
    acs_test = c.acs5.state_place(('NAME',
                                   'B01003_001E',
                                   'B00002_001E',
                                   'B09018_007E',
                                   'B01002_001E'), states.NY.fips, '*', year=x)
    acs_years_ny.append(pd.DataFrame.from_records(acs_test))
    print(x)
    acs_years_ny[i] = acs_years_ny[i].rename(columns={
        'NAME' : 'City_Name',
        'place': 'FIPS',
        'B01003_001E': 'Total_Population_{}'.format(x),
        'B00002_001E': 'Total_Housing_{}'.format(x),
        'B09018_007E': 'Presence_of_Non-Relatives_{}'.format(x),
        'B01002_001E': 'Median_Age_{}'.format(x),
    })
    acs_years_ny[i].set_index('FIPS', inplace=True)
    acs_years_ny[i].drop(columns=['City_Name', 'state'], inplace=True)
    acs_years_ny[i] = acs_years_ny[i].nlargest(5, 'Total_Population_{}'.format(x))
    i = i + 1


#American Community Survey Data for Idaho
i = 0
acs_years_id = []
for x in range(2012, 2018):
    acs_test = c.acs5.state_place(('NAME',
                                   'B01003_001E',
                                   'B00002_001E',
                                   'B09018_007E',
                                   'B01002_001E'), states.ID.fips, '*', year=x)
    acs_years_id.append(pd.DataFrame.from_records(acs_test))
    print(x)
    acs_years_id[i] = acs_years_id[i].rename(columns={
        'NAME' : 'City_Name',
        'place': 'FIPS',
        'B01003_001E': 'Total_Population_{}'.format(x),
        'B00002_001E': 'Total_Housing_{}'.format(x),
        'B09018_007E': 'Presence_of_Non-Relatives_{}'.format(x),
        'B01002_001E': 'Median_Age_{}'.format(x),
    })
    acs_years_id[i].set_index('FIPS', inplace=True)
    acs_years_id[i].drop(columns=['City_Name', 'state'], inplace=True)
    acs_years_id[i] = acs_years_id[i].nlargest(5, 'Total_Population_{}'.format(x))
    i = i + 1


#Greenhouse Gas Data
ghg = pd.DataFrame()

for f in os.listdir('data/2018_data_summary_spreadsheets'):
    temp = pd.read_excel('data/2018_data_summary_spreadsheets/'+f, sheet_name=0)
    temp['Year'] = f.split('.')[0].split('_')[2]    
    ghg = pd.concat([temp, ghg])
    
fips_map = pd.read_excel('data/fips-codes.xls', sheet_name=0)

fips_map = fips_map[fips_map['Entity Description'] == 'city']

def str_func(x):
    return str(x).zfill(5)

fips_map['FIPS'] = fips_map['FIPS Entity Code'].apply(str_func)
fips_map['City'] = fips_map['GU Name']
fips_map['State'] = fips_map['State Abbreviation']

ghg_mapped = pd.merge(ghg, fips_map, on=['State', 'City'])
total_emissions = ghg_mapped.groupby(['FIPS','Year'])['Total reported direct emissions'].agg('sum').to_frame()

total_emissions.reset_index(inplace=True)

pivot_em = total_emissions.pivot(index='FIPS', columns='Year', values='Total reported direct emissions')

2012
2013
2014
2015
2016
2017
2012
2013
2014
2015
2016
2017
2012
2013
2014
2015
2016
2017



Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





Now that all of our data is loaded, we will work on putting it all together for analysis

In [5]:
#common variables used for working with the census data.
keys = ['NAME' ,'P002001','P002002','P002005','H001001','P013001','H003001','P027001','H005001','H005002','H005003','H005004','H005005','H005006','H005007']

renames_2000 = {
        'NAME' : 'City_Name',
        'place': 'FIPS',
        'P002001': 'Total_Population_2000',
        'P002002':'Total_Urban_Population_2000',
        'P002005':'Total_Rural_Population_2000',
        'H001001': 'Total_Housing_2000',
        'P013001': 'Median_Age_2000',
        'H003001': 'Occupancy_Status_For_Housing_Units_2000',
        'P027001': 'Presence_of_Non-Relatives_2000',
        'H005001': 'Vacancy_Status_2000',
        'H005002': 'For_Rent_2000',
        'H005003': 'Rented_Not_Occupied_2000',
        'H005004': 'For_Sale_Only_2000',
        'H005005': 'Sold_Not_Occupied_2000',
        'H005006': 'For_Seasonal_Recreational_Or_Occasional_Use_2000',
        'H005007': 'For_Migrant_Workers_2000'
}

renames_2010 = {
        'NAME' : 'City_Name',
        'place': 'FIPS',
        'P002001': 'Total_Population_2010',
        'P002002':'Total_Urban_Population_2010',
        'P002005':'Total_Rural_Population_2010',
        'H001001': 'Total_Housing_2010',
        'P013001': 'Median_Age_2010',
        'H003001': 'Occupancy_Status_For_Housing_Units_2010',
        'P027001': 'Presence_of_Non-Relatives_2010',
        'H005001': 'Vacancy_Status_2010',
        'H005002': 'For_Rent_2010',
        'H005003': 'Rented_Not_Occupied_2010',
        'H005004': 'For_Sale_Only_2010',
        'H005005': 'Sold_Not_Occupied_2010',
        'H005006': 'For_Seasonal_Recreational_Or_Occasional_Use_2010',
        'H005007': 'For_Migrant_Workers_2010'}


## Merge data and start analysis

### California

In [61]:
city_2010 = c.sf1.state_place(keys, states.CA.fips, '*', year=2010)
c_pop_2010 = pd.DataFrame.from_records(city_2010)
c_pop_2010_50000 = c_pop_2010.rename(columns=renames_2010)

In [62]:
c_pop_2010_50000.head()

Unnamed: 0,Total_Housing_2010,Occupancy_Status_For_Housing_Units_2010,Vacancy_Status_2010,For_Rent_2010,Rented_Not_Occupied_2010,For_Sale_Only_2010,Sold_Not_Occupied_2010,For_Seasonal_Recreational_Or_Occasional_Use_2010,For_Migrant_Workers_2010,City_Name,Total_Population_2010,Total_Urban_Population_2010,Total_Rural_Population_2010,Median_Age_2010,Presence_of_Non-Relatives_2010,FIPS,state
0,457.0,457.0,16.0,4.0,1.0,1.0,2.0,2.0,0.0,"Acalanes Ridge CDP, California",1137.0,1137.0,0.0,46.3,441.0,135,6
1,99.0,99.0,5.0,1.0,0.0,0.0,0.0,1.0,0.0,"Acampo CDP, California",341.0,0.0,341.0,30.6,94.0,156,6
2,2814.0,2814.0,154.0,22.0,1.0,42.0,9.0,34.0,0.0,"Acton CDP, California",7596.0,0.0,7596.0,45.5,2660.0,212,6
3,9086.0,9086.0,1277.0,462.0,11.0,323.0,56.0,32.0,0.0,"Adelanto city, California",31765.0,31381.0,384.0,25.3,7809.0,296,6
4,144.0,144.0,20.0,1.0,0.0,1.0,0.0,9.0,0.0,"Adin CDP, California",272.0,0.0,272.0,47.3,124.0,310,6


In [63]:
city_2000 = c.sf1.state_place(keys, states.CA.fips, '*', year=2000)
c_pop_2000 = pd.DataFrame.from_records(city_2000)
c_pop_2000_50000 = c_pop_2000.rename(columns=renames_2000)

In [64]:
c_pop_2000_50000.drop(columns=['City_Name', 'state'], inplace=True)

In [65]:
c_pop_2000_50000.head()

Unnamed: 0,Total_Housing_2000,Occupancy_Status_For_Housing_Units_2000,Vacancy_Status_2000,For_Rent_2000,Rented_Not_Occupied_2000,For_Sale_Only_2000,Sold_Not_Occupied_2000,For_Seasonal_Recreational_Or_Occasional_Use_2000,For_Migrant_Workers_2000,Total_Population_2000,Total_Urban_Population_2000,Total_Rural_Population_2000,Median_Age_2000,Presence_of_Non-Relatives_2000,FIPS
0,915,915,26,12,4,1,1,0,8,2797,0,0,37.3,2797,2812
1,3145,3145,135,61,29,3,11,0,31,12956,0,0,23.3,12956,2924
2,88262,88262,4821,2187,1018,309,268,3,1036,247057,0,0,30.1,247057,3526
3,2147,2147,561,15,69,18,417,0,42,4232,0,0,43.8,4232,4734
4,1188,1188,348,18,46,6,219,0,59,1823,0,0,49.8,1823,7274


In [66]:
c_pop_2000_50000.set_index('FIPS', inplace=True)
c_pop_2010_50000.set_index('FIPS', inplace=True)

In [67]:
ca_join = c_pop_2000_50000.join(c_pop_2010_50000, on='FIPS')

ca_join = ca_join.join(pivot_em, on='FIPS')

In [68]:
ca_join.head()

Unnamed: 0_level_0,Total_Housing_2000,Occupancy_Status_For_Housing_Units_2000,Vacancy_Status_2000,For_Rent_2000,Rented_Not_Occupied_2000,For_Sale_Only_2000,Sold_Not_Occupied_2000,For_Seasonal_Recreational_Or_Occasional_Use_2000,For_Migrant_Workers_2000,Total_Population_2000,...,state,2010,2011,2012,2013,2014,2015,2016,2017,2018
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2812,915,915,26,12,4,1,1,0,8,2797,...,6,,,,,,,,,
2924,3145,3145,135,61,29,3,11,0,31,12956,...,6,,,,,,,,,
3526,88262,88262,4821,2187,1018,309,268,3,1036,247057,...,6,980629.082,906498.922,1155619.396,1207249.466,1848460.71,2086014.066,94153.496,69323.64,18864.88
4734,2147,2147,561,15,69,18,417,0,42,4232,...,6,,,,,,,,,
7274,1188,1188,348,18,46,6,219,0,59,1823,...,6,,,,,,,,,


In [69]:
ca_join['Total_Population_2000'] = ca_join['Total_Population_2000'].astype('i8')

In [70]:
ca_join = ca_join.nlargest(5, 'Total_Population_2000')

In [73]:
fig = go.Figure(data=[
    go.Bar(name='2000_pop', x=ca_join['City_Name'], y=ca_join['Total_Population_2000']),
    go.Bar(name='2010_pop', x=ca_join['City_Name'], y=ca_join['Total_Population_2010']),
    go.Bar(name='2000_housing', x=ca_join['City_Name'], y=ca_join['Total_Housing_2000']),
    go.Bar(name='2010_housing', x=ca_join['City_Name'], y=ca_join['Total_Housing_2010']),
    go.Bar(name='2000_non-relatives', x=ca_join['City_Name'], y=ca_join['Presence_of_Non-Relatives_2000']),
    go.Bar(name='2010_non-relatives', x=ca_join['City_Name'], y=ca_join['Presence_of_Non-Relatives_2010']),
    go.Bar(name='2010_direct_emissions', x=ca_join['City_Name'], y=ca_join['2010']),
    go.Bar(name='2011_direct_emissions', x=ca_join['City_Name'], y=ca_join['2011']),
    go.Bar(name='2012_direct_emissions', x=ca_join['City_Name'], y=ca_join['2012']),
    go.Bar(name='2013_direct_emissions', x=ca_join['City_Name'], y=ca_join['2013']),
    go.Bar(name='2014_direct_emissions', x=ca_join['City_Name'], y=ca_join['2014']),
    go.Bar(name='2015_direct_emissions', x=ca_join['City_Name'], y=ca_join['2015']),
    go.Bar(name='2016_direct_emissions', x=ca_join['City_Name'], y=ca_join['2016']),
    go.Bar(name='2017_direct_emissions', x=ca_join['City_Name'], y=ca_join['2017']),
    go.Bar(name='2018_direct_emissions', x=ca_join['City_Name'], y=ca_join['2018'])
])
fig.update_layout(barmode='group')
fig.show()

## Get's the 5 largest cities in New York

In [86]:
city_2010 = c.sf1.state_place(keys, states.NY.fips, '*', year=2010)
c_pop_2010 = pd.DataFrame.from_records(city_2010)
c_pop_2010_50000 = c_pop_2010.rename(columns=renames_2010)

In [87]:
c_pop_2010_50000.head()

Unnamed: 0,Total_Housing_2010,Occupancy_Status_For_Housing_Units_2010,Vacancy_Status_2010,For_Rent_2010,Rented_Not_Occupied_2010,For_Sale_Only_2010,Sold_Not_Occupied_2010,For_Seasonal_Recreational_Or_Occasional_Use_2010,For_Migrant_Workers_2010,City_Name,Total_Population_2010,Total_Urban_Population_2010,Total_Rural_Population_2010,Median_Age_2010,Presence_of_Non-Relatives_2010,FIPS,state
0,265.0,265.0,49.0,9.0,0.0,9.0,6.0,13.0,0.0,"Accord CDP, New York",562.0,0.0,562.0,40.3,216.0,155,36
1,793.0,793.0,54.0,17.0,0.0,2.0,3.0,10.0,0.0,"Adams village, New York",1775.0,1669.0,106.0,35.6,739.0,199,36
2,629.0,629.0,38.0,10.0,1.0,5.0,2.0,5.0,0.0,"Adams Center CDP, New York",1568.0,701.0,867.0,38.2,591.0,232,36
3,770.0,770.0,73.0,22.0,4.0,10.0,0.0,12.0,0.0,"Addison village, New York",1763.0,0.0,1763.0,36.7,697.0,276,36
4,430.0,430.0,46.0,8.0,0.0,12.0,0.0,9.0,0.0,"Afton village, New York",822.0,0.0,822.0,42.9,384.0,342,36


In [88]:
city_2000 = c.sf1.state_place(keys, states.NY.fips, '*', year=2000)
c_pop_2000 = pd.DataFrame.from_records(city_2000)
c_pop_2000_50000 = c_pop_2000.rename(columns=renames_2000)

In [89]:
c_pop_2000_50000.drop(columns=['City_Name', 'state'], inplace=True)

In [90]:
c_pop_2000_50000.head()

Unnamed: 0,Total_Housing_2000,Occupancy_Status_For_Housing_Units_2000,Vacancy_Status_2000,For_Rent_2000,Rented_Not_Occupied_2000,For_Sale_Only_2000,Sold_Not_Occupied_2000,For_Seasonal_Recreational_Or_Occasional_Use_2000,For_Migrant_Workers_2000,Total_Population_2000,Total_Urban_Population_2000,Total_Rural_Population_2000,Median_Age_2000,Presence_of_Non-Relatives_2000,FIPS
0,1158,1158,86,41,11,8,8,0,18,2597,0,0,35.9,2597,3001
1,823,823,107,31,14,10,20,0,32,1699,0,0,37.1,1699,20346
2,1018,1018,103,44,18,2,3,0,36,2166,0,0,41.2,2166,20731
3,329,329,31,6,9,3,4,0,9,791,0,0,38.3,791,21523
4,5564,5564,550,287,90,47,29,2,95,13617,0,0,31.8,13617,28640


In [91]:
c_pop_2000_50000.set_index('FIPS', inplace=True)
c_pop_2010_50000.set_index('FIPS', inplace=True)

In [92]:
ny_join = c_pop_2000_50000.join(c_pop_2010_50000, on='FIPS')
ny_join = ny_join.join(pivot_em, on='FIPS')

In [93]:
ny_join.head()

Unnamed: 0_level_0,Total_Housing_2000,Occupancy_Status_For_Housing_Units_2000,Vacancy_Status_2000,For_Rent_2000,Rented_Not_Occupied_2000,For_Sale_Only_2000,Sold_Not_Occupied_2000,For_Seasonal_Recreational_Or_Occasional_Use_2000,For_Migrant_Workers_2000,Total_Population_2000,...,state,2010,2011,2012,2013,2014,2015,2016,2017,2018
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3001,1158,1158,86,41,11,8,8,0,18,2597,...,36,,,,,,,,,
20346,823,823,107,31,14,10,20,0,32,1699,...,36,,,,,,,,,
20731,1018,1018,103,44,18,2,3,0,36,2166,...,36,,,,,,,,,
21523,329,329,31,6,9,3,4,0,9,791,...,36,,,,,,,,,
28640,5564,5564,550,287,90,47,29,2,95,13617,...,36,,,,,,,,,


In [94]:
ny_join['Total_Population_2000'] = ny_join['Total_Population_2000'].astype('i8')

In [95]:
ny_join = ny_join.nlargest(5, 'Total_Population_2000')

In [97]:
fig = go.Figure(data=[
    go.Bar(name='2000_pop', x=ny_join['City_Name'], y=ny_join['Total_Population_2000']),
    go.Bar(name='2010_pop', x=ny_join['City_Name'], y=ny_join['Total_Population_2010']),
    go.Bar(name='2000_housing', x=ny_join['City_Name'], y=ny_join['Total_Housing_2000']),
    go.Bar(name='2010_housing', x=ny_join['City_Name'], y=ny_join['Total_Housing_2010']),
    go.Bar(name='2000_non-relatives', x=ny_join['City_Name'], y=ny_join['Presence_of_Non-Relatives_2000']),
    go.Bar(name='2010_non-relatives', x=ny_join['City_Name'], y=ny_join['Presence_of_Non-Relatives_2010']),
    go.Bar(name='2010_direct_emissions', x=ca_join['City_Name'], y=ca_join['2010']),
    go.Bar(name='2011_direct_emissions', x=ca_join['City_Name'], y=ca_join['2011']),
    go.Bar(name='2012_direct_emissions', x=ca_join['City_Name'], y=ca_join['2012']),
    go.Bar(name='2013_direct_emissions', x=ca_join['City_Name'], y=ca_join['2013']),
    go.Bar(name='2014_direct_emissions', x=ca_join['City_Name'], y=ca_join['2014']),
    go.Bar(name='2015_direct_emissions', x=ca_join['City_Name'], y=ca_join['2015']),
    go.Bar(name='2016_direct_emissions', x=ca_join['City_Name'], y=ca_join['2016']),
    go.Bar(name='2017_direct_emissions', x=ca_join['City_Name'], y=ca_join['2017']),
    go.Bar(name='2018_direct_emissions', x=ca_join['City_Name'], y=ca_join['2018'])

])
fig.update_layout(barmode='group')
fig.show()

## Get's the 5 largest cities in Idaho

In [98]:
city_2010 = c.sf1.state_place(keys, states.ID.fips, '*', year=2010)
c_pop_2010 = pd.DataFrame.from_records(city_2010)
c_pop_2010_50000 = c_pop_2010.rename(columns=renames_2010)

In [99]:
c_pop_2010_50000.head()

Unnamed: 0,Total_Housing_2010,Occupancy_Status_For_Housing_Units_2010,Vacancy_Status_2010,For_Rent_2010,Rented_Not_Occupied_2010,For_Sale_Only_2010,Sold_Not_Occupied_2010,For_Seasonal_Recreational_Or_Occasional_Use_2010,For_Migrant_Workers_2010,City_Name,Total_Population_2010,Total_Urban_Population_2010,Total_Rural_Population_2010,Median_Age_2010,Presence_of_Non-Relatives_2010,FIPS,state
0,667.0,667.0,52.0,4.0,1.0,14.0,2.0,6.0,0.0,"Aberdeen city, Idaho",1994.0,0.0,1994.0,28.1,615.0,100,16
1,45.0,45.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,"Acequia city, Idaho",124.0,0.0,124.0,36.0,43.0,280,16
2,138.0,138.0,25.0,6.0,1.0,4.0,3.0,5.0,1.0,"Albion city, Idaho",267.0,0.0,267.0,42.8,113.0,1000,16
3,1612.0,1612.0,138.0,45.0,4.0,32.0,5.0,6.0,2.0,"American Falls city, Idaho",4457.0,4450.0,7.0,30.2,1474.0,1900,16
4,4747.0,4747.0,271.0,87.0,7.0,74.0,9.0,37.0,1.0,"Ammon city, Idaho",13816.0,13693.0,123.0,29.6,4476.0,1990,16


In [100]:
city_2000 = c.sf1.state_place(keys, states.ID.fips, '*', year=2000)
c_pop_2000 = pd.DataFrame.from_records(city_2000)
c_pop_2000_50000 = c_pop_2000.rename(columns=renames_2000)

In [101]:
c_pop_2000_50000.drop(columns=['City_Name', 'state'], inplace=True)

In [102]:
c_pop_2000_50000.head()

Unnamed: 0,Total_Housing_2000,Occupancy_Status_For_Housing_Units_2000,Vacancy_Status_2000,For_Rent_2000,Rented_Not_Occupied_2000,For_Sale_Only_2000,Sold_Not_Occupied_2000,For_Seasonal_Recreational_Or_Occasional_Use_2000,For_Migrant_Workers_2000,Total_Population_2000,Total_Urban_Population_2000,Total_Rural_Population_2000,Median_Age_2000,Presence_of_Non-Relatives_2000,FIPS
0,1088,1088,119,22,17,2,9,1,68,3193,0,0,27.9,3193,28360
1,20627,20627,1293,556,298,87,84,1,267,51466,0,0,28.8,51466,64090
2,77850,77850,3412,1463,739,302,353,1,554,185787,0,0,32.8,185787,8830
3,9603,9603,640,268,152,40,23,1,156,25967,0,0,28.8,25967,12250
4,4048,4048,184,28,81,18,15,0,42,11085,0,0,35.2,11085,23410


In [103]:
c_pop_2000_50000.set_index('FIPS', inplace=True)
c_pop_2010_50000.set_index('FIPS', inplace=True)

In [104]:
id_join = c_pop_2000_50000.join(c_pop_2010_50000, on='FIPS')
id_join = id_join.join(pivot_em, on='FIPS')

In [105]:
id_join.head()

Unnamed: 0_level_0,Total_Housing_2000,Occupancy_Status_For_Housing_Units_2000,Vacancy_Status_2000,For_Rent_2000,Rented_Not_Occupied_2000,For_Sale_Only_2000,Sold_Not_Occupied_2000,For_Seasonal_Recreational_Or_Occasional_Use_2000,For_Migrant_Workers_2000,Total_Population_2000,...,state,2010,2011,2012,2013,2014,2015,2016,2017,2018
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
28360,1088,1088,119,22,17,2,9,1,68,3193,...,16,,,,,,,,,
64090,20627,20627,1293,556,298,87,84,1,267,51466,...,16,52242.42,100342.886,88202.054,80940.54,85232.964638,75898.220214,66906.213486,64418.852668,71841.469426
8830,77850,77850,3412,1463,739,302,353,1,554,185787,...,16,,,,,,,,,
12250,9603,9603,640,268,152,40,23,1,156,25967,...,16,,,,,,,,,
23410,4048,4048,184,28,81,18,15,0,42,11085,...,16,,,,,,,,,


In [106]:
id_join['Total_Population_2000'] = id_join['Total_Population_2000'].astype('i8')

In [107]:
id_join =  id_join.nlargest(5, 'Total_Population_2000')

In [108]:
id_join.head()

Unnamed: 0_level_0,Total_Housing_2000,Occupancy_Status_For_Housing_Units_2000,Vacancy_Status_2000,For_Rent_2000,Rented_Not_Occupied_2000,For_Sale_Only_2000,Sold_Not_Occupied_2000,For_Seasonal_Recreational_Or_Occasional_Use_2000,For_Migrant_Workers_2000,Total_Population_2000,...,state,2010,2011,2012,2013,2014,2015,2016,2017,2018
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8830,77850,77850,3412,1463,739,302,353,1,554,185787,...,16,,,,,,,,,
56260,19379,19379,1289,448,440,90,57,3,251,51867,...,16,87260.04,87953.05,91498.048,113740.078,120071.3,123762.9,128147.1,138822.7,140565.8
64090,20627,20627,1293,556,298,87,84,1,267,51466,...,16,52242.42,100342.886,88202.054,80940.54,85232.96,75898.22,66906.21,64418.85,71841.47
39700,19771,19771,978,373,200,82,86,0,237,50730,...,16,4570117.17,4307774.128,3973294.462,4215302.27,4027233.0,4884031.0,4468881.0,3941550.0,3574040.0
52120,12293,12293,464,56,276,45,22,0,65,34919,...,16,,,,,,,,,


In [110]:
fig = go.Figure(data=[
    go.Bar(name='2000_pop', x=id_join['City_Name'], y=id_join['Total_Population_2000']),
    go.Bar(name='2010_pop', x=id_join['City_Name'], y=id_join['Total_Population_2010']),
    go.Bar(name='2000_housing', x=id_join['City_Name'], y=id_join['Total_Housing_2000']),
    go.Bar(name='2010_housing', x=id_join['City_Name'], y=id_join['Total_Housing_2010']),
    go.Bar(name='2000_non-relatives', x=id_join['City_Name'], y=id_join['Presence_of_Non-Relatives_2000']),
    go.Bar(name='2010_non-relatives', x=id_join['City_Name'], y=id_join['Presence_of_Non-Relatives_2010']),
    go.Bar(name='2010_direct_emissions', x=id_join['City_Name'], y=id_join['2010']),
    go.Bar(name='2011_direct_emissions', x=id_join['City_Name'], y=id_join['2011']),
    go.Bar(name='2012_direct_emissions', x=id_join['City_Name'], y=id_join['2012']),
    go.Bar(name='2013_direct_emissions', x=id_join['City_Name'], y=id_join['2013']),
    go.Bar(name='2014_direct_emissions', x=id_join['City_Name'], y=id_join['2014']),
    go.Bar(name='2015_direct_emissions', x=id_join['City_Name'], y=id_join['2015']),
    go.Bar(name='2016_direct_emissions', x=id_join['City_Name'], y=id_join['2016']),
    go.Bar(name='2017_direct_emissions', x=id_join['City_Name'], y=id_join['2017']),
    go.Bar(name='2018_direct_emissions', x=id_join['City_Name'], y=id_join['2018'])

])
fig.update_layout(barmode='group')
fig.show()

In [40]:
three_state_df = pd.concat([id_join, ca_join, ny_join])

In [41]:
three_state_df.reset_index(inplace=True)

In [42]:
three_state_df.head()

Unnamed: 0,FIPS,Total_Housing_2000,Occupancy_Status_For_Housing_Units_2000,Vacancy_Status_2000,For_Rent_2000,Rented_Not_Occupied_2000,For_Sale_Only_2000,Sold_Not_Occupied_2000,For_Seasonal_Recreational_Or_Occasional_Use_2000,For_Migrant_Workers_2000,...,Sold_Not_Occupied_2010,For_Seasonal_Recreational_Or_Occasional_Use_2010,For_Migrant_Workers_2010,City_Name,Total_Population_2010,Total_Urban_Population_2010,Total_Rural_Population_2010,Median_Age_2010,Presence_of_Non-Relatives_2010,state
0,8830,77850,77850,3412,1463,739,302,353,1,554,...,234.0,595.0,3.0,"Boise City city, Idaho",205671.0,204776.0,895.0,35.3,85704.0,16
1,56260,19379,19379,1289,448,440,90,57,3,251,...,106.0,105.0,0.0,"Nampa city, Idaho",81557.0,81285.0,272.0,30.1,27729.0,16
2,64090,20627,20627,1293,556,298,87,84,1,267,...,70.0,97.0,3.0,"Pocatello city, Idaho",54255.0,54189.0,66.0,30.2,20832.0,16
3,39700,19771,19771,978,373,200,82,86,0,237,...,63.0,157.0,5.0,"Idaho Falls city, Idaho",56813.0,56558.0,255.0,32.2,21203.0,16
4,52120,12293,12293,464,56,276,45,22,0,65,...,71.0,135.0,0.0,"Meridian city, Idaho",75092.0,74986.0,106.0,32.5,25302.0,16


In [43]:
fig = go.Figure(data=[
    go.Bar(name='2000_pop', x=three_state_df['City_Name'], y=three_state_df['Total_Population_2000']),
    go.Bar(name='2010_pop', x=three_state_df['City_Name'], y=three_state_df['Total_Population_2010']),
    go.Bar(name='2000_housing', x=three_state_df['City_Name'], y=three_state_df['Total_Housing_2000']),
    go.Bar(name='2010_housing', x=three_state_df['City_Name'], y=three_state_df['Total_Housing_2010']),
    go.Bar(name='2000_non-relatives', x=three_state_df['City_Name'], y=three_state_df['Presence_of_Non-Relatives_2000']),
    go.Bar(name='2010_non-relatives', x=three_state_df['City_Name'], y=three_state_df['Presence_of_Non-Relatives_2010']),
])
fig.update_layout(barmode='group')
fig.show()

* This graph is hard to gather any useful data from due to how New York City and Los Angeles are skewing the graph, let's drop those cities from the graph

In [44]:
three_state_df.drop(three_state_df[three_state_df['City_Name'] =='Los Angeles city, California'].index, inplace = True)
three_state_df.drop(three_state_df[three_state_df['City_Name'] =='New York city, New York'].index, inplace = True)

In [45]:
fig = go.Figure(data=[
    go.Bar(name='2000_pop', x=three_state_df['City_Name'], y=three_state_df['Total_Population_2000']),
    go.Bar(name='2010_pop', x=three_state_df['City_Name'], y=three_state_df['Total_Population_2010']),
    go.Bar(name='2000_housing', x=three_state_df['City_Name'], y=three_state_df['Total_Housing_2000']),
    go.Bar(name='2010_housing', x=three_state_df['City_Name'], y=three_state_df['Total_Housing_2010']),
    go.Bar(name='2000_non-relatives', x=three_state_df['City_Name'], y=three_state_df['Presence_of_Non-Relatives_2000']),
    go.Bar(name='2010_non-relatives', x=three_state_df['City_Name'], y=three_state_df['Presence_of_Non-Relatives_2010']),
])
fig.update_layout(barmode='group')
fig.show()

* California is still being an issue, lets drop those cities form our graph

In [46]:
three_state_df.drop(three_state_df[three_state_df['state'] ==states.CA.fips].index, inplace = True)

In [47]:
fig = go.Figure(data=[
    go.Bar(name='2000_pop', x=three_state_df['City_Name'], y=three_state_df['Total_Population_2000']),
    go.Bar(name='2010_pop', x=three_state_df['City_Name'], y=three_state_df['Total_Population_2010']),
    go.Bar(name='2000_housing', x=three_state_df['City_Name'], y=three_state_df['Total_Housing_2000']),
    go.Bar(name='2010_housing', x=three_state_df['City_Name'], y=three_state_df['Total_Housing_2010']),
    go.Bar(name='2000_non-relatives', x=three_state_df['City_Name'], y=three_state_df['Presence_of_Non-Relatives_2000']),
    go.Bar(name='2010_non-relatives', x=three_state_df['City_Name'], y=three_state_df['Presence_of_Non-Relatives_2010']),
])
fig.update_layout(barmode='group')
fig.show()

In [48]:
fig = go.Figure(data=[
    go.Bar(name='2000_age', x=three_state_df['City_Name'], y=three_state_df['Median_Age_2000']),
    go.Bar(name='2010_age', x=three_state_df['City_Name'], y=three_state_df['Median_Age_2010']),
])
fig.update_layout(barmode='group')
fig.show()

## American Community Survey

In [49]:
i = 0
three_state_acs = []
for x in acs_years_ca:
    acs_1 = pd.concat([acs_years_ca[i], acs_years_ny[i], acs_years_id[i]])
    three_state_acs.append(acs_1)
    i = i + 1

In [50]:
three_state_df.set_index('FIPS', inplace=True)

In [51]:
for x in three_state_acs:
    print(x)
    three_state_df = three_state_df.join(x, on="FIPS")

       Total_Housing_2012  Median_Age_2012  Total_Population_2012  \
FIPS                                                                
44000             97760.0             34.1              3804503.0   
66000             34361.0             33.7              1308619.0   
68000             22425.0             35.2               954379.0   
67000             22906.0             38.5               807755.0   
27000             10914.0             29.4               495777.0   
51000            215308.0             35.5              8199221.0   
11000              8891.0             33.3               261955.0   
63000              7192.0             31.1               210967.0   
84000              5296.0             37.6               196459.0   
73000              5342.0             29.4               144703.0   
08830              5659.0             35.8               208332.0   
56260              1694.0             29.8                81667.0   
52120              1444.0         

In [52]:
three_state_df.head()

Unnamed: 0_level_0,Total_Housing_2000,Occupancy_Status_For_Housing_Units_2000,Vacancy_Status_2000,For_Rent_2000,Rented_Not_Occupied_2000,For_Sale_Only_2000,Sold_Not_Occupied_2000,For_Seasonal_Recreational_Or_Occasional_Use_2000,For_Migrant_Workers_2000,Total_Population_2000,...,Total_Population_2015,Presence_of_Non-Relatives_2015,Total_Housing_2016,Median_Age_2016,Total_Population_2016,Presence_of_Non-Relatives_2016,Total_Housing_2017,Median_Age_2017,Total_Population_2017,Presence_of_Non-Relatives_2017
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8830,77850,77850,3412,1463,739,302,353,1,554,185787,...,214196.0,933.0,5859.0,35.8,218677.0,591.0,5814.0,36.0,220859.0,356.0
56260,19379,19379,1289,448,440,90,57,3,251,51867,...,86203.0,401.0,1703.0,31.1,87896.0,389.0,1707.0,31.4,89576.0,320.0
64090,20627,20627,1293,556,298,87,84,1,267,51466,...,54549.0,111.0,1797.0,31.4,54592.0,112.0,1763.0,31.6,54658.0,184.0
39700,19771,19771,978,373,200,82,86,0,237,50730,...,58374.0,130.0,1848.0,33.1,58933.0,161.0,1840.0,33.0,59414.0,297.0
52120,12293,12293,464,56,276,45,22,0,65,34919,...,84018.0,272.0,1176.0,35.4,88247.0,374.0,1213.0,35.7,91917.0,262.0


In [53]:
fig = go.Figure(data=[
    go.Bar(name='2000_pop', x=three_state_df['City_Name'], y=three_state_df['Total_Population_2000']),
    go.Bar(name='2010_pop', x=three_state_df['City_Name'], y=three_state_df['Total_Population_2010']),
    go.Bar(name='2012_pop', x=three_state_df['City_Name'], y=three_state_df['Total_Population_2012']),
    go.Bar(name='2013_pop', x=three_state_df['City_Name'], y=three_state_df['Total_Population_2013']),
    go.Bar(name='2014_pop', x=three_state_df['City_Name'], y=three_state_df['Total_Population_2014']),
    go.Bar(name='2015_pop', x=three_state_df['City_Name'], y=three_state_df['Total_Population_2015']),
    go.Bar(name='2016_pop', x=three_state_df['City_Name'], y=three_state_df['Total_Population_2016']),
    go.Bar(name='2017_pop', x=three_state_df['City_Name'], y=three_state_df['Total_Population_2017']),
    go.Bar(name='2000_housing', x=three_state_df['City_Name'], y=three_state_df['Total_Housing_2000']),
    go.Bar(name='2010_housing', x=three_state_df['City_Name'], y=three_state_df['Total_Housing_2010']),
    go.Bar(name='2012_housing', x=three_state_df['City_Name'], y=three_state_df['Total_Housing_2012']),
    go.Bar(name='2013_housing', x=three_state_df['City_Name'], y=three_state_df['Total_Housing_2013']),
    go.Bar(name='2014_housing', x=three_state_df['City_Name'], y=three_state_df['Total_Housing_2014']),
    go.Bar(name='2015_housing', x=three_state_df['City_Name'], y=three_state_df['Total_Housing_2015']),
    go.Bar(name='2016_housing', x=three_state_df['City_Name'], y=three_state_df['Total_Housing_2016']),
    go.Bar(name='2017_housing', x=three_state_df['City_Name'], y=three_state_df['Total_Housing_2017']),
    go.Bar(name='2000_non-relatives', x=three_state_df['City_Name'], y=three_state_df['Presence_of_Non-Relatives_2000']),
    go.Bar(name='2010_non-relatives', x=three_state_df['City_Name'], y=three_state_df['Presence_of_Non-Relatives_2010']),
    go.Bar(name='2012_non-relatives', x=three_state_df['City_Name'], y=three_state_df['Presence_of_Non-Relatives_2012']),
    go.Bar(name='2013_non-relatives', x=three_state_df['City_Name'], y=three_state_df['Presence_of_Non-Relatives_2013']),
    go.Bar(name='2014_non-relatives', x=three_state_df['City_Name'], y=three_state_df['Presence_of_Non-Relatives_2014']),
    go.Bar(name='2015_non-relatives', x=three_state_df['City_Name'], y=three_state_df['Presence_of_Non-Relatives_2015']),
    go.Bar(name='2016_non-relatives', x=three_state_df['City_Name'], y=three_state_df['Presence_of_Non-Relatives_2016']),
    go.Bar(name='2017_non-relatives', x=three_state_df['City_Name'], y=three_state_df['Presence_of_Non-Relatives_2017']),
])
fig.update_layout(barmode='group')
fig.show()

In [54]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import sklearn.metrics
import sys
from pandas_ml import ConfusionMatrix

In [55]:
mod = smf.glm('Total_Population_2000 ~ Total_Population_2010 + Total_Population_2012 + Total_Population_2013', three_state_df, family=sm.families.Binomial()).fit()
mod.summary()


divide by zero encountered in true_divide


divide by zero encountered in true_divide


invalid value encountered in add


divide by zero encountered in log



0,1,2,3
Dep. Variable:,Total_Population_2000,No. Observations:,9.0
Model:,GLM,Df Residuals:,5.0
Model Family:,Binomial,Df Model:,3.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,
Date:,"Sun, 01 Dec 2019",Deviance:,118350000.0
Time:,15:40:40,Pearson chi2:,1.07e+27
No. Iterations:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-4.567e+19,7.28e+07,-6.28e+11,0.000,-4.57e+19,-4.57e+19
Total_Population_2010,2.696e+16,3.56e+04,7.57e+11,0.000,2.7e+16,2.7e+16
Total_Population_2012,7.486e+16,5.36e+04,1.4e+12,0.000,7.49e+16,7.49e+16
Total_Population_2013,-9.454e+16,2.62e+04,-3.61e+12,0.000,-9.45e+16,-9.45e+16
