## Note: 

This isn't an important notebook for understanding recycling. It's just me importing and cleaning the census data for future use. Hence, there will be fewer comments.

In [4]:
import pandas as pd
import requests

FPS code information
https://transition.fcc.gov/oet/info/maps/census/fips/fips.txt
25        MASSACHUSETTS

group information:
https://api.census.gov/data/2019/acs/acs5/profile/groups.html

In [5]:
def group_call(groupID):
    '''
    API call for census data, takes a groupID, refers to metadata table in data/census and returns a dataframe with desire columns
    '''
    # getting meta data information for group, meta data includes specific columns of interest and their labels for each "group" in the ACS
    meta = pd.read_csv('data/census_data/metadatagroup.csv', index_col='ID')
    meta = meta[meta['Group'] == groupID]
    meta.loc['NAME'] = ['','','municipality']
    estIDs = meta.index
    
    # Set base url
    url = 'https://api.census.gov/data/2019/acs/acs5/profile?'
    
    # Set params
    params = {
        'get': f'group({groupID})',
        'for': 'place:*',
        'in': 'state:25',
        'key': 'APIKEY'
    }
    # Make a request and display the response code.
    res = requests.get(url,params)
    res
    # Create a dataframe from the request JSON object.
    df = pd.DataFrame(res.json())
    df.columns = df.iloc[0]
    df.drop(df.index[0], inplace=True)
    
    # Only keep cols of interest
    df = df[estIDs]
    
    # Relabel IDs to Labels, format municipality names, and make muncipality the index
    df.rename(columns= dict(zip(df.columns, meta['label'])), inplace = True)
    df['municipality'] = df['municipality'].str.replace(', Massachusetts', '')
    df['municipality'] = df['municipality'].str.replace('Town city', '')
    df['municipality'] = df['municipality'].str.replace('CDP', '')
    df['municipality'] = df['municipality'].str.replace('city', '')
    df['municipality'] = df['municipality'].str.rstrip(' ')
    df.set_index('municipality', inplace = True)
    
    return df

In [6]:
# creating df for each group

groups = {
    'education_char':'DP02',
    'economic_char':'DP03',
    'housing_char':'DP04',
    'demo_char':'DP05'
}

for characteristic in groups:
    group_call(groups[characteristic])
    exec(f'{characteristic} = group_call(groups[characteristic])')

In [7]:
education_char.columns

Index(['KEY_population_25_and_older', 'edu_high_school_and_higher_%',
       'edu_bachelors_and_higher_%'],
      dtype='object', name=0)

In [8]:
education_char.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245 entries, Weymouth to Hanscom AFB
Data columns (total 3 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   KEY_population_25_and_older   245 non-null    object
 1   edu_high_school_and_higher_%  245 non-null    object
 2   edu_bachelors_and_higher_%    245 non-null    object
dtypes: object(3)
memory usage: 7.7+ KB


In [11]:
education_char['edu_bachelors_and_higher_%'] = pd.to_numeric(education_char['edu_bachelors_and_higher_%'])
education_char['edu_high_school_and_higher_%'] = pd.to_numeric(education_char['edu_high_school_and_higher_%'])
education_char['KEY_population_25_and_older'] = pd.to_numeric(education_char['KEY_population_25_and_older'])

In [12]:
education_char.describe()

Unnamed: 0,KEY_population_25_and_older,edu_high_school_and_higher_%,edu_bachelors_and_higher_%
count,245.0,245.0,245.0
mean,13719.473469,93.365306,43.09551
std,34595.002572,5.501153,17.911639
min,34.0,67.0,9.6
25%,1581.0,90.8,28.8
50%,3398.0,94.7,42.4
75%,16935.0,97.4,55.4
max,473775.0,100.0,84.7


In [13]:
economic_char.head()

Unnamed: 0_level_0,KEY_pop_16_+,unemployment_rate_%,median_household_income,mean_household_income,families_below_poverty_level_%,people_below_poverty_level_%
municipality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Weymouth,47753,5.8,84942,100858,4.3,6.0
Winthrop,15688,3.6,74069,102769,5.4,8.8
Belchertown,2328,4.4,78578,96836,3.7,9.3
Needham,23711,4.0,165547,223894,1.4,2.7
Quincy,82087,5.1,77562,94360,8.0,11.3


In [14]:
for column in economic_char.columns:
    economic_char[column] = pd.to_numeric(economic_char[column])

In [15]:
economic_char.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245 entries, Weymouth to Hanscom AFB
Data columns (total 6 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   KEY_pop_16_+                    245 non-null    int64  
 1   unemployment_rate_%             245 non-null    float64
 2   median_household_income         245 non-null    int64  
 3   mean_household_income           245 non-null    int64  
 4   families_below_poverty_level_%  245 non-null    float64
 5   people_below_poverty_level_%    245 non-null    float64
dtypes: float64(3), int64(3)
memory usage: 13.4+ KB


In [16]:
for column in demo_char.columns:
    demo_char[column] = pd.to_numeric(demo_char[column])

In [17]:
demo_char.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245 entries, Weymouth to Hanscom AFB
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   KEY_total_population    245 non-null    int64  
 1   males-to-100females     245 non-null    float64
 2   median_age              245 non-null    float64
 3   demo_white_%            245 non-null    float64
 4   demo_black_aa_%         245 non-null    float64
 5   demo_american_indian_%  245 non-null    float64
 6   demo_asian_%            245 non-null    float64
 7   demo_native_islander%   245 non-null    float64
 8   demo_other_Race_%       245 non-null    float64
 9   demo_hispanic_latino_%  245 non-null    float64
dtypes: float64(9), int64(1)
memory usage: 21.1+ KB


In [18]:
demo_char.sort_values(by='demo_white_%')

Unnamed: 0_level_0,KEY_total_population,males-to-100females,median_age,demo_white_%,demo_black_aa_%,demo_american_indian_%,demo_asian_%,demo_native_islander%,demo_other_Race_%,demo_hispanic_latino_%
municipality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Randolph,34064,92.8,40.7,34.7,47.5,0.7,13.2,0.0,7.1,8.8
Brockton,95594,91.0,35.7,38.1,48.0,0.9,2.4,0.1,14.3,11.1
Lynn,93743,101.5,34.5,54.0,18.2,1.1,8.3,0.3,25.3,42.8
Lawrence,79942,95.8,32.3,55.2,7.6,0.5,1.9,0.3,37.6,80.6
Malden,60984,91.8,34.6,55.7,20.0,0.6,23.8,0.2,3.1,8.5
...,...,...,...,...,...,...,...,...,...,...
Cheshire,539,100.4,51.8,100.0,0.0,1.5,0.0,0.0,0.0,0.0
Warren,510,78.9,42.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0
Chatham,1428,83.5,67.4,100.0,0.0,1.0,0.0,0.0,0.0,0.0
Northfield,1032,84.0,51.6,100.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
demo_char.describe()

Unnamed: 0,KEY_total_population,males-to-100females,median_age,demo_white_%,demo_black_aa_%,demo_american_indian_%,demo_asian_%,demo_native_islander%,demo_other_Race_%,demo_hispanic_latino_%
count,245.0,245.0,245.0,245.0,245.0,245.0,245.0,245.0,245.0,245.0
mean,19751.840816,95.375102,45.42449,90.09102,4.671837,0.674286,4.402041,0.129796,2.768163,6.519184
std,50303.813237,25.984873,9.260783,10.691965,6.709394,1.073389,5.522936,0.397392,4.713189,9.812768
min,34.0,42.2,20.4,34.7,0.0,0.0,0.0,0.0,0.0,0.0
25%,2045.0,86.9,40.2,86.3,0.8,0.0,0.5,0.0,0.0,1.5
50%,4796.0,92.9,43.8,93.5,2.8,0.4,2.4,0.0,1.2,3.5
75%,25132.0,99.4,49.9,97.5,5.9,0.9,6.0,0.1,3.3,7.4
max,684379.0,443.9,81.1,100.0,48.0,9.2,32.6,4.7,37.6,80.6


In [20]:
for column in housing_char.columns:
    housing_char[column] = pd.to_numeric(housing_char[column])

In [21]:
housing_char.info()

<class 'pandas.core.frame.DataFrame'>
Index: 245 entries, Weymouth to Hanscom AFB
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   KEY_total_housing_units  245 non-null    int64  
 1   occupancy_%              245 non-null    float64
 2   1-unit_detached_%        245 non-null    float64
 3   1-unit_attached_%        245 non-null    float64
 4   2-units_%                245 non-null    float64
 5   3-4_units_%              245 non-null    float64
 6   5-9_units_%              245 non-null    float64
 7   10-19_units_%            245 non-null    float64
 8   20+_units_%              245 non-null    float64
 9   mobile_home_%            245 non-null    float64
 10  boat_RV_van_%            245 non-null    float64
 11  owner-occupied_%         245 non-null    float64
 12  renter-occupied_%        245 non-null    float64
dtypes: float64(12), int64(1)
memory usage: 26.8+ KB


In [22]:
housing_char[housing_char['occupancy_%'] < 10]

Unnamed: 0_level_0,KEY_total_housing_units,occupancy_%,1-unit_detached_%,1-unit_attached_%,2-units_%,3-4_units_%,5-9_units_%,10-19_units_%,20+_units_%,mobile_home_%,boat_RV_van_%,owner-occupied_%,renter-occupied_%
municipality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Siasconset,1056,3.7,98.6,0.0,1.4,0.0,0.0,0.0,0.0,0.0,0.0,100.0,0.0


In [23]:
# For saving files

# for char_df in groups:
#     exec(f"{char_df}.to_csv('data/census_data/{char_df}.csv', index=True)")