# Recycling Effectiveness in MA

### *Part 2: Clustering of Municipalities Based on Census Data*


In [140]:
# imports

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.cluster.hierarchy as sch

from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans

In [141]:
# creating df for each group

groups = [
    'education_char',
    'economic_char',
    'housing_char',
    'demo_char'
]

for characteristic in groups:
    exec(f'{characteristic} = pd.read_csv("data/census_data/{characteristic}.csv")')

In [142]:
education_char.set_index('municipality', inplace = True)
education_char.head()

Unnamed: 0_level_0,population_25_and_older,edu_high_school_and_higher_%,edu_bachelors_and_higher_%
municipality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Weymouth,41855,93.8,39.2
Winthrop,14263,95.5,38.3
Belchertown,2017,93.2,40.5
Needham,20559,97.8,76.0
Quincy,73038,88.2,45.1


In [143]:
economic_char.set_index('municipality', inplace = True)
housing_char.set_index('municipality', inplace = True)
demo_char.set_index('municipality', inplace = True)

In [144]:
comb_char = pd.concat([education_char,economic_char,housing_char,demo_char], axis=1)
comb_char.head()

Unnamed: 0_level_0,population_25_and_older,edu_high_school_and_higher_%,edu_bachelors_and_higher_%,unemployment_rate,median_household_income,mean_household_income,families_below_poverty_level_%,people_below_poverty_level_%,Total housing units,occupancy_%,...,renter-occupied_%,males-to-100females,median_age,demo_white_%,demo_black_aa_%,demo_american_indian_%,demo_asian_%,demo_native_islander%,demo_other_Race_%,demo_hispanic_latino_%
municipality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Weymouth,41855,93.8,39.2,5.8,84942,100858,4.3,6.0,24570,96.6,...,33.0,90.4,42.4,85.7,6.3,0.3,7.5,0.2,1.8,3.8
Winthrop,14263,95.5,38.3,3.6,74069,102769,5.4,8.8,8367,93.6,...,44.1,88.3,44.9,95.0,3.7,0.2,1.3,0.0,1.9,9.5
Belchertown,2017,93.2,40.5,4.4,78578,96836,3.7,9.3,1224,92.9,...,31.6,99.0,42.0,92.9,1.5,0.0,9.2,0.0,0.0,5.2
Needham,20559,97.8,76.0,4.0,165547,223894,1.4,2.7,11309,95.5,...,16.1,92.2,43.5,87.4,3.4,0.3,10.6,0.0,0.7,3.2
Quincy,73038,88.2,45.1,5.1,77562,94360,8.0,11.3,43736,94.2,...,53.8,94.5,39.7,62.7,5.8,0.4,31.4,0.3,1.4,3.1


In [145]:
kmeans = KMeans(n_clusters=6)
kmeans.fit(comb_char)

KMeans(n_clusters=6)

In [146]:
kmeans.cluster_centers_

array([[ 2.05590000e+04,  9.78000000e+01,  7.60000000e+01,
         4.00000000e+00,  1.65547000e+05,  2.23894000e+05,
         1.40000000e+00,  2.70000000e+00,  1.13090000e+04,
         9.55000000e+01,  7.64000000e+01,  5.40000000e+00,
         3.80000000e+00,  1.60000000e+00,  2.20000000e+00,
         1.60000000e+00,  8.90000000e+00,  0.00000000e+00,
         0.00000000e+00,  8.39000000e+01,  1.61000000e+01,
         9.22000000e+01,  4.35000000e+01,  8.74000000e+01,
         3.40000000e+00,  3.00000000e-01,  1.06000000e+01,
         0.00000000e+00,  7.00000000e-01,  3.20000000e+00],
       [ 3.40000000e+01,  1.00000000e+02,  7.65000000e+01,
         0.00000000e+00, -6.66666666e+08, -9.99999999e+08,
         0.00000000e+00,  0.00000000e+00,  4.70000000e+01,
         3.62000000e+01,  1.00000000e+02,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+02,  0.00000000e+

In [147]:
pd.Series(kmeans.labels_).value_counts()

3    146
5     79
0     15
2      2
1      2
4      1
dtype: int64

In [148]:
comb_char['label'] = kmeans.labels_

In [149]:
comb_stats = comb_char.describe().T

In [150]:
stats0 = comb_char[comb_char['label'] == 0].describe().T

In [151]:
stats3 = comb_char[comb_char['label'] == 3].describe().T
stats4 = comb_char[comb_char['label'] == 4].describe().T

In [152]:
sub = 100*(comb_stats - stats3) / comb_stats

In [153]:
sub.drop(columns=['count', 'std', 'min', 'max'], inplace=True)
sub.head()

Unnamed: 0,mean,25%,50%,75%
population_25_and_older,6.890073,5.234029,4.237787,12.452022
edu_high_school_and_higher_%,2.192263,2.588106,2.375924,2.361396
edu_bachelors_and_higher_%,22.519978,19.53125,25.471698,23.194946
unemployment_rate,-10.521033,-20.0,-14.634146,-5.263158
median_household_income,100.601291,11.238981,16.031052,23.23211


In [154]:
sub.style.background_gradient(cmap='Blues')

  norm = colors.Normalize(smin - (rng * low), smax + (rng * high))


Unnamed: 0,mean,25%,50%,75%
population_25_and_older,6.890073,5.234029,4.237787,12.452022
edu_high_school_and_higher_%,2.192263,2.588106,2.375924,2.361396
edu_bachelors_and_higher_%,22.519978,19.53125,25.471698,23.194946
unemployment_rate,-10.521033,-20.0,-14.634146,-5.263158
median_household_income,100.601291,11.238981,16.031052,23.23211
mean_household_income,101.029633,9.261241,12.851103,22.608673
families_below_poverty_level_%,-27.2914,-75.0,-40.0,-40.140845
people_below_poverty_level_%,-19.463829,-48.780488,-28.169014,-15.707965
Total housing units,5.045489,-0.741906,-1.73495,1.654795
occupancy_%,1.62722,3.653382,1.943844,0.968586


Checking the smaller groups manually for key features.

In [155]:
comb_char[comb_char['label'] == 1]

Unnamed: 0_level_0,population_25_and_older,edu_high_school_and_higher_%,edu_bachelors_and_higher_%,unemployment_rate,median_household_income,mean_household_income,families_below_poverty_level_%,people_below_poverty_level_%,Total housing units,occupancy_%,...,males-to-100females,median_age,demo_white_%,demo_black_aa_%,demo_american_indian_%,demo_asian_%,demo_native_islander%,demo_other_Race_%,demo_hispanic_latino_%,label
municipality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Seconsett Island,40,100.0,22.5,0.0,-666666666,-999999999,0.0,0.0,77,24.7,...,90.5,37.9,100.0,0.0,0.0,0.0,0.0,0.0,0.0,1
Popponesset Island,34,100.0,76.5,0.0,-666666666,-999999999,0.0,0.0,47,36.2,...,100.0,81.1,100.0,0.0,0.0,0.0,0.0,0.0,0.0,1


these are small summer retreat type towns. I doubt I even have data on them. will probably remove

In [156]:
comb_char[comb_char['label'] == 2]

Unnamed: 0_level_0,population_25_and_older,edu_high_school_and_higher_%,edu_bachelors_and_higher_%,unemployment_rate,median_household_income,mean_household_income,families_below_poverty_level_%,people_below_poverty_level_%,Total housing units,occupancy_%,...,males-to-100females,median_age,demo_white_%,demo_black_aa_%,demo_american_indian_%,demo_asian_%,demo_native_islander%,demo_other_Race_%,demo_hispanic_latino_%,label
municipality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Popponesset,112,100.0,68.8,0.0,-666666666,38569,47.2,44.0,451,15.5,...,154.2,62.3,100.0,0.0,0.0,0.0,0.0,0.0,0.0,2
Deerfield,311,98.4,74.6,8.1,-666666666,90963,0.0,38.9,155,100.0,...,121.0,40.0,76.3,28.8,0.0,9.0,0.0,1.3,9.5,2


Very low populations again but this time with high poverty. Both also appears to have a missing input for median income.

In [157]:
comb_char[comb_char['label'] == 5]

Unnamed: 0_level_0,population_25_and_older,edu_high_school_and_higher_%,edu_bachelors_and_higher_%,unemployment_rate,median_household_income,mean_household_income,families_below_poverty_level_%,people_below_poverty_level_%,Total housing units,occupancy_%,...,males-to-100females,median_age,demo_white_%,demo_black_aa_%,demo_american_indian_%,demo_asian_%,demo_native_islander%,demo_other_Race_%,demo_hispanic_latino_%,label
municipality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abington,11793,94.7,39.1,4.3,99381,105390,2.5,4.1,6781,94.3,...,97.7,42.2,93.2,3.0,0.3,4.9,0.0,1.4,2.6,5
Kingston,4261,97.3,39.7,2.5,90754,110433,1.2,3.0,2748,88.8,...,99.4,41.9,99.1,2.4,0.0,0.0,0.0,0.9,5.3,5
Mattapoisett Center,2482,98.4,50.0,2.9,93826,159992,1.9,2.8,1819,76.5,...,88.7,52.1,98.3,0.0,0.5,0.0,0.0,1.7,2.9,5
Cambridge,79240,95.5,79.0,4.0,103154,139991,7.3,12.7,51882,90.3,...,99.4,30.5,70.0,12.2,0.8,19.1,0.2,2.4,9.5,5
Pinehurst,5229,91.8,33.0,4.7,107159,124086,2.6,4.0,2536,96.0,...,105.2,42.3,88.2,4.5,0.2,9.8,0.0,0.5,1.8,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Madaket,255,100.0,53.7,10.8,101563,127276,0.0,21.9,650,17.4,...,82.7,51.3,100.0,0.0,0.0,0.0,0.0,0.0,0.0,5
The Pinehills,1391,98.8,69.8,4.5,101518,146590,0.0,4.9,874,94.4,...,67.1,68.9,94.5,0.0,0.0,5.5,0.0,0.0,0.0,5
Upton,1806,97.7,50.7,4.9,89507,120387,0.0,2.6,1053,91.1,...,137.0,47.9,97.9,2.4,0.0,0.0,0.0,0.7,7.9,5
Watertown,27845,95.0,65.9,3.5,101103,124756,4.9,7.6,16570,94.3,...,87.5,38.2,85.1,2.5,0.4,11.3,0.3,3.4,9.2,5


Ah... can't remove Boston. But yes, it is the highest population by far.

In [158]:
mudata= pd.read_csv('data/MA_MSW_Collection_Data/musum19.csv', index_col='Municipality Name')

In [159]:
mudata.loc['Boston']

tot_households               310800
stream_type                      ss
tons_ss_recyclables           37050
tons_ms_recyclables               0
tons_recyclables_total        37050
hh_served_by_mu_recycle      260000
tons_recyclables/hh          0.1425
hh_served_by_mu_trash        260000
tons_trash_total             188113
tons_trash/hh              0.723512
%recycle/hh                0.164547
total_waste/hh             0.866012
Name: Boston, dtype: object

In [160]:
muns = list(mudata.index)

In [161]:
outliers = ['Popponesset', 'Deerfield', 'Seconsett Island', 'Popponesset Island', 'Boston']

In [162]:
for place in outliers:
    print(f"{place} in mun list: {place in muns}")

Popponesset in mun list: False
Deerfield in mun list: True
Seconsett Island in mun list: False
Popponesset Island in mun list: False
Boston in mun list: True


okay, I'm going to remove the 3 False values, fix the missing mean income value for Deerfield, and then repeat the kMeans Clustering. Also will remove non-relevant columns: Population of 25 years and older, % with HS edu, M:F, Med Age, and % white.

In [163]:
comb_char.columns

Index(['population_25_and_older', 'edu_high_school_and_higher_%',
       'edu_bachelors_and_higher_%', 'unemployment_rate',
       'median_household_income', 'mean_household_income',
       'families_below_poverty_level_%', 'people_below_poverty_level_%',
       'Total housing units', 'occupancy_%', '1-unit_detached_%',
       '1-unit_attached_%', '2-units_%', '3-4_units_%', '5-9_units_%',
       '10-19_units_%', '20+_units_%', 'mobile_home_%', 'boat_RV_van_%',
       'owner-occupied_%', 'renter-occupied_%', 'males-to-100females',
       'median_age', 'demo_white_%', 'demo_black_aa_%',
       'demo_american_indian_%', 'demo_asian_%', 'demo_native_islander%',
       'demo_other_Race_%', 'demo_hispanic_latino_%', 'label'],
      dtype='object')

In [164]:
comb_char_clean = comb_char.drop(columns=['population_25_and_older','edu_high_school_and_higher_%','males-to-100females','median_age', 'demo_white_%'], 
                                 index=['Popponesset ', 'Seconsett Island ', 'Popponesset Island '])

In [165]:
comb_char_clean.at['Deerfield ', 'median_household_income',] = comb_char_clean.loc['Deerfield ']['mean_household_income']
comb_char_clean.loc['Deerfield ']['median_household_income']

90963.0

In [166]:
kmeans2 = KMeans(n_clusters=6)
kmeans2.fit(comb_char_clean)

KMeans(n_clusters=6)

In [167]:
pd.Series(kmeans2.labels_).value_counts()

4    80
0    65
2    52
5    32
1    12
3     1
dtype: int64

In [168]:
comb_char_clean['label'] = kmeans2.labels_

In [169]:
comb_char_clean[comb_char_clean['label'] == 3]

Unnamed: 0_level_0,edu_bachelors_and_higher_%,unemployment_rate,median_household_income,mean_household_income,families_below_poverty_level_%,people_below_poverty_level_%,Total housing units,occupancy_%,1-unit_detached_%,1-unit_attached_%,...,boat_RV_van_%,owner-occupied_%,renter-occupied_%,demo_black_aa_%,demo_american_indian_%,demo_asian_%,demo_native_islander%,demo_other_Race_%,demo_hispanic_latino_%,label
municipality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Boston,49.7,6.6,71115,107608,14.1,18.9,294418,91.5,11.9,6.3,...,0.0,35.0,65.0,28.9,0.9,10.7,0.2,7.4,19.8,3


In [170]:
centers2 = pd.DataFrame(kmeans2.cluster_centers_).T
centers2['features'] = comb_char_clean.columns
centers2.set_index('features', inplace = True)
centers2

Unnamed: 0_level_0,0,1,2,3,4,5
features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
edu_bachelors_and_higher_%,27.366154,76.658333,49.65,49.7,38.38,62.1875
unemployment_rate,6.069231,4.008333,3.946154,6.6,4.24875,3.859375
median_household_income,52633.523077,174049.833333,96233.5,71115.0,74593.6625,119182.125
mean_household_income,69762.938462,240043.833333,119343.115385,107608.0,93259.25,157997.28125
families_below_poverty_level_%,9.532308,1.466667,3.171154,14.1,4.71375,2.521875
people_below_poverty_level_%,13.523077,2.483333,6.155769,18.9,8.17625,5.25
Total housing units,10428.046154,6888.083333,7063.057692,294418.0,6150.675,4910.15625
occupancy_%,83.392308,95.733333,87.648077,91.5,82.11625,85.3625
1-unit_detached_%,50.966154,83.95,63.209615,11.9,66.245,74.325
1-unit_attached_%,3.450769,4.125,6.207692,6.3,4.7575,6.053125
