### CV testing of label

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

In [8]:
# import clustering data

cluster = pd.read_csv('data/cluster_Data.csv', index_col='municipality')
cluster_labels = cluster.loc[:,'km_label':]

In [9]:
# df of service details per municipality with 'hc-schema' labels

df_of_serve_details = pd.read_csv('data/data_for_regression.csv', index_col = 'Municipality Name')
df_of_serve_details = df_of_serve_details.merge(cluster_labels['hc_label'], left_index = True, right_index=True)
df_of_serve_details.head()

Unnamed: 0,Solid Waste program funded by property tax?,Solid Waste program funded by transfer station access fee?,What is the annual fee?,What is the transfer station access fee?,What is the per-visit fee?,PAYT/ SMART,Municipal Buildings Trash and Recycling Service_Both,Municipal Buildings Trash and Recycling Service_Recycling,Municipal Buildings Trash and Recycling Service_Trash,School Trash and Recycling Service_Both,...,Recycling Enforced by Muni,Recycling Enforced by Hauler,Dedicated Mandatory Recycling Enforcement Personnel,# Hours Enforcement Personnel on Street,Private Hauler regulations that require recycling,%recycle/hh,Recycling Collection Frequency_Bi-weekly,Recycling Collection Frequency_Weekly,Recycle Bin Size Ranking,hc_label
Abington,1,0,0.0,0.0,0.0,0,1,0,0,1,...,1,1,1,20.0,1,0.269733,0,1,0.5,0
Acton,0,1,0.0,100.0,30.0,1,1,0,0,0,...,1,0,0,0.0,0,0.310437,0,0,0.0,2
Acushnet,1,0,0.0,0.0,0.0,0,1,0,0,1,...,1,1,0,0.0,1,0.215648,1,0,1.0,0
Adams,1,1,0.0,50.0,0.0,1,1,0,0,0,...,1,1,0,0.0,1,0.509377,0,0,0.0,0
Agawam,1,0,0.0,0.0,0.0,0,1,0,0,1,...,0,0,0,0.0,0,0.249912,1,0,0.75,0


In [10]:
df_of_serve_details = df_of_serve_details.drop(columns=['Households Served by Municipal Recycling Program'])

In [12]:
l0_serve = df_of_serve_details.groupby('hc_label').get_group(0)
l1_serve = df_of_serve_details.groupby('hc_label').get_group(1)
l2_serve = df_of_serve_details.groupby('hc_label').get_group(2)
l3_serve = df_of_serve_details.groupby('hc_label').get_group(3)

In [11]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, KFold
from sklearn.pipeline import make_pipeline
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor, BaggingRegressor, VotingRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.feature_selection import RFE

In [58]:
cv_results = {}
kfs = []
for i in range(8):
    kfs.append(KFold(n_splits=3, shuffle=True, random_state=i*1000))

In [53]:
def cross_validation_results(estimator, serv_df, name):
    X = serv_df.drop(columns=['%recycle/hh', 'hc_label'])
    y = serv_df['%recycle/hh']
    cv_results[name] = []
    
    for cv_kf in kfs:
        cv = cross_val_score(estimator, X, y, cv=cv_kf) 
        cv_mean = cv.mean()
        cv_results[name].append(cv_mean)

In [41]:
def cross_validation_procedure(df, tag):
    cross_validation_results(DummyRegressor(), df, tag+"__dummy")
    cross_validation_results(RandomForestRegressor(random_state=8), df, tag+"__rfr")
    cross_validation_results(BaggingRegressor(random_state=8), df, tag+"__bagging")
    cross_validation_results(GradientBoostingRegressor(min_samples_leaf=5, random_state=8), df, tag+"__boosting")

In [59]:
cross_validation_procedure(l0_serve, "L0")
cross_validation_procedure(l1_serve, "L1")
cross_validation_procedure(l2_serve, "L2")

In [57]:
pd.DataFrame(cv_results, index = ['CV1','CV2','CV3','CV4','CV5','CV6','CV7','CV8']).T

Unnamed: 0,CV1,CV2,CV3,CV4,CV5,CV6,CV7,CV8
L0__dummy,-0.004815,-0.029427,-0.033802,-0.063929,-0.037836,-0.061407,-0.067377,-0.063005
L0__rfr,-0.005876,-0.207804,-0.099049,-0.109347,-0.258244,-0.125331,-0.078966,-0.259149
L0__bagging,-0.07128,-0.185997,-0.142059,-0.247681,-0.349537,-0.227948,-0.1091,-0.405941
L0__boosting,-0.106651,-0.430078,-0.160093,-0.293652,-0.43942,-0.299837,-0.310505,-0.163671
L1__dummy,-0.060468,-0.603713,-0.049963,-0.164345,-0.054505,-0.038927,-0.092147,-0.00906
L1__rfr,-0.520265,-0.809277,-0.091234,-0.143359,-0.224457,-0.276685,-0.1074,-0.112319
L1__bagging,-0.681753,-0.474617,-0.127309,0.063806,-0.132621,-0.835211,-0.147648,-0.206977
L1__boosting,-0.490715,-1.167058,-0.243993,-0.505807,-0.437264,-0.487705,-0.266518,-0.144646
L2__dummy,-0.006035,-0.019695,-0.000809,-0.057763,-0.109205,-0.234084,-0.099909,-0.038117
L2__rfr,0.227025,0.219467,0.310968,0.266985,0.207874,0.082454,0.253125,0.404169
