In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
import csv
import json
import sklearn
from google.cloud import bigquery as bq
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.options.mode.chained_assignment = None
%matplotlib inline

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, ShuffleSplit, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error

In [None]:
query="""
    SELECT
      *
    FROM
      `ual3d-277909.nl.zuid_holland_ml_df`
"""

df = bq.Client().query(query).to_dataframe()
df = df.sample(frac=0.1)
df.head()

In [None]:
df.set_index('osm_way_id', inplace=True)

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:

df["city"] = df["city"].astype("category")
df["state_name"] = df["state_name"].astype("category")
df["country"] = df["country"].astype("category")
df["postcode"] = df["postcode"].astype("category")
# df["year"] = df["year"].astype(float).astype("Int64")
# df["count_vertices"] = df["count_vertices"].astype(float).astype("Int64")
# df["count_neighbor"] = df["count_neighbor"].astype(float).astype("Int64")
# df["building_levels"] = df["building_levels"].astype(float).astype("Int64")
df.dtypes

In [None]:
label_col = ["bldg_height"]
num_cols = ["area", "perimeter", "length", "building_levels", "year", "count_vertices", "compactness", "complexity", "count_neighbor"]
# cat_cols = ["city", "postcode"]
cat_cols = [col for col in df.columns if col not in (num_cols + label_col)]


In [None]:
print('count unique city:', len(pd.unique(df['city'])))
print('unique city:', pd.unique(df['city']))

print('\ncount unique postcode:', len(pd.unique(df['postcode'])))
print('unique postcode:', pd.unique(df['postcode']))

print('\ncount unique year:', len(pd.unique(df['year'])))
print('unique year:', pd.unique(df['year']))

In [None]:
df.isna().sum()

In [None]:

num_df = df[num_cols]

In [None]:
num_df.describe()

In [None]:
num_df.plot(kind='box', subplots=True, figsize=(15,5))
plt.plot()

In [None]:
num_df['count_neighbor'].fillna(0, inplace=True)

In [None]:
num_df.describe()

In [None]:
num_df.plot(kind='box', subplots=True, figsize=(15,5))
plt.plot()

We assume that count_neighbor is 0 when null. 

In [None]:
df['count_neighbor'].fillna(0, inplace=True)
df.isna().sum()

In [None]:
df.columns

In [None]:
# drop country and state_name as they are irrelevant 
df = df.drop(['country', 'state_name'], axis=1)
print(df.columns)

In [None]:
# # drop length and perimeter as they are highly correlated 
# df = df.drop(['length', 'perimeter'], axis=1)
# print(df.columns)

In [None]:
num_cols = ['area', 'length', 'perimeter', 'building_levels', 'year', 'count_vertices', 'compactness', 'complexity', 'count_neighbor']
print(num_cols)

In [None]:
cat_cols = [col for col in df.columns if col not in (num_cols + label_col)]
print(cat_cols)


In [None]:
X = df.drop('bldg_height', axis=1)
y = df['bldg_height']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

In [None]:
X, y

In [None]:
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean', add_indicator=True)),
    ('scaler', StandardScaler()),
    ('kbest', SelectKBest(f_regression, k='all'))])


In [None]:
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', add_indicator=True, fill_value="NA")),
    ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore')),
    ('kbest', SelectKBest(f_regression, k='all'))])

In [None]:
preprocessor = ColumnTransformer(transformers=[
        ('num_transformer', num_transformer, num_cols),
        ('cat_transfomer', cat_transformer, cat_cols)])

In [None]:
# ('imputer', KNNImputer(n_neighbors=50, weights='uniform'))
# ,('kbest', SelectKBest(f_regression, k=5))
# ,('kbest', SelectKBest(f_regression, k=50))

In [None]:
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression

In [None]:
# print(sorted(sklearn.metrics.SCORERS.keys()) )

In [None]:
regressors = []
regressors.append(('LR', LinearRegression()))
regressors.append(('DT', DecisionTreeRegressor()))
# regressors.append(('DT', DecisionTreeRegressor(max_depth=30, random_state=42)))
regressors.append(('KNN', KNeighborsRegressor()))
# regressors.append(('KNN', KNeighborsRegressor(n_neighborsint=30, weights="distance")))
regressors.append(('RF', RandomForestRegressor()))
# regressors.append(('RF', RandomForestRegressor(n_estimators=100, max_depth=10)))
regressors.append(('AB', AdaBoostRegressor()))
regressors.append(('B', BaggingRegressor()))
regressors.append(('GB', GradientBoostingRegressor()))


In [None]:
mae_reg_scores = {}

In [None]:
for name, regressor in regressors:
    cv = ShuffleSplit(n_splits=5, test_size=.3, random_state=42)
    mdl_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', regressor)])
    scores = cross_val_score(mdl_pipeline, X_train, y_train, cv=cv, scoring='neg_mean_absolute_error', error_score='raise', n_jobs=-1)
    mae_reg_scores[name] = scores
    msg = "%s: avg: %f (std: %f, max: %f, min: %f)" % (name, scores.mean(), scores.std(), scores.max(), scores.min())
    print(msg)

In [None]:
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)

plt.boxplot(mae_reg_scores.values())
ax.set_xticklabels(mae_reg_scores.keys())

plt.plot()
plt.xlabel('Models')
plt.ylabel('MAE Scores')
plt.savefig('model_selection_cv/South-Holland-Sample_MAE.png', dpi=600)


In [None]:
if 'AB' in mae_reg_scores.keys():
    del mae_reg_scores['AB'] 
mae_reg_scores

In [None]:
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)

plt.boxplot(mae_reg_scores.values())
ax.set_xticklabels(mae_reg_scores.keys())

plt.plot()
plt.xlabel('Models')
plt.ylabel('MAE Scores')
plt.savefig('model_selection_cv/South-Holland-Sample_MAE_Except-AB.png', dpi=600)



In [None]:
mse_reg_scores = {}

In [None]:
for name, regressor in regressors:
    cv = ShuffleSplit(n_splits=5, test_size=.3, random_state=42)
    mdl_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', regressor)])
    scores = cross_val_score(mdl_pipeline, X_train, y_train, cv=cv, scoring='neg_mean_squared_error', error_score='raise', n_jobs=-1)
    mse_reg_scores[name] = scores
    msg = "%s: avg: %f (std: %f, max: %f, min: %f)" % (name, scores.mean(), scores.std(), scores.max(), scores.min())
    print(msg)

In [None]:
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)

plt.boxplot(mse_reg_scores.values())
ax.set_xticklabels(mse_reg_scores.keys())

plt.plot()
plt.xlabel('Models')
plt.ylabel('MSE Scores')
plt.savefig('model_selection_cv/South-Holland-Sample_MSE.png', dpi=600)
plt.show()

In [None]:
if 'AB' in mse_reg_scores.keys():
    del mse_reg_scores['AB'] 
mse_reg_scores

In [None]:
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)

plt.boxplot(mse_reg_scores.values())
ax.set_xticklabels(mse_reg_scores.keys())

plt.plot()
plt.xlabel('Models')
plt.ylabel('MSE Scores')
plt.savefig('model_selection_cv/South-Holland-Sample_MSE_Except-AB.png', dpi=600)
plt.show()

In [None]:
r2_reg_scores = {}

In [None]:
for name, regressor in regressors:
    cv = ShuffleSplit(n_splits=5, test_size=.3, random_state=42)
    mdl_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', regressor)])
    scores = cross_val_score(mdl_pipeline, X_train, y_train, cv=cv, scoring='r2', error_score='raise', n_jobs=-1)
    r2_reg_scores[name] = scores
    msg = "%s: avg: %f (std: %f, max: %f, min: %f)" % (name, scores.mean(), scores.std(), scores.max(), scores.min())
    print(msg)

In [None]:
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)

plt.boxplot(r2_reg_scores.values())
ax.set_xticklabels(r2_reg_scores.keys())

plt.plot()
plt.xlabel('Models')
plt.ylabel('R^2 Scores')
plt.savefig('model_selection_cv/South-Holland-Sample_R2.png', dpi=600)
plt.show()

In [None]:
if 'AB' in r2_reg_scores.keys():
    del r2_reg_scores['AB']
r2_reg_scores

In [None]:
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)

plt.boxplot(r2_reg_scores.values())
ax.set_xticklabels(r2_reg_scores.keys())

plt.plot()
plt.xlabel('Models')
plt.ylabel('R^2 Scores')
plt.savefig('model_selection_cv/South-Holland-Sample_R2_Except-AB.png', dpi=600)
plt.show()