In [1]:
import pandas as pd
import tensorflow as tf
import collections
import numpy as np
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR, NuSVR, LinearSVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import normalize
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold

  from ._conv import register_converters as _register_converters


In [2]:
COLUMN_TYPES = collections.OrderedDict([
    ("crime", float), # Important
    ("zn", float),
    ("indus", float),
    ("chas", int),
    ("nox", float),
    ("rm", float), # Important! Without this accuracy drops
    ("age", float),
    ("dis", float), # Important! 
    ("rad", int),
    ("tax", int),
    ("ptratio", float), # Important!
    ("b", float), # Doesnt do much
    ("lstat", float), # Important
    ("medv", float)
])

In [3]:
df = pd.read_csv('housingdata.csv', header=0, delimiter=';', names=COLUMN_TYPES.keys(), dtype=COLUMN_TYPES)

In [4]:
seed = sum([ord(x) for x in "LIGHT FROM LIGHT"])
np.random.seed(seed)
y_name = 'medv'
mean = np.mean(df["medv"])
df_sampled = df.sample(frac=1, random_state=seed)
df["medv"] = df["medv"].apply(lambda x: (x - mean))

In [5]:
# Features
x_train, x_valid = np.split(df_sampled, [int(0.9*len(df_sampled))])
x_train, x_test = np.split(x_train, [int(0.9*len(x_train))])
# Targets
y_train = x_train.pop(y_name)
y_valid = x_valid.pop(y_name)
y_test  = x_test.pop(y_name)

In [6]:
mean

22.532806324110698

In [7]:
#model = GradientBoostingRegressor(random_state=seed)
#param_grid = { "loss"              : ['ls', 'huber', 'lad'],
#               "n_estimators"      : [1,25,100,300,500],
#               "max_depth"         : [1,3,6,9,12,15,18],
#               "min_samples_split" : [2,4,6,8,10],
#               "learning_rate": [0.01, 0.05, 0.1]}
#grid_search = GridSearchCV(model, param_grid, n_jobs=-1, cv=2, scoring='r2', verbose=1)
#grid_search.fit(x_train, y_train)
#print(grid_search.best_params_)

In [8]:
#model = GradientBoostingRegressor(loss='huber', 
#                                 max_depth=3, 
#                                 min_samples_split=4, 
#                                 n_estimators=300,
#                                 learning_rate=0.1,
#                                 random_state=seed)
#
#scores = cross_val_score(model, x_train, y_train, cv=5)
#scores # array([0.88990719, 0.89271209, 0.88811618, 0.76994707, 0.87563016])
#sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
#sel.fit_transform(x_train)

In [9]:
model = Pipeline([
#('feature_selection', SelectFromModel(SVR())),
('regression', GradientBoostingRegressor(loss='huber', 
                                         max_depth=3, 
                                         min_samples_split=4, 
                                         n_estimators=300,
                                         learning_rate=0.1,
                                         random_state=seed))
])
model.fit(x_train, y_train)
y_valid = y_valid
y_pred = pd.Series(model.predict(x_valid), index=y_valid.index)
print("Mean squared error: %.2f" % mean_squared_error(y_valid, y_pred)) # Lower the better
print("Mean absolute error: %.2f" % mean_absolute_error(y_valid, y_pred)) # Lower the better
print("Explained Variance Score: %.2f" % explained_variance_score(y_valid, y_pred)) # Explained variance score: 1 is perfect
print('R2 score: %.2f' % r2_score(y_valid, y_pred)) # 1 is perfect

Mean squared error: 10.10
Mean absolute error: 2.01
Explained Variance Score: 0.89
R2 score: 0.87


In [10]:
y_test_pred = pd.Series(model.predict(x_test), index=y_test.index)
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_test_pred)) # Lower the better
print("Mean absolute error: %.2f" % mean_absolute_error(y_test, y_test_pred)) # Lower the better
print("Explained Variance Score: %.2f" % explained_variance_score(y_test, y_test_pred)) # Explained variance score: 1 is perfect
print('R2 score: %.2f' % r2_score(y_test, y_test_pred)) # 1 is perfect

Mean squared error: 13.15
Mean absolute error: 2.30
Explained Variance Score: 0.87
R2 score: 0.87


In [11]:
# Good enough! Learn model with full dataset and save.
model = Pipeline([
#('feature_selection', SelectFromModel(SVR())),
('regression', GradientBoostingRegressor(loss='huber', 
                                         max_depth=3, 
                                         min_samples_split=4, 
                                         n_estimators=300,
                                         learning_rate=0.1,
                                         random_state=seed))
])
X = df
y = df.pop(y_name)
model.fit(X, y)
# To be saved, whatever.

Pipeline(memory=None,
     steps=[('regression', GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='huber', max_depth=3,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=4,
             min_weight_fraction_leaf=0.0, n_estimators=300,
             presort='auto', random_state=1124, subsample=1.0, verbose=0,
             warm_start=False))])