In [None]:
import json

import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.ensemble import GradientBoostingRegressor

import kaggle_utils
from data_augmentation.combined_attributes_adder import CombinedAttributesAdder
from data_augmentation.coords import NearestCluster

In [None]:
train = kaggle_utils.read_train_data()

In [None]:
train.head()

In [None]:
class Experimentor:
    '''A class to make trying different comninations of input preprocessing easier.
    User can update a config file to choose how the input is processed'''

    def __init__(self, model):
        self.input_data = None
        self.model = model
        self.cv_results = None
        self.prev_cv_results = None
        
    def _preprocess_input(self):
        if self.config['preprocess']['append_sklearn_dataset']:
            self._append_additional_data()

        X, y = kaggle_utils.split_X_y(self.input_data, 'MedHouseVal')

        numeric_pipeline_stages = []

        # this reduces performance and increases training time
        if self.config['preprocess']['add_attributes']:
            numeric_pipeline_stages.append(
                ('attr_adder', 
                 CombinedAttributesAdder(self.config['preprocess']['combined_attrs']))
            )

        # this makes a negligible improvement (about 0.1%)
        if self.config['preprocess']['kmeans']['enable']:
            numeric_pipeline_stages.append(
                ('nearest_cluster',
                 NearestCluster(self.config['preprocess']['kmeans']['n_clusters']))
            )

        if self.config['preprocess']['do_scale']:
            numeric_pipeline_stages.append(('scaler', StandardScaler()))

        pipeline = Pipeline(numeric_pipeline_stages)
        X_proc = pipeline.fit_transform(X)
        
        return X_proc, y
    
    def _append_additional_data(self):
        housing = fetch_california_housing()
        data = np.concatenate([housing.data, housing.target.reshape([-1, 1])], axis=1)
        columns = housing.feature_names + housing.target_names
        more_data = pd.DataFrame(data, columns=columns)
        self.input_data = pd.concat([self.input_data, more_data], axis=0)

    def _set_model_random_state(self):
        if 'random_state' in self.model.get_params():
            params = {'random_state': self.config['random_state']}
            self.model.set_params(**params)
            
    def _evaluate_cv(self, X, y):
        self.prev_cv_results = self.cv_results
        k_fold = KFold(
            n_splits=self.config['cross_validation']['num_splits'],
            shuffle=True,
            random_state=self.config['random_state']
        )
        self.cv_results = cross_validate(
            self.model,
            X,
            y,
            cv=k_fold,
            scoring=self.config['cross_validation']['scoring'],
            n_jobs=-1
        )
    
    def _print_results(self):
        rmse = -self.cv_results['test_neg_root_mean_squared_error']
        fit_time = self.cv_results['fit_time']
        print('CV Results')
        print(f'rmse: {rmse.mean():.5f} +/- {rmse.std():.5f}')
        print(f'training time: {fit_time.mean():.2f}s +/- {fit_time.std():.2f}s')
        if self.prev_cv_results is not None:
            rmse = -self.prev_cv_results['test_neg_root_mean_squared_error']
            fit_time = self.prev_cv_results['fit_time']
            print('-----')
            print('Previous CV Results')
            print(f'rmse: {rmse.mean():.5f} +/- {rmse.std():.5f}')
            print(f'training time: {fit_time.mean():.2f}s +/- {fit_time.std():.2f}s')
            
    def run(self, input_data):
        with open('config.json', 'r') as f:
            self.config = json.load(f)

        self.input_data = input_data
        if self.config['preprocess']['do_preproc']:
            X_proc, y = self._preprocess_input()    

        self._set_model_random_state()

        if self.config['cross_validation']['do_cross_val']:
            self._evaluate_cv(X_proc, y)
            self._print_results()


In [None]:
model = GradientBoostingRegressor()
experimentor = Experimentor(model)

In [None]:
experimentor.run(input_data=train)