In [1]:
import os
import numpy as np 
import pandas as pd
from scipy import stats, special
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import Ridge, Lasso, SGDRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR

%matplotlib inline

## Loading and Splitting Transformed Data

In [2]:
data_dir = 'input_data'
data_file = 'train_pca.csv'
model_dir = 'models'
seed=0
scoring='neg_mean_squared_log_error'

In [3]:
processed_df = pd.read_csv(os.path.join(data_dir, data_file), index_col='ID')

In [6]:
X = processed_df.iloc[:, 1:]
y = processed_df['target']

In [None]:
X_train, X_validate, y_train, y_validate = model_selection.train_test_split(X, y, random_state=seed)

## Model Selection

In [None]:
models = []
models.append(('RCV', Ridge()))
models.append(('SVM', SVR()))
models.append(('RFR', RandomForestRegressor()))
models.append(('GBM', AdaBoostRegressor()))
models.append(('GBR', GradientBoostingRegressor()))
models.append(('SGD', SGDRegressor()))
models.append(('LSO', Lasso()))

In [None]:
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle=True)
    cv_results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

## Hyperparameter tuning

It looks like a baseline gradient boosted regression performed the best on the training data. I will focus on tuning the hyperparamters of this algorithm further using GridsearchCV

In [4]:
GBRegressor = GradientBoostingRegressor()
grid_values_grdb = {'n_estimators' : [10, 100], 'max_depth' : [1, 5], 'learning_rate' : [.001, .01]}
GBRegressor_CV = model_selection.GridSearchCV(GBRegressor, param_grid = grid_values_grdb, scoring=scoring)

In [None]:
GBRegressor_CV.fit(X_train, y_train)

In [None]:
GRB_predicted = GBRegressor_CV.predict(X_validate)

In [None]:
GRB_predicted = special.inv_boxcox(GRB_predicted.reshape(-1, 1), stats.boxcox(pd.read_csv(os.path.join(data_dir, 'train.csv'), usecols=['target', 'ID'], index_col='ID')['target'])[1])

In [None]:
GRB_predicted = [x[0] for x in GRB_predicted]

Reversing Box-Cox transformation on target data

In [None]:
y_validate_inv = special.inv_boxcox(y_validate, stats.boxcox(pd.read_csv(os.path.join(data_dir, 'train.csv'), usecols=['target', 'ID'], index_col='ID')['target'])[1])

Evaluating RMSLE on validation data:

In [None]:
np.sqrt(mean_squared_log_error(y_validate_inv, GRB_predicted))

In [None]:
plt.scatter(y_validate_inv, GRB_predicted)
plt.title('Predicted Versus Actual Plot')
plt.xlabel('y True')
plt.ylabel('y Predicted')



These results indicate a need for further model refinement.

In [7]:
GBRegressor_CV.fit(X, y)

GridSearchCV(cv=None, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_no_change=None,
              

## Saving Model for Deployment

In [8]:
import pickle
import datetime
time = datetime.datetime.now().strftime("%D").replace('/', '_')

#Creating model directory
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Saving serialized model data to directory
filename = 'GBR_model_{}'.format(time)
pickle.dump(GBRegressor_CV, open(os.path.join(model_dir, filename), 'wb'))