In [None]:
import os
import numpy as np 
import pandas as pd
from scipy import stats, special
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import Ridge, Lasso, SGDRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR

%matplotlib inline

## Loading and Splitting Transformed Data

In [None]:
data_dir = 'input_data'
data_file = 'train_pca.csv'
model_dir = 'models'
seed=0
scoring='neg_mean_squared_error'

In [None]:
processed_df = pd.read_csv(os.path.join(data_dir, data_file), index_col='ID')

In [None]:
X = processed_df.iloc[:, 1:]
y = processed_df['target']

In [None]:
X_train, X_validate, y_train, y_validate = model_selection.train_test_split(X, y, random_state=seed)

## Model Selection

In [None]:
models = []
models.append(('RCV', Ridge()))
models.append(('SVM', SVR()))
models.append(('RFR', RandomForestRegressor()))
models.append(('GBM', AdaBoostRegressor()))
models.append(('GBR', GradientBoostingRegressor()))
models.append(('SGD', SGDRegressor()))
models.append(('LSO', Lasso()))

In [None]:
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle=True)
    cv_results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

## Hyperparameter tuning

It looks like a baseline gradient boosted regression and random forest regression performed the best on the training data. I will focus on tuning the hyperparamters of these algorithms further using GridsearchCV.

### Gradient Boosting Regressor

In [None]:
#Instantiating the model and parameters
GBRegressor = GradientBoostingRegressor()
grid_values_grdb = {'n_estimators' : [1, 10, 100], 'max_depth' : [1, 3, 5], 'learning_rate' : [.001, .01, .1]}

#Fitting model with parameter search
GBRegressor_CV = model_selection.GridSearchCV(GBRegressor, param_grid = grid_values_grdb, scoring=scoring)
GBRegressor_CV.fit(X_train, y_train)

In [None]:
#Making predictions
GRB_predicted = GBRegressor_CV.predict(X_validate)
GRB_predicted = special.inv_boxcox(GRB_predicted.reshape(-1, 1), stats.boxcox(pd.read_csv(os.path.join(data_dir, 'train.csv'), usecols=['target', 'ID'], index_col='ID')['target'])[1])
GRB_predicted = [x[0] for x in GRB_predicted]

In [None]:
# Reversing Box-Cox transformation on target data
y_validate_inv = special.inv_boxcox(y_validate, stats.boxcox(pd.read_csv(os.path.join(data_dir, 'train.csv'), usecols=['target', 'ID'], index_col='ID')['target'])[1])

In [None]:
# Evaluating RMSLE of predictions:
np.sqrt(mean_squared_log_error(y_validate_inv, GRB_predicted))

In [None]:
#Plotting predictions versus true values
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20, 5)) 
ax1.scatter(y_validate_inv, GRB_predicted)
ax1.set_title('Predicted Versus Actual Plot', fontsize=16)
ax1.set_xlabel('y True', fontsize=12)
ax1.set_ylabel('y Predicted', fontsize=12)
ax2.scatter(range(len(GRB_predicted)), np.sort(GRB_predicted))
ax2.set_title('Prediction Distribution', fontsize=16)
ax2.set_xlabel('Index Number', fontsize=12)
ax2.set_ylabel('Predicted Value', fontsize=12)

### Random Forest Regressor

In [None]:
# Instantiating regressor and parameter search
RFRegressor = RandomForestRegressor()
grid_values_rfr = {'n_estimators' : [1, 10, 100], 'max_depth' : [1, 3, 5], 'min_samples_split' : [2, 3, 5]}
RFRegressor_CV = model_selection.GridSearchCV(RFRegressor, param_grid = grid_values_rfr, scoring=scoring)
RFRegressor_CV.fit(X_train, y_train)

In [None]:
#Making and evaluating predictions
RFR_predicted = RFRegressor_CV.predict(X_validate)
RFR_predicted = special.inv_boxcox(RFR_predicted.reshape(-1, 1), stats.boxcox(pd.read_csv(os.path.join(data_dir, 'train.csv'), usecols=['target', 'ID'], index_col='ID')['target'])[1])
RFR_predicted = [x[0] for x in RFR_predicted]
np.sqrt(mean_squared_log_error(y_validate_inv, RFR_predicted))

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20, 5)) 
ax1.scatter(y_validate_inv, RFR_predicted)
ax1.set_title('Predicted Versus Actual Plot', fontsize=16)
ax1.set_xlabel('y True', fontsize=12)
ax1.set_ylabel('y Predicted', fontsize=12)
ax2.scatter(range(len(RFR_predicted)), np.sort(RFR_predicted))
ax2.set_title('Prediction Distribution', fontsize=16)
ax2.set_xlabel('Index Number', fontsize=12)
ax2.set_ylabel('Predicted Value', fontsize=12)

As there is not a significant difference in performance between both models, these results indicate a need for further data modeling.

In [None]:
GBRegressor_CV.fit(X, y)

## Saving Model for Test Predictions

In [None]:
import pickle
import datetime
time = datetime.datetime.now().strftime("%D").replace('/', '_')

#Creating model directory
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Saving serialized model data to directory
filename = 'GBR_model_{}'.format(time)
pickle.dump(GBRegressor_CV, open(os.path.join(model_dir, filename), 'wb'))