In [1]:
import os
import numpy as np 
import pandas as pd
import pickle
import datetime
from scipy import stats, special
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import Ridge, Lasso, SGDRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR

%matplotlib inline

In [2]:
data_dir = 'input_data'
transform = 'pca'
model_dir = 'models'
seed=0
scoring='neg_mean_squared_error'
time = datetime.datetime.now().strftime("%D").replace('/', '-')

if not os.path.exists(model_dir):
    os.makedirs(model_dir)

## Loading Data

In [3]:
def get_train_file(transform=transform):
    files_dict = {'sparse' : 'train_sparse_pca.csv', 'pca' : 'train_pca.csv', 'reduction' : 'train_reduced.csv', None : 'train.csv'}
    df = pd.read_csv(os.path.join(data_dir, files_dict[transform]))
    X = df.iloc[:, 1:].values
    y = df.iloc[:, 0].values
    X_train, X_validate, y_train, y_validate = model_selection.train_test_split(X, y, random_state=seed)
    return X_train, X_validate, y_train, y_validate 

In [4]:
# Loading and splitting transformed data
X_train, X_validate, y_train, y_validate = get_train_file()

## Initial Model Evaluation 

In [None]:
models = []
models.append(('RCV', Ridge()))
models.append(('SVM', SVR()))
models.append(('RFR', RandomForestRegressor()))
models.append(('GBM', AdaBoostRegressor()))
models.append(('GBR', GradientBoostingRegressor()))
models.append(('SGD', SGDRegressor()))
models.append(('LSO', Lasso()))

In [None]:
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle=True)
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

Random forest regression appears to perform the best

## Selected Model Hyperparameter Tuning

It looks like a baseline gradient boosted regression and random forest regression performed the best on the training data. I will focus on tuning the hyperparamters of these algorithms further using GridsearchCV. Lastly, i will explore the use of an XGBoost algorithm using SageMaker.

### Random Forest Regressor

In [None]:
# Instantiating regressor and parameter search
RFRegressor = RandomForestRegressor()
grid_values_rfr = {'n_estimators' : [1, 10, 50, 100], 'max_depth' : [1, 3, 5, 10], 'min_samples_split' : [2, 3, 5]}
RFRegressor_CV = model_selection.GridSearchCV(RFRegressor, param_grid = grid_values_rfr, scoring=scoring)
RFRegressor_CV.fit(X_train, y_train)

In [None]:
#Making and evaluating predictions
RFR_predicted = RFRegressor_CV.predict(X_validate)
RFR_predicted = special.inv_boxcox(RFR_predicted.reshape(-1, 1), stats.boxcox(pd.read_csv(os.path.join(data_dir, 'train.csv'), usecols=['target', 'ID'], index_col='ID')['target'])[1])
RFR_predicted = [x[0] for x in RFR_predicted]
y_validate_inv = special.inv_boxcox(y_validate, stats.boxcox(pd.read_csv(os.path.join(data_dir, 'train.csv'), usecols=['target', 'ID'], index_col='ID')['target'])[1])
np.sqrt(mean_squared_log_error(y_validate_inv, RFR_predicted))

In [None]:
#Plotting results
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20, 5)) 
ax1.scatter(y_validate_inv, RFR_predicted)
ax1.set_title('Predicted Versus Actual Plot', fontsize=16)
ax1.set_xlabel('y True', fontsize=12)
ax1.set_ylabel('y Predicted', fontsize=12)
ax2.scatter(range(len(RFR_predicted)), np.sort(RFR_predicted))
ax2.set_title('Prediction Distribution', fontsize=16)
ax2.set_xlabel('Index Number', fontsize=12)
ax2.set_ylabel('Predicted Value', fontsize=12)
fig.suptitle('Random Forest Model Results with {} Transform'.format(transform), fontsize=20, y=1.02)

In [None]:
# Saving random forest model data to model directory
filename = 'RFR-model-{}-{}'.format(transform, time)
pickle.dump(RFRegressor_CV, open(os.path.join(model_dir, filename), 'wb'))

Next i will construct an XGBoost regressor using the Sagemaker API

### XGBoost 

In [5]:
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()
model_prefix = 'santander_project/XGBoost'
data_prefix = 'santander_project/data'

In [8]:
pd.concat([pd.DataFrame(y_validate), pd.DataFrame(X_validate)], axis=1).to_csv(os.path.join(data_dir, 'model_validation.csv'), header=False, index=False)
pd.concat([pd.DataFrame(y_train), pd.DataFrame(X_train)], axis=1).to_csv(os.path.join(data_dir, 'model_train.csv'), header=False, index=False)

val_location = sagemaker_session.upload_data(os.path.join(data_dir, 'model_validation.csv'), bucket=bucket, key_prefix=model_prefix)
train_location = sagemaker_session.upload_data(os.path.join(data_dir, 'model_train.csv'), bucket=bucket, key_prefix=model_prefix)

s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [9]:
from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(sagemaker_session.boto_region_name, 'xgboost')

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
There is a more up to date SageMaker XGBoost image. To use the newer image, please set 'repo_version'='1.0-1'. For example:
	get_image_uri(region, 'xgboost', '1.0-1').


In [10]:
xgb_base = sagemaker.estimator.Estimator(container, 
                                    role,                                    
                                    train_instance_count=1,                  
                                    train_instance_type='ml.m4.xlarge',      
                                    output_path='s3://{}/{}/output'.format(bucket, model_prefix),
                                    sagemaker_session=sagemaker_session)

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [11]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

xgb_hyperparameter_tuner = HyperparameterTuner(estimator = xgb_base, 
                                               objective_metric_name = 'validation:rmse', 
                                               objective_type = 'Minimize', 
                                               max_jobs = 15, 
                                               max_parallel_jobs = 3,
                                               hyperparameter_ranges = {
                                                    'max_depth': IntegerParameter(3, 12),
                                                    'eta'      : ContinuousParameter(0.05, 0.5),
                                                    'min_child_weight': IntegerParameter(2, 8),
                                                    'subsample': ContinuousParameter(0.5, 0.9),
                                                    'gamma': ContinuousParameter(0, 10),
                                                   'num_round' : IntegerParameter(1, 500)
                                               })

In [12]:
xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})
xgb_hyperparameter_tuner.wait()

..................................................................................................................................................................................................................!


In [13]:
xgb_estimator = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


2020-07-27 22:53:30 Starting - Preparing the instances for training
2020-07-27 22:53:30 Downloading - Downloading input data
2020-07-27 22:53:30 Training - Training image download completed. Training in progress.
2020-07-27 22:53:30 Uploading - Uploading generated training model
2020-07-27 22:53:30 Completed - Training job completed[34mArguments: train[0m
[34m[2020-07-27:22:53:18:INFO] Running standalone xgboost training.[0m
[34m[2020-07-27:22:53:18:INFO] Setting up HPO optimized metric to be : rmse[0m
[34m[2020-07-27:22:53:18:INFO] File size need to be processed in the node: 0.91mb. Available memory size in the node: 8491.84mb[0m
[34m[2020-07-27:22:53:18:INFO] Determined delimiter of CSV input is ','[0m
[34m[22:53:18] S3DistributionType set as FullyReplicated[0m
[34m[22:53:18] 3343x10 matrix with 33430 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-07-27:22:53:18:INFO] Determined delimiter of CSV input is ','[0m
[34m[22

In [14]:
best_job_name = xgb_hyperparameter_tuner.best_training_job()

In [15]:
model_data = "s3://{}/{}/output/{}/output/model.tar.gz".format(bucket, model_prefix, best_job_name)

In [16]:
# Saving XGBoost model data to model directory
!aws s3 cp $model_data $model_dir

Completed 25.5 KiB/25.5 KiB (391.8 KiB/s) with 1 file(s) remainingdownload: s3://sagemaker-us-east-2-278383315865/santander_project/XGBoost/output/xgboost-200727-2246-004-85c4a7e8/output/model.tar.gz to models/model.tar.gz


In [17]:
os.rename(os.path.join(model_dir, 'model.tar.gz'), os.path.join(model_dir, 'XGB-model-{}-{}'.format(transform, time)))