In [1]:
import os
import numpy as np 
import pandas as pd
import pickle
import datetime
from scipy import stats, special
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import Ridge, Lasso, SGDRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR

%matplotlib inline

In [2]:
# Instantiating variables
data_dir = 'input_data'
transform = 'pca'
model_dir = 'models'
seed=0
scoring='neg_mean_squared_error'
time = datetime.datetime.now().strftime("%D").replace('/', '-')

#Creating model directory
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

In [3]:
def get_train_file(transform=transform):
    files_dict = {'sparse' : 'train_sparse_pca.csv', 'pca' : 'train_pca.csv', 'imputation' : 'train_imputed.csv', None : 'test.csv'}
    df = pd.read_csv(os.path.join(data_dir, files_dict[transform]))
    X = df.iloc[:, 1:].values
    y = df.iloc[:, 0].values
    X_train, X_validate, y_train, y_validate = model_selection.train_test_split(X, y, random_state=seed)
    return X_train, X_validate, y_train, y_validate 

In [4]:
# Loading and splitting transformed data
X_train, X_validate, y_train, y_validate = get_train_file()

## Quick Model Evaluation 

In [None]:
models = []
models.append(('RCV', Ridge()))
models.append(('SVM', SVR()))
models.append(('RFR', RandomForestRegressor()))
models.append(('GBM', AdaBoostRegressor()))
models.append(('GBR', GradientBoostingRegressor()))
models.append(('SGD', SGDRegressor()))
models.append(('LSO', Lasso()))

In [None]:
results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed, shuffle=True)
    cv_results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

## Model Hyperparameter Tuning

It looks like a baseline gradient boosted regression and random forest regression performed the best on the training data. I will focus on tuning the hyperparamters of these algorithms further using GridsearchCV. Lastly, i will explore the use of an XGBoost algorithm using SageMaker.

### Gradient Boosting Regressor

In [5]:
#Instantiating the model and parameters
GBRegressor = GradientBoostingRegressor()
grid_values_grdb = {'n_estimators' : [1, 100], 'max_depth' : [1, 5], 'learning_rate' : [.001, .01]}

#Fitting model with parameter search
GBRegressor_CV = model_selection.GridSearchCV(GBRegressor, param_grid = grid_values_grdb, scoring=scoring)
GBRegressor_CV.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_no_change=None,
              

In [6]:
#Making predictions
GRB_predicted = GBRegressor_CV.predict(X_validate)
GRB_predicted = special.inv_boxcox(GRB_predicted.reshape(-1, 1), stats.boxcox(pd.read_csv(os.path.join(data_dir, 'train.csv'), usecols=['target', 'ID'], index_col='ID')['target'])[1])
GRB_predicted = [x[0] for x in GRB_predicted]

# Reversing Box-Cox transformation on target data
y_validate_inv = special.inv_boxcox(y_validate, stats.boxcox(pd.read_csv(os.path.join(data_dir, 'train.csv'), usecols=['target', 'ID'], index_col='ID')['target'])[1])

# Evaluating RMSLE of predictions:
np.sqrt(mean_squared_log_error(y_validate_inv, GRB_predicted))

1.5891610367138331

In [None]:
#Plotting predictions versus true values
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20, 5)) 
ax1.scatter(y_validate_inv, GRB_predicted)
ax1.set_title('Predicted Versus Actual Plot', fontsize=16)
ax1.set_xlabel('y True', fontsize=12)
ax1.set_ylabel('y Predicted', fontsize=12)
ax2.scatter(range(len(GRB_predicted)), np.sort(GRB_predicted))
ax2.set_title('Prediction Distribution', fontsize=16)
ax2.set_xlabel('Index Number', fontsize=12)
ax2.set_ylabel('Predicted Value', fontsize=12)

In [None]:
# Saving gradient boosting regression model data to model directory
filename = 'GBR-model-{}-{}'.format(transform, time)
pickle.dump(GBRegressor_CV, open(os.path.join(model_dir, filename), 'wb'))

### Random Forest Regressor

In [None]:
# Instantiating regressor and parameter search
RFRegressor = RandomForestRegressor()
grid_values_rfr = {'n_estimators' : [1, 10, 100], 'max_depth' : [1, 3, 5], 'min_samples_split' : [2, 3, 5]}
RFRegressor_CV = model_selection.GridSearchCV(RFRegressor, param_grid = grid_values_rfr, scoring=scoring)
RFRegressor_CV.fit(X_train, y_train)

In [None]:
#Making and evaluating predictions
RFR_predicted = RFRegressor_CV.predict(X_validate)
RFR_predicted = special.inv_boxcox(RFR_predicted.reshape(-1, 1), stats.boxcox(pd.read_csv(os.path.join(data_dir, 'train.csv'), usecols=['target', 'ID'], index_col='ID')['target'])[1])
RFR_predicted = [x[0] for x in RFR_predicted]
y_validate_inv = special.inv_boxcox(y_validate, stats.boxcox(pd.read_csv(os.path.join(data_dir, 'train.csv'), usecols=['target', 'ID'], index_col='ID')['target'])[1])
np.sqrt(mean_squared_log_error(y_validate_inv, RFR_predicted))

In [None]:
# Saving random forest model data to model directory
filename = 'RFR-model-{}-{}'.format(transform, time)
pickle.dump(RFRegressor_CV, open(os.path.join(model_dir, filename), 'wb'))

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20, 5)) 
ax1.scatter(y_validate_inv, RFR_predicted)
ax1.set_title('Predicted Versus Actual Plot', fontsize=16)
ax1.set_xlabel('y True', fontsize=12)
ax1.set_ylabel('y Predicted', fontsize=12)
ax2.scatter(range(len(RFR_predicted)), np.sort(RFR_predicted))
ax2.set_title('Prediction Distribution', fontsize=16)
ax2.set_xlabel('Index Number', fontsize=12)
ax2.set_ylabel('Predicted Value', fontsize=12)

As there is not a significant difference in performance between both models, these results indicate a need for further data modeling.

### XGBoost 

In [9]:
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()
model_prefix = 'santander_project/XGBoost'
data_prefix = 'santander_project/data'

In [10]:
pd.concat([pd.DataFrame(y_validate), pd.DataFrame(X_validate)], axis=1).to_csv(os.path.join(data_dir, 'model_validation.csv'), header=False, index=False)
pd.concat([pd.DataFrame(y_train), pd.DataFrame(X_train)], axis=1).to_csv(os.path.join(data_dir, 'model_train.csv'), header=False, index=False)

In [11]:
val_location = sagemaker_session.upload_data(os.path.join(data_dir, 'model_validation.csv'), bucket=bucket, key_prefix=model_prefix)
train_location = sagemaker_session.upload_data(os.path.join(data_dir, 'model_train.csv'), bucket=bucket, key_prefix=model_prefix)

In [12]:
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [13]:
from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(sagemaker_session.boto_region_name, 'xgboost')

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
	get_image_uri(region, 'xgboost', '1.0-1').


In [14]:
xgb_base = sagemaker.estimator.Estimator(container, 
                                    role,                                    
                                    train_instance_count=1,                  
                                    train_instance_type='ml.m4.xlarge',      
                                    output_path='s3://{}/{}/output'.format(bucket, model_prefix),
                                    sagemaker_session=sagemaker_session)



In [15]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

xgb_hyperparameter_tuner = HyperparameterTuner(estimator = xgb_base, 
                                               objective_metric_name = 'validation:rmse', 
                                               objective_type = 'Minimize', 
                                               max_jobs = 15, 
                                               max_parallel_jobs = 3,
                                               hyperparameter_ranges = {
                                                    'max_depth': IntegerParameter(3, 12),
                                                    'eta'      : ContinuousParameter(0.05, 0.5),
                                                    'min_child_weight': IntegerParameter(2, 8),
                                                    'subsample': ContinuousParameter(0.5, 0.9),
                                                    'gamma': ContinuousParameter(0, 10),
                                                   'num_round' : IntegerParameter(1, 500)
                                               })

In [17]:
xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})
xgb_hyperparameter_tuner.wait()

............................................................................................................................................................................................................!


In [18]:
xgb_estimator = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())



2020-07-17 00:43:41 Starting - Preparing the instances for training
2020-07-17 00:43:41 Downloading - Downloading input data
2020-07-17 00:43:41 Training - Training image download completed. Training in progress.
2020-07-17 00:43:41 Uploading - Uploading generated training model
2020-07-17 00:43:41 Completed - Training job completed[34mArguments: train[0m
[34m[2020-07-17:00:43:19:INFO] Running standalone xgboost training.[0m
[34m[2020-07-17:00:43:19:INFO] Setting up HPO optimized metric to be : rmse[0m
[34m[2020-07-17:00:43:19:INFO] File size need to be processed in the node: 11.43mb. Available memory size in the node: 8516.35mb[0m
[34m[2020-07-17:00:43:19:INFO] Determined delimiter of CSV input is ','[0m
[34m[00:43:19] S3DistributionType set as FullyReplicated[0m
[34m[00:43:19] 3343x139 matrix with 464677 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-07-17:00:43:19:INFO] Determined delimiter of CSV input is ','[0m
[34m

In [19]:
best_job_name = xgb_hyperparameter_tuner.best_training_job()

In [20]:
model_data = "s3://{}/{}/output/{}/output/model.tar.gz".format(bucket, model_prefix, best_job_name)

In [21]:
# Saving XGBoost model data to model directory
!aws s3 cp $model_data $model_dir

download: s3://sagemaker-us-east-2-278383315865/santander_project/XGBoost/output/xgboost-200717-0037-006-cd6291b3/output/model.tar.gz to models/model.tar.gz


In [22]:
os.rename(os.path.join(model_dir, 'model.tar.gz'), os.path.join(model_dir, 'XGB-model-{}-{}'.format(transform, time)))