In [1]:
# This notebook takes output from Train_Common_FeatureSelectionWithLasso
# Purpose: Use gradient boosting regression algorithm to train 

# https://www.analyticsvidhya.com/blog/2016/02/complete-guide-parameter-tuning-gradient-boosting-gbm-python/
#Import libraries:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor  #GBM algorithm
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search



In [2]:
# Import training data set
Training_dataset = pd.read_csv('Training_95_Features.csv')

In [3]:
# Inspect data set
Training_dataset.shape

(4209, 95)

In [4]:
Training_dataset.head()

Unnamed: 0,X181,X119,X47,X156,X151,X324,X321,X12,X84,X218,...,X5_w,X5_r,X5_s,X5_l,X0_aj,X0_d,X0_h,X0_o,X0_t,X8_x
0,0,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Import target data 
Target = pd.read_csv('Target_Variable.csv')

In [6]:
# Convert data set into array
Training_array = Training_dataset.values
# get the features
Predictors = Training_array[:]
# Convert target into array
Target_array = Target.values
# Get the Target 
Targets = Target_array[:,0]

In [7]:
Predictors

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ..., 
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [8]:
Targets

array([ 130.81,   88.53,   76.26, ...,  109.22,   87.48,  110.85])

In [18]:
def modelfit(alg, target, predictors, performCV=True, printFeatureImportance=True, cv_folds=cv):
    
    #Fit the algorithm on the training data
    alg.fit(predictors,target)
        
    #Use the model fitted to predict training set:
    dtrain_predictions = alg.predict(predictors)
    
    # Evaluate the predictions made by this fit
    # The quality control I chose for my fit include mean squared error and R_squared
    MSE=metrics.mean_squared_error(target,dtrain_predictions)
    R_squared = metrics.r2_score(target,dtrain_predictions)
    
    
    # The above code is without any cross validation where all the data is used to train the model.
    # Well we know that leaves us open to overfitting. So to mitigate that, lets do some cross validation
    # Also we will use this CV to tune some of our parameters 
    
    #Perform cross-validation:
    if performCV:
        cv_scoreMSE = cross_validation.cross_val_score(alg, predictors,target, cv=cv_folds, scoring='neg_mean_squared_error')
        cv_scoreR_squared = cross_validation.cross_val_score(alg, predictors,target, cv=cv_folds, scoring='r2')
    
    #Print model report:
    print "\nModel Report"
    print "MSE : %.4g" % MSE
    print "R_squared : %.6g" % R_squared
    
    
    if performCV:
        print "CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" % (np.mean(cv_scoreMSE),np.std(cv_scoreMSE),np.min(cv_scoreMSE),np.max(cv_scoreMSE))
        print "CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" % (np.mean(cv_scoreR_squared),np.std(cv_scoreR_squared),np.min(cv_scoreR_squared),np.max(cv_scoreR_squared))

In [19]:
# Now we have this function that will allow us to fit a gradient boosting regression on a data set
# get a model and then use the model to make predictions, evaluate the quality of these predictions
# by way of mean squared error means and R squared.

# First thing, lets fit a model with mostly default parameter settings and set a benchmark for 
# our model generation

In [20]:
# Now time to fit the first model called gbm0
cv = 5
gbm0 = GradientBoostingRegressor(random_state=10)
# Now fit it on the training data set with default settings and a CV of 5
modelfit(gbm0,Targets, Predictors)


Model Report
MSE : 59.66
R_squared : 0.628842
CV Score : Mean - -71.43274 | Std - 33.75996 | Min - -162.8597 | Max - -42.34306
CV Score : Mean - 0.5710286 | Std - 0.1050077 | Min - 0.309568 | Max - 0.6801093


In [32]:
# That was cv of 5, lets do cv of 10
cv = 10
modelfit(gbm0,Targets, Predictors)


Model Report
MSE : 59.66
R_squared : 0.628842
CV Score : Mean - -71.43274 | Std - 33.75996 | Min - -162.8597 | Max - -42.34306
CV Score : Mean - 0.5710286 | Std - 0.1050077 | Min - 0.309568 | Max - 0.6801093


In [33]:
# Base model all default settings gave an R squared of 0.62 with the poorest at 0.3

# I wonder how this will do in the kaggle board. I have already embarrassed myself with a R score of 0.1
# So I figured with that, having shame should no longer be a concern of mine hahaha
# So just predict and post the predictions of the base model

In [35]:
# Saving this as base model to be used to fit on the Test model
BaseModel = gbm0.fit(Predictors, Targets)

In [37]:
# Load 
Test_95_Features = pd.read_csv('Test_95_Features.csv')
Test_array =Test_95_Features.values
Prediction = BaseModel.predict(Test_array)
Prediction

array([  78.33632892,   94.40492763,   78.93267331, ...,   92.41118851,
        109.87762172,   92.39030036])

In [38]:
# Based on this model, these are my predictions.

# Save that in an csv file as Predictions_95FeaturesBaseModel
np.savetxt('Predictions_95FeaturesBaseModel.csv',Prediction, delimiter=",")

In [39]:
# Whoaaaa...my score went from an embarrasingly 0.1 to 0.5378. Note I didnt do anything ground breaking here
# I just made sure that the features I was using for the train was the same as test. I definitely didnt do that last time
# which lead to just garbage model as I was basically using a model from a train data on a test data that was different

# The 0.5378 still left me at number 2535 position. So again, I just did the default thingy right
# But still I tweeted this LOL...can you believe this? Shameless plug at its finest! Hahahahah

# But now the question is...can we improve this model?

In [40]:
# One of the major parameters to tune is the number of trees that are used
# There is no default optimum trees, so we have to tune the number of trees using the parameter n_estimators

# Lets test the number of trees from a range of 20 to 80 in steps of 10
param_test1 ={'n_estimators':range(20,81,10)}

# Lets set some parameters
# We choose a learning rate of 0.1
gsearch1 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.1, min_samples_split=500,min_samples_leaf=50,max_depth=8,max_features='sqrt',subsample=0.8,random_state=10),param_grid = param_test1, scoring='r2',n_jobs=4,iid=False, cv=5) 

In [41]:
gsearch1.fit(Predictors,Targets)

GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=8,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=50,
             min_samples_split=500, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=10,
             subsample=0.8, verbose=0, warm_start=False),
       fit_params={}, iid=False, n_jobs=4,
       param_grid={'n_estimators': [20, 30, 40, 50, 60, 70, 80]},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=0)

In [42]:
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: 0.54590, std: 0.06708, params: {'n_estimators': 20},
  mean: 0.56224, std: 0.07017, params: {'n_estimators': 30},
  mean: 0.56638, std: 0.07131, params: {'n_estimators': 40},
  mean: 0.56741, std: 0.07201, params: {'n_estimators': 50},
  mean: 0.56799, std: 0.07261, params: {'n_estimators': 60},
  mean: 0.56774, std: 0.07243, params: {'n_estimators': 70},
  mean: 0.56731, std: 0.07231, params: {'n_estimators': 80}],
 {'n_estimators': 60},
 0.5679893790171912)

In [43]:
# So as we can see here, the optimal number of trees built here is 60
# So we have the number of trees as 60. Now lets tune the tree parameters and the parameters we want to tune
# are :
# 1) Tune max_depth and num_samples_split
# 2) Tune min_samples_leaf
# 3) Tune max_features

# max_depth and min_samples_split have a significant impact and we’re tuning those first.

In [48]:
# To start with, I’ll test max_depth values of 5 to 15 in steps of 2 and min_samples_split 
# from 200 to 1000 in steps of 200.

param_test2 = {'max_depth':range(5,16,2), 'min_samples_split':range(200,1001,200)}

gsearch2 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.1, n_estimators=60, max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = param_test2, scoring='r2',n_jobs=4,iid=False, cv=5)

In [49]:
gsearch2.fit(Predictors,Targets)

GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=60, presort='auto', random_state=10,
             subsample=0.8, verbose=0, warm_start=False),
       fit_params={}, iid=False, n_jobs=4,
       param_grid={'min_samples_split': [200, 400, 600, 800, 1000], 'max_depth': [5, 7, 9, 11, 13, 15]},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=0)

In [50]:
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

([mean: 0.57183, std: 0.07347, params: {'min_samples_split': 200, 'max_depth': 5},
  mean: 0.57226, std: 0.07402, params: {'min_samples_split': 400, 'max_depth': 5},
  mean: 0.57332, std: 0.07370, params: {'min_samples_split': 600, 'max_depth': 5},
  mean: 0.57262, std: 0.07280, params: {'min_samples_split': 800, 'max_depth': 5},
  mean: 0.57413, std: 0.07150, params: {'min_samples_split': 1000, 'max_depth': 5},
  mean: 0.57178, std: 0.07738, params: {'min_samples_split': 200, 'max_depth': 7},
  mean: 0.57220, std: 0.07552, params: {'min_samples_split': 400, 'max_depth': 7},
  mean: 0.57486, std: 0.07530, params: {'min_samples_split': 600, 'max_depth': 7},
  mean: 0.57514, std: 0.07441, params: {'min_samples_split': 800, 'max_depth': 7},
  mean: 0.57378, std: 0.07260, params: {'min_samples_split': 1000, 'max_depth': 7},
  mean: 0.56630, std: 0.07398, params: {'min_samples_split': 200, 'max_depth': 9},
  mean: 0.57312, std: 0.07516, params: {'min_samples_split': 400, 'max_depth': 9},
  

In [60]:
# The ideal max depth is 7 and 800 min_samples_split. Lets keep the max depth as 7
# But lets tinker around with the min_samples_split and also tune min_samples_leaf

# Here samples_split ranging from 800 to 2000 in steps of 200
# While min_samples_leaf is between 30 to 71 in steps of 10
param_test3 = {'min_samples_split':range(800,2000,200), 'min_samples_leaf':range(30,71,10)}

gsearch3 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.1, n_estimators=60,max_depth=7, max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = param_test3, scoring='r2',n_jobs=4,iid=False, cv=5)


In [61]:
gsearch3.fit(Predictors,Targets)

GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=7,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=60, presort='auto', random_state=10,
             subsample=0.8, verbose=0, warm_start=False),
       fit_params={}, iid=False, n_jobs=4,
       param_grid={'min_samples_split': [800, 1000, 1200, 1400, 1600, 1800], 'min_samples_leaf': [30, 40, 50, 60, 70]},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=0)

In [62]:
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

([mean: 0.57280, std: 0.07389, params: {'min_samples_split': 800, 'min_samples_leaf': 30},
  mean: 0.57317, std: 0.07245, params: {'min_samples_split': 1000, 'min_samples_leaf': 30},
  mean: 0.57169, std: 0.07153, params: {'min_samples_split': 1200, 'min_samples_leaf': 30},
  mean: 0.57043, std: 0.07158, params: {'min_samples_split': 1400, 'min_samples_leaf': 30},
  mean: 0.56788, std: 0.07265, params: {'min_samples_split': 1600, 'min_samples_leaf': 30},
  mean: 0.56283, std: 0.07103, params: {'min_samples_split': 1800, 'min_samples_leaf': 30},
  mean: 0.56825, std: 0.07363, params: {'min_samples_split': 800, 'min_samples_leaf': 40},
  mean: 0.56830, std: 0.07247, params: {'min_samples_split': 1000, 'min_samples_leaf': 40},
  mean: 0.56804, std: 0.07183, params: {'min_samples_split': 1200, 'min_samples_leaf': 40},
  mean: 0.56731, std: 0.07195, params: {'min_samples_split': 1400, 'min_samples_leaf': 40},
  mean: 0.56421, std: 0.07209, params: {'min_samples_split': 1600, 'min_samples_le

In [66]:
# Hmmmm..after min_samples_leaf 30, the R2 score fell
# So lets set between 1 to 30 min_samples_leaf
param_test4 = {'min_samples_split':range(800,1200,200), 'min_samples_leaf':range(1,30,10)}

gsearch4 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.1, n_estimators=60,max_depth=7, max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = param_test4, scoring='r2',n_jobs=4,iid=False, cv=5)

In [67]:
gsearch4.fit(Predictors,Targets)

GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=7,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=60, presort='auto', random_state=10,
             subsample=0.8, verbose=0, warm_start=False),
       fit_params={}, iid=False, n_jobs=4,
       param_grid={'min_samples_split': [800, 1000], 'min_samples_leaf': [1, 11, 21]},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=0)

In [68]:
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

([mean: 0.57514, std: 0.07441, params: {'min_samples_split': 800, 'min_samples_leaf': 1},
  mean: 0.57378, std: 0.07260, params: {'min_samples_split': 1000, 'min_samples_leaf': 1},
  mean: 0.57551, std: 0.07405, params: {'min_samples_split': 800, 'min_samples_leaf': 11},
  mean: 0.57408, std: 0.07275, params: {'min_samples_split': 1000, 'min_samples_leaf': 11},
  mean: 0.57394, std: 0.07299, params: {'min_samples_split': 800, 'min_samples_leaf': 21},
  mean: 0.57332, std: 0.07258, params: {'min_samples_split': 1000, 'min_samples_leaf': 21}],
 {'min_samples_leaf': 11, 'min_samples_split': 800},
 0.5755101095563181)

In [69]:
# So the optimum samples leaf is 11 or at least falls between 11 and 21
# Lets test that range and see if we get any extra benefit. 11 and 21 with steps 1 every time
param_test5 = {'min_samples_leaf':range(11,21,1)}

gsearch5 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.1, n_estimators=60,min_samples_split=800, max_features='sqrt',max_depth=7, subsample=0.8, random_state=10), 
param_grid = param_test5,scoring='r2',n_jobs=4,iid=False, cv=5)

In [70]:
gsearch5.fit(Predictors,Targets)

GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=7,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=800, min_weight_fraction_leaf=0.0,
             n_estimators=60, presort='auto', random_state=10,
             subsample=0.8, verbose=0, warm_start=False),
       fit_params={}, iid=False, n_jobs=4,
       param_grid={'min_samples_leaf': [11, 12, 13, 14, 15, 16, 17, 18, 19, 20]},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=0)

In [71]:
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_

([mean: 0.57551, std: 0.07405, params: {'min_samples_leaf': 11},
  mean: 0.57522, std: 0.07400, params: {'min_samples_leaf': 12},
  mean: 0.57485, std: 0.07374, params: {'min_samples_leaf': 13},
  mean: 0.57433, std: 0.07362, params: {'min_samples_leaf': 14},
  mean: 0.57494, std: 0.07291, params: {'min_samples_leaf': 15},
  mean: 0.57450, std: 0.07294, params: {'min_samples_leaf': 16},
  mean: 0.57457, std: 0.07317, params: {'min_samples_leaf': 17},
  mean: 0.57459, std: 0.07298, params: {'min_samples_leaf': 18},
  mean: 0.57411, std: 0.07323, params: {'min_samples_leaf': 19},
  mean: 0.57352, std: 0.07285, params: {'min_samples_leaf': 20}],
 {'min_samples_leaf': 11},
 0.5755101095563181)

In [74]:
# The min_samples_leaf returned 11. So lets test 1 to 11


param_test6 = {'min_samples_leaf':range(1,11,1)}

gsearch6 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.1, n_estimators=60,min_samples_split=800, max_features='sqrt',max_depth=7, subsample=0.8, random_state=10), 
param_grid = param_test6,scoring='r2',n_jobs=4,iid=False, cv=5)

In [76]:
gsearch6.fit(Predictors,Targets)

GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=7,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=800, min_weight_fraction_leaf=0.0,
             n_estimators=60, presort='auto', random_state=10,
             subsample=0.8, verbose=0, warm_start=False),
       fit_params={}, iid=False, n_jobs=4,
       param_grid={'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=0)

In [78]:
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_

([mean: 0.57514, std: 0.07441, params: {'min_samples_leaf': 1},
  mean: 0.57503, std: 0.07411, params: {'min_samples_leaf': 2},
  mean: 0.57535, std: 0.07420, params: {'min_samples_leaf': 3},
  mean: 0.57587, std: 0.07454, params: {'min_samples_leaf': 4},
  mean: 0.57567, std: 0.07458, params: {'min_samples_leaf': 5},
  mean: 0.57621, std: 0.07471, params: {'min_samples_leaf': 6},
  mean: 0.57640, std: 0.07469, params: {'min_samples_leaf': 7},
  mean: 0.57605, std: 0.07425, params: {'min_samples_leaf': 8},
  mean: 0.57538, std: 0.07383, params: {'min_samples_leaf': 9},
  mean: 0.57496, std: 0.07338, params: {'min_samples_leaf': 10}],
 {'min_samples_leaf': 7},
 0.5763992576286825)

In [79]:
# Ok..so lets take this for what it is now...
# Tuning yielded optimum values for :
# n_estimators = 60
# max_depth': 7, 
# 'min_samples_split': 800
# 'min_samples_leaf': 7


# How about the max features. We have been using max features set as square root of features.
# That sets it as max features as 10.
# But lets tune that and see if we can get better leverage.

# Here we set up a range of 10 to 20 and in steps of 2
param_test7 = {'max_features':range(10,20,2)}

gsearch7 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.1,max_depth=7,min_samples_leaf=7, n_estimators=60,min_samples_split=800, subsample=0.8, random_state=10), 
param_grid = param_test7,scoring='r2',n_jobs=4,iid=False, cv=5)

In [80]:
gsearch7.fit(Predictors,Targets)

GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=7, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=7, min_samples_split=800,
             min_weight_fraction_leaf=0.0, n_estimators=60, presort='auto',
             random_state=10, subsample=0.8, verbose=0, warm_start=False),
       fit_params={}, iid=False, n_jobs=4,
       param_grid={'max_features': [10, 12, 14, 16, 18]},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=0)

In [81]:
gsearch7.grid_scores_, gsearch7.best_params_, gsearch7.best_score_

([mean: 0.57399, std: 0.07408, params: {'max_features': 10},
  mean: 0.57569, std: 0.07468, params: {'max_features': 12},
  mean: 0.57524, std: 0.07355, params: {'max_features': 14},
  mean: 0.57679, std: 0.07398, params: {'max_features': 16},
  mean: 0.57650, std: 0.07326, params: {'max_features': 18}],
 {'max_features': 16},
 0.576791407855946)

In [82]:
# So now our max feature is 16..so lets summarize again:
# n_estimators = 60
# max_depth': 7, 
# 'min_samples_split': 800
# 'min_samples_leaf': 7
# 'max_features' : 16

In [83]:
# Lets continue tuning...
# First subsample whose default is 0.8. Lets tune that
param_test8 = {'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]}

gsearch8 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.1,max_depth=7,min_samples_leaf=7, n_estimators=60,min_samples_split=800,max_features=16, random_state=10), 
param_grid = param_test8,scoring='r2',n_jobs=4,iid=False, cv=5)

In [84]:
gsearch8.fit(Predictors,Targets)

GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=7, max_features=16,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=7, min_samples_split=800,
             min_weight_fraction_leaf=0.0, n_estimators=60, presort='auto',
             random_state=10, subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=False, n_jobs=4,
       param_grid={'subsample': [0.6, 0.7, 0.75, 0.8, 0.85, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=0)

In [85]:
gsearch8.grid_scores_, gsearch8.best_params_, gsearch8.best_score_

([mean: 0.57719, std: 0.07311, params: {'subsample': 0.6},
  mean: 0.57675, std: 0.07307, params: {'subsample': 0.7},
  mean: 0.57675, std: 0.07484, params: {'subsample': 0.75},
  mean: 0.57679, std: 0.07398, params: {'subsample': 0.8},
  mean: 0.57605, std: 0.07424, params: {'subsample': 0.85},
  mean: 0.57641, std: 0.07336, params: {'subsample': 0.9}],
 {'subsample': 0.6},
 0.5771948407410082)

In [86]:
# Hmm subsample started dropping at 0.6. So lets set 0.1 to 0.6

# Lets continue tuning...
# First subsample whose default is 0.8. Lets tune that
param_test9 = {'subsample':[0.1,0.2,0.3,0.4,0.5,0.6]}

gsearch9 = GridSearchCV(estimator = GradientBoostingRegressor(learning_rate=0.1,max_depth=7,min_samples_leaf=7, n_estimators=60,min_samples_split=800,max_features=16, random_state=10), 
param_grid = param_test9,scoring='r2',n_jobs=4,iid=False, cv=5)

In [87]:
gsearch9.fit(Predictors,Targets)

GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=7, max_features=16,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=7, min_samples_split=800,
             min_weight_fraction_leaf=0.0, n_estimators=60, presort='auto',
             random_state=10, subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=False, n_jobs=4,
       param_grid={'subsample': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]},
       pre_dispatch='2*n_jobs', refit=True, scoring='r2', verbose=0)

In [88]:
gsearch9.grid_scores_, gsearch9.best_params_, gsearch9.best_score_

([mean: -0.00845, std: 0.01150, params: {'subsample': 0.1},
  mean: -0.00790, std: 0.01007, params: {'subsample': 0.2},
  mean: 0.56403, std: 0.07114, params: {'subsample': 0.3},
  mean: 0.57010, std: 0.07077, params: {'subsample': 0.4},
  mean: 0.57381, std: 0.07170, params: {'subsample': 0.5},
  mean: 0.57719, std: 0.07311, params: {'subsample': 0.6}],
 {'subsample': 0.6},
 0.5771948407410082)

In [89]:
# So the sub sample is 0.6. Again summarize:
# n_estimators = 60
# max_depth': 7, 
# 'min_samples_split': 800
# 'min_samples_leaf': 7
# 'max_features' : 16
# 'sub sample' : 0.6

In [90]:
# Seems we have all our parameters tuned except the learning rate which we had set very high as 0.1.
# Lets reduce that by half and see if it helps and when we do that, we can increase the number of estimates

# So reduce the learning parameter by half and double the number of trees 

# So learning rate is 0.05 while n_estimators set as 120

In [91]:
gbm_tuned_1 = GradientBoostingRegressor(learning_rate=0.05, n_estimators=120,max_depth=7, min_samples_split=800,min_samples_leaf=7, subsample=0.6, random_state=10, max_features=16)

In [93]:
modelfit(gbm_tuned_1,Targets, Predictors)


Model Report
MSE : 65.49
R_squared : 0.59253
CV Score : Mean - -69.13744 | Std - 30.18157 | Min - -149.5394 | Max - -41.73169
CV Score : Mean - 0.5829006 | Std - 0.09051485 | Min - 0.3660387 | Max - 0.6847281


In [94]:
# How about 0.01 and 700
gbm_tuned_2 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=700,max_depth=7, min_samples_split=800,min_samples_leaf=7, subsample=0.6, random_state=10, max_features=16)

In [95]:
modelfit(gbm_tuned_2,Targets, Predictors)


Model Report
MSE : 64.99
R_squared : 0.595638
CV Score : Mean - -69.11224 | Std - 30.26539 | Min - -149.8495 | Max - -41.39238
CV Score : Mean - 0.583141 | Std - 0.09083058 | Min - 0.364724 | Max - 0.6872915


In [96]:
# I am not getting any extra leverage. Lets keep the estimate at 60 and then try at 0.1 and 0.05

# 0.1
gbm_tuned_3 = GradientBoostingRegressor(learning_rate=0.1, n_estimators=60,max_depth=7, min_samples_split=800,min_samples_leaf=7, subsample=0.6, random_state=10, max_features=16)

In [97]:
modelfit(gbm_tuned_3,Targets, Predictors)


Model Report
MSE : 65.49
R_squared : 0.592535
CV Score : Mean - -69.34603 | Std - 30.32398 | Min - -150.3267 | Max - -41.68576
CV Score : Mean - 0.5816968 | Std - 0.09068235 | Min - 0.362701 | Max - 0.6850751


In [98]:
# 0.05
gbm_tuned_4 = GradientBoostingRegressor(learning_rate=0.05, n_estimators=60,max_depth=7, min_samples_split=800,min_samples_leaf=7, subsample=0.6, random_state=10, max_features=16)

In [99]:
modelfit(gbm_tuned_4,Targets, Predictors)


Model Report
MSE : 67.84
R_squared : 0.577927
CV Score : Mean - -70.17451 | Std - 29.98049 | Min - -150.0549 | Max - -42.58078
CV Score : Mean - 0.5761008 | Std - 0.08835689 | Min - 0.3638534 | Max - 0.6783134


In [101]:
# Well, 0.05 is not cool. Lets increase 0.1 and see if it helps. Try 0.2
gbm_tuned_5 = GradientBoostingRegressor(learning_rate=0.2, n_estimators=60,max_depth=7, min_samples_split=800,min_samples_leaf=7, subsample=0.6, random_state=10, max_features=16)

In [102]:
modelfit(gbm_tuned_5,Targets, Predictors)


Model Report
MSE : 63.53
R_squared : 0.60475
CV Score : Mean - -69.78469 | Std - 30.21935 | Min - -150.0892 | Max - -42.13692
CV Score : Mean - 0.5787368 | Std - 0.09047795 | Min - 0.3637078 | Max - 0.6816666


In [103]:
# Seems to be higher for 0.2.

# Now lets try 0.5
gbm_tuned_6 = GradientBoostingRegressor(learning_rate=0.5, n_estimators=60,max_depth=7, min_samples_split=800,min_samples_leaf=7, subsample=0.6, random_state=10, max_features=16)

In [104]:
modelfit(gbm_tuned_6,Targets, Predictors)


Model Report
MSE : 61.1
R_squared : 0.619839
CV Score : Mean - -73.03037 | Std - 31.03759 | Min - -157.3093 | Max - -44.9372
CV Score : Mean - 0.558413 | Std - 0.08905005 | Min - 0.3330986 | Max - 0.6605113


In [105]:
# Now lets try learning rate as 1
gbm_tuned_7 = GradientBoostingRegressor(learning_rate=1, n_estimators=60,max_depth=7, min_samples_split=800,min_samples_leaf=7, subsample=0.6, random_state=10, max_features=16)

In [106]:
modelfit(gbm_tuned_7,Targets, Predictors)


Model Report
MSE : 62.84
R_squared : 0.609034
CV Score : Mean - -83.01067 | Std - 33.06961 | Min - -170.5176 | Max - -50.98866
CV Score : Mean - 0.4963725 | Std - 0.09303524 | Min - 0.2771028 | Max - 0.6110393


In [107]:
# Lets use gbm_tuned_6 model for prediction
# Saving this as base model to be used to fit on the Test model
Model_tuned_6 = gbm_tuned_6.fit(Predictors, Targets)
# Make predictions on our test data
Prediction = Model_tuned_6.predict(Test_array)
Prediction

array([  78.38385619,   93.31373963,   78.2496379 , ...,   91.28503451,
        108.49637444,   86.45349914])

In [108]:
# Based on this model, these are my predictions.

# Save that in an csv file as Predictions_95FeaturesModel_tuned_6
np.savetxt('Predictions_95FeaturesModel_tuned_6.csv',Prediction, delimiter=",")

In [109]:
# Ok that sucked! After all that fancy tweaking all I did was reduce the quality of my predictions that I made
# from the base model. My R2 squared on Kaggle went from 0.53728 to 0.53059. So basically my base model was better

# I am going back to the base model....lets just tweak the learning rate in my base model

In [110]:
# Now time to fit the first model called gbm0
cv = 5
gbm1 = GradientBoostingRegressor(random_state=10, learning_rate=0.1)
# Now fit it on the training data set with default settings and a CV of 5
modelfit(gbm1,Targets, Predictors)


Model Report
MSE : 59.66
R_squared : 0.628842
CV Score : Mean - -71.43274 | Std - 33.75996 | Min - -162.8597 | Max - -42.34306
CV Score : Mean - 0.5710286 | Std - 0.1050077 | Min - 0.309568 | Max - 0.6801093


In [111]:
# Returns back the defult model. Lets do 0.5
# Now time to fit the first model called gbm0
cv = 5
gbm2 = GradientBoostingRegressor(random_state=10, learning_rate=0.5)
# Now fit it on the training data set with default settings and a CV of 5
modelfit(gbm2,Targets, Predictors)


Model Report
MSE : 42.73
R_squared : 0.734142
CV Score : Mean - -80.14048 | Std - 37.5326 | Min - -182.6821 | Max - -48.58629
CV Score : Mean - 0.5180906 | Std - 0.1160001 | Min - 0.2255326 | Max - 0.6329434


In [112]:
# Wow that jumped really high. R square increased and MSE increased. Lets submit this to kaggle right away and see the effect
# Lets use that for prediction

# Saving this as base model to be used to fit on the Test model
Model_gbm2 = gbm2.fit(Predictors, Targets)

Prediction =Model_gbm2.predict(Test_array)
Prediction

array([  77.538025  ,   95.04864951,   79.14371743, ...,   91.24428454,
        110.26498932,   89.82739432])

In [113]:
# Based on this model, these are my predictions.

# Save that in an csv file as Predictions_95FeaturesModel_gbm2
np.savetxt('Predictions_95FeaturesModel_gbm2.csv',Prediction, delimiter=",")

In [114]:
# Nope. Actually, it went from 0.5 to 0.4 . Damn!
# Returns back the defult model. Lets do 0.5
# Now time to fit the first model called gbm0
cv = 5
gbm3 = GradientBoostingRegressor(random_state=10, learning_rate=1)
# Now fit it on the training data set with default settings and a CV of 5
modelfit(gbm3,Targets, Predictors)



Model Report
MSE : 36.86
R_squared : 0.77067
CV Score : Mean - -92.74285 | Std - 41.70329 | Min - -206.2691 | Max - -55.68886
CV Score : Mean - 0.4411005 | Std - 0.1257591 | Min - 0.1255373 | Max - 0.5751843


In [116]:
gbm_tuned_2 = GradientBoostingRegressor(learning_rate=0.01, n_estimators=700,max_depth=7, min_samples_split=800,min_samples_leaf=7, subsample=0.6, random_state=10, max_features=16)

In [117]:
Model_gbm_tuned_2 = gbm_tuned_2.fit(Predictors, Targets)
Prediction =Model_gbm_tuned_2.predict(Test_array)
Prediction

array([  78.70074712,   93.88604202,   78.96082665, ...,   91.42385726,
        110.33666902,   91.28708748])

In [118]:
# Based on this model, these are my predictions.

# Save that in an csv file as Predictions_95FeaturesModel_gbm_tuned_2
np.savetxt('Predictions_95FeaturesModel_gbm_tuned_2.csv',Prediction, delimiter=",")

In [1]:
# This approach actually helped. My score on the public leaderboard went from 0.53728 to 0.55226.

# However, I was stuck and languishing at the 2423th position out of 3835 with the leading model having
# a score of 0.67. But you know me, shameless...I tweeted this out! Hahahahah.

In [2]:
# I have a few rough ideas to help make my model better

# 1) The 95 features that I have selected contains both categorical and continous data. One thing I can do
# is to use PCA to summarize the continous data into three components and use those components instead of
# of using the entire feature sets. Some might be correlated and thus adding to the noise

# 2) I used the gradient boost regressor algorithm to generate this model. If I am really serious, I should use
# the Xgboost algorithm. Some of the models that I built here clearly showed signs of over fitting since they 
# performed worse when I submitted my predictions on the test. Xgboost will give me the ability to apply some
# form of regularisation(L1 or L2) on my models as I tweak my parameters. This might help.

# 3) Also, I have not even explored the idea of a stacked model approach where I will build several models
# and combine the models for prediction. I should plan on doing that.

# 4) If everything fails, I will go back to using the entire features in the data set, build an Xgboost on all of them
# and see if by just using regularisation if I can get a better model.

# Bottom line: There are so many things that I am eager to try but experiments in the lab calling, abstracts to get
# ready, presentations and posters to be made. So I will come back to this exciting data set when I have time

In [None]:
# Update:

# Well I never was able to squeeze out time and get back to this. The competition ended. But a little bit of the
# cup not fully empty perspective: At the end of the compettition when 81 percent of the data was used to evaluate the
# accuracy , my position improved from 2423 to 1590 th position with a score of 0.54796. The smart alec that won this 
# this had a score of 0.55551. 

# I will take time out and check out the strategies of some good models of this competition and see if some of the
# above ideas were implemented to improve the models. I am also excited to see other creative and outside the box
# thinking that these smart guys have implemented to get good models.

# Exciting times ahead!