In [1]:
import numpy as np
from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import ElasticNetCV, LassoLarsCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.utils import check_array
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import r2_score



class StackingEstimator(BaseEstimator, TransformerMixin):
    
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y=None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self
    def transform(self, X):
        X = check_array(X)
        X_transformed = np.copy(X)
        # add class probabilities as a synthetic feature
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'):
            X_transformed = np.hstack((self.estimator.predict_proba(X), X))

        # add class prodiction as a synthetic feature
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))

        return X_transformed


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))



n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

#save columns list before adding the decomposition components

usable_columns = list(set(train.columns) - set(['y']))

# Append decomposition components to datasets
for i in range(1, n_comp + 1):
    train['pca_' + str(i)] = pca2_results_train[:, i - 1]
    test['pca_' + str(i)] = pca2_results_test[:, i - 1]

    train['ica_' + str(i)] = ica2_results_train[:, i - 1]
    test['ica_' + str(i)] = ica2_results_test[:, i - 1]

    train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

    train['grp_' + str(i)] = grp_results_train[:, i - 1]
    test['grp_' + str(i)] = grp_results_test[:, i - 1]

    train['srp_' + str(i)] = srp_results_train[:, i - 1]
    test['srp_' + str(i)] = srp_results_test[:, i - 1]

#usable_columns = list(set(train.columns) - set(['y']))

y_train = train['y'].values
y_mean = np.mean(y_train)
id_test = test['ID'].values
#finaltrainset and finaltestset are data to be used only the stacked model (does not contain PCA, SVD... arrays) 
finaltrainset = train[usable_columns].values
finaltestset = test[usable_columns].values

'''Train the stacked models then predict the test data'''

stacked_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, max_features=0.55, min_samples_leaf=18, min_samples_split=14, subsample=0.7)),
    LassoLarsCV()

)


stacked_pipeline.fit(finaltrainset, y_train)
results = stacked_pipeline.predict(finaltestset)



In [2]:
from sklearn.cross_validation import train_test_split
train=train.drop('y', axis=1)
X_train, X_test, Y_train, Y_test = train_test_split(train,y_train, test_size = 0.2)

1，Benchmark

In [20]:
'''Train the xgb model then predict the test data'''
import time
start = time.time()

xgb_params = {
    'n_trees': 520, 
    'eta': 0.0045,
    'max_depth': 4,
    'subsample': 0.93,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}
# NOTE: Make sure that the class is labeled 'class' in the data file

dtrain = xgb.DMatrix(X_train, Y_train)
dtest = xgb.DMatrix(X_test)

num_boost_rounds = 1250
# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
y_pred = model.predict(dtest)
print(r2_score( Y_test,y_pred))

elapsed = time.time() - start
print(elapsed)   

0.602913449733
12.593720436096191


In [19]:
'''Train the xgb model then predict the test data'''
import time
start = time.time()

xgb_params = { 
    'eta': 0.0045,
    'max_depth': 4,
    'subsample': 0.93,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}
# NOTE: Make sure that the class is labeled 'class' in the data file

dtrain = xgb.DMatrix(X_train, Y_train)
dtest = xgb.DMatrix(X_test)


# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain,num_boost_round=1250)
y_pred = model.predict(dtest)
print(r2_score( Y_test,y_pred))

elapsed = time.time() - start
print(elapsed)   

0.602913449733
12.762730121612549


In [36]:
from xgboost.sklearn import XGBRegressor
import time
start = time.time()


est = XGBRegressor( learning_rate =0.0045, n_estimators=1250, max_depth=4,
 subsample=0.93, objective= 'reg:linear', nthread=4, seed=27,silent=1,base_score=y_mean)
est.fit(X_train, Y_train)
y_pred = est.predict(X_test)
print(r2_score(Y_test,y_pred))

print(time.time() - start) 

0.600859184893
11.67066764831543


In [21]:
'''Average the preditionon test data  of both models then save it on a csv file'''
dtrain = xgb.DMatrix(train, y_train)
dtest = xgb.DMatrix(test)

num_boost_rounds = 1250
# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
y_pred = model.predict(dtest)

sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred*0.75 + results*0.25
sub.to_csv('stacked-models.csv', index=False)

## scored 0.56841

2,Tune XGboost

2.1Fix learning rate and number of estimators for tuning tree-based parameters

In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV



In [4]:
def modelfit(alg,X_train, Y_train,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_train, Y_train)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='rmse', early_stopping_rounds=early_stopping_rounds)
        #alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    #alg.fit(X_train, Y_train,eval_metric='rmse')
    print(cvresult.shape[0])

In [5]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'reg:linear',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

In [6]:
modelfit(xgb1,X_train, Y_train)

51


2.2 Tune max_depth and min_child_weight

In [3]:
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.cross_validation import ShuffleSplit
import time
start = time.time()

param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
grid = GridSearchCV(estimator =XGBRegressor( learning_rate =0.1, n_estimators=54, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:linear', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='r2',n_jobs=4,iid=False, cv=5)

grid.fit(X_train, Y_train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

#grid = grid.fit(X_train, Y_train)
model=grid.best_estimator_

y_pred = model.predict(X_test)
print(r2_score( Y_test,y_pred))

print(time.time() - start) 

[mean: 0.54859, std: 0.05794, params: {'max_depth': 3, 'min_child_weight': 1}, mean: 0.54710, std: 0.05835, params: {'min_child_weight': 3, 'max_depth': 3}, mean: 0.54687, std: 0.05719, params: {'min_child_weight': 5, 'max_depth': 3}, mean: 0.53884, std: 0.05633, params: {'min_child_weight': 1, 'max_depth': 5}, mean: 0.53845, std: 0.05594, params: {'min_child_weight': 3, 'max_depth': 5}, mean: 0.53792, std: 0.05848, params: {'min_child_weight': 5, 'max_depth': 5}, mean: 0.52185, std: 0.05972, params: {'min_child_weight': 1, 'max_depth': 7}, mean: 0.52920, std: 0.05833, params: {'min_child_weight': 3, 'max_depth': 7}, mean: 0.53292, std: 0.06005, params: {'min_child_weight': 5, 'max_depth': 7}, mean: 0.50849, std: 0.06067, params: {'min_child_weight': 1, 'max_depth': 9}, mean: 0.51546, std: 0.05607, params: {'min_child_weight': 3, 'max_depth': 9}, mean: 0.52280, std: 0.06027, params: {'min_child_weight': 5, 'max_depth': 9}]
{'max_depth': 3, 'min_child_weight': 1}
0.548586521039
0.638502



In [4]:
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.cross_validation import ShuffleSplit
import time
start = time.time()

param_test1 = {
 'max_depth':[1,2,3],
 'min_child_weight':[0.3,0.5,0.7,1.0]
}
grid = GridSearchCV(estimator =XGBRegressor( learning_rate =0.1, n_estimators=54, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:linear', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='r2',n_jobs=4,iid=False, cv=5)

grid.fit(X_train, Y_train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

#grid = grid.fit(X_train, Y_train)
model=grid.best_estimator_

y_pred = model.predict(X_test)
print(r2_score( Y_test,y_pred))

print(time.time() - start) 

[mean: 0.50991, std: 0.05923, params: {'min_child_weight': 0.3, 'max_depth': 1}, mean: 0.50991, std: 0.05923, params: {'max_depth': 1, 'min_child_weight': 0.5}, mean: 0.50991, std: 0.05923, params: {'min_child_weight': 0.7, 'max_depth': 1}, mean: 0.50991, std: 0.05923, params: {'min_child_weight': 1.0, 'max_depth': 1}, mean: 0.54967, std: 0.05788, params: {'min_child_weight': 0.3, 'max_depth': 2}, mean: 0.54967, std: 0.05788, params: {'min_child_weight': 0.5, 'max_depth': 2}, mean: 0.54967, std: 0.05788, params: {'min_child_weight': 0.7, 'max_depth': 2}, mean: 0.54967, std: 0.05788, params: {'max_depth': 2, 'min_child_weight': 1.0}, mean: 0.54859, std: 0.05794, params: {'max_depth': 3, 'min_child_weight': 0.3}, mean: 0.54859, std: 0.05794, params: {'max_depth': 3, 'min_child_weight': 0.5}, mean: 0.54859, std: 0.05794, params: {'max_depth': 3, 'min_child_weight': 0.7}, mean: 0.54859, std: 0.05794, params: {'max_depth': 3, 'min_child_weight': 1.0}]
{'min_child_weight': 0.3, 'max_depth': 



In [5]:
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.cross_validation import ShuffleSplit
import time
start = time.time()

param_test1 = {
 'max_depth':[2],
 'min_child_weight':[0.1,0.2,0.3]
}
grid = GridSearchCV(estimator =XGBRegressor( learning_rate =0.1, n_estimators=54, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:linear', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='r2',n_jobs=4,iid=False, cv=5)

grid.fit(X_train, Y_train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

#grid = grid.fit(X_train, Y_train)
model=grid.best_estimator_

y_pred = model.predict(X_test)
print(r2_score( Y_test,y_pred))

print(time.time() - start) 

[mean: 0.54967, std: 0.05788, params: {'min_child_weight': 0.1, 'max_depth': 2}, mean: 0.54967, std: 0.05788, params: {'max_depth': 2, 'min_child_weight': 0.2}, mean: 0.54967, std: 0.05788, params: {'min_child_weight': 0.3, 'max_depth': 2}]
{'min_child_weight': 0.1, 'max_depth': 2}
0.549668358206
0.643034924665
6.009000062942505




In [6]:
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.cross_validation import ShuffleSplit
import time
start = time.time()

param_test1 = {
 'max_depth':[2],
 'min_child_weight':[0.03,0.05,0.07,0.1]
}
grid = GridSearchCV(estimator =XGBRegressor( learning_rate =0.1, n_estimators=54, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:linear', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='r2',n_jobs=4,iid=False, cv=5)

grid.fit(X_train, Y_train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

#grid = grid.fit(X_train, Y_train)
model=grid.best_estimator_

y_pred = model.predict(X_test)
print(r2_score( Y_test,y_pred))

print(time.time() - start) 

[mean: 0.54967, std: 0.05788, params: {'max_depth': 2, 'min_child_weight': 0.03}, mean: 0.54967, std: 0.05788, params: {'max_depth': 2, 'min_child_weight': 0.05}, mean: 0.54967, std: 0.05788, params: {'max_depth': 2, 'min_child_weight': 0.07}, mean: 0.54967, std: 0.05788, params: {'max_depth': 2, 'min_child_weight': 0.1}]
{'max_depth': 2, 'min_child_weight': 0.03}
0.549668358206
0.643034924665
7.2790000438690186




In [7]:
'''Average the preditionon test data  of both models then save it on a csv file'''
estimator =XGBRegressor( learning_rate =0.1, n_estimators=54, max_depth=2,
 min_child_weight=0.3, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:linear', nthread=4, scale_pos_weight=1, seed=27)


estimator.fit(train, y_train)
y_pred = estimator.predict(test)


sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred*0.75 + results*0.25
sub.to_csv('stacked-models.csv', index=False)

## scored 0.55681

In [None]:
2.3 Tune gamma

In [8]:
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.cross_validation import ShuffleSplit
import time
start = time.time()

param_test1 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
grid = GridSearchCV(estimator =XGBRegressor( learning_rate =0.1, n_estimators=54, max_depth=2,
 min_child_weight=0.3, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:linear', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='r2',n_jobs=4,iid=False, cv=5)

grid.fit(X_train, Y_train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

#grid = grid.fit(X_train, Y_train)
model=grid.best_estimator_

y_pred = model.predict(X_test)
print(r2_score( Y_test,y_pred))

print(time.time() - start) 

[mean: 0.54967, std: 0.05788, params: {'gamma': 0.0}, mean: 0.54967, std: 0.05788, params: {'gamma': 0.1}, mean: 0.54967, std: 0.05788, params: {'gamma': 0.2}, mean: 0.54967, std: 0.05788, params: {'gamma': 0.3}, mean: 0.54967, std: 0.05788, params: {'gamma': 0.4}]
{'gamma': 0.0}
0.549668358206
0.643034924665
8.653000116348267




re-calibrate the number of boosting rounds for the updated parameters

In [13]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV

def modelfit(alg,X_train, Y_train,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_train, Y_train)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='rmse', early_stopping_rounds=early_stopping_rounds)
        #alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    #alg.fit(X_train, Y_train,eval_metric='rmse')
    print(cvresult.shape[0])
    

xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=2,
 min_child_weight=0.3,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'reg:linear',
 nthread=4,
 scale_pos_weight=1,
 seed=27)


modelfit(xgb1,X_train, Y_train)

52


2.4 Tune subsample and colsample_bytree

In [14]:
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.cross_validation import ShuffleSplit
import time
start = time.time()

param_test1 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
grid = GridSearchCV(estimator =XGBRegressor( learning_rate =0.1, n_estimators=52, max_depth=2,
 min_child_weight=0.3, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:linear', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='r2',n_jobs=4,iid=False, cv=5)

grid.fit(X_train, Y_train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

#grid = grid.fit(X_train, Y_train)
model=grid.best_estimator_

y_pred = model.predict(X_test)
print(r2_score( Y_test,y_pred))

print(time.time() - start) 

[mean: 0.54635, std: 0.05743, params: {'subsample': 0.6, 'colsample_bytree': 0.6}, mean: 0.54788, std: 0.05869, params: {'subsample': 0.7, 'colsample_bytree': 0.6}, mean: 0.55002, std: 0.05900, params: {'subsample': 0.8, 'colsample_bytree': 0.6}, mean: 0.55105, std: 0.06062, params: {'subsample': 0.9, 'colsample_bytree': 0.6}, mean: 0.54851, std: 0.05816, params: {'subsample': 0.6, 'colsample_bytree': 0.7}, mean: 0.54719, std: 0.05707, params: {'subsample': 0.7, 'colsample_bytree': 0.7}, mean: 0.54960, std: 0.05857, params: {'subsample': 0.8, 'colsample_bytree': 0.7}, mean: 0.55089, std: 0.06010, params: {'subsample': 0.9, 'colsample_bytree': 0.7}, mean: 0.54592, std: 0.05815, params: {'subsample': 0.6, 'colsample_bytree': 0.8}, mean: 0.54698, std: 0.05885, params: {'subsample': 0.7, 'colsample_bytree': 0.8}, mean: 0.54917, std: 0.05797, params: {'subsample': 0.8, 'colsample_bytree': 0.8}, mean: 0.54935, std: 0.06073, params: {'subsample': 0.9, 'colsample_bytree': 0.8}, mean: 0.54453, 



In [15]:
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.cross_validation import ShuffleSplit
import time
start = time.time()

param_test1 = {
 'subsample':[i/100.0 for i in range(75,100,5)],
 'colsample_bytree':[i/100.0 for i in range(50,80,5)]
}
grid = GridSearchCV(estimator =XGBRegressor( learning_rate =0.1, n_estimators=52, max_depth=2,
 min_child_weight=0.3, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'reg:linear', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='r2',n_jobs=4,iid=False, cv=5)

grid.fit(X_train, Y_train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

#grid = grid.fit(X_train, Y_train)
model=grid.best_estimator_

y_pred = model.predict(X_test)
print(r2_score( Y_test,y_pred))

print(time.time() - start) 

[mean: 0.54735, std: 0.05982, params: {'subsample': 0.75, 'colsample_bytree': 0.5}, mean: 0.54881, std: 0.05954, params: {'subsample': 0.8, 'colsample_bytree': 0.5}, mean: 0.55025, std: 0.06048, params: {'subsample': 0.85, 'colsample_bytree': 0.5}, mean: 0.55189, std: 0.06088, params: {'subsample': 0.9, 'colsample_bytree': 0.5}, mean: 0.55111, std: 0.06056, params: {'subsample': 0.95, 'colsample_bytree': 0.5}, mean: 0.54826, std: 0.05787, params: {'subsample': 0.75, 'colsample_bytree': 0.55}, mean: 0.54924, std: 0.05841, params: {'subsample': 0.8, 'colsample_bytree': 0.55}, mean: 0.54863, std: 0.05920, params: {'subsample': 0.85, 'colsample_bytree': 0.55}, mean: 0.55045, std: 0.05951, params: {'subsample': 0.9, 'colsample_bytree': 0.55}, mean: 0.55154, std: 0.05942, params: {'subsample': 0.95, 'colsample_bytree': 0.55}, mean: 0.54895, std: 0.05860, params: {'subsample': 0.75, 'colsample_bytree': 0.6}, mean: 0.55002, std: 0.05900, params: {'subsample': 0.8, 'colsample_bytree': 0.6}, mea



2.5 Tune Tuning Regularization Parameters

In [16]:
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.cross_validation import ShuffleSplit
import time
start = time.time()

param_test1 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
grid = GridSearchCV(estimator =XGBRegressor( learning_rate =0.1, n_estimators=52, max_depth=2,
 min_child_weight=0.3, gamma=0, subsample=0.95, colsample_bytree=0.6,
 objective= 'reg:linear', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='r2',n_jobs=4,iid=False, cv=5)

grid.fit(X_train, Y_train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

#grid = grid.fit(X_train, Y_train)
model=grid.best_estimator_

y_pred = model.predict(X_test)
print(r2_score( Y_test,y_pred))

print(time.time() - start) 

[mean: 0.55247, std: 0.05901, params: {'reg_alpha': 1e-05}, mean: 0.55243, std: 0.05901, params: {'reg_alpha': 0.01}, mean: 0.55238, std: 0.05943, params: {'reg_alpha': 0.1}, mean: 0.55117, std: 0.05885, params: {'reg_alpha': 1}, mean: 0.54737, std: 0.06257, params: {'reg_alpha': 100}]
{'reg_alpha': 1e-05}
0.552471499624
0.646306543341
6.9019999504089355




In [17]:
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.cross_validation import ShuffleSplit
import time
start = time.time()

param_test1 = {
'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]}

grid = GridSearchCV(estimator =XGBRegressor( learning_rate =0.1, n_estimators=52, max_depth=2,
 min_child_weight=0.3, gamma=0, subsample=0.95, colsample_bytree=0.6,
 objective= 'reg:linear', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='r2',n_jobs=4,iid=False, cv=5)

grid.fit(X_train, Y_train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

#grid = grid.fit(X_train, Y_train)
model=grid.best_estimator_

y_pred = model.predict(X_test)
print(r2_score( Y_test,y_pred))

print(time.time() - start) 

[mean: 0.55247, std: 0.05901, params: {'reg_alpha': 0}, mean: 0.55247, std: 0.05901, params: {'reg_alpha': 0.001}, mean: 0.55247, std: 0.05901, params: {'reg_alpha': 0.005}, mean: 0.55243, std: 0.05901, params: {'reg_alpha': 0.01}, mean: 0.55243, std: 0.05934, params: {'reg_alpha': 0.05}]
{'reg_alpha': 0.005}
0.552471540294
0.646306851348
7.101999998092651




In [18]:
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.cross_validation import ShuffleSplit
import time
start = time.time()

param_test1 = {
'reg_alpha':[0.006,0.007,0.008,0.009, 0.005, 0.001, 0.002,0.003,0.004]}

grid = GridSearchCV(estimator =XGBRegressor( learning_rate =0.1, n_estimators=52, max_depth=2,
 min_child_weight=0.3, gamma=0, subsample=0.95, colsample_bytree=0.6,
 objective= 'reg:linear', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='r2',n_jobs=4,iid=False, cv=5)

grid.fit(X_train, Y_train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

#grid = grid.fit(X_train, Y_train)
model=grid.best_estimator_

y_pred = model.predict(X_test)
print(r2_score( Y_test,y_pred))

print(time.time() - start) 

[mean: 0.55247, std: 0.05901, params: {'reg_alpha': 0.006}, mean: 0.55247, std: 0.05901, params: {'reg_alpha': 0.007}, mean: 0.55243, std: 0.05901, params: {'reg_alpha': 0.008}, mean: 0.55243, std: 0.05901, params: {'reg_alpha': 0.009}, mean: 0.55247, std: 0.05901, params: {'reg_alpha': 0.005}, mean: 0.55247, std: 0.05901, params: {'reg_alpha': 0.001}, mean: 0.55247, std: 0.05901, params: {'reg_alpha': 0.002}, mean: 0.55247, std: 0.05901, params: {'reg_alpha': 0.003}, mean: 0.55247, std: 0.05901, params: {'reg_alpha': 0.004}]
{'reg_alpha': 0.005}
0.552471540294
0.646306851348
10.83400011062622




In [None]:
2.6 Reducing Learning Rate re-calibrate the number of boosting rounds 

In [24]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV

def modelfit(alg,X_train, Y_train,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_train, Y_train)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='rmse', early_stopping_rounds=early_stopping_rounds)
        #alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    #alg.fit(X_train, Y_train,eval_metric='rmse')
    print(cvresult.shape[0])
    

xgb1 = XGBClassifier(learning_rate =0.0045, n_estimators=5000, max_depth=2,
 min_child_weight=0.3, gamma=0, subsample=0.95, colsample_bytree=0.6,reg_alpha=0.005,
 objective= 'reg:linear', nthread=4, scale_pos_weight=1, seed=27)


modelfit(xgb1,X_train, Y_train)

1372


In [40]:
'''Average the preditionon test data  of both models then save it on a csv file'''
estimator =XGBRegressor( learning_rate =0.0045, n_estimators=1372, max_depth=2,
 min_child_weight=0.3, gamma=0, subsample=0.95, colsample_bytree=0.8,
 objective= 'reg:linear', nthread=4, scale_pos_weight=1, seed=27)


estimator.fit(train, y_train)
y_pred = estimator.predict(test)

y_pred1 = model.predict(X_test)
print(r2_score( Y_test,y_pred1))

sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred*0.75 + results*0.25
sub.to_csv('stacked-models.csv', index=False)

## scored 0.55591

0.646219263052


In [27]:
'''Average the preditionon test data  of both models then save it on a csv file'''
estimator =XGBRegressor( learning_rate =0.0045, n_estimators=1250, max_depth=4,
 subsample=0.93, objective= 'reg:linear', nthread=4, seed=27,silent=1,base_score=y_mean)


estimator.fit(train, y_train)
y_pred = estimator.predict(test)


sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred*0.75 + results*0.25
sub.to_csv('stacked-models.csv', index=False)

## scored  0.56794

In [28]:
'''Average the preditionon test data  of both models then save it on a csv file'''
estimator =XGBRegressor( learning_rate =0.0045, n_estimators=1250, max_depth=4,
 subsample=0.93, objective= 'reg:linear', nthread=4, seed=27,silent=1)


estimator.fit(train, y_train)
y_pred = estimator.predict(test)


sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred*0.75 + results*0.25
sub.to_csv('stacked-models.csv', index=False)

## scored  0.56481

In [29]:
'''Average the preditionon test data  of both models then save it on a csv file'''
estimator =XGBRegressor( learning_rate =0.0045, n_estimators=1250, max_depth=4,
 subsample=0.93, objective= 'reg:linear', nthread=4, seed=27,silent=1,base_score=y_mean)


estimator.fit(train, y_train)
y_pred = estimator.predict(test)


sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred
sub.to_csv('stacked-models.csv', index=False)

## scored  0.56686

In [34]:
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.cross_validation import ShuffleSplit
import time
start = time.time()

param_test1 = {
'max_depth':[2,3,4,5],
#'subsample':[0.91,0.92,0.93,0.94,0.95,0.96,0.97]
'subsample':[i/100.0 for i in [91,92,93,94,95,96,97]]
}

grid = GridSearchCV(estimator =XGBRegressor( learning_rate =0.0045, n_estimators=1250, max_depth=2,
 subsample=0.95,
 objective= 'reg:linear', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='r2',n_jobs=4,iid=False, cv=5)

grid.fit(X_train, Y_train)

print(grid.grid_scores_)
print(grid.best_params_)
print(grid.best_score_)

#grid = grid.fit(X_train, Y_train)
model=grid.best_estimator_

y_pred = model.predict(X_test)
print(r2_score( Y_test,y_pred))

print(time.time() - start) 

[mean: 0.55179, std: 0.05914, params: {'subsample': 0.91, 'max_depth': 2}, mean: 0.55150, std: 0.05928, params: {'subsample': 0.92, 'max_depth': 2}, mean: 0.55161, std: 0.05928, params: {'subsample': 0.93, 'max_depth': 2}, mean: 0.55159, std: 0.05919, params: {'subsample': 0.94, 'max_depth': 2}, mean: 0.55172, std: 0.05943, params: {'subsample': 0.95, 'max_depth': 2}, mean: 0.55191, std: 0.05943, params: {'subsample': 0.96, 'max_depth': 2}, mean: 0.55205, std: 0.05945, params: {'subsample': 0.97, 'max_depth': 2}, mean: 0.54774, std: 0.05946, params: {'subsample': 0.91, 'max_depth': 3}, mean: 0.54732, std: 0.05999, params: {'subsample': 0.92, 'max_depth': 3}, mean: 0.54807, std: 0.06006, params: {'subsample': 0.93, 'max_depth': 3}, mean: 0.54832, std: 0.05998, params: {'subsample': 0.94, 'max_depth': 3}, mean: 0.54867, std: 0.06010, params: {'subsample': 0.95, 'max_depth': 3}, mean: 0.54887, std: 0.06021, params: {'subsample': 0.96, 'max_depth': 3}, mean: 0.54912, std: 0.06003, params: 



In [35]:
'''Average the preditionon test data  of both models then save it on a csv file'''
estimator =XGBRegressor( learning_rate =0.0045, n_estimators=1250, max_depth=2,
 subsample=0.97, objective= 'reg:linear', nthread=4, seed=27,silent=1)


estimator.fit(train, y_train)
y_pred = estimator.predict(test)


sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred*0.75 + results*0.25
sub.to_csv('stacked-models.csv', index=False)

## scored  0.55470

In [37]:
'''Average the preditionon test data  of both models then save it on a csv file'''
estimator =XGBRegressor( learning_rate =0.0045, n_estimators=1250, max_depth=4,
 subsample=0.93, objective= 'reg:linear', nthread=4, seed=27,silent=1)


estimator.fit(train, y_train)
y_pred = estimator.predict(test)


sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred*0.75 + results*0.25
sub.to_csv('stacked-models.csv', index=False)

## scored  0.56481

In [39]:

'''Average the preditionon test data  of both models then save it on a csv file'''
estimator =XGBRegressor( learning_rate =0.0045, n_estimators=1250, max_depth=4,
 subsample=0.93, objective= 'reg:linear', nthread=4,base_score= y_mean, seed=27,silent=1)


estimator.fit(train, y_train)
y_pred = estimator.predict(test)


sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred*0.75 + results*0.25
sub.to_csv('stacked-models.csv', index=False)

## scored  0.56481