In [1]:
import pandas as pd
import numpy as np

import xgboost as xgb

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
from sklearn import metrics
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.grid_search import GridSearchCV

import matplotlib
import matplotlib.pyplot as plt

from sklearn.metrics import r2_score

import sys
sys.path.append('../lib')

import XgbAdjustmentHelper as xgbhelper
import SklearnModelCurve as curve_helper



In [9]:
munged_train_df = pd.read_csv('../data/offline/train.csv', index_col=0)
munged_test_df = pd.read_csv('../data/offline/test.csv', index_col=0)
y_train_df = pd.read_csv('../data/offline/y_train.csv', index_col=0)

X_train, X_test, y_train, y_test = train_test_split(munged_train_df.values, y_train_df.values, test_size=0.5, random_state=1729)
X_all_train = munged_train_df.values
y_all_train = y_train_df['y'].values
X_all_test = munged_test_df.values

In [10]:
y_mean = np.mean(y_train_df['y'])
# prepare dict of params for xgboost to run with
xgb_params = {
    'eta': 0.005,
    'max_depth': 4,
    'subsample': 0.95,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}

# form DMatrices for Xgboost training
d_all_train = xgb.DMatrix(X_all_train, y_all_train)


# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   d_all_train, 
                   num_boost_round=700, # increase to have better results (~700)
                   early_stopping_rounds=50,
                   verbose_eval=50, 
                   show_stdv=False
                  )

num_boost_rounds = len(cv_result)
print(num_boost_rounds)

# train model
model = xgb.train(dict(xgb_params, silent=0), d_all_train, num_boost_round=num_boost_rounds)

[0]	train-rmse:12.6405	test-rmse:12.6384
[50]	train-rmse:11.1002	test-rmse:11.16
[100]	train-rmse:10.0349	test-rmse:10.1639
[150]	train-rmse:9.3121	test-rmse:9.51319
[200]	train-rmse:8.82785	test-rmse:9.09971
[250]	train-rmse:8.5054	test-rmse:8.84066
[300]	train-rmse:8.28615	test-rmse:8.68156
[350]	train-rmse:8.12651	test-rmse:8.58936
[400]	train-rmse:7.98718	test-rmse:8.53655
[450]	train-rmse:7.84175	test-rmse:8.50538
[500]	train-rmse:7.71801	test-rmse:8.48775
[550]	train-rmse:7.60769	test-rmse:8.48236
[600]	train-rmse:7.50638	test-rmse:8.48217
581


In [11]:
avg_score = 0

for i in range(10,20):
    X_train, X_test, y_train, y_test = train_test_split(munged_train_df.values, y_train_df.values, test_size=0.19, random_state=i)
    dtrain = xgb.DMatrix(X_train, y_train)
    dtest = xgb.DMatrix(X_test)
    xgb_model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
    y_pred = xgb_model.predict(dtest)
    
    score = r2_score(y_test, y_pred)
    avg_score += score
    print(score)
    
print(avg_score / 10)

0.565268592999
0.569993076681
0.466781693386
0.563923226094
0.440493545425
0.610018672501
0.541920037679
0.588080948083
0.57489070066
0.572391670021
0.549376216353


In [12]:
0.549376216353

0.549376216353

In [13]:
d_all_test = xgb.DMatrix(X_all_test)

xgb_model = xgb.train(dict(xgb_params, silent=0), d_all_train, num_boost_round=num_boost_rounds)
y_test = xgb_model.predict(d_all_test)

In [14]:
output = pd.DataFrame({'id': munged_test_df.index, 'y': y_test})
output.to_csv('../data/online/xgboost-ref1-pca-ica-6-6-1.csv', index=False)