In [1]:
import pandas as pd
import numpy as np

import xgboost as xgb

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import validation_curve
from sklearn.model_selection import learning_curve
from sklearn import metrics
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.grid_search import GridSearchCV

import matplotlib
import matplotlib.pyplot as plt



In [2]:
import sys
sys.path.append('../lib')

In [3]:
import XgbAdjustmentHelper as xgbhelper

In [4]:
munged_train_df = pd.read_csv('../data/offline/train.csv', index_col=0)
munged_test_df = pd.read_csv('../data/offline/test.csv', index_col=0)
y_train_df = pd.read_csv('../data/offline/y_train.csv', index_col=0)

In [5]:
X_train, X_test, y_train, y_test = train_test_split(munged_train_df.values, y_train_df.values, test_size=0.5, random_state=1729)

In [17]:
import xgboost as xgb

y_mean = np.mean(y_train_df['y'])
# prepare dict of params for xgboost to run with
xgb_params = {
    'n_trees': 500, 
    'eta': 0.005,
    'max_depth': 4,
    'subsample': 0.95,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(X_train, y_train)
dtest = xgb.DMatrix(X_test)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=500, # increase to have better results (~700)
                   early_stopping_rounds=50,
                   verbose_eval=50, 
                   show_stdv=False
                  )

num_boost_rounds = len(cv_result)
print(num_boost_rounds)

# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)

[0]	train-rmse:0.118129	test-rmse:0.118133
[50]	train-rmse:0.101402	test-rmse:0.102155
[100]	train-rmse:0.0895263	test-rmse:0.09112
[150]	train-rmse:0.0812667	test-rmse:0.0837153
[200]	train-rmse:0.0755903	test-rmse:0.0788193
[250]	train-rmse:0.0717223	test-rmse:0.0756877
[300]	train-rmse:0.069071	test-rmse:0.073758
[350]	train-rmse:0.067122	test-rmse:0.0725947
[400]	train-rmse:0.0655913	test-rmse:0.0719343
[450]	train-rmse:0.0642523	test-rmse:0.071558
[499]	train-rmse:0.063031	test-rmse:0.0713823
500


In [18]:
# check f2-score (to get higher score - increase num_boost_round in previous cell)
from sklearn.metrics import r2_score

# now fixed, correct calculation
print(r2_score(dtrain.get_label(), model.predict(dtrain)))

0.698095026722


In [19]:
y_pred = model.predict(dtest)
print(r2_score(y_test, model.predict(dtest)))

0.583051317788


In [26]:
dtrain = xgb.DMatrix(munged_train_df.values, y_train_df.values)
dtest = xgb.DMatrix(munged_test_df.values)

model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
dtest = xgb.DMatrix(munged_test_df.values)

In [27]:
# make predictions and save results
y_pred = model.predict(dtest)
y_pred = np.expm1(y_pred)
output = pd.DataFrame({'id': munged_test_df.index, 'y': y_pred})
output.to_csv('xgboost-depth{}-pca-ica.csv'.format(xgb_params['max_depth']), index=False)