In [5]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb
import pprint as pp
import math
#from ml_metrics import rmsle

df_train = pd.read_csv('../data/macroCleanEngineeredTrain.csv')
df_test = pd.read_csv('../data/macroCleanEngineeredTest.csv')

In [6]:
def rmsle(preds, dtrain):
	labels = dtrain.get_label()
	assert len(preds) == len(labels)
	labels = labels.tolist()
	preds = preds.tolist()
	terms_to_sum = [(math.log(labels[i] + 1) - math.log(max(0, preds[i]) + 1)) ** 2.0 for i, pred in enumerate(labels)]
	return 'rmsle', (sum(terms_to_sum) * (1.0 / len(preds))) ** 0.5

In [7]:
# We take all float/int columns except for ID, timestamp, and the target value
train_columns = list(
	set(df_train.select_dtypes(include=['float64', 'int64']).columns) - set(['id', 'timestamp', 'price_doc']))

y_train = df_train['price_doc'].values
x_train = df_train[train_columns].values
x_test = df_test[train_columns].values

In [4]:
# Train/Valid split
split = 25000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

params = {}
params['objective'] = 'reg:linear'
params['eta'] = 0.02
params['silent'] = 1

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 800, watchlist, feval=rmsle, early_stopping_rounds=100)

p_test = clf.predict(xgb.DMatrix(x_test))

sub = pd.DataFrame()
sub['id'] = df_test['id'].values
sub['price_doc'] = p_test
sub.to_csv('xgb.csv', index=False)

[0]	train-rmse:8.74548e+06	valid-rmse:9.57765e+06	train-rmsle:3.85735	valid-rmsle:3.92598
Multiple eval metrics have been passed: 'valid-rmsle' will be used for early stopping.

Will train until valid-rmsle hasn't improved in 100 rounds.
[1]	train-rmse:8.588e+06	valid-rmse:9.42433e+06	train-rmsle:3.18099	valid-rmsle:3.24736
[2]	train-rmse:8.43402e+06	valid-rmse:9.27601e+06	train-rmsle:2.79055	valid-rmsle:2.85618
[3]	train-rmse:8.2833e+06	valid-rmse:9.13001e+06	train-rmsle:2.51729	valid-rmsle:2.58289
[4]	train-rmse:8.13585e+06	valid-rmse:8.98851e+06	train-rmsle:2.30839	valid-rmsle:2.37445
[5]	train-rmse:7.99176e+06	valid-rmse:8.85165e+06	train-rmsle:2.13929	valid-rmsle:2.20399
[6]	train-rmse:7.85044e+06	valid-rmse:8.71566e+06	train-rmsle:1.99898	valid-rmsle:2.06397
[7]	train-rmse:7.71223e+06	valid-rmse:8.58296e+06	train-rmsle:1.87953	valid-rmsle:1.94398
[8]	train-rmse:7.57733e+06	valid-rmse:8.45634e+06	train-rmsle:1.77518	valid-rmsle:1.84017
[9]	train-rmse:7.44488e+06	valid-rmse:8.32395