In [28]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb
import pprint as pp
import math
#from ml_metrics import rmsle

df_train = pd.read_csv('../data/cleanEngineeredTrain.csv')
df_test = pd.read_csv('../data/cleanEngineeredTest.csv')

In [29]:
def rmsle(preds, dtrain):
	labels = dtrain.get_label()
	assert len(preds) == len(labels)
	labels = labels.tolist()
	preds = preds.tolist()
	terms_to_sum = [(math.log(labels[i] + 1) - math.log(max(0, preds[i]) + 1)) ** 2.0 for i, pred in enumerate(labels)]
	return 'rmsle', (sum(terms_to_sum) * (1.0 / len(preds))) ** 0.5

In [31]:
# We take all float/int columns except for ID, timestamp, and the target value
train_columns = list(
	set(df_train.select_dtypes(include=['float64', 'int64']).columns) - set(['id', 'timestamp', 'price_doc']))

y_train = df_train['price_doc'].values
x_train = df_train[train_columns].values
x_test = df_test[train_columns].values

In [32]:
# Train/Valid split
split = 25000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

params = {}
params['objective'] = 'reg:linear'
params['eta'] = 0.02
params['silent'] = 1

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 800, watchlist, feval=rmsle, early_stopping_rounds=100)

p_test = clf.predict(xgb.DMatrix(x_test))

sub = pd.DataFrame()
sub['id'] = df_test['id'].values
sub['price_doc'] = p_test
sub.to_csv('xgb.csv', index=False)

[0]	train-rmse:8.74551e+06	valid-rmse:9.57797e+06	train-rmsle:3.8573	valid-rmsle:3.92727
Multiple eval metrics have been passed: 'valid-rmsle' will be used for early stopping.

Will train until valid-rmsle hasn't improved in 100 rounds.
[1]	train-rmse:8.58804e+06	valid-rmse:9.42497e+06	train-rmsle:3.18092	valid-rmsle:3.24849
[2]	train-rmse:8.43408e+06	valid-rmse:9.27674e+06	train-rmsle:2.79048	valid-rmsle:2.85727
[3]	train-rmse:8.28336e+06	valid-rmse:9.13062e+06	train-rmsle:2.51721	valid-rmsle:2.58368
[4]	train-rmse:8.13592e+06	valid-rmse:8.98888e+06	train-rmsle:2.30829	valid-rmsle:2.37507
[5]	train-rmse:7.99206e+06	valid-rmse:8.85219e+06	train-rmsle:2.13913	valid-rmsle:2.20452
[6]	train-rmse:7.85088e+06	valid-rmse:8.71701e+06	train-rmsle:1.99891	valid-rmsle:2.06436
[7]	train-rmse:7.71314e+06	valid-rmse:8.5853e+06	train-rmsle:1.87936	valid-rmsle:1.9443
[8]	train-rmse:7.57799e+06	valid-rmse:8.45757e+06	train-rmsle:1.77516	valid-rmsle:1.8409
[9]	train-rmse:7.44553e+06	valid-rmse:8.3264e+