In [4]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import xgboost as xgb
import pprint as pp
import math
#from ml_metrics import rmsle

df_train = pd.read_csv('../data/macroCleanEngineeredTrain.csv')
df_test = pd.read_csv('../data/macroCleanEngineeredTest.csv')

In [5]:
def rmsle(preds, dtrain):
	labels = dtrain.get_label()
	assert len(preds) == len(labels)
	labels = labels.tolist()
	preds = preds.tolist()
	terms_to_sum = [(math.log(labels[i] + 1) - math.log(max(0, preds[i]) + 1)) ** 2.0 for i, pred in enumerate(labels)]
	return 'rmsle', (sum(terms_to_sum) * (1.0 / len(preds))) ** 0.5

In [6]:
# We take all float/int columns except for ID, timestamp, and the target value
train_columns = list(
	set(df_train.select_dtypes(include=['float64', 'int64']).columns) - set(['id', 'timestamp', 'price_doc']))

y_train = df_train['price_doc'].values
x_train = df_train[train_columns].values
x_test = df_test[train_columns].values

In [7]:
# Train/Valid split
split = 25000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

params = {}
params['objective'] = 'reg:linear'
params['eta'] = 0.02
params['silent'] = 1

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 800, watchlist, feval=rmsle, early_stopping_rounds=100)

p_test = clf.predict(xgb.DMatrix(x_test))

sub = pd.DataFrame()
sub['id'] = df_test['id'].values
sub['price_doc'] = p_test
sub.to_csv('xgb.csv', index=False)

[0]	train-rmse:8.18888e+06	valid-rmse:9.54965e+06	train-rmsle:3.86061	valid-rmsle:3.96336
Multiple eval metrics have been passed: 'valid-rmsle' will be used for early stopping.

Will train until valid-rmsle hasn't improved in 100 rounds.
[1]	train-rmse:8.04446e+06	valid-rmse:9.39628e+06	train-rmsle:3.18295	valid-rmsle:3.28499
[2]	train-rmse:7.90351e+06	valid-rmse:9.25112e+06	train-rmsle:2.7913	valid-rmsle:2.89387
[3]	train-rmse:7.76536e+06	valid-rmse:9.10637e+06	train-rmsle:2.51846	valid-rmsle:2.61945
[4]	train-rmse:7.63005e+06	valid-rmse:8.96607e+06	train-rmsle:2.30927	valid-rmsle:2.40856
[5]	train-rmse:7.49778e+06	valid-rmse:8.82634e+06	train-rmsle:2.14105	valid-rmsle:2.23849
[6]	train-rmse:7.36863e+06	valid-rmse:8.69239e+06	train-rmsle:2.00027	valid-rmsle:2.09699
[7]	train-rmse:7.24201e+06	valid-rmse:8.55736e+06	train-rmsle:1.8804	valid-rmsle:1.97544
[8]	train-rmse:7.1185e+06	valid-rmse:8.42977e+06	train-rmsle:1.77582	valid-rmsle:1.86942
[9]	train-rmse:6.99779e+06	valid-rmse:8.30536

In [9]:
sub.shape

(7610, 2)

In [10]:
df_test.shape

(7610, 430)