In [2]:
import numpy as np
import pandas as pd
import xgboost as xgb
import gc

print('Loading data ...')

train = pd.read_csv('./Zillow-Data/train_2016_v2.csv')
prop = pd.read_csv('./Zillow-Data/properties_2016.csv')
sample = pd.read_csv('./Zillow-Data/sample_submission.csv')

print('Binding to float32')

for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

print('Creating training set ...')

df_train = train.merge(prop, how='left', on='parcelid')

x_train = df_train.drop(['parcelid', 'logerror', 'transactiondate', 'propertyzoningdesc', 'propertycountylandusecode'], axis=1)
y_train = df_train['logerror'].values
print(x_train.shape, y_train.shape)

train_columns = x_train.columns

for c in x_train.dtypes[x_train.dtypes == object].index.values:
    x_train[c] = (x_train[c] == True)

del df_train; gc.collect()

split = 80000
x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[split:], y_train[split:]

print('Building DMatrix...')

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

del x_train, x_valid; gc.collect()

print('Training ...')

params = {}
params['eta'] = 0.02
params['objective'] = 'reg:linear'
params['eval_metric'] = 'mae'
params['max_depth'] = 4
params['silent'] = 1

watchlist = [(d_train, 'train'), (d_valid, 'valid')]
clf = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, verbose_eval=10)

del d_train, d_valid

print('Building test set ...')

sample['parcelid'] = sample['ParcelId']
df_test = sample.merge(prop, on='parcelid', how='left')

del prop; gc.collect()

x_test = df_test[train_columns]
for c in x_test.dtypes[x_test.dtypes == object].index.values:
    x_test[c] = (x_test[c] == True)

del df_test, sample; gc.collect()

d_test = xgb.DMatrix(x_test)

del x_test; gc.collect()

print('Predicting on test ...')

p_test = clf.predict(d_test)

del d_test; gc.collect()

sub = pd.read_csv('./Zillow-Data/sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = p_test

print('Writing csv ...')
sub.to_csv('xgb_starter.csv', index=False, float_format='%.4f') # Thanks to @inversion

Loading data ...


  prop = pd.read_csv('./Zillow-Data/properties_2016.csv')


Binding to float32
Creating training set ...
(90275, 55) (90275,)
Building DMatrix...
Training ...
[0]	train-mae:0.06861	valid-mae:0.06653
[10]	train-mae:0.06840	valid-mae:0.06642
[20]	train-mae:0.06825	valid-mae:0.06634
[30]	train-mae:0.06815	valid-mae:0.06632


Parameters: { "silent" } are not used.



[40]	train-mae:0.06807	valid-mae:0.06632
[50]	train-mae:0.06800	valid-mae:0.06634
[60]	train-mae:0.06795	valid-mae:0.06636
[70]	train-mae:0.06792	valid-mae:0.06638
[80]	train-mae:0.06789	valid-mae:0.06641
[90]	train-mae:0.06786	valid-mae:0.06643
[100]	train-mae:0.06784	valid-mae:0.06646
[110]	train-mae:0.06783	valid-mae:0.06649
[120]	train-mae:0.06781	valid-mae:0.06650
[130]	train-mae:0.06780	valid-mae:0.06651
[139]	train-mae:0.06779	valid-mae:0.06652
Building test set ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test[c] = (x_test[c] == True)


Predicting on test ...
Writing csv ...
