In [9]:
from __future__ import print_function, division
#virtualenv -p python3 venv

In [10]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import cPickle as pk
import pandas as pd
import numpy as np
from IPython.display import clear_output
import xgboost as xgb

In [11]:
import os
import sys
base_module_path = os.path.abspath(os.path.join('..'))
if base_module_path not in sys.path:
    sys.path.append(base_module_path)
import zill as z

In [12]:
__base_path = '../data/'

def make_dtrain(outliers=False):
    prop = pd.read_csv(__base_path + 'properties_2016.csv')
    logerrors = pd.read_csv(__base_path + 'train_2016_v2.csv')

    prop = z.data._clean_encode_props(prop)
            
    merged = logerrors.merge(prop, how='left', on='parcelid')
    if outliers:
        print('Removing outliers')
        merged = merged[ merged.logerror > outliers[0] ]
        merged = merged[ merged.logerror < outliers[1] ]
    else: print('Keeping outliers')

    x_train = merged.drop(['parcelid', 'logerror', 'transactiondate'], axis=1)
    y_train = merged['logerror']

    # return y_mean also
    y_mean = np.mean(y_train)

    #del merged, prop, logerrors, test; gc.collect()

    return xgb.DMatrix(x_train, label=y_train), y_mean


In [13]:
dtrain, y_mean = make_dtrain((-0.39, 0.41)) 

Removing outliers


In [15]:
dtrain.get_label().shape

(88458,)

### Summary of changes

#### Parameter tuning
1. Biggest change: from good initialisation, aka setting base_score to the mean
2. Next biggest: using a lower learning rate to hit about 200-250 boost rounds
    * note: it seems to take longer to do each boost round at lower eta also
3. After: max_depth at certain times helped, as did the sampling, as did alpha

For future: best thing probably is to do the first 2 by hand, then use sklearn GridSearchCV. 

#### Other stuff
Removed outliers (self defined as mean±2.5*std)

#### Versions, changes, results
1. eval_metric=mean absolute error 
    * as per competition evaluation
2. lower eta 0.3->0.03 
    * model was reaching min at 19 steps. 
    * results: step224, 0.068668
3. subsample 1->0.5
    * equivalent to dropout? should help prevent overfit
    * results: step215, 0.068683
4. gamma 0->0.2
    * lowers model complexity by only splitting node when loss reduction is >gamma (aka regularization)
    * again should help overfitting
    * results: step211, 0.068696
5. max_depth 6->5
    * other overfitting stuff doesn't seem to be changing much. lower complexity
    * results: step228, test0.068570 train0.067371
6. alpha 0->1
    * cranking it see if we can do something about this overfitting
    * results: step227, test0.068465 train0.067324
7. USE PARAMS 2 everything-(except eta)>default, base_score->y_mean
    * should just cut out first ~50 iterations
    * results: step23, test0.068146, train0.067734
        * NOTE: not only did it make the train MUCH faster, but also improved test score beyond previous best!
        * initialisation fucking matters u prick
8. reverse step7 apart from base_score->y_mean
    * adding back regularization etc
    * is it worth redoing this?
    * results: step38 test0.068045, train0.067750
9. colsample_bytree 1->0.8
    * again adding more randomness (aka dropout)
    * results: step38 test0.068031 train0.067757
10. subsample 0.5->0.8
    * yet more dropout (OOPS THIS WAS WRONG its less! oops)
    * results: step45 test0.068035 train0.067637
11. min_child_weight 1->4
    * supposed to be one that makes a big diff (that and max depth) should have probably done earlier
    * results: step46 test0.068021 train0.067650
12. min_child_weight 4->6
    * did alright last time let's crank it
    * results: step39 test0.068021 train0.067715
13. max_depth 5->4
    * again supposed to make a big diff, so let's see
    * results: step46 test0.068011 train0.067805
14. subsample 0.8->0.5
    * return to more dropout (im an idiot)
    * results: step55 test0.068022 train0.067809
15. colsample_bytree 0.8->0.5
    * more dropout
    * results: step59 test0.068013 train0.067808
16. gamma 0.2->0
    * let's see
    * results: step59 test0.068011 train0.067791
17. eta 0.03->0.01
    * we're reaching plateau pretty fast, so let's lower it
    * results: step205 test0.067984 train0.067746
18. max_depth 4->5
    * not underfitting but try increasing complexity
    * results: step200 test0.067959 train0.067584
19. min_child_weight 6->5
    * not underfitting but try increasing complexity
    * results: step198 test0.067964 train0.067586
20. min_child_weight 5->3
    * not underfitting but try increasing complexity
    * results: step200 test0.067968 train0.067572
21. max_depth 5->6
    * not underfitting but try increasing complexity
    * results: step175 test0.067958 train0.067416
22. reverse to v18
    * it was better
    * results: step200 test0.067959 train0.067584	
23. eta 0.01->0.005 (num_boost_rounds->600)
    * lower eta seems to make a big difference, so lets see
    * increasing num_boost_rounds as its likely to take double the time
    * results: step304 test0.067967 train0.067674
24. revert to v22
    * it was better
    * results: step200 test0.067959 train0.067584
25. remove outliers (more than 2.5*std from the mean)
    * results: step304, effective train0.067168

In [16]:
params = {
    'eval_metric':'mae', #mean absolute error
    'eta':0.01, #learning rate, default 0.3
    'subsample':0.5,
    'colsample_bytree':0.5,
    'gamma':0,
    'max_depth':5,
    'min_child_weight':6,
    'alpha':1,
    'base_score':y_mean,
}

In [17]:
cv_result = xgb.cv(params, dtrain, 
                   nfold=5, 
                   num_boost_round=305, 
                   early_stopping_rounds=50, 
                   verbose_eval=10, 
                   show_stdv=False)

[0]	train-mae:0.0530246	test-mae:0.0530262
[10]	train-mae:0.0529434	test-mae:0.0529586
[20]	train-mae:0.052868	test-mae:0.0528968
[30]	train-mae:0.052801	test-mae:0.0528436
[40]	train-mae:0.0527398	test-mae:0.0527964
[50]	train-mae:0.0526846	test-mae:0.0527532
[60]	train-mae:0.052635	test-mae:0.0527176
[70]	train-mae:0.0525892	test-mae:0.0526844
[80]	train-mae:0.0525476	test-mae:0.0526536
[90]	train-mae:0.052509	test-mae:0.0526286
[100]	train-mae:0.0524716	test-mae:0.0526032
[110]	train-mae:0.0524392	test-mae:0.052583
[120]	train-mae:0.0524068	test-mae:0.052562
[130]	train-mae:0.0523784	test-mae:0.052544
[140]	train-mae:0.0523514	test-mae:0.0525276
[150]	train-mae:0.0523264	test-mae:0.0525134
[160]	train-mae:0.0523008	test-mae:0.052498
[170]	train-mae:0.0522782	test-mae:0.0524876
[180]	train-mae:0.052254	test-mae:0.052475
[190]	train-mae:0.0522326	test-mae:0.0524644
[200]	train-mae:0.0522106	test-mae:0.0524536
[210]	train-mae:0.0521906	test-mae:0.0524456
[220]	train-mae:0.0521714	test-

In [18]:
cv_result.tail()

Unnamed: 0,test-mae-mean,test-mae-std,train-mae-mean,train-mae-std
300,0.052384,0.000394,0.052039,0.000107
301,0.052384,0.000394,0.052038,0.000107
302,0.052384,0.000394,0.052036,0.000107
303,0.052383,0.000394,0.052035,0.000107
304,0.052383,0.000394,0.052033,0.000107


The following to try to assess the outlier-less model - since it's obviously gonna have better CV scores, let's try to predict on all the data and see what we get (and compare it to the train CV score for the other models)

In [19]:
model = xgb.train(dict(params), dtrain, num_boost_round=len(cv_result), 
                  evals=[(dtrain, 'eval')])

[0]	eval-mae:0.053025
[1]	eval-mae:0.053016
[2]	eval-mae:0.053007
[3]	eval-mae:0.052998
[4]	eval-mae:0.052988
[5]	eval-mae:0.052979
[6]	eval-mae:0.052968
[7]	eval-mae:0.052961
[8]	eval-mae:0.052952
[9]	eval-mae:0.052944
[10]	eval-mae:0.052937
[11]	eval-mae:0.05293
[12]	eval-mae:0.052923
[13]	eval-mae:0.052917
[14]	eval-mae:0.052911
[15]	eval-mae:0.052902
[16]	eval-mae:0.052895
[17]	eval-mae:0.052889
[18]	eval-mae:0.052883
[19]	eval-mae:0.052875
[20]	eval-mae:0.052869
[21]	eval-mae:0.052863
[22]	eval-mae:0.052858
[23]	eval-mae:0.05285
[24]	eval-mae:0.052843
[25]	eval-mae:0.052834
[26]	eval-mae:0.052827
[27]	eval-mae:0.052818
[28]	eval-mae:0.052813
[29]	eval-mae:0.052806
[30]	eval-mae:0.052797
[31]	eval-mae:0.052792
[32]	eval-mae:0.052784
[33]	eval-mae:0.052777
[34]	eval-mae:0.052771
[35]	eval-mae:0.052765
[36]	eval-mae:0.052759
[37]	eval-mae:0.052754
[38]	eval-mae:0.052747
[39]	eval-mae:0.052741
[40]	eval-mae:0.052736
[41]	eval-mae:0.052731
[42]	eval-mae:0.052726
[43]	eval-mae:0.05272
[

In [57]:
import sklearn 
def assess_outlier_model(model):
    print('Getting all data')

    dtest, y_mean = z.data.make_dtrain()
    y_test = dtest.get_label()
    print(y_test.shape)
    dtest.set_label([])
    print('Predicting...')
    pred = model.predict(dtest)
    y_pred = []
    for i, predict in enumerate(pred):
        y_pred.append(round(predict, 4))
    y_pred = np.array(y_pred)

    return sklearn.metrics.mean_absolute_error(y_test, y_pred)

In [58]:
assess_outlier_model(model)

Getting all data
(90275,)
Predicting...


0.067168124069345858

In [60]:
z.data.make_submission_with_model(model)

  if self.run_code(code, result):



Writing results to disk ...

Written results to disk.
