In [5]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer 

In [6]:
properties_2016 = pd.read_csv('../datasets/properties_2016.csv')
properties_2017 = pd.read_csv('../datasets/properties_2017.csv')
train_2016 = pd.read_csv('../datasets/new_train_2016.csv')
train_2017 = pd.read_csv('../datasets/new_train_2017.csv')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
def label_impute(properties):
    for c in properties.columns:
        properties[c]=properties[c].fillna(-1)
        if properties[c].dtype == 'object':
            lbl = LabelEncoder()
            lbl.fit(list(properties[c].values))
            properties[c] = lbl.transform(list(properties[c].values))
    return properties
properties_2016 = label_impute(properties_2016)
properties_2017 = label_impute(properties_2017)
train_2016 = train_2016.merge(properties_2016, how='left', on='parcelid')
train_2017 = train_2017.merge(properties_2017, how='left', on='parcelid')
train_df = train_2016.append(train_2017)

In [10]:



x_train = train_df[train_df.logerror > -0.4]
x_train = x_train[x_train.logerror < .42]
y_train = x_train['logerror']
y_mean = np.mean(y_train)
x_train = x_train.drop(['parcelid', 'logerror','transactiondate'], axis=1)
xgb_params = {
    'eta': 0.033,
    'max_depth': 6,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'base_score': y_mean,
    'silent': 1
}

In [11]:
results = []
scorer = make_scorer(mean_absolute_error)
for column in x_train.columns:
    dtrain = xgb.DMatrix(x_train.drop([column], axis=1), y_train)
    cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   nfold=5,
                   num_boost_round=500,
                   early_stopping_rounds=5,
                   verbose_eval=10, 
                   show_stdv=False
                  )
    results.append(cv_result['test-mae-mean'].iloc[len(cv_result) - 1])

[0]	train-mae:0.053034	test-mae:0.0530414
[10]	train-mae:0.0527406	test-mae:0.0528268
[20]	train-mae:0.0525482	test-mae:0.0527048
[30]	train-mae:0.0524076	test-mae:0.0526342
[40]	train-mae:0.0522914	test-mae:0.0525826
[50]	train-mae:0.0521946	test-mae:0.052547
[60]	train-mae:0.0521136	test-mae:0.0525246
[70]	train-mae:0.0520348	test-mae:0.0525032
[80]	train-mae:0.051972	test-mae:0.0524924
[90]	train-mae:0.051908	test-mae:0.0524802
[100]	train-mae:0.051849	test-mae:0.0524728
[110]	train-mae:0.0517942	test-mae:0.052464
[120]	train-mae:0.0517436	test-mae:0.0524548
[130]	train-mae:0.0516948	test-mae:0.0524492
[140]	train-mae:0.051647	test-mae:0.0524434
[150]	train-mae:0.0516034	test-mae:0.0524396
[160]	train-mae:0.0515558	test-mae:0.052433
[0]	train-mae:0.053034	test-mae:0.0530418
[10]	train-mae:0.0527452	test-mae:0.0528298
[20]	train-mae:0.0525546	test-mae:0.0527118
[30]	train-mae:0.0524128	test-mae:0.052639
[40]	train-mae:0.0522942	test-mae:0.052584
[50]	train-mae:0.0521928	test-mae:0.05

[110]	train-mae:0.0517916	test-mae:0.0524594
[120]	train-mae:0.0517412	test-mae:0.0524516
[130]	train-mae:0.0516928	test-mae:0.052446
[140]	train-mae:0.0516464	test-mae:0.0524428
[150]	train-mae:0.051603	test-mae:0.0524346
[160]	train-mae:0.0515534	test-mae:0.0524286
[0]	train-mae:0.053034	test-mae:0.0530416
[10]	train-mae:0.0527406	test-mae:0.0528264
[20]	train-mae:0.0525482	test-mae:0.0527054
[30]	train-mae:0.0524078	test-mae:0.0526354
[40]	train-mae:0.0522912	test-mae:0.052585
[50]	train-mae:0.052194	test-mae:0.0525488
[60]	train-mae:0.052112	test-mae:0.0525254
[70]	train-mae:0.0520314	test-mae:0.0525016
[80]	train-mae:0.051968	test-mae:0.0524888
[90]	train-mae:0.0519024	test-mae:0.0524726
[100]	train-mae:0.0518442	test-mae:0.0524666
[110]	train-mae:0.05179	test-mae:0.0524596
[120]	train-mae:0.0517406	test-mae:0.0524526
[130]	train-mae:0.0516904	test-mae:0.0524454
[140]	train-mae:0.051644	test-mae:0.052441
[150]	train-mae:0.0515956	test-mae:0.0524328
[160]	train-mae:0.0515458	test-m

[50]	train-mae:0.0521934	test-mae:0.0525504
[60]	train-mae:0.0521118	test-mae:0.0525268
[70]	train-mae:0.052035	test-mae:0.052506
[80]	train-mae:0.051969	test-mae:0.052493
[90]	train-mae:0.0519058	test-mae:0.0524784
[100]	train-mae:0.051848	test-mae:0.0524716
[110]	train-mae:0.0517942	test-mae:0.0524662
[120]	train-mae:0.0517438	test-mae:0.0524584
[130]	train-mae:0.051695	test-mae:0.052452
[140]	train-mae:0.0516476	test-mae:0.0524478
[150]	train-mae:0.0516036	test-mae:0.0524432
[160]	train-mae:0.0515538	test-mae:0.0524368
[0]	train-mae:0.0530338	test-mae:0.0530414
[10]	train-mae:0.052741	test-mae:0.0528262
[20]	train-mae:0.0525478	test-mae:0.0527062
[30]	train-mae:0.0524072	test-mae:0.0526348
[40]	train-mae:0.0522924	test-mae:0.052584
[50]	train-mae:0.052195	test-mae:0.052547
[60]	train-mae:0.0521128	test-mae:0.0525214
[70]	train-mae:0.052033	test-mae:0.0524988
[80]	train-mae:0.051969	test-mae:0.0524866
[90]	train-mae:0.0519042	test-mae:0.0524716
[100]	train-mae:0.0518472	test-mae:0.05

[10]	train-mae:0.0527406	test-mae:0.0528242
[20]	train-mae:0.0525502	test-mae:0.0527056
[30]	train-mae:0.0524092	test-mae:0.0526358
[40]	train-mae:0.0522938	test-mae:0.0525852
[50]	train-mae:0.0521994	test-mae:0.0525518
[60]	train-mae:0.052118	test-mae:0.0525266
[70]	train-mae:0.052042	test-mae:0.0525058
[80]	train-mae:0.0519778	test-mae:0.0524926
[90]	train-mae:0.0519168	test-mae:0.0524804
[100]	train-mae:0.0518552	test-mae:0.0524732
[110]	train-mae:0.051799	test-mae:0.052466
[120]	train-mae:0.0517482	test-mae:0.0524586
[130]	train-mae:0.051698	test-mae:0.0524524
[140]	train-mae:0.0516494	test-mae:0.0524472
[150]	train-mae:0.051606	test-mae:0.052442
[160]	train-mae:0.0515578	test-mae:0.0524354
[170]	train-mae:0.0515112	test-mae:0.0524342
[0]	train-mae:0.0530338	test-mae:0.0530416
[10]	train-mae:0.0527436	test-mae:0.0528276
[20]	train-mae:0.0525538	test-mae:0.0527064
[30]	train-mae:0.0524162	test-mae:0.0526358
[40]	train-mae:0.0523012	test-mae:0.0525852
[50]	train-mae:0.052204	test-mae

[30]	train-mae:0.0524104	test-mae:0.0526366
[40]	train-mae:0.0522952	test-mae:0.0525854
[50]	train-mae:0.0521976	test-mae:0.0525488
[60]	train-mae:0.0521156	test-mae:0.0525244
[70]	train-mae:0.0520348	test-mae:0.0524996
[80]	train-mae:0.0519702	test-mae:0.052488
[90]	train-mae:0.051905	test-mae:0.0524712
[100]	train-mae:0.0518448	test-mae:0.052463
[110]	train-mae:0.0517902	test-mae:0.0524538
[120]	train-mae:0.0517384	test-mae:0.0524432
[130]	train-mae:0.05169	test-mae:0.0524372
[140]	train-mae:0.0516412	test-mae:0.0524304
[150]	train-mae:0.0515988	test-mae:0.052426
[160]	train-mae:0.0515492	test-mae:0.0524186
[0]	train-mae:0.053034	test-mae:0.0530416
[10]	train-mae:0.0527406	test-mae:0.0528274
[20]	train-mae:0.0525478	test-mae:0.052706
[30]	train-mae:0.052409	test-mae:0.0526358
[40]	train-mae:0.0522944	test-mae:0.0525856
[50]	train-mae:0.0521968	test-mae:0.05255
[60]	train-mae:0.0521172	test-mae:0.0525262
[70]	train-mae:0.0520378	test-mae:0.0525034
[80]	train-mae:0.0519746	test-mae:0.0

[170]	train-mae:0.0515062	test-mae:0.0524284
[0]	train-mae:0.053034	test-mae:0.0530416
[10]	train-mae:0.0527408	test-mae:0.0528264
[20]	train-mae:0.052548	test-mae:0.0527048
[30]	train-mae:0.0524078	test-mae:0.0526342
[40]	train-mae:0.0522896	test-mae:0.052579
[50]	train-mae:0.052194	test-mae:0.0525442
[60]	train-mae:0.0521122	test-mae:0.0525204
[70]	train-mae:0.052032	test-mae:0.0524984
[80]	train-mae:0.0519704	test-mae:0.05249
[90]	train-mae:0.0519054	test-mae:0.0524734
[100]	train-mae:0.0518458	test-mae:0.0524666
[110]	train-mae:0.0517924	test-mae:0.0524598
[120]	train-mae:0.0517436	test-mae:0.0524524
[130]	train-mae:0.0516956	test-mae:0.0524472
[140]	train-mae:0.0516502	test-mae:0.0524426
[150]	train-mae:0.0516048	test-mae:0.052437
[160]	train-mae:0.0515552	test-mae:0.0524294
[170]	train-mae:0.0515072	test-mae:0.052427
[180]	train-mae:0.0514612	test-mae:0.0524212
[190]	train-mae:0.0514142	test-mae:0.052417
[200]	train-mae:0.0513668	test-mae:0.0524116
[210]	train-mae:0.0513246	test-

[140]	train-mae:0.0517148	test-mae:0.052484
[150]	train-mae:0.051671	test-mae:0.05248
[160]	train-mae:0.051624	test-mae:0.0524744
[170]	train-mae:0.0515794	test-mae:0.0524724
[180]	train-mae:0.0515356	test-mae:0.0524668
[190]	train-mae:0.051489	test-mae:0.0524634
[0]	train-mae:0.053034	test-mae:0.0530416
[10]	train-mae:0.0527406	test-mae:0.0528264
[20]	train-mae:0.0525482	test-mae:0.0527054
[30]	train-mae:0.0524078	test-mae:0.0526354
[40]	train-mae:0.0522914	test-mae:0.0525848
[50]	train-mae:0.0521944	test-mae:0.0525486
[60]	train-mae:0.0521126	test-mae:0.0525266
[70]	train-mae:0.0520324	test-mae:0.0525024
[80]	train-mae:0.0519682	test-mae:0.05249
[90]	train-mae:0.0519032	test-mae:0.0524742
[100]	train-mae:0.0518428	test-mae:0.0524676
[110]	train-mae:0.0517888	test-mae:0.05246
[120]	train-mae:0.0517402	test-mae:0.0524522
[130]	train-mae:0.0516908	test-mae:0.052445
[140]	train-mae:0.0516444	test-mae:0.0524406
[150]	train-mae:0.0515974	test-mae:0.0524318
[160]	train-mae:0.0515498	test-ma

In [12]:
scores = pd.DataFrame({'features': x_train.columns, 'mae_wo': results})

In [13]:
dtrain = xgb.DMatrix(x_train, y_train)
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   nfold=5,
                   num_boost_round=500,
                   early_stopping_rounds=5,
                   verbose_eval=10, 
                   show_stdv=False
                  )
base = cv_result['test-mae-mean'].iloc[len(cv_result) - 1]

[0]	train-mae:0.053034	test-mae:0.0530416
[10]	train-mae:0.0527428	test-mae:0.0528278
[20]	train-mae:0.0525476	test-mae:0.0527036
[30]	train-mae:0.0524018	test-mae:0.0526246
[40]	train-mae:0.0522866	test-mae:0.0525792
[50]	train-mae:0.052192	test-mae:0.0525458
[60]	train-mae:0.0521078	test-mae:0.0525194
[70]	train-mae:0.0520336	test-mae:0.0525024
[80]	train-mae:0.0519702	test-mae:0.0524928
[90]	train-mae:0.0519056	test-mae:0.0524804
[100]	train-mae:0.0518472	test-mae:0.052468
[110]	train-mae:0.051793	test-mae:0.0524622
[120]	train-mae:0.0517384	test-mae:0.0524562
[130]	train-mae:0.0516902	test-mae:0.0524486
[140]	train-mae:0.0516402	test-mae:0.0524448
[150]	train-mae:0.0515948	test-mae:0.0524406
[160]	train-mae:0.0515478	test-mae:0.0524376
[170]	train-mae:0.0515048	test-mae:0.0524344


In [25]:
#features that make the model perform worse
scores[scores['mae_wo'] < 0.05241]['features']

2                   cosmonth
4                num_missing
7               basementsqft
10       buildingclasstypeid
13                decktypeid
17      finishedsquarefeet13
20       finishedsquarefeet6
21                      fips
23               fullbathcnt
31                   poolcnt
41            regionidcounty
45               storytypeid
47    typeconstructiontypeid
49        yardbuildingsqft17
53             fireplaceflag
55         taxvaluedollarcnt
59        taxdelinquencyflag
Name: features, dtype: object

In [17]:
base

0.052431600000000002

In [27]:
dtrain = xgb.DMatrix(x_train.drop(scores[scores['mae_wo'] < 0.05241]['features'], axis=1), y_train)
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   nfold=5,
                   num_boost_round=500,
                   early_stopping_rounds=5,
                   verbose_eval=10, 
                   show_stdv=False
                  )
cv_result['test-mae-mean'].iloc[len(cv_result) - 1]

[0]	train-mae:0.0530352	test-mae:0.0530438
[10]	train-mae:0.0527544	test-mae:0.0528392
[20]	train-mae:0.0525618	test-mae:0.0527164
[30]	train-mae:0.0524182	test-mae:0.0526374
[40]	train-mae:0.0523062	test-mae:0.0525892
[50]	train-mae:0.0522126	test-mae:0.0525546
[60]	train-mae:0.052124	test-mae:0.0525234
[70]	train-mae:0.05205	test-mae:0.052501
[80]	train-mae:0.0519806	test-mae:0.0524846
[90]	train-mae:0.0519132	test-mae:0.0524682
[100]	train-mae:0.0518518	test-mae:0.052454
[110]	train-mae:0.0517926	test-mae:0.052442
[120]	train-mae:0.0517376	test-mae:0.0524346
[130]	train-mae:0.051688	test-mae:0.0524292
[140]	train-mae:0.0516394	test-mae:0.052424
[150]	train-mae:0.0515908	test-mae:0.0524186
[160]	train-mae:0.0515396	test-mae:0.0524142
[170]	train-mae:0.0514908	test-mae:0.052409
[180]	train-mae:0.051444	test-mae:0.0524052
[190]	train-mae:0.0514008	test-mae:0.0523996
[200]	train-mae:0.051357	test-mae:0.0523968
[210]	train-mae:0.051313	test-mae:0.052395


0.052395000000000004