In [18]:
%matplotlib inline
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
## Make Cost Function
from sklearn.metrics import make_scorer
import math

In [19]:
def rmsle(preds, dtrain):
    labels = dtrain.get_label()
    assert len(preds) == len(labels)
    labels = labels.tolist()
    preds = preds.tolist()
    terms_to_sum = [(math.log(labels[i] + 1) - math.log(max(0, preds[i]) + 1)) ** 2.0 for i, pred in enumerate(labels)]
    return 'rmsle', (sum(terms_to_sum) * (1.0 / len(preds))) ** 0.5

In [50]:
train = pd.read_csv('train_c.csv')
test = pd.read_csv('test_c.csv')
col = list(test.columns)[2:]
label = train['price_doc']
dtrain = xgb.DMatrix(train[col],label)
dtest = xgb.DMatrix(test[col])

In [51]:
params = {
    'eta': 0.05, ## Try 0.01,3,5
    'max_depth': 7,## Try 4,5,6
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    #'eval_metric': rmsle,
    'silent': 1
    #'lambda':5,
    #'min_child_weight':5
}

In [52]:
xgb_cvalid = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=20,feval=rmsle,
    verbose_eval=50, show_stdv=False,seed=42)
print('Performance does not improve from '+str(len(xgb_cvalid))+' rounds')

[0]	train-rmsle:2.96461	test-rmsle:2.96853
[50]	train-rmsle:0.438207	test-rmsle:0.468996
Performance does not improve from 53 rounds


In [None]:
#depth at 10, lambda 5, mcw 5
#[0]	train-rmsle:2.97956	test-rmsle:2.98573
#[50]	train-rmsle:0.397519	test-rmsle:0.465332
#Performance does not improve from 57 rounds
#depth at 8
#[0]	train-rmsle:2.97237	test-rmsle:2.97984
#[50]	train-rmsle:0.419887	test-rmsle:0.46697
#Performance does not improve from 53 rounds
# depth at 10, mcw at 3,lambda at 5
#[0]	train-rmsle:2.98961	test-rmsle:2.99765
#[50]	train-rmsle:0.392596	test-rmsle:0.466047
#Performance does not improve from 55 rounds
# depth at 10
#[0]	train-rmsle:2.98903	test-rmsle:3.0103
#[50]	train-rmsle:0.371999	test-rmsle:0.464972
#Performance does not improve from 53 rounds
# depth at 7
#[0]	train-rmsle:2.96461	test-rmsle:2.96853
#[50]	train-rmsle:0.438205	test-rmsle:0.468994
#Performance does not improve from 53 rounds
# depth at 6

#depth at 4
#[0]	train-rmsle:2.94682	test-rmsle:2.9468
#[50]	train-rmsle:0.473494	test-rmsle:0.479218
#depth at 4, eta 0.04
#[0]	train-rmsle:3.16696	test-rmsle:3.16691
#[50]	train-rmsle:0.476906	test-rmsle:0.482038
#Performance does not improve from 66 rounds
#depth at 4, eta 0.02
#[0]	train-rmsle:3.85298	test-rmsle:3.85286
#[50]	train-rmsle:0.590174	test-rmsle:0.593271
#[100]	train-rmsle:0.47717	test-rmsle:0.482508
#Performance does not improve from 129 rounds
#depth at 3
#[0]	train-rmsle:2.94186	test-rmsle:2.94259
#[50]	train-rmsle:0.481606	test-rmsle:0.484897
#Performance does not improve from 50 rounds
#depth at 4
#[0]	train-rmsle:2.94682	test-rmsle:2.9468
#[50]	train-rmsle:0.473494	test-rmsle:0.479218
#depth at 5
#[0]	train-rmsle:2.95171	test-rmsle:2.95184
#[50]	train-rmsle:0.464171	test-rmsle:0.474858
#Performance does not improve from 51 rounds

In [53]:
model = xgb.train(params,dtrain,num_boost_round=53,feval=rmsle)

In [54]:
def get_feature_importance(model):
    Importance = model.get_fscore()
    Importance = list(Importance.items())
    Feature= []
    Score = []
    for each in Importance:
        Feature.append(each[0])
        Score.append(each[1])
    df = pd.DataFrame({'Feature':Feature,'Score':Score}).sort_values(by=['Score'],ascending=[0])
    return df    


In [55]:
get_feature_importance(model)

Unnamed: 0,Feature,Score
189,full_sq,710
126,life_sq,177
93,floor,171
200,max_floor,164
72,build_year,160
171,kitch_sq,111
76,state,88
133,num_room,83
256,swim_pool_km,67
71,metro_min_avto,65


In [56]:
pred = model.predict(dtest)

In [57]:
## Have a look at this!!!
pred

array([ 4947929.,  7851186.,  5323283., ...,  4587681.,  5244095.,
        7605561.], dtype=float32)

In [58]:
sub = pd.DataFrame({'id':test['id'],'price_doc':pred})

In [59]:
sub.to_csv('sub_RMSLEasLossFunction.csv',index=False)