## Outline
* xgboost fit of data
* clean data, run xgboost
* tune hyperparameters by stepping through them. Thens runs xgboost again

In [2]:
# Parameters
prediction_stderr = 0.0073  #  assumed standard error of predictions
                          #  (smaller values make output closer to input)
train_test_logmean_diff = 0.1  # assumed shift used to adjust frequencies for time trend
probthresh = 90  # minimum probability*frequency to use new price instead of just rounding
rounder = 2  # number of places left of decimal point to zero

import numpy as np
import pandas as pd
from sklearn import model_selection, preprocessing
import xgboost as xgb

import datetime
from scipy.stats import norm
    
#load files
train = pd.read_csv('train/train.csv', parse_dates=['timestamp'])
test = pd.read_csv('test/test.csv', parse_dates=['timestamp'])
id_test = test.id

#clean data
print('Data Clean...')
bad_index = train[train.life_sq > train.full_sq].index
train.loc[bad_index, "life_sq"] = np.NaN
equal_index = [601,1896,2791]
test.loc[equal_index, "life_sq"] = test.loc[equal_index, "full_sq"]
bad_index = test[test.life_sq > test.full_sq].index
test.loc[bad_index, "life_sq"] = np.NaN
bad_index = train[train.life_sq < 5].index
train.loc[bad_index, "life_sq"] = np.NaN
bad_index = test[test.life_sq < 5].index
test.loc[bad_index, "life_sq"] = np.NaN
bad_index = train[train.full_sq < 5].index
train.loc[bad_index, "full_sq"] = np.NaN
bad_index = test[test.full_sq < 5].index
test.loc[bad_index, "full_sq"] = np.NaN
kitch_is_build_year = [13117]
train.loc[kitch_is_build_year, "build_year"] = train.loc[kitch_is_build_year, "kitch_sq"]
bad_index = train[train.kitch_sq >= train.life_sq].index
train.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = test[test.kitch_sq >= test.life_sq].index
test.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = train[(train.kitch_sq == 0).values + (train.kitch_sq == 1).values].index
train.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = test[(test.kitch_sq == 0).values + (test.kitch_sq == 1).values].index
test.loc[bad_index, "kitch_sq"] = np.NaN
bad_index = train[(train.full_sq > 210) & (train.life_sq / train.full_sq < 0.3)].index
train.loc[bad_index, "full_sq"] = np.NaN
bad_index = test[(test.full_sq > 150) & (test.life_sq / test.full_sq < 0.3)].index
test.loc[bad_index, "full_sq"] = np.NaN
bad_index = train[train.life_sq > 300].index
train.loc[bad_index, ["life_sq", "full_sq"]] = np.NaN
bad_index = test[test.life_sq > 200].index
test.loc[bad_index, ["life_sq", "full_sq"]] = np.NaN
train.product_type.value_counts(normalize= True)
test.product_type.value_counts(normalize= True)
bad_index = train[train.build_year < 1500].index
train.loc[bad_index, "build_year"] = np.NaN
bad_index = test[test.build_year < 1500].index
test.loc[bad_index, "build_year"] = np.NaN
bad_index = train[train.num_room == 0].index
train.loc[bad_index, "num_room"] = np.NaN
bad_index = test[test.num_room == 0].index
test.loc[bad_index, "num_room"] = np.NaN
bad_index = [10076, 11621, 17764, 19390, 24007, 26713, 29172]
train.loc[bad_index, "num_room"] = np.NaN
bad_index = [3174, 7313]
test.loc[bad_index, "num_room"] = np.NaN
bad_index = train[(train.floor == 0).values * (train.max_floor == 0).values].index
train.loc[bad_index, ["max_floor", "floor"]] = np.NaN
bad_index = train[train.floor == 0].index
train.loc[bad_index, "floor"] = np.NaN
bad_index = train[train.max_floor == 0].index
train.loc[bad_index, "max_floor"] = np.NaN
bad_index = test[test.max_floor == 0].index
test.loc[bad_index, "max_floor"] = np.NaN
bad_index = train[train.floor > train.max_floor].index
train.loc[bad_index, "max_floor"] = np.NaN
bad_index = test[test.floor > test.max_floor].index
test.loc[bad_index, "max_floor"] = np.NaN
train.floor.describe(percentiles= [0.9999])
bad_index = [23584]
train.loc[bad_index, "floor"] = np.NaN
train.material.value_counts()
test.material.value_counts()
train.state.value_counts()
bad_index = train[train.state == 33].index
train.loc[bad_index, "state"] = np.NaN
test.state.value_counts()

# brings error down a lot by removing extreme price per sqm
train.loc[train.full_sq == 0, 'full_sq'] = 50
train = train[train.price_doc/train.full_sq <= 600000]
train = train[train.price_doc/train.full_sq >= 10000]

print('Feature Engineering...')
# Add month-year
month_year = (train.timestamp.dt.month*30 + train.timestamp.dt.year * 365)
month_year_cnt_map = month_year.value_counts().to_dict()
train['month_year_cnt'] = month_year.map(month_year_cnt_map)

month_year = (test.timestamp.dt.month*30 + test.timestamp.dt.year * 365)
month_year_cnt_map = month_year.value_counts().to_dict()
test['month_year_cnt'] = month_year.map(month_year_cnt_map)

# Add week-year count
week_year = (train.timestamp.dt.weekofyear*7 + train.timestamp.dt.year * 365)
week_year_cnt_map = week_year.value_counts().to_dict()
train['week_year_cnt'] = week_year.map(week_year_cnt_map)

week_year = (test.timestamp.dt.weekofyear*7 + test.timestamp.dt.year * 365)
week_year_cnt_map = week_year.value_counts().to_dict()
test['week_year_cnt'] = week_year.map(week_year_cnt_map)

# Add month and day-of-week
train['month'] = train.timestamp.dt.month
train['dow'] = train.timestamp.dt.dayofweek

test['month'] = test.timestamp.dt.month
test['dow'] = test.timestamp.dt.dayofweek

# Other feature engineering
train['rel_floor'] = 0.05+train['floor'] / train['max_floor'].astype(float)
train['rel_kitch_sq'] = 0.05+train['kitch_sq'] / train['full_sq'].astype(float)

test['rel_floor'] = 0.05+test['floor'] / test['max_floor'].astype(float)
test['rel_kitch_sq'] = 0.05+test['kitch_sq'] / test['full_sq'].astype(float)

train.apartment_name=train.sub_area + train['metro_km_avto'].astype(str)
test.apartment_name=test.sub_area + train['metro_km_avto'].astype(str)

train['room_size'] = train['life_sq'] / train['num_room'].astype(float)
test['room_size'] = test['life_sq'] / test['num_room'].astype(float)

train['area_per_room'] = train['life_sq'] / train['num_room'].astype(float) #rough area per room
train['livArea_ratio'] = train['life_sq'] / train['full_sq'].astype(float) #rough living area
train['yrs_old'] = 2017 - train['build_year'].astype(float) #years old from 2017
train['avgfloor_sq'] = train['life_sq']/train['max_floor'].astype(float) #living area per floor
train['pts_floor_ratio'] = train['public_transport_station_km']/train['max_floor'].astype(float)
# looking for significance of apartment buildings near public t 
train['room_size'] = train['life_sq'] / train['num_room'].astype(float)
# doubled a var by accident
# when removing one score did not improve...
train['gender_ratio'] = train['male_f']/train['female_f'].astype(float)
train['kg_park_ratio'] = train['kindergarten_km']/train['park_km'].astype(float) #significance of children?
train['high_ed_extent'] = train['school_km'] / train['kindergarten_km'] #schooling
train['pts_x_state'] = train['public_transport_station_km'] * train['state'].astype(float) #public trans * state of listing
train['lifesq_x_state'] = train['life_sq'] * train['state'].astype(float) #life_sq times the state of the place
train['floor_x_state'] = train['floor'] * train['state'].astype(float) #relative floor * the state of the place

test['area_per_room'] = test['life_sq'] / test['num_room'].astype(float)
test['livArea_ratio'] = test['life_sq'] / test['full_sq'].astype(float)
test['yrs_old'] = 2017 - test['build_year'].astype(float)
test['avgfloor_sq'] = test['life_sq']/test['max_floor'].astype(float) #living area per floor
test['pts_floor_ratio'] = test['public_transport_station_km']/test['max_floor'].astype(float) #apartments near public t?
test['room_size'] = test['life_sq'] / test['num_room'].astype(float)
test['gender_ratio'] = test['male_f']/test['female_f'].astype(float)
test['kg_park_ratio'] = test['kindergarten_km']/test['park_km'].astype(float)
test['high_ed_extent'] = test['school_km'] / test['kindergarten_km']
test['pts_x_state'] = test['public_transport_station_km'] * test['state'].astype(float) #public trans * state of listing
test['lifesq_x_state'] = test['life_sq'] * test['state'].astype(float)
test['floor_x_state'] = test['floor'] * test['state'].astype(float)

#########################################################################
print('Rate Mults...')
# Aggreagte house price data derived from 
# http://www.globalpropertyguide.com/real-estate-house-prices/R#russia
# by luckyzhou
# See https://www.kaggle.com/luckyzhou/lzhou-test/comments

rate_2015_q2 = 1
rate_2015_q1 = rate_2015_q2 / 0.9932
rate_2014_q4 = rate_2015_q1 / 1.0112
rate_2014_q3 = rate_2014_q4 / 1.0169
rate_2014_q2 = rate_2014_q3 / 1.0086
rate_2014_q1 = rate_2014_q2 / 1.0126
rate_2013_q4 = rate_2014_q1 / 0.9902
rate_2013_q3 = rate_2013_q4 / 1.0041
rate_2013_q2 = rate_2013_q3 / 1.0044
rate_2013_q1 = rate_2013_q2 / 1.0104  # This is 1.002 (relative to mult), close to 1:
rate_2012_q4 = rate_2013_q1 / 0.9832  #     maybe use 2013q1 as a base quarter and get rid of mult?
rate_2012_q3 = rate_2012_q4 / 1.0277
rate_2012_q2 = rate_2012_q3 / 1.0279
rate_2012_q1 = rate_2012_q2 / 1.0279
rate_2011_q4 = rate_2012_q1 / 1.076
rate_2011_q3 = rate_2011_q4 / 1.0236
rate_2011_q2 = rate_2011_q3 / 1
rate_2011_q1 = rate_2011_q2 / 1.011


# train 2015
train['average_q_price'] = 1

train_2015_q2_index = train.loc[train['timestamp'].dt.year == 2015].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2015_q2_index, 'average_q_price'] = rate_2015_q2

train_2015_q1_index = train.loc[train['timestamp'].dt.year == 2015].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2015_q1_index, 'average_q_price'] = rate_2015_q1


# train 2014
train_2014_q4_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2014_q4_index, 'average_q_price'] = rate_2014_q4

train_2014_q3_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2014_q3_index, 'average_q_price'] = rate_2014_q3

train_2014_q2_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2014_q2_index, 'average_q_price'] = rate_2014_q2

train_2014_q1_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2014_q1_index, 'average_q_price'] = rate_2014_q1


# train 2013
train_2013_q4_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2013_q4_index, 'average_q_price'] = rate_2013_q4

train_2013_q3_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2013_q3_index, 'average_q_price'] = rate_2013_q3

train_2013_q2_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2013_q2_index, 'average_q_price'] = rate_2013_q2

train_2013_q1_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2013_q1_index, 'average_q_price'] = rate_2013_q1


# train 2012
train_2012_q4_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2012_q4_index, 'average_q_price'] = rate_2012_q4

train_2012_q3_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2012_q3_index, 'average_q_price'] = rate_2012_q3

train_2012_q2_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2012_q2_index, 'average_q_price'] = rate_2012_q2

train_2012_q1_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2012_q1_index, 'average_q_price'] = rate_2012_q1


# train 2011
train_2011_q4_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2011_q4_index, 'average_q_price'] = rate_2011_q4

train_2011_q3_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2011_q3_index, 'average_q_price'] = rate_2011_q3

train_2011_q2_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2011_q2_index, 'average_q_price'] = rate_2011_q2

train_2011_q1_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2011_q1_index, 'average_q_price'] = rate_2011_q1

train['price_doc'] = train['price_doc'] * train['average_q_price']


#########################################################################################################

mult = 1.054880504
train['price_doc'] = train['price_doc'] * mult
y_train = train["price_doc"]





Data Clean...
Feature Engineering...
Rate Mults...


In [3]:
#########################################################################################################
print('Running Model 1...')
x_train = train.drop(["id", "timestamp", "price_doc", "average_q_price"], axis=1)
#x_test = test.drop(["id", "timestamp", "average_q_price"], axis=1)
x_test = test.drop(["id", "timestamp"], axis=1)

num_train = len(x_train)
x_all = pd.concat([x_train, x_test])

for c in x_all.columns:
    if x_all[c].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(x_all[c].values))
        x_all[c] = lbl.transform(list(x_all[c].values))

x_train = x_all[:num_train]
x_test = x_all[num_train:]


xgb_params = {
    'eta': 0.05,
    'max_depth': 6,
    'subsample': 0.6,
    'colsample_bytree': 1,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

dtrain = xgb.DMatrix(x_train, y_train)
dtest = xgb.DMatrix(x_test)


num_boost_rounds = 422
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)


y_predict = model.predict(dtest)
gunja_output = pd.DataFrame({'id': id_test, 'price_doc': y_predict})

Running Model 1...


In [5]:
#gunja_output    
def output_predictions(name,id_col,predictions):
    df_out = pd.DataFrame({"id":id_col,"price_doc":predictions}) 
    print(df_out.head())
    df_out.to_csv(name+"_sberbank_submission.csv",index=False)

In [7]:
output_predictions("xgb_test",gunja_output['id'],gunja_output['price_doc'])

      id  price_doc
0  30474  5736830.0
1  30475  8332985.0
2  30476  5280246.0
3  30477  6184072.5
4  30478  5146293.5


In [10]:
#xgb CV wise
xgb_params = {
    'eta': 0.05,
    'max_depth': 6,
    'subsample': 0.6,
    'colsample_bytree': 1,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1,
    'seed':31
}

cv_model = xgb.cv(xgb_params,dtrain,num_boost_round=10,nfold=3,early_stopping_rounds=5,
                  metrics='rmse',verbose_eval=True,seed=0)

[0]	train-rmse:8.36865e+06+27692.7	test-rmse:8.3738e+06+59243.4
[1]	train-rmse:8.0048e+06+27514.1	test-rmse:8.01859e+06+59038
[2]	train-rmse:7.66105e+06+26129.3	test-rmse:7.68296e+06+61726.1
[3]	train-rmse:7.33669e+06+24636	test-rmse:7.36705e+06+60727.3
[4]	train-rmse:7.02794e+06+23095.5	test-rmse:7.06482e+06+61744.6
[5]	train-rmse:6.73603e+06+23907.9	test-rmse:6.77828e+06+61729.1
[6]	train-rmse:6.45823e+06+22695	test-rmse:6.50789e+06+65029.1
[7]	train-rmse:6.19772e+06+19930.9	test-rmse:6.25695e+06+66301.5
[8]	train-rmse:5.94832e+06+17804.1	test-rmse:6.01581e+06+66871.9
[9]	train-rmse:5.71484e+06+14583.9	test-rmse:5.79365e+06+68387.8


In [9]:
cv_model

Unnamed: 0,test-rmse-mean,test-rmse-std,train-rmse-mean,train-rmse-std
0,8375226.0,57825.711765,8368704.0,26603.592446
1,8019868.0,57768.651312,8003090.0,25955.592
2,7682980.0,54570.353826,7656902.0,24408.849059
3,7366354.0,55516.097154,7330665.0,21762.852283
4,7067743.0,56335.852322,7023327.0,20771.963903
5,6786340.0,55442.662175,6731077.0,20911.6244
6,6516556.0,55200.816524,6455029.0,19762.047238
7,6261720.0,57276.965956,6193815.0,18649.531349
8,6024179.0,55794.65729,5947300.0,19297.697247
9,5800648.0,54773.309843,5715952.0,18365.622972


In [60]:
#trying to tune the various parameters
# xgboost.XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='reg:linear', 
#                      booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, 
#                      max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, 
#                      reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, 
#                      random_state=0, seed=None, missing=None)

# xgb_params = {
#     'eta': 0.05,
#     'max_depth': 6,
#     'subsample': 0.6,
#     'colsample_bytree': 1,
#     'objective': 'reg:linear',
#     'eval_metric': 'rmse',
#     'silent': 1,
#     'seed':31
# }

xgb1 = xgb.XGBRegressor(max_depth=6, learning_rate=0.05, n_estimators=50, silent=False, objective='reg:linear', 
                     subsample=0.6, colsample_bytree=1,seed=31)



In [42]:
#custom scorer: root_mean_squared_error
from sklearn.metrics import mean_squared_error

def rmsle_scorer(estimator,X,y):
    y_pred = estimator.predict(X)
    return np.sqrt(np.mean(np.square(np.log(y_pred+0.1) - np.log(y+0.1))))
    #return mean_squared_error(y,y_pred)**0.5


In [61]:
from sklearn.model_selection import GridSearchCV

gs_parameters = {   
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2)
}
xgb_gs1 = GridSearchCV(xgb1, param_grid=gs_parameters,scoring=rmsle_scorer,cv=3)
#xgb_gs1 = GridSearchCV(xgb1, param_grid=gs_parameters,scoring='neg_mean_squared_error',cv=2)

In [62]:
xgb_gs1.fit(x_train,y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=50, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=31, silent=False, subsample=0.6),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': range(3, 10, 2), 'min_child_weight': range(1, 6, 2)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=<function rmsle_scorer at 0x000000000C191730>, verbose=0)

In [57]:
def describe_grid_model(grid_model):
    #best_model = grid_model.best_estimator_
    df_grid = pd.DataFrame(grid_model.cv_results_)
    print(df_grid['params'])
    print(df_grid['mean_test_score'])

In [63]:
describe_grid_model(xgb_gs1)

0     {'max_depth': 3, 'min_child_weight': 1}
1     {'max_depth': 3, 'min_child_weight': 3}
2     {'max_depth': 3, 'min_child_weight': 5}
3     {'max_depth': 5, 'min_child_weight': 1}
4     {'max_depth': 5, 'min_child_weight': 3}
5     {'max_depth': 5, 'min_child_weight': 5}
6     {'max_depth': 7, 'min_child_weight': 1}
7     {'max_depth': 7, 'min_child_weight': 3}
8     {'max_depth': 7, 'min_child_weight': 5}
9     {'max_depth': 9, 'min_child_weight': 1}
10    {'max_depth': 9, 'min_child_weight': 3}
11    {'max_depth': 9, 'min_child_weight': 5}
Name: params, dtype: object
0     0.492214
1     0.492418
2     0.492429
3     0.485810
4     0.485080
5     0.485643
6     0.483506
7     0.482554
8     0.482376
9     0.481677
10    0.481226
11    0.481385
Name: mean_test_score, dtype: float64


In [64]:
xgb2 = xgb.XGBRegressor(max_depth=9,min_child_weight=3, learning_rate=0.05, n_estimators=50, silent=False, objective='reg:linear', 
                     subsample=0.6, colsample_bytree=1,seed=31)

gs_parameters = {   
    'gamma':[i/10.0 for i in range(0,5)]
}
xgb_gs2 = GridSearchCV(xgb2, param_grid=gs_parameters,scoring=rmsle_scorer,cv=3)

In [65]:
xgb_gs2.fit(x_train,y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=9,
       min_child_weight=3, missing=None, n_estimators=50, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=31, silent=False, subsample=0.6),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'gamma': [0.0, 0.1, 0.2, 0.3, 0.4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=<function rmsle_scorer at 0x000000000C191730>, verbose=0)

In [66]:
describe_grid_model(xgb_gs2)

0    {'gamma': 0.0}
1    {'gamma': 0.1}
2    {'gamma': 0.2}
3    {'gamma': 0.3}
4    {'gamma': 0.4}
Name: params, dtype: object
0    0.481226
1    0.481226
2    0.481226
3    0.481226
4    0.481226
Name: mean_test_score, dtype: float64


In [67]:
xgb3 = xgb.XGBRegressor(gamma=0.0,max_depth=9,min_child_weight=3, learning_rate=0.05, n_estimators=50, silent=False, objective='reg:linear', 
                     subsample=0.6, colsample_bytree=1,seed=31)

gs_parameters = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
xgb_gs3 = GridSearchCV(xgb3, param_grid=gs_parameters,scoring=rmsle_scorer,cv=3)

xgb_gs3.fit(x_train,y_train)
describe_grid_model(xgb_gs3)

0     {'colsample_bytree': 0.6, 'subsample': 0.6}
1     {'colsample_bytree': 0.6, 'subsample': 0.7}
2     {'colsample_bytree': 0.6, 'subsample': 0.8}
3     {'colsample_bytree': 0.6, 'subsample': 0.9}
4     {'colsample_bytree': 0.7, 'subsample': 0.6}
5     {'colsample_bytree': 0.7, 'subsample': 0.7}
6     {'colsample_bytree': 0.7, 'subsample': 0.8}
7     {'colsample_bytree': 0.7, 'subsample': 0.9}
8     {'colsample_bytree': 0.8, 'subsample': 0.6}
9     {'colsample_bytree': 0.8, 'subsample': 0.7}
10    {'colsample_bytree': 0.8, 'subsample': 0.8}
11    {'colsample_bytree': 0.8, 'subsample': 0.9}
12    {'colsample_bytree': 0.9, 'subsample': 0.6}
13    {'colsample_bytree': 0.9, 'subsample': 0.7}
14    {'colsample_bytree': 0.9, 'subsample': 0.8}
15    {'colsample_bytree': 0.9, 'subsample': 0.9}
Name: params, dtype: object
0     0.487711
1     0.482977
2     0.484385
3     0.485728
4     0.494017
5     0.486590
6     0.490458
7     0.488007
8     0.483777
9     0.481597
10    0.484650
11    0

In [68]:
xgb4 = xgb.XGBRegressor(subsample=0.6, colsample_bytree=0.9,gamma=0.0,max_depth=9,min_child_weight=3,
                        learning_rate=0.05, n_estimators=50, silent=False, objective='reg:linear', 
                     seed=31)

gs_parameters = {
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
xgb_gs4 = GridSearchCV(xgb4, param_grid=gs_parameters,scoring=rmsle_scorer,cv=3)

xgb_gs4.fit(x_train,y_train)
describe_grid_model(xgb_gs4)

0    {'reg_alpha': 1e-05}
1     {'reg_alpha': 0.01}
2      {'reg_alpha': 0.1}
3        {'reg_alpha': 1}
4      {'reg_alpha': 100}
Name: params, dtype: object
0    0.480889
1    0.480889
2    0.480889
3    0.480889
4    0.480889
Name: mean_test_score, dtype: float64


In [69]:
xgb_params = {
    'eta': 0.01,
    'max_depth': 9,
    'subsample': 0.6,
    'min_child_weight': 3,
    #'gamma': 0.0,
    'colsample_bytree': 0.9,
    #'alpha': ,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1,
    'seed':31
}

cv_model = xgb.cv(xgb_params,dtrain,num_boost_round=1000,nfold=3,early_stopping_rounds=5,
                  metrics='rmse',verbose_eval=True,seed=0)


[0]	train-rmse:8.67465e+06+28126	test-rmse:8.67671e+06+57234.2
[1]	train-rmse:8.59738e+06+28010.5	test-rmse:8.60156e+06+57435.7
[2]	train-rmse:8.52152e+06+27455	test-rmse:8.52757e+06+57755.4
[3]	train-rmse:8.44648e+06+27309.4	test-rmse:8.45441e+06+57646.7
[4]	train-rmse:8.37173e+06+26832.9	test-rmse:8.38132e+06+58307.3
[5]	train-rmse:8.29835e+06+26111.6	test-rmse:8.31113e+06+58998.4
[6]	train-rmse:8.22471e+06+26025.9	test-rmse:8.23976e+06+59525.5
[7]	train-rmse:8.15257e+06+24906.1	test-rmse:8.16975e+06+60382.3
[8]	train-rmse:8.08042e+06+24616.5	test-rmse:8.10026e+06+60413.2
[9]	train-rmse:8.00904e+06+23956.7	test-rmse:8.03144e+06+60858.9
[10]	train-rmse:7.93887e+06+23667.3	test-rmse:7.96362e+06+60769.2
[11]	train-rmse:7.86998e+06+24134.4	test-rmse:7.89703e+06+60330.5
[12]	train-rmse:7.80193e+06+24799.7	test-rmse:7.83117e+06+58497
[13]	train-rmse:7.73376e+06+24400.4	test-rmse:7.76522e+06+58549.2
[14]	train-rmse:7.66659e+06+24415.7	test-rmse:7.69994e+06+57920
[15]	train-rmse:7.60001e+06+

In [70]:
xgb_params = {
    'eta': 0.01,
    'max_depth': 9,
    'subsample': 0.6,
    'min_child_weight': 3,
    #'gamma': 0.0,
    'colsample_bytree': 0.9,
    #'alpha': ,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1,
    'seed':31
}

num_boost_rounds = 840
model2 = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)


In [71]:
y_predict = model2.predict(dtest)
sberbank_output = pd.DataFrame({'id': id_test, 'price_doc': y_predict})

output_predictions("xgb_attempt_2",sberbank_output['id'],sberbank_output['price_doc'])

      id  price_doc
0  30474  5746386.0
1  30475  8466926.0
2  30476  5669385.5
3  30477  6149756.0
4  30478  5242248.0
