In [1]:
#
#     The data used in this notebook was cleaned by the script by MadScientist:
#     https://www.kaggle.com/keremt/very-extensive-cleaning-by-sberbank-discussions
#
#

import pandas as pd
import numpy as np
import xgboost as xgb
def preprocess(data):
    #add relative floor
    data['rel_floor'] = data['floor']/data['max_floor']
    #add relative kitch_sq
    data['relative_kitch_sq'] = data['kitch_sq']/data['full_sq']
    #add room size
    data['room_size'] = data['life_sq']/data['num_room']
    # add month and day of week
    data['month'] = data.timestamp.dt.month.astype(int)
    data['day_of_week'] = data.timestamp.dt.dayofweek.astype(int)
    
    data['bought_minus_built'] = data.timestamp.dt.year.astype(int) - data['build_year']
    
    data.loc[data['full_sq']==data['life_sq'],'life_sq'] = np.nan
    return data
def fact_binary(data):
    data = data.applymap(lambda x: x if x!='yes' else 1)
    data = data.applymap(lambda x: x if x!='no' else 0)
    data = data.applymap(lambda x: x if x!='OwnerOccupier' else 0)
    data = data.applymap(lambda x: x if x!='Investment' else 1)
    return data
class sep_estimator:
    def __init__(self,owner_params,investment_params):
        self.est1 = xgb.XGBRegressor(**owner_params)
        self.est2 = xgb.XGBRegressor(**investment_params)
    def preprocess_owner(self,data,mode):
        assert (data['product_type'].values==0).all()
        if mode=='predict':
            data.loc[data['full_sq'].isnull(),'full_sq'] = 50
        data = data.drop('timestamp',axis=1)
        return data
    def preprocess_investment(self,data,mode):
        if mode=='predict':
            assert (data['product_type'].values==1).all()
            data.loc[data['full_sq'].isnull(),'full_sq'] = 50
        data = data.drop('timestamp',axis=1)
        return data
    def fit(self,X,y):
        X1 = X[X['product_type']==0]
        X2 = X
        X1 = self.preprocess_owner(X1,'train')
        y1 = y.loc[X1.index.values]/X1['full_sq']
        X2 = self.preprocess_investment(X2,'train')
        y2 = y
        y2 = y2/X2['full_sq']
        if len(X1)>0:
            self.est1.fit(X1,y1)
        if len(X2)>0:
            self.est2.fit(X2,y2)
    def predict(self,X):
        X1 = X[X['product_type']==0]
        X2 = X[X['product_type']==1]
        owner_index = X1.index.values
        investment_index = X.index.drop(owner_index).values
        X1 = self.preprocess_owner(X1,'predict')
        X2 = self.preprocess_investment(X2,'predict')
        res = pd.DataFrame(index=X.index)
        if len(X1)>0:
            pred1 = self.est1.predict(X1)
            res.loc[owner_index,0] = pred1*X1['full_sq']
        if len(X2)>0:
            pred2 = self.est2.predict(X2)
            res.loc[investment_index,0] = pred2*X2['full_sq']     
        return res[0].values.flatten()



In [2]:
data = pd.read_csv('train_corr_clean.csv',index_col='id',parse_dates=['timestamp'])
data = fact_binary(data)
data = preprocess(data)
data = data.drop(['sub_area','ecology'],axis=1)
data = data.loc[~data['full_sq'].isnull(),:]
train = data

In [3]:
#
#    Price level multipliers here. I've copied it from Andy Harless script:
#    https://www.kaggle.com/aharless/exercising-the-exorcism
#
#

rate_2015_q2 = 1
rate_2015_q1 = rate_2015_q2 / 0.9932
rate_2014_q4 = rate_2015_q1 / 1.0112
rate_2014_q3 = rate_2014_q4 / 1.0169
rate_2014_q2 = rate_2014_q3 / 1.0086
rate_2014_q1 = rate_2014_q2 / 1.0126
rate_2013_q4 = rate_2014_q1 / 0.9902
rate_2013_q3 = rate_2013_q4 / 1.0041
rate_2013_q2 = rate_2013_q3 / 1.0044
rate_2013_q1 = rate_2013_q2 / 1.0104  # This is 1.002 (relative to mult), close to 1:
rate_2012_q4 = rate_2013_q1 / 0.9832  #     maybe use 2013q1 as a base quarter and get rid of mult?
rate_2012_q3 = rate_2012_q4 / 1.0277
rate_2012_q2 = rate_2012_q3 / 1.0279
rate_2012_q1 = rate_2012_q2 / 1.0279
rate_2011_q4 = rate_2012_q1 / 1.076
rate_2011_q3 = rate_2011_q4 / 1.0236
rate_2011_q2 = rate_2011_q3 / 1
rate_2011_q1 = rate_2011_q2 / 1.011


# train 2015
train['average_q_price'] = 1

train_2015_q2_index = train.loc[train['timestamp'].dt.year == 2015].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2015_q2_index, 'average_q_price'] = rate_2015_q2

train_2015_q1_index = train.loc[train['timestamp'].dt.year == 2015].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2015_q1_index, 'average_q_price'] = rate_2015_q1


# train 2014
train_2014_q4_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2014_q4_index, 'average_q_price'] = rate_2014_q4

train_2014_q3_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2014_q3_index, 'average_q_price'] = rate_2014_q3

train_2014_q2_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2014_q2_index, 'average_q_price'] = rate_2014_q2

train_2014_q1_index = train.loc[train['timestamp'].dt.year == 2014].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2014_q1_index, 'average_q_price'] = rate_2014_q1


# train 2013
train_2013_q4_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2013_q4_index, 'average_q_price'] = rate_2013_q4

train_2013_q3_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2013_q3_index, 'average_q_price'] = rate_2013_q3

train_2013_q2_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2013_q2_index, 'average_q_price'] = rate_2013_q2

train_2013_q1_index = train.loc[train['timestamp'].dt.year == 2013].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2013_q1_index, 'average_q_price'] = rate_2013_q1


# train 2012
train_2012_q4_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2012_q4_index, 'average_q_price'] = rate_2012_q4

train_2012_q3_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2012_q3_index, 'average_q_price'] = rate_2012_q3

train_2012_q2_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2012_q2_index, 'average_q_price'] = rate_2012_q2

train_2012_q1_index = train.loc[train['timestamp'].dt.year == 2012].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2012_q1_index, 'average_q_price'] = rate_2012_q1


# train 2011
train_2011_q4_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 10].loc[train['timestamp'].dt.month <= 12].index
train.loc[train_2011_q4_index, 'average_q_price'] = rate_2011_q4

train_2011_q3_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 7].loc[train['timestamp'].dt.month < 10].index
train.loc[train_2011_q3_index, 'average_q_price'] = rate_2011_q3

train_2011_q2_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 4].loc[train['timestamp'].dt.month < 7].index
train.loc[train_2011_q2_index, 'average_q_price'] = rate_2011_q2

train_2011_q1_index = train.loc[train['timestamp'].dt.year == 2011].loc[train['timestamp'].dt.month >= 1].loc[train['timestamp'].dt.month < 4].index
train.loc[train_2011_q1_index, 'average_q_price'] = rate_2011_q1

train['price_doc'] = train['price_doc'] * train['average_q_price']


#########################################################################################################


X = train.drop(["price_doc", "average_q_price"],axis=1)
y = train['price_doc']

In [4]:
kaggle_test = pd.read_csv('test_corr_clean.csv', parse_dates=['timestamp'],index_col='id')
test1 = kaggle_test.copy()
test1 = preprocess(test1)
test1 = fact_binary(test1)
#test1 = test1.merge(macro,on='timestamp')
test1 = test1.drop(['ecology','sub_area'],axis=1)
test1.loc[test1['full_sq'].isnull(),'full_sq'] = 50
test1.loc[test1['product_type'].isnull(),'product_type'] = 1

In [5]:
estimators = []
for i in range(400):
    owner_params = {
            'n_estimators':100,
            'learning_rate':0.1,
            'max_depth':7,
            'min_child_weight':1,
            'subsample':0.8,
            'colsample_bytree':0.9,
            'colsample_bylevel':1,
            'reg_alpha':0,
            'reg_lambda':1,
            'seed':i,
            'objective':'reg:linear',
            'nthread':8
    }
    investment_params = {
            'n_estimators':100,
            'learning_rate':0.1,
            'max_depth':7,
            'min_child_weight':1,
            'subsample':0.8,
            'colsample_bytree':0.9,
            'colsample_bylevel':1,
            'reg_alpha':0,
            'reg_lambda':1,
            'seed':i,
            'objective':'reg:linear',
            'nthread':8
    }
    est = sep_estimator(owner_params=owner_params,investment_params=investment_params)
    est.fit(X,y)
    estimators.append(est)
    if i%10==0:
        print i


0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390


In [6]:
pred_test = np.zeros(shape=test1.values.shape[0])
cnt = 0
for e1 in estimators:
    pr = e1.predict(test1)
    pred_test += pr
    if cnt%10 == 0:
        print cnt
    cnt += 1
pred_test /= len(estimators)
pred_test *= 0.9915
subm = pd.read_csv('sample_submission.csv')
subm['price_doc'] = pred_test
subm.to_csv('model_1_output.csv',index=False)
subm.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390


Unnamed: 0,id,price_doc
0,30474,5503016.0
1,30475,8358840.0
2,30476,5344753.0
3,30477,6311780.0
4,30478,5117794.0
