In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn import model_selection, preprocessing, ensemble
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,GradientBoostingRegressor,ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression,LogisticRegression
from numpy import nan
from com_util import *
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # ingore warnings

In [2]:
cols = ['brent', 'eurrub', 'usdrub', 'micex_cbi_tr', 'micex_rgbi_tr','micex',  'brent', 'rts', 'oil_urals',
        'balance_trade', 'ppi', 'cpi', 'gdp_quart', 'net_capital_export', 'micex_cbi_tr', 'deposits_rate',
       'gdp_quart_growth', 'mortgage_rate', 'average_provision_of_build_contract_moscow']

In [3]:
df_train = pd.read_csv("../data/train.csv", parse_dates=['timestamp'])
df_test = pd.read_csv("../data/test.csv", parse_dates=['timestamp'])
df_macro = pd.read_csv("../data/macro.csv", parse_dates=['timestamp'], usecols=['timestamp']+cols)
df_train.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
0,1,2011-08-20,43,27.0,4.0,,,,,,...,9,4,0,13,22,1,0,52,4,5850000
1,2,2011-08-23,34,19.0,3.0,,,,,,...,15,3,0,15,29,1,10,66,14,6000000
2,3,2011-08-27,43,29.0,2.0,,,,,,...,10,3,0,11,27,0,4,67,10,5700000
3,4,2011-09-01,89,50.0,9.0,,,,,,...,11,2,1,4,4,0,0,26,3,13100000
4,5,2011-09-05,77,77.0,4.0,,,,,,...,319,108,17,135,236,2,91,195,14,16331452


In [4]:
df_train_loc = pd.read_csv("../data/train_lat_lon.csv")
df_test_loc = pd.read_csv("../data/test_lat_lon.csv")
df_train = df_train.merge(df_train_loc, on='id')
df_train.drop(['key', 'tolerance_m'], axis=1, inplace=True)
df_test = df_test.merge(df_test_loc, on='id')
df_test.drop(['key', 'tolerance_m'], axis=1, inplace=True)

In [5]:
y_train = df_train['price_doc'].values
id_test = df_test['id']

df_train.drop(['id', 'price_doc'], axis=1, inplace=True)
df_test.drop(['id'], axis=1, inplace=True)

# Build df_all = (df_train+df_test).join(df_macro)
num_train = len(df_train)
df_all = pd.concat([df_train, df_test])
df_all = df_all.merge(df_macro, on='timestamp')

# Add month-year count
month_year = (df_all.timestamp.dt.month + df_all.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
df_all['month_year_cnt'] = month_year.map(month_year_cnt_map)

# Add week-year count
week_year = (df_all.timestamp.dt.weekofyear + df_all.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
df_all['week_year_cnt'] = week_year.map(week_year_cnt_map)

# Add month and day-of-week
df_all['month'] = df_all.timestamp.dt.month
df_all['dow'] = df_all.timestamp.dt.dayofweek

print(df_all.shape)

(38133, 313)


In [6]:
# doing some cleaning of square
df_all['full_sq'].ix[df_all.full_sq > 1000] = nan
df_all['life_sq'].ix[df_all.life_sq > 1000] = nan

df_all['life_sq'].ix[df_all.full_sq < df_all.life_sq] = nan
df_all['life_sq'].ix[df_all.life_sq < 5] = nan
df_all['full_sq'].ix[df_all.full_sq < 5] = nan
df_all['kitch_sq'].ix[df_all.kitch_sq > df_all.life_sq] = nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [7]:
# cleaning build_year
df_all['build_year'].ix[df_all.build_year == 20052009] = 2007
df_all['build_year'].ix[df_all.build_year < 1500] = nan
df_all['build_year'].ix[df_all.build_year > 2500] = nan

df_all['state'].ix[df_all.state == 33] = 3

# cleaning floor
df_all['max_floor'].ix[df_all.max_floor == 0] = nan
df_all['floor'].ix[df_all.floor == 0] = nan

In [8]:
# Add null value counts
df_all['null_count'] = df_all.isnull().sum(axis=1)
# Add apartment name
# df_all['apartment_name'] = df_all['sub_area'] + df_all['metro_km_avto'].astype(str)

In [9]:
#每个经纬度分类一下（1平方千米一个类）
df_all["jwd_class"]=map(lambda x,y:(int(x*100)%100)*100+(int(-y*100)%100),df_all["lat"].fillna(0),df_all["lon"].fillna(0))
# GroupBy 经纬度
df_all = merge_median(df_all, ["jwd_class"], "full_sq", "fullsq_median_jwd")
df_all = merge_median(df_all, ["jwd_class"], "life_sq", "lifesq_median_jwd")
df_all = merge_median(df_all, ["jwd_class"], "floor", "floor_median_jwd")

In [10]:
df_all['full_sq_separate'] = df_all['full_sq'].copy()

for i in range(30):
    df_all['full_sq_separate'].ix[(df_all.full_sq > (i+1)*5) & (df_all.full_sq <= ((i+1)*5+5))] = \
    len(df_all['full_sq'].ix[(df_all.full_sq > (i+1)*5) & (df_all.full_sq <= ((i+1)*5+5))])
    
df_all['full_sq_separate'].ix[df_all.full_sq <= 5] = len(df_all['full_sq'].ix[df_all.full_sq <= 5])
df_all['full_sq_separate'].ix[df_all.full_sq <= 5] = len(df_all['full_sq'].ix[df_all.full_sq > 155])

In [11]:
df_all['life_sq_separate'] = df_all['life_sq'].copy()

for i in range(30):
    df_all['life_sq_separate'].ix[(df_all.life_sq > (i+1)*5) & (df_all.life_sq <= ((i+1)*5+5))] = \
    len(df_all['life_sq'].ix[(df_all.life_sq > (i+1)*5) & (df_all.life_sq <= ((i+1)*5+5))])
    
df_all['life_sq_separate'].ix[df_all.life_sq <= 5] = len(df_all['life_sq'].ix[df_all.life_sq <= 5])
df_all['life_sq_separate'].ix[df_all.life_sq <= 5] = len(df_all['life_sq'].ix[df_all.life_sq > 155])

In [12]:
# Remove timestamp column (may overfit the model in train)
df_all.drop(['timestamp'], axis=1, inplace=True)
# df_all.drop(['timestamp_macro'], axis=1, inplace=True)

In [13]:
# Deal with categorical values
df_numeric = df_all.select_dtypes(exclude=['object'])
df_obj = df_all.select_dtypes(include=['object']).copy()

for c in df_obj:
    df_obj[c] = pd.factorize(df_obj[c])[0]

df_values = pd.concat([df_numeric, df_obj], axis=1)

### Dealing with missed variables

In [12]:
# from sklearn.base import TransformerMixin
# class DataFrameImputer(TransformerMixin):
#     def fit(self, X, y=None):
#         self.fill = pd.Series([X[c].value_counts().index[0]
#         if X[c].dtype == np.dtype('O') else X[c].median() for c in X],
#         index=X.columns)
#         return self
#     def transform(self, X, y=None):
#         return X.fillna(self.fill)
# df_values = DataFrameImputer().fit_transform(df_values)

In [14]:
# Convert to numpy values
X_all = df_values.values
print(X_all.shape)

X_train = X_all[:num_train]
X_test = X_all[num_train:]

df_columns = df_values.columns

(38133, 319)


## Stacking

In [27]:
n_folds = 5
stacker = xgb.XGBRegressor(max_depth=6, nthread=4, subsample=0.8, objective='reg:linear', learning_rate=0.05,
                               colsample_bytree=0.8, n_estimators=400, seed=0)
base_models = {xgb.XGBRegressor(max_depth=5, nthread=4, subsample=0.7, objective='reg:linear', learning_rate=0.05,
                               colsample_bytree=0.7, n_estimators=500, seed=0)}

In [28]:
for i in range(10):
    base_models.add(xgb.XGBRegressor(max_depth=5, nthread=4, subsample=0.7, objective='reg:linear', learning_rate=0.05,
                               colsample_bytree=0.7, n_estimators=500, seed=i+1))

In [29]:
for i in range(10):
    base_models.add(xgb.XGBRegressor(max_depth=6, nthread=4, subsample=0.8, objective='reg:linear', learning_rate=0.05,
                               colsample_bytree=0.8, n_estimators=400, seed=i+50))

In [30]:
for i in range(10):
    base_models.add(xgb.XGBRegressor(max_depth=5, nthread=4, subsample=0.8, objective='reg:linear', learning_rate=0.05,
                               colsample_bytree=0.8, n_estimators=500, seed=i+100))

In [38]:
X = X_train
y = y_train
T = X_test
kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=2017)
S_train = np.zeros((X.shape[0], len(base_models)))
S_test = np.zeros((T.shape[0], len(base_models)))

In [39]:
for i, clf in enumerate(base_models):
    print clf
    S_test_i = np.zeros((T.shape[0], n_folds))
    for j, (train_idx, test_idx) in enumerate(kf.split(X, y)):
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_holdout = X[test_idx]
        # y_holdout = y[test_idx]
        %time clf.fit(X_train, y_train)
        y_pred = clf.predict(X_holdout)[:]
        S_train[test_idx, i] = y_pred
        S_test_i[:, j] = clf.predict(T)[:]
    S_test[:, i] = S_test_i.mean(1)
print stacker
%time stacker.fit(S_train, y)
y_pred = stacker.predict(S_test)[:]

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=500, nthread=4,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=101, silent=True, subsample=0.8)
CPU times: user 2min, sys: 1.1 s, total: 2min 1s
Wall time: 31.9 s
CPU times: user 2min 4s, sys: 951 ms, total: 2min 5s
Wall time: 32.6 s
CPU times: user 2min 9s, sys: 895 ms, total: 2min 10s
Wall time: 33.9 s
CPU times: user 2min 13s, sys: 952 ms, total: 2min 14s
Wall time: 34.9 s
CPU times: user 2min 20s, sys: 1 s, total: 2min 21s
Wall time: 36.7 s
XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=500, nthread=4,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=102, silent=True

CPU times: user 2min 20s, sys: 1.12 s, total: 2min 21s
Wall time: 36.9 s
XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=400, nthread=4,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=57, silent=True, subsample=0.8)
CPU times: user 2min 28s, sys: 1.28 s, total: 2min 29s
Wall time: 39 s
CPU times: user 2min 27s, sys: 1.25 s, total: 2min 28s
Wall time: 38.7 s
CPU times: user 2min 28s, sys: 1.26 s, total: 2min 29s
Wall time: 38.8 s
CPU times: user 2min 26s, sys: 1.29 s, total: 2min 28s
Wall time: 38.6 s
CPU times: user 2min 25s, sys: 1.22 s, total: 2min 26s
Wall time: 38.1 s
XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.7,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=500, nthread=4,
       objective='reg:li

CPU times: user 2min 22s, sys: 1.03 s, total: 2min 23s
Wall time: 37.2 s
CPU times: user 2min 21s, sys: 1 s, total: 2min 22s
Wall time: 37 s
XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=400, nthread=4,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=50, silent=True, subsample=0.8)
CPU times: user 2min 26s, sys: 1.28 s, total: 2min 27s
Wall time: 38.7 s
CPU times: user 2min 17s, sys: 1.15 s, total: 2min 18s
Wall time: 35.9 s
CPU times: user 2min 18s, sys: 1.16 s, total: 2min 19s
Wall time: 36.2 s
CPU times: user 2min 18s, sys: 1.19 s, total: 2min 19s
Wall time: 36.3 s
CPU times: user 2min 17s, sys: 1.16 s, total: 2min 18s
Wall time: 36.1 s
XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=6,
       min_child_weight=

In [40]:
# y_pred = model.predict(dtest)
df_sub = pd.DataFrame({'id': id_test, 'price_doc': y_pred})

In [41]:
# df_naiveXGB['price_doc'] = df_naiveXGB['price_doc'] * 0.9691 + 10.08
df_sub.to_csv('EnsembleSub.csv', index=False)

In [42]:
df_sub['price_doc'].mean()

7684382.5

In [43]:
df_sub.head(5)

Unnamed: 0,id,price_doc
0,30474,5592616.5
1,30475,8541218.0
2,30476,5642737.5
3,30477,6183659.0
4,30478,5405800.0
