#Group Details


**Group Members (student ID):** August Asheim Birkelan (506753), Ørjan Carlsen (507694), Alexey Gusev (477979)

**Kaggle Competition:** Moscow Housing

**Kaggle Team:** Group 1

# PIP INSTALLS

In [1]:
pip install xgboost #Should be version 1.5.0!!



In [2]:
pip install lightgbm #Should be version 3.3.1!!



In [3]:
pip install geopy



In [4]:
pip install catboost #Should be version 1.0.3!!



In [5]:
pip install category_encoders -q

# IMPORTS

In [6]:
import json
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler, KBinsDiscretizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from category_encoders.target_encoder import TargetEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import RidgeCV, LinearRegression
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.impute import SimpleImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from geopy.distance import geodesic

from copy import deepcopy

%matplotlib inline


plt.style.use('ggplot')
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

  import pandas.util.testing as tm


# EDA

### Collect data

In [7]:
# TRAIN SET
buildings = pd.read_csv('./sample_data/buildings_train.csv')
apartments = pd.read_csv('./sample_data/apartments_train.csv')
print(f'All apartments have an associated building: {apartments.building_id.isin(buildings.id).all()}')
data_train = pd.merge(apartments, buildings.set_index('id'), how='left', left_on='building_id', right_index=True)

# Dropping duplicates in train
data_train = data_train.drop_duplicates(subset=data_train.columns.difference(['id']))

# TEST SET
apartments_test = pd.read_csv('./sample_data/apartments_test.csv')
buildings_test = pd.read_csv('./sample_data/buildings_test.csv')
print(f'All test apartments have an associated building: {apartments_test.building_id.isin(buildings_test.id).all()}')
data_test = pd.merge(apartments_test, buildings_test.set_index('id'), how='left', left_on='building_id', right_index=True)

# ALL DATA
data_all = pd.concat([data_train, data_test])
data_all['Split'] = np.where(data_all['id'] <= np.max(data_train['id']), 'Train', 'Test')
data_all = data_all.drop(['id'], axis=1)
pd.set_option("display.max_rows", 2000)

All apartments have an associated building: True
All test apartments have an associated building: True


 ### Remove outliers

**Remove data with too high price**

In [8]:
data_all = data_all[(data_all['area_total']<=1175) | (data_all['Split']=='Test')]

**Remove data with too big area total**

In [9]:
data_all = data_all[(data_all['price']<1000000000) | (data_all['price'].isna())]

### Bathrooms

Use median of bathrooms for same building to fill in for NaN

In [10]:
data_all["bathrooms_shared"] = data_all.groupby("building_id").transform(lambda x: x.fillna(x.median()))["bathrooms_shared"]
data_all["bathrooms_private"] = data_all.groupby("building_id").transform(lambda x: x.fillna(x.median()))["bathrooms_private"]

### District

In [11]:
pd.set_option("display.max_rows", 2000)
data_all.loc[(data_all['latitude']==55.595160) & (data_all['longitude']==37.741109) & (data_all['district'].isnull()), 'district'] = 5
data_all.loc[(data_all['latitude']==17.141734) & (data_all['longitude']==-61.790500) & (data_all['district'].isnull()), 'district'] = 11
data_all.loc[(data_all['latitude']==55.583537) & (data_all['longitude']==37.478025	) & (data_all['district'].isnull()), 'district'] = 11
data_all.loc[(data_all['latitude']==55.583551) & (data_all['longitude']==37.711356) & (data_all['district'].isnull()), 'district'] = 5
data_all.loc[(data_all['street']=='В мкр') & (data_all['district'].isnull()), 'district'] = 2
data_all.loc[(data_all['street']=='улица 1-я Линия') & (data_all['district'].isnull()), 'district'] = 3
data_all.loc[(data_all['street']=='улица Центральная') & (data_all['district'].isnull()), 'district'] = 11
data_all.loc[(data_all['address']=='Москва А101 ЖК') & (data_all['district'].isnull()), 'district'] = 11
data_all.loc[(data_all['street']=='Бунинские Луга ЖК') & (data_all['district'].isnull()), 'district'] = 11 #

### Longitude and Latitude

In [12]:

data_all.loc[(data_all['address']=='к2/2/1') & (data_all['street']=='Бунинские Луга ЖК') & (data_all['longitude']<0), ['latitude', 'longitude']] = [55.544046, 37.478055]
data_all.loc[(data_all['address']=='к2/2/2') & (data_all['street']=='Бунинские Луга ЖК') & (data_all['longitude']<0), ['latitude', 'longitude']] = [55.544886, 37.478459] 
data_all.loc[data_all['address'] == 'Москва А101 ЖК', ['latitude', 'longitude']] = [55.560891,	37.473761]
#data_all.loc[(data_all['street'] == 'улица 1-я Линия') & (data_all['longitude']==74.517184) & (data_all['latitude']==42.914700), ['latitude', 'longitude']] = [55.764424, 37.907781]
data_all.loc[(data_all['street']=='улица Центральная') & (data_all['address']=='48'), ['latitude', 'longitude']] = [55.853511, 37.384711]	 #[55.809245, 37.350090]
data_all.loc[(data_all['street']=='улица Центральная') & (data_all['address']=='75'), ['latitude', 'longitude']] = [55.853511, 37.384711]   #[55.809245, 37.350090]
#data_all.loc[(data_all.index==4719) & (data_all.Split=='Test'), ['latitude', 'longitude']] = [55.809245, 37.350090]

### Ceiling

In [13]:
data_all.loc[(data_all['ceiling']>12), 'ceiling'] = np.nan
data_all.loc[(data_all['ceiling']<1), 'ceiling'] = np.nan

### Distance to Universities/Colleges

In [14]:
MSU    = ( 55.704279331013915 , 37.527720613854    ) # center/sw
MSUCE  = ( 55.859955674414444 , 37.707267495936996 ) # nw
BMSTU  = ( 55.76615846588919  , 37.68505253621469  ) # center/east
IUFS   = ( 55.628344081320236 , 37.593311336163644 ) # south
LIS    = ( 55.892843325039514 , 37.57455519923625  ) # north
NW     = ( 55.81084987524267  , 37.51065533037661  )
MUG    = ( 55.72997580854584  , 37.816477554416245 ) # east
KKAR   = ( 55.65828905630157  , 37.770577681885065 ) # se
IESR   = ( 55.79239687372863  , 37.82063842368287  ) # east
RCTU   = ( 55.85867123962497  , 37.4158124386546   ) # nw
MEI    = ( 55.77480692814678  , 37.52265213121898  ) # center/west
MPU    = ( 55.723255925832    , 37.674135958782806 ) # center/se
RMANPO = ( 55.867347229014214 , 37.4761356952223   ) # nw
IIEP   = ( 55.594679800406006 , 37.6686510189342   ) # south
MPU2   = ( 55.820408245566306 , 37.664296132279986 ) # center/north
MSIEA  = ( 55.80611763965845  , 37.41016712643405  ) # nw
SW     = ( 55.6514038020098   , 37.49940394537357  )
IETVS  = ( 55.7262213003841   , 37.399531618530936 ) # east
CENTER = ( 55.75377154250644  , 37.6197263162158   )

In [15]:
import geopy.distance

data_all['MSU']    = data_all.apply(lambda row: geopy.distance.geodesic((row['latitude'],row['longitude']), MSU   ).km, axis=1)
data_all['MSUCE']  = data_all.apply(lambda row: geopy.distance.geodesic((row['latitude'],row['longitude']), MSUCE ).km, axis=1)
data_all['BMSTU']  = data_all.apply(lambda row: geopy.distance.geodesic((row['latitude'],row['longitude']), BMSTU ).km, axis=1)
data_all['IUFS']   = data_all.apply(lambda row: geopy.distance.geodesic((row['latitude'],row['longitude']), IUFS  ).km, axis=1)
data_all['LIS']    = data_all.apply(lambda row: geopy.distance.geodesic((row['latitude'],row['longitude']), LIS   ).km, axis=1)
data_all['NW']     = data_all.apply(lambda row: geopy.distance.geodesic((row['latitude'],row['longitude']), NW    ).km, axis=1)
data_all['MUG']    = data_all.apply(lambda row: geopy.distance.geodesic((row['latitude'],row['longitude']), MUG   ).km, axis=1)
data_all['KKAR']   = data_all.apply(lambda row: geopy.distance.geodesic((row['latitude'],row['longitude']), KKAR  ).km, axis=1)
data_all['IESR']   = data_all.apply(lambda row: geopy.distance.geodesic((row['latitude'],row['longitude']), IESR  ).km, axis=1)
data_all['RCTU']   = data_all.apply(lambda row: geopy.distance.geodesic((row['latitude'],row['longitude']), RCTU  ).km, axis=1)
data_all['MEI']    = data_all.apply(lambda row: geopy.distance.geodesic((row['latitude'],row['longitude']), MEI   ).km, axis=1)
data_all['MPU']    = data_all.apply(lambda row: geopy.distance.geodesic((row['latitude'],row['longitude']), MPU   ).km, axis=1)
data_all['RMANPO'] = data_all.apply(lambda row: geopy.distance.geodesic((row['latitude'],row['longitude']), RMANPO).km, axis=1)
data_all['IIEP']   = data_all.apply(lambda row: geopy.distance.geodesic((row['latitude'],row['longitude']), IIEP  ).km, axis=1)
data_all['MPU2']   = data_all.apply(lambda row: geopy.distance.geodesic((row['latitude'],row['longitude']), MPU2  ).km, axis=1)
data_all['MSIEA']  = data_all.apply(lambda row: geopy.distance.geodesic((row['latitude'],row['longitude']), MSIEA ).km, axis=1)
data_all['SW']     = data_all.apply(lambda row: geopy.distance.geodesic((row['latitude'],row['longitude']), SW    ).km, axis=1)
data_all['IETVS']  = data_all.apply(lambda row: geopy.distance.geodesic((row['latitude'],row['longitude']), IETVS ).km, axis=1)
data_all['CENTER'] = data_all.apply(lambda row: geopy.distance.geodesic((row['latitude'],row['longitude']), CENTER).km, axis=1)

In [16]:
data_all.loc[(data_all['seller']==2), 'seller'] = 0
data_all.loc[(data_all['seller']==3), 'seller'] = 1

In [17]:
unique_street = data_all['street'].unique()
street_map = dict([(y,x+1) for x,y in enumerate(sorted(unique_street))])
data_all['street'] = data_all['street'].apply(lambda x: street_map[x])

unique_address = data_all['address'].unique()
address_map = dict([(y,x+1) for x,y in enumerate(sorted(unique_address))])
data_all['address'] = data_all['address'].apply(lambda x: address_map[x])

### Store data for CatBoost

In [18]:
data_all_cat = deepcopy(data_all)
data_all_cat

Unnamed: 0,seller,price,area_total,area_kitchen,area_living,floor,rooms,layout,ceiling,bathrooms_shared,bathrooms_private,windows_court,windows_street,balconies,loggias,condition,phones,building_id,new,latitude,longitude,district,street,address,constructed,material,stories,elevator_without,elevator_passenger,elevator_service,parking,garbage_chute,heating,Split,MSU,MSUCE,BMSTU,IUFS,LIS,NW,MUG,KKAR,IESR,RCTU,MEI,MPU,RMANPO,IIEP,MPU2,MSIEA,SW,IETVS,CENTER
0,1.0,7139520.0,59.2,12.5,31.0,2.0,2.0,,2.65,0.0,2.0,0.0,1.0,,,,1.0,4076,1.0,55.544046,37.478055,11.0,304,2187,2021.0,3.0,9.0,0.0,1.0,1.0,1.0,,,Train,18.111865,38.010013,27.951770,11.870307,39.304628,29.775580,29.710859,22.400143,35.060803,35.247152,25.844792,23.464999,35.995419,13.279610,32.923875,29.488939,12.028155,20.876708,24.995381
1,,10500000.0,88.0,14.2,48.0,18.0,3.0,1.0,,2.0,0.0,1.0,1.0,1.0,0.0,3.0,1.0,1893,0.0,55.861282,37.666647,2.0,1729,364,2010.0,3.0,25.0,0.0,1.0,1.0,1.0,,0.0,Train,19.533059,2.547822,10.653579,26.340436,6.750784,11.272004,17.379333,23.523682,12.327268,15.709482,13.197512,15.374732,11.947236,29.682923,4.553223,17.204777,25.618077,22.513163,12.326286
2,1.0,9019650.0,78.5,22.5,40.8,12.0,3.0,,2.65,0.0,2.0,1.0,1.0,,,,1.0,5176,1.0,55.663299,37.515335,6.0,871,1146,2021.0,3.0,15.0,0.0,1.0,1.0,1.0,,,Train,4.628625,24.991497,15.650362,6.264921,25.825705,16.430508,20.339160,16.074171,23.969873,22.631718,12.423439,12.011991,22.851090,12.313719,19.837003,17.218845,1.661160,10.104800,12.021881
3,,10500000.0,88.0,14.0,48.0,18.0,3.0,,,0.0,2.0,1.0,1.0,0.0,1.0,2.0,1.0,1893,0.0,55.861282,37.666647,2.0,1729,364,2010.0,3.0,25.0,0.0,1.0,1.0,1.0,,0.0,Train,19.533059,2.547822,10.653579,26.340436,6.750784,11.272004,17.379333,23.523682,12.327268,15.709482,13.197512,15.374732,11.947236,29.682923,4.553223,17.204777,25.618077,22.513163,12.326286
4,,13900000.0,78.0,17.0,35.0,7.0,2.0,1.0,2.90,1.0,0.0,1.0,1.0,0.0,1.0,3.0,1.0,6604,0.0,55.590785,37.451438,11.0,1758,606,2017.0,2.0,15.0,0.0,1.0,1.0,1.0,0.0,0.0,Train,13.517872,34.007596,24.438289,9.870446,34.507853,24.782501,27.712964,21.463420,32.294731,29.909488,20.972263,20.347375,30.830653,13.701552,28.855793,24.114381,7.394659,15.428755,21.009646
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9932,1.0,,106.0,19.9,56.7,16.0,3.0,,3.30,1.0,0.0,,,0.0,1.0,,1.0,4252,1.0,55.789750,37.456468,8.0,1695,71,2020.0,2.0,20.0,0.0,1.0,1.0,0.0,,0.0,Test,10.515605,17.555014,14.582190,19.923091,13.655803,4.131266,23.560505,24.572269,22.845900,8.085602,4.473483,15.542601,8.727052,25.490019,13.471771,3.428343,15.637553,7.925068,11.001288
9933,,,82.0,,,3.0,3.0,,,2.0,0.0,1.0,0.0,,,1.0,2.0,7380,1.0,55.751639,37.516260,7.0,430,2015,2021.0,2.0,20.0,0.0,1.0,1.0,1.0,,,Test,5.321818,16.996794,10.719495,14.557400,16.140509,6.601784,19.010185,19.069432,19.634236,13.478993,2.610483,10.408307,13.126107,19.932763,12.036557,9.006031,11.210033,7.859205,6.501047
9934,,,49.3,,,15.0,1.0,,,1.0,1.0,,,,,,1.0,1480,1.0,55.770659,37.375234,7.0,854,1667,2016.0,2.0,24.0,0.0,1.0,1.0,0.0,1.0,1.0,Test,12.098199,23.067652,19.452311,20.954045,18.468046,9.601405,28.075381,27.819939,28.051136,10.123967,9.263245,19.498814,12.485694,26.917108,18.957701,4.515351,15.401594,5.177510,15.462811
9935,,,38.8,10.5,15.1,14.0,1.0,,3.30,1.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,2154,0.0,55.699943,37.637183,5.0,1387,1,2019.0,,14.0,1.0,1.0,1.0,0.0,1.0,,Test,6.899013,18.350186,7.961998,8.436113,21.833410,14.682821,11.754497,9.587889,15.449759,22.478018,11.010539,3.483124,21.201096,11.885806,13.519869,18.518273,10.214877,15.221214,6.092644


**Make features categorical for CatBoost**

### OneHotEncoding

In [19]:
stored_data_all = deepcopy(data_all)
for col in list(data_all.columns[data_all.nunique()<9]):
  if col == 'Split':
    continue
  stored_data_all = pd.concat([stored_data_all, pd.get_dummies(data_all[col], prefix=col)], axis=1)
  stored_data_all = stored_data_all.drop(col, axis=1)


# stored_data_all_cat = deepcopy(data_all_cat)
# for col in list(data_all_cat.columns[data_all_cat.nunique()<9]):
#   if col == 'Split':
#     continue
#   stored_data_all_cat = pd.concat([stored_data_all_cat, pd.get_dummies(data_all_cat[col], prefix=col)], axis=1)
#   stored_data_all_cat = stored_data_all_cat.drop(col, axis=1)
# data_all_cat = deepcopy(stored_data_all_cat)

In [20]:
# Copy the data
split = stored_data_all.Split
stored_data_all = stored_data_all.drop('Split', axis=1)
# Init
ii_imp = IterativeImputer(estimator=ExtraTreesRegressor(n_jobs=-1, random_state=42), max_iter=4, random_state=42, verbose=2)

# Tranform
stored_data_all.loc[:, :] = ii_imp.fit_transform(stored_data_all)

[IterativeImputer] Completing matrix with shape (32753, 98)
[IterativeImputer] Ending imputation round 1/4, elapsed time 162.53
[IterativeImputer] Change: 543400810.875101, scaled tolerance: 936014.184 
[IterativeImputer] Ending imputation round 2/4, elapsed time 326.75
[IterativeImputer] Change: 56539738.63200003, scaled tolerance: 936014.184 
[IterativeImputer] Ending imputation round 3/4, elapsed time 491.66
[IterativeImputer] Change: 74411330.39500006, scaled tolerance: 936014.184 
[IterativeImputer] Ending imputation round 4/4, elapsed time 662.49
[IterativeImputer] Change: 82428731.15370001, scaled tolerance: 936014.184 


In [21]:
stored_data_all['Split'] = split

In [22]:
data_all = deepcopy(stored_data_all)
data_all

Unnamed: 0,price,area_total,area_kitchen,area_living,floor,ceiling,bathrooms_shared,building_id,latitude,longitude,district,street,address,constructed,stories,MSU,MSUCE,BMSTU,IUFS,LIS,NW,MUG,KKAR,IESR,RCTU,MEI,MPU,RMANPO,IIEP,MPU2,MSIEA,SW,IETVS,CENTER,seller_0.0,seller_1.0,rooms_1.0,rooms_2.0,rooms_3.0,rooms_4.0,...,balconies_2.0,balconies_3.0,balconies_4.0,loggias_0.0,loggias_1.0,loggias_2.0,loggias_3.0,loggias_4.0,condition_0.0,condition_1.0,condition_2.0,condition_3.0,phones_0.0,phones_1.0,phones_2.0,new_0.0,new_1.0,material_0.0,material_1.0,material_2.0,material_3.0,material_4.0,material_5.0,material_6.0,elevator_without_0.0,elevator_without_1.0,elevator_passenger_0.0,elevator_passenger_1.0,elevator_service_0.0,elevator_service_1.0,parking_0.0,parking_1.0,parking_2.0,garbage_chute_0.0,garbage_chute_1.0,heating_0.0,heating_1.0,heating_2.0,heating_3.0,Split
0,7139520.00,59.2,12.500,31.0000,2.0,2.6500,0.00,4076.0,55.544046,37.478055,11.0,304.0,2187.0,2021.00,9.0,18.111865,38.010013,27.951770,11.870307,39.304628,29.775580,29.710859,22.400143,35.060803,35.247152,25.844792,23.464999,35.995419,13.279610,32.923875,29.488939,12.028155,20.876708,24.995381,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Train
1,10500000.00,88.0,14.200,48.0000,18.0,2.7139,2.00,1893.0,55.861282,37.666647,2.0,1729.0,364.0,2010.00,25.0,19.533059,2.547822,10.653579,26.340436,6.750784,11.272004,17.379333,23.523682,12.327268,15.709482,13.197512,15.374732,11.947236,29.682923,4.553223,17.204777,25.618077,22.513163,12.326286,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Train
2,9019650.00,78.5,22.500,40.8000,12.0,2.6500,0.00,5176.0,55.663299,37.515335,6.0,871.0,1146.0,2021.00,15.0,4.628625,24.991497,15.650362,6.264921,25.825705,16.430508,20.339160,16.074171,23.969873,22.631718,12.423439,12.011991,22.851090,12.313719,19.837003,17.218845,1.661160,10.104800,12.021881,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Train
3,10500000.00,88.0,14.000,48.0000,18.0,2.7475,0.00,1893.0,55.861282,37.666647,2.0,1729.0,364.0,2010.00,25.0,19.533059,2.547822,10.653579,26.340436,6.750784,11.272004,17.379333,23.523682,12.327268,15.709482,13.197512,15.374732,11.947236,29.682923,4.553223,17.204777,25.618077,22.513163,12.326286,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Train
4,13900000.00,78.0,17.000,35.0000,7.0,2.9000,1.00,6604.0,55.590785,37.451438,11.0,1758.0,606.0,2017.00,15.0,13.517872,34.007596,24.438289,9.870446,34.507853,24.782501,27.712964,21.463420,32.294731,29.909488,20.972263,20.347375,30.830653,13.701552,28.855793,24.114381,7.394659,15.428755,21.009646,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,Train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9932,35698942.88,106.0,19.900,56.7000,16.0,3.3000,1.00,4252.0,55.789750,37.456468,8.0,1695.0,71.0,2020.00,20.0,10.515605,17.555014,14.582190,19.923091,13.655803,4.131266,23.560505,24.572269,22.845900,8.085602,4.473483,15.542601,8.727052,25.490019,13.471771,3.428343,15.637553,7.925068,11.001288,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Test
9933,21712025.68,82.0,16.262,47.2879,3.0,2.9857,2.00,7380.0,55.751639,37.516260,7.0,430.0,2015.0,2021.00,20.0,5.321818,16.996794,10.719495,14.557400,16.140509,6.601784,19.010185,19.069432,19.634236,13.478993,2.610483,10.408307,13.126107,19.932763,12.036557,9.006031,11.210033,7.859205,6.501047,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Test
9934,10707863.02,49.3,14.679,19.9850,15.0,2.8873,1.00,1480.0,55.770659,37.375234,7.0,854.0,1667.0,2016.00,24.0,12.098199,23.067652,19.452311,20.954045,18.468046,9.601405,28.075381,27.819939,28.051136,10.123967,9.263245,19.498814,12.485694,26.917108,18.957701,4.515351,15.401594,5.177510,15.462811,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,Test
9935,12590901.55,38.8,10.500,15.1000,14.0,3.3000,1.00,2154.0,55.699943,37.637183,5.0,1387.0,1.0,2019.00,14.0,6.899013,18.350186,7.961998,8.436113,21.833410,14.682821,11.754497,9.587889,15.449759,22.478018,11.010539,3.483124,21.201096,11.885806,13.519869,18.518273,10.214877,15.221214,6.092644,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Test


In [23]:
stored_data = deepcopy(data_all)
stored_data

Unnamed: 0,price,area_total,area_kitchen,area_living,floor,ceiling,bathrooms_shared,building_id,latitude,longitude,district,street,address,constructed,stories,MSU,MSUCE,BMSTU,IUFS,LIS,NW,MUG,KKAR,IESR,RCTU,MEI,MPU,RMANPO,IIEP,MPU2,MSIEA,SW,IETVS,CENTER,seller_0.0,seller_1.0,rooms_1.0,rooms_2.0,rooms_3.0,rooms_4.0,...,balconies_2.0,balconies_3.0,balconies_4.0,loggias_0.0,loggias_1.0,loggias_2.0,loggias_3.0,loggias_4.0,condition_0.0,condition_1.0,condition_2.0,condition_3.0,phones_0.0,phones_1.0,phones_2.0,new_0.0,new_1.0,material_0.0,material_1.0,material_2.0,material_3.0,material_4.0,material_5.0,material_6.0,elevator_without_0.0,elevator_without_1.0,elevator_passenger_0.0,elevator_passenger_1.0,elevator_service_0.0,elevator_service_1.0,parking_0.0,parking_1.0,parking_2.0,garbage_chute_0.0,garbage_chute_1.0,heating_0.0,heating_1.0,heating_2.0,heating_3.0,Split
0,7139520.00,59.2,12.500,31.0000,2.0,2.6500,0.00,4076.0,55.544046,37.478055,11.0,304.0,2187.0,2021.00,9.0,18.111865,38.010013,27.951770,11.870307,39.304628,29.775580,29.710859,22.400143,35.060803,35.247152,25.844792,23.464999,35.995419,13.279610,32.923875,29.488939,12.028155,20.876708,24.995381,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Train
1,10500000.00,88.0,14.200,48.0000,18.0,2.7139,2.00,1893.0,55.861282,37.666647,2.0,1729.0,364.0,2010.00,25.0,19.533059,2.547822,10.653579,26.340436,6.750784,11.272004,17.379333,23.523682,12.327268,15.709482,13.197512,15.374732,11.947236,29.682923,4.553223,17.204777,25.618077,22.513163,12.326286,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Train
2,9019650.00,78.5,22.500,40.8000,12.0,2.6500,0.00,5176.0,55.663299,37.515335,6.0,871.0,1146.0,2021.00,15.0,4.628625,24.991497,15.650362,6.264921,25.825705,16.430508,20.339160,16.074171,23.969873,22.631718,12.423439,12.011991,22.851090,12.313719,19.837003,17.218845,1.661160,10.104800,12.021881,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Train
3,10500000.00,88.0,14.000,48.0000,18.0,2.7475,0.00,1893.0,55.861282,37.666647,2.0,1729.0,364.0,2010.00,25.0,19.533059,2.547822,10.653579,26.340436,6.750784,11.272004,17.379333,23.523682,12.327268,15.709482,13.197512,15.374732,11.947236,29.682923,4.553223,17.204777,25.618077,22.513163,12.326286,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Train
4,13900000.00,78.0,17.000,35.0000,7.0,2.9000,1.00,6604.0,55.590785,37.451438,11.0,1758.0,606.0,2017.00,15.0,13.517872,34.007596,24.438289,9.870446,34.507853,24.782501,27.712964,21.463420,32.294731,29.909488,20.972263,20.347375,30.830653,13.701552,28.855793,24.114381,7.394659,15.428755,21.009646,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,Train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9932,35698942.88,106.0,19.900,56.7000,16.0,3.3000,1.00,4252.0,55.789750,37.456468,8.0,1695.0,71.0,2020.00,20.0,10.515605,17.555014,14.582190,19.923091,13.655803,4.131266,23.560505,24.572269,22.845900,8.085602,4.473483,15.542601,8.727052,25.490019,13.471771,3.428343,15.637553,7.925068,11.001288,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Test
9933,21712025.68,82.0,16.262,47.2879,3.0,2.9857,2.00,7380.0,55.751639,37.516260,7.0,430.0,2015.0,2021.00,20.0,5.321818,16.996794,10.719495,14.557400,16.140509,6.601784,19.010185,19.069432,19.634236,13.478993,2.610483,10.408307,13.126107,19.932763,12.036557,9.006031,11.210033,7.859205,6.501047,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Test
9934,10707863.02,49.3,14.679,19.9850,15.0,2.8873,1.00,1480.0,55.770659,37.375234,7.0,854.0,1667.0,2016.00,24.0,12.098199,23.067652,19.452311,20.954045,18.468046,9.601405,28.075381,27.819939,28.051136,10.123967,9.263245,19.498814,12.485694,26.917108,18.957701,4.515351,15.401594,5.177510,15.462811,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,Test
9935,12590901.55,38.8,10.500,15.1000,14.0,3.3000,1.00,2154.0,55.699943,37.637183,5.0,1387.0,1.0,2019.00,14.0,6.899013,18.350186,7.961998,8.436113,21.833410,14.682821,11.754497,9.587889,15.449759,22.478018,11.010539,3.483124,21.201096,11.885806,13.519869,18.518273,10.214877,15.221214,6.092644,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Test


In [24]:
stored_data_cat = deepcopy(data_all_cat)
stored_data_cat

Unnamed: 0,seller,price,area_total,area_kitchen,area_living,floor,rooms,layout,ceiling,bathrooms_shared,bathrooms_private,windows_court,windows_street,balconies,loggias,condition,phones,building_id,new,latitude,longitude,district,street,address,constructed,material,stories,elevator_without,elevator_passenger,elevator_service,parking,garbage_chute,heating,Split,MSU,MSUCE,BMSTU,IUFS,LIS,NW,MUG,KKAR,IESR,RCTU,MEI,MPU,RMANPO,IIEP,MPU2,MSIEA,SW,IETVS,CENTER
0,1.0,7139520.0,59.2,12.5,31.0,2.0,2.0,,2.65,0.0,2.0,0.0,1.0,,,,1.0,4076,1.0,55.544046,37.478055,11.0,304,2187,2021.0,3.0,9.0,0.0,1.0,1.0,1.0,,,Train,18.111865,38.010013,27.951770,11.870307,39.304628,29.775580,29.710859,22.400143,35.060803,35.247152,25.844792,23.464999,35.995419,13.279610,32.923875,29.488939,12.028155,20.876708,24.995381
1,,10500000.0,88.0,14.2,48.0,18.0,3.0,1.0,,2.0,0.0,1.0,1.0,1.0,0.0,3.0,1.0,1893,0.0,55.861282,37.666647,2.0,1729,364,2010.0,3.0,25.0,0.0,1.0,1.0,1.0,,0.0,Train,19.533059,2.547822,10.653579,26.340436,6.750784,11.272004,17.379333,23.523682,12.327268,15.709482,13.197512,15.374732,11.947236,29.682923,4.553223,17.204777,25.618077,22.513163,12.326286
2,1.0,9019650.0,78.5,22.5,40.8,12.0,3.0,,2.65,0.0,2.0,1.0,1.0,,,,1.0,5176,1.0,55.663299,37.515335,6.0,871,1146,2021.0,3.0,15.0,0.0,1.0,1.0,1.0,,,Train,4.628625,24.991497,15.650362,6.264921,25.825705,16.430508,20.339160,16.074171,23.969873,22.631718,12.423439,12.011991,22.851090,12.313719,19.837003,17.218845,1.661160,10.104800,12.021881
3,,10500000.0,88.0,14.0,48.0,18.0,3.0,,,0.0,2.0,1.0,1.0,0.0,1.0,2.0,1.0,1893,0.0,55.861282,37.666647,2.0,1729,364,2010.0,3.0,25.0,0.0,1.0,1.0,1.0,,0.0,Train,19.533059,2.547822,10.653579,26.340436,6.750784,11.272004,17.379333,23.523682,12.327268,15.709482,13.197512,15.374732,11.947236,29.682923,4.553223,17.204777,25.618077,22.513163,12.326286
4,,13900000.0,78.0,17.0,35.0,7.0,2.0,1.0,2.90,1.0,0.0,1.0,1.0,0.0,1.0,3.0,1.0,6604,0.0,55.590785,37.451438,11.0,1758,606,2017.0,2.0,15.0,0.0,1.0,1.0,1.0,0.0,0.0,Train,13.517872,34.007596,24.438289,9.870446,34.507853,24.782501,27.712964,21.463420,32.294731,29.909488,20.972263,20.347375,30.830653,13.701552,28.855793,24.114381,7.394659,15.428755,21.009646
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9932,1.0,,106.0,19.9,56.7,16.0,3.0,,3.30,1.0,0.0,,,0.0,1.0,,1.0,4252,1.0,55.789750,37.456468,8.0,1695,71,2020.0,2.0,20.0,0.0,1.0,1.0,0.0,,0.0,Test,10.515605,17.555014,14.582190,19.923091,13.655803,4.131266,23.560505,24.572269,22.845900,8.085602,4.473483,15.542601,8.727052,25.490019,13.471771,3.428343,15.637553,7.925068,11.001288
9933,,,82.0,,,3.0,3.0,,,2.0,0.0,1.0,0.0,,,1.0,2.0,7380,1.0,55.751639,37.516260,7.0,430,2015,2021.0,2.0,20.0,0.0,1.0,1.0,1.0,,,Test,5.321818,16.996794,10.719495,14.557400,16.140509,6.601784,19.010185,19.069432,19.634236,13.478993,2.610483,10.408307,13.126107,19.932763,12.036557,9.006031,11.210033,7.859205,6.501047
9934,,,49.3,,,15.0,1.0,,,1.0,1.0,,,,,,1.0,1480,1.0,55.770659,37.375234,7.0,854,1667,2016.0,2.0,24.0,0.0,1.0,1.0,0.0,1.0,1.0,Test,12.098199,23.067652,19.452311,20.954045,18.468046,9.601405,28.075381,27.819939,28.051136,10.123967,9.263245,19.498814,12.485694,26.917108,18.957701,4.515351,15.401594,5.177510,15.462811
9935,,,38.8,10.5,15.1,14.0,1.0,,3.30,1.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,2154,0.0,55.699943,37.637183,5.0,1387,1,2019.0,,14.0,1.0,1.0,1.0,0.0,1.0,,Test,6.899013,18.350186,7.961998,8.436113,21.833410,14.682821,11.754497,9.587889,15.449759,22.478018,11.010539,3.483124,21.201096,11.885806,13.519869,18.518273,10.214877,15.221214,6.092644


# Feature engineering 

**Clustering**

In [25]:
def lat_long_clustering(df, df_test):
    from sklearn.cluster import KMeans
    k_means = KMeans(n_clusters = 400, max_iter = 10000, init='k-means++', random_state=42)

    lat_long_pairs = df[['latitude','longitude']]
    lat_long_pairs_test = df_test[['latitude','longitude']]
    target_data = np.log2(df.price)

    k_means.fit(lat_long_pairs,sample_weight = target_data)
    df['cluster'] = k_means.predict(lat_long_pairs)
    df_test['cluster'] = k_means.predict(lat_long_pairs_test)

    return df, df_test

data_all['cluster'] = np.nan
data_all[data_all['Split']=='Train'], data_all[data_all['Split']=='Test'] = lat_long_clustering(deepcopy(data_all[data_all['Split']=='Train']), deepcopy(data_all[data_all['Split']=='Test']))

data_all_cat['cluster'] = np.nan
data_all_cat[data_all_cat['Split']=='Train'], data_all_cat[data_all_cat['Split']=='Test'] = lat_long_clustering(deepcopy(data_all_cat[data_all_cat['Split']=='Train']), deepcopy(data_all_cat[data_all_cat['Split']=='Test']))

In [26]:
data_all['penthouse'] = (data_all['floor']*(data_all['floor'] / data_all['stories'] - 0.5)**3).astype(float)
data_all_cat['penthouse'] = (data_all_cat['floor']*(data_all_cat['floor'] / data_all_cat['stories'] - 0.5)**3).astype(float)

In [27]:
data_all[['penthouse', 'floor', 'stories']]

Unnamed: 0,penthouse,floor,stories
0,-0.042867,2.0,9.0
1,0.191664,18.0,25.0
2,0.324000,12.0,15.0
3,0.191664,18.0,25.0
4,-0.000259,7.0,15.0
...,...,...,...
9932,0.432000,16.0,20.0
9933,-0.128625,3.0,20.0
9934,0.029297,15.0,24.0
9935,1.750000,14.0,14.0


In [28]:
data_corr = data_all[data_all['Split']=='Train'].select_dtypes(include=[np.number])

In [29]:
corr = data_corr.corr()
corr.head(20)
corr.sort_values(['price'], ascending=False, inplace=True)
corr['price']

price                     1.000000
area_total                0.788478
area_living               0.760249
ceiling                   0.456730
area_kitchen              0.433733
rooms_6.0                 0.369085
bathrooms_shared          0.328290
rooms_5.0                 0.312307
parking_0.0               0.265516
rooms_4.0                 0.259054
condition_3.0             0.255601
bathrooms_private_4.0     0.232936
elevator_without_1.0      0.192696
bathrooms_private_3.0     0.163144
material_2.0              0.146694
material_5.0              0.136744
new_0.0                   0.105045
heating_3.0               0.104359
condition_0.0             0.101214
floor                     0.099645
windows_street_1.0        0.091655
balconies_4.0             0.081686
garbage_chute_1.0         0.073092
stories                   0.065851
latitude                  0.064700
parking_2.0               0.064606
layout_2.0                0.062309
building_id               0.059104
bathrooms_private_2.

# NaN values after EDA and feature engineering



In [30]:
data_all_cat

Unnamed: 0,seller,price,area_total,area_kitchen,area_living,floor,rooms,layout,ceiling,bathrooms_shared,bathrooms_private,windows_court,windows_street,balconies,loggias,condition,phones,building_id,new,latitude,longitude,district,street,address,constructed,material,stories,elevator_without,elevator_passenger,elevator_service,parking,garbage_chute,heating,Split,MSU,MSUCE,BMSTU,IUFS,LIS,NW,MUG,KKAR,IESR,RCTU,MEI,MPU,RMANPO,IIEP,MPU2,MSIEA,SW,IETVS,CENTER,cluster,penthouse
0,1.0,7139520.0,59.2,12.5,31.0,2.0,2.0,,2.65,0.0,2.0,0.0,1.0,,,,1.0,4076,1.0,55.544046,37.478055,11.0,304,2187,2021.0,3.0,9.0,0.0,1.0,1.0,1.0,,,Train,18.111865,38.010013,27.951770,11.870307,39.304628,29.775580,29.710859,22.400143,35.060803,35.247152,25.844792,23.464999,35.995419,13.279610,32.923875,29.488939,12.028155,20.876708,24.995381,155.0,-0.042867
1,,10500000.0,88.0,14.2,48.0,18.0,3.0,1.0,,2.0,0.0,1.0,1.0,1.0,0.0,3.0,1.0,1893,0.0,55.861282,37.666647,2.0,1729,364,2010.0,3.0,25.0,0.0,1.0,1.0,1.0,,0.0,Train,19.533059,2.547822,10.653579,26.340436,6.750784,11.272004,17.379333,23.523682,12.327268,15.709482,13.197512,15.374732,11.947236,29.682923,4.553223,17.204777,25.618077,22.513163,12.326286,248.0,0.191664
2,1.0,9019650.0,78.5,22.5,40.8,12.0,3.0,,2.65,0.0,2.0,1.0,1.0,,,,1.0,5176,1.0,55.663299,37.515335,6.0,871,1146,2021.0,3.0,15.0,0.0,1.0,1.0,1.0,,,Train,4.628625,24.991497,15.650362,6.264921,25.825705,16.430508,20.339160,16.074171,23.969873,22.631718,12.423439,12.011991,22.851090,12.313719,19.837003,17.218845,1.661160,10.104800,12.021881,247.0,0.324000
3,,10500000.0,88.0,14.0,48.0,18.0,3.0,,,0.0,2.0,1.0,1.0,0.0,1.0,2.0,1.0,1893,0.0,55.861282,37.666647,2.0,1729,364,2010.0,3.0,25.0,0.0,1.0,1.0,1.0,,0.0,Train,19.533059,2.547822,10.653579,26.340436,6.750784,11.272004,17.379333,23.523682,12.327268,15.709482,13.197512,15.374732,11.947236,29.682923,4.553223,17.204777,25.618077,22.513163,12.326286,248.0,0.191664
4,,13900000.0,78.0,17.0,35.0,7.0,2.0,1.0,2.90,1.0,0.0,1.0,1.0,0.0,1.0,3.0,1.0,6604,0.0,55.590785,37.451438,11.0,1758,606,2017.0,2.0,15.0,0.0,1.0,1.0,1.0,0.0,0.0,Train,13.517872,34.007596,24.438289,9.870446,34.507853,24.782501,27.712964,21.463420,32.294731,29.909488,20.972263,20.347375,30.830653,13.701552,28.855793,24.114381,7.394659,15.428755,21.009646,107.0,-0.000259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9932,1.0,,106.0,19.9,56.7,16.0,3.0,,3.30,1.0,0.0,,,0.0,1.0,,1.0,4252,1.0,55.789750,37.456468,8.0,1695,71,2020.0,2.0,20.0,0.0,1.0,1.0,0.0,,0.0,Test,10.515605,17.555014,14.582190,19.923091,13.655803,4.131266,23.560505,24.572269,22.845900,8.085602,4.473483,15.542601,8.727052,25.490019,13.471771,3.428343,15.637553,7.925068,11.001288,280.0,0.432000
9933,,,82.0,,,3.0,3.0,,,2.0,0.0,1.0,0.0,,,1.0,2.0,7380,1.0,55.751639,37.516260,7.0,430,2015,2021.0,2.0,20.0,0.0,1.0,1.0,1.0,,,Test,5.321818,16.996794,10.719495,14.557400,16.140509,6.601784,19.010185,19.069432,19.634236,13.478993,2.610483,10.408307,13.126107,19.932763,12.036557,9.006031,11.210033,7.859205,6.501047,259.0,-0.128625
9934,,,49.3,,,15.0,1.0,,,1.0,1.0,,,,,,1.0,1480,1.0,55.770659,37.375234,7.0,854,1667,2016.0,2.0,24.0,0.0,1.0,1.0,0.0,1.0,1.0,Test,12.098199,23.067652,19.452311,20.954045,18.468046,9.601405,28.075381,27.819939,28.051136,10.123967,9.263245,19.498814,12.485694,26.917108,18.957701,4.515351,15.401594,5.177510,15.462811,203.0,0.029297
9935,,,38.8,10.5,15.1,14.0,1.0,,3.30,1.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,2154,0.0,55.699943,37.637183,5.0,1387,1,2019.0,,14.0,1.0,1.0,1.0,0.0,1.0,,Test,6.899013,18.350186,7.961998,8.436113,21.833410,14.682821,11.754497,9.587889,15.449759,22.478018,11.010539,3.483124,21.201096,11.885806,13.519869,18.518273,10.214877,15.221214,6.092644,240.0,1.750000


In [31]:
lgbm_categorical = (0, 6, 10, 11, 14, 16, 17, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 51)


# Stacking

In [32]:
SEED=42
model1 = RandomForestRegressor(
    n_estimators=1500,
    n_jobs=-1,
    random_state=SEED,
    verbose=1,
)
model2 = GradientBoostingRegressor(
    n_estimators=600,
    learning_rate=0.06,
    min_samples_leaf=4, 
    max_depth=9, 
    random_state=SEED,
    verbose=1,
)
model3 = lgb.LGBMRegressor(
    n_estimators=6000,
    learning_rate=0.08,
    num_leaves=10,
    random_state=SEED, 
    seed=SEED,
    n_jobs=-1,
    categorical_feature = lgbm_categorical,
)
# model4 = xgb.XGBRegressor(
#     n_estimators=5000,
#     learning_rate=0.14,
#     n_jobs=-1, 
#     random_state=SEED,
#     max_depth = 4,
#     seed=SEED,
#     verbosity=1,
# )
# model5 = CatBoostRegressor(
#     n_estimators=2500,
#     learning_rate=0.1,
#     thread_count=-1,
#     depth=9,
#     random_seed=SEED,
#     silent=True,
# #     cat_features = ["layout", "condition", "new", "material", "seller", "parking", "heating", "district"],
# )
model4 = xgb.XGBRegressor(
    n_estimators=5000,
    learning_rate=0.01,
    n_jobs=-1, 
    subsample=0.8,
    random_state=SEED,
    max_depth = 8,
    gamma=0.0,
    seed=SEED,
    verbosity=1,
)
model5 = CatBoostRegressor(
    n_estimators=5000,
    learning_rate=0.06322764426255192,
    thread_count=-1,
    num_leaves=24,
    min_child_samples=16,
    depth=6,
    random_seed=SEED,
    silent=True,
    grow_policy='Lossguide',
#     cat_features = ["layout", "condition", "new", "material", "seller", "parking", "heating", "district"],
)

In [33]:
data_all.to_csv('data_allNew.csv')
data_all_cat.to_csv('data_all_catNew.csv')

In [34]:
data_all

Unnamed: 0,price,area_total,area_kitchen,area_living,floor,ceiling,bathrooms_shared,building_id,latitude,longitude,district,street,address,constructed,stories,MSU,MSUCE,BMSTU,IUFS,LIS,NW,MUG,KKAR,IESR,RCTU,MEI,MPU,RMANPO,IIEP,MPU2,MSIEA,SW,IETVS,CENTER,seller_0.0,seller_1.0,rooms_1.0,rooms_2.0,rooms_3.0,rooms_4.0,...,balconies_4.0,loggias_0.0,loggias_1.0,loggias_2.0,loggias_3.0,loggias_4.0,condition_0.0,condition_1.0,condition_2.0,condition_3.0,phones_0.0,phones_1.0,phones_2.0,new_0.0,new_1.0,material_0.0,material_1.0,material_2.0,material_3.0,material_4.0,material_5.0,material_6.0,elevator_without_0.0,elevator_without_1.0,elevator_passenger_0.0,elevator_passenger_1.0,elevator_service_0.0,elevator_service_1.0,parking_0.0,parking_1.0,parking_2.0,garbage_chute_0.0,garbage_chute_1.0,heating_0.0,heating_1.0,heating_2.0,heating_3.0,Split,cluster,penthouse
0,7139520.00,59.2,12.500,31.0000,2.0,2.6500,0.00,4076.0,55.544046,37.478055,11.0,304.0,2187.0,2021.00,9.0,18.111865,38.010013,27.951770,11.870307,39.304628,29.775580,29.710859,22.400143,35.060803,35.247152,25.844792,23.464999,35.995419,13.279610,32.923875,29.488939,12.028155,20.876708,24.995381,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Train,155.0,-0.042867
1,10500000.00,88.0,14.200,48.0000,18.0,2.7139,2.00,1893.0,55.861282,37.666647,2.0,1729.0,364.0,2010.00,25.0,19.533059,2.547822,10.653579,26.340436,6.750784,11.272004,17.379333,23.523682,12.327268,15.709482,13.197512,15.374732,11.947236,29.682923,4.553223,17.204777,25.618077,22.513163,12.326286,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Train,248.0,0.191664
2,9019650.00,78.5,22.500,40.8000,12.0,2.6500,0.00,5176.0,55.663299,37.515335,6.0,871.0,1146.0,2021.00,15.0,4.628625,24.991497,15.650362,6.264921,25.825705,16.430508,20.339160,16.074171,23.969873,22.631718,12.423439,12.011991,22.851090,12.313719,19.837003,17.218845,1.661160,10.104800,12.021881,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Train,247.0,0.324000
3,10500000.00,88.0,14.000,48.0000,18.0,2.7475,0.00,1893.0,55.861282,37.666647,2.0,1729.0,364.0,2010.00,25.0,19.533059,2.547822,10.653579,26.340436,6.750784,11.272004,17.379333,23.523682,12.327268,15.709482,13.197512,15.374732,11.947236,29.682923,4.553223,17.204777,25.618077,22.513163,12.326286,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Train,248.0,0.191664
4,13900000.00,78.0,17.000,35.0000,7.0,2.9000,1.00,6604.0,55.590785,37.451438,11.0,1758.0,606.0,2017.00,15.0,13.517872,34.007596,24.438289,9.870446,34.507853,24.782501,27.712964,21.463420,32.294731,29.909488,20.972263,20.347375,30.830653,13.701552,28.855793,24.114381,7.394659,15.428755,21.009646,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,Train,107.0,-0.000259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9932,35698942.88,106.0,19.900,56.7000,16.0,3.3000,1.00,4252.0,55.789750,37.456468,8.0,1695.0,71.0,2020.00,20.0,10.515605,17.555014,14.582190,19.923091,13.655803,4.131266,23.560505,24.572269,22.845900,8.085602,4.473483,15.542601,8.727052,25.490019,13.471771,3.428343,15.637553,7.925068,11.001288,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,Test,280.0,0.432000
9933,21712025.68,82.0,16.262,47.2879,3.0,2.9857,2.00,7380.0,55.751639,37.516260,7.0,430.0,2015.0,2021.00,20.0,5.321818,16.996794,10.719495,14.557400,16.140509,6.601784,19.010185,19.069432,19.634236,13.478993,2.610483,10.408307,13.126107,19.932763,12.036557,9.006031,11.210033,7.859205,6.501047,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Test,259.0,-0.128625
9934,10707863.02,49.3,14.679,19.9850,15.0,2.8873,1.00,1480.0,55.770659,37.375234,7.0,854.0,1667.0,2016.00,24.0,12.098199,23.067652,19.452311,20.954045,18.468046,9.601405,28.075381,27.819939,28.051136,10.123967,9.263245,19.498814,12.485694,26.917108,18.957701,4.515351,15.401594,5.177510,15.462811,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,Test,203.0,0.029297
9935,12590901.55,38.8,10.500,15.1000,14.0,3.3000,1.00,2154.0,55.699943,37.637183,5.0,1387.0,1.0,2019.00,14.0,6.899013,18.350186,7.961998,8.436113,21.833410,14.682821,11.754497,9.587889,15.449759,22.478018,11.010539,3.483124,21.201096,11.885806,13.519869,18.518273,10.214877,15.221214,6.092644,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,Test,240.0,1.750000


In [35]:
data_train_all = data_all[data_all['Split']=='Train'].drop(['Split'], axis=1)
data_test_all = data_all[data_all['Split']=='Test'].drop(['Split'], axis=1)

data_train_cat = data_all_cat[data_all_cat['Split']=='Train'].drop(['Split'], axis=1)
data_test_cat = data_all_cat[data_all_cat['Split']=='Test'].drop(['Split'], axis=1)

X_train = data_train_all.drop('price', axis=1)
X_test = data_test_all.drop('price', axis=1)

X_train_cat = data_train_cat.drop('price', axis=1)
X_test_cat = data_test_cat.drop('price', axis=1)

y_train = np.log2(data_train_all.loc[X_train.index].price)

In [36]:
X_train

Unnamed: 0,area_total,area_kitchen,area_living,floor,ceiling,bathrooms_shared,building_id,latitude,longitude,district,street,address,constructed,stories,MSU,MSUCE,BMSTU,IUFS,LIS,NW,MUG,KKAR,IESR,RCTU,MEI,MPU,RMANPO,IIEP,MPU2,MSIEA,SW,IETVS,CENTER,seller_0.0,seller_1.0,rooms_1.0,rooms_2.0,rooms_3.0,rooms_4.0,rooms_5.0,...,balconies_3.0,balconies_4.0,loggias_0.0,loggias_1.0,loggias_2.0,loggias_3.0,loggias_4.0,condition_0.0,condition_1.0,condition_2.0,condition_3.0,phones_0.0,phones_1.0,phones_2.0,new_0.0,new_1.0,material_0.0,material_1.0,material_2.0,material_3.0,material_4.0,material_5.0,material_6.0,elevator_without_0.0,elevator_without_1.0,elevator_passenger_0.0,elevator_passenger_1.0,elevator_service_0.0,elevator_service_1.0,parking_0.0,parking_1.0,parking_2.0,garbage_chute_0.0,garbage_chute_1.0,heating_0.0,heating_1.0,heating_2.0,heating_3.0,cluster,penthouse
0,59.2,12.5,31.0000,2.0,2.6500,0.00,4076.0,55.544046,37.478055,11.0,304.0,2187.0,2021.0,9.0,18.111865,38.010013,27.951770,11.870307,39.304628,29.775580,29.710859,22.400143,35.060803,35.247152,25.844792,23.464999,35.995419,13.279610,32.923875,29.488939,12.028155,20.876708,24.995381,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,155.0,-0.042867
1,88.0,14.2,48.0000,18.0,2.7139,2.00,1893.0,55.861282,37.666647,2.0,1729.0,364.0,2010.0,25.0,19.533059,2.547822,10.653579,26.340436,6.750784,11.272004,17.379333,23.523682,12.327268,15.709482,13.197512,15.374732,11.947236,29.682923,4.553223,17.204777,25.618077,22.513163,12.326286,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,248.0,0.191664
2,78.5,22.5,40.8000,12.0,2.6500,0.00,5176.0,55.663299,37.515335,6.0,871.0,1146.0,2021.0,15.0,4.628625,24.991497,15.650362,6.264921,25.825705,16.430508,20.339160,16.074171,23.969873,22.631718,12.423439,12.011991,22.851090,12.313719,19.837003,17.218845,1.661160,10.104800,12.021881,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,247.0,0.324000
3,88.0,14.0,48.0000,18.0,2.7475,0.00,1893.0,55.861282,37.666647,2.0,1729.0,364.0,2010.0,25.0,19.533059,2.547822,10.653579,26.340436,6.750784,11.272004,17.379333,23.523682,12.327268,15.709482,13.197512,15.374732,11.947236,29.682923,4.553223,17.204777,25.618077,22.513163,12.326286,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,248.0,0.191664
4,78.0,17.0,35.0000,7.0,2.9000,1.00,6604.0,55.590785,37.451438,11.0,1758.0,606.0,2017.0,15.0,13.517872,34.007596,24.438289,9.870446,34.507853,24.782501,27.712964,21.463420,32.294731,29.909488,20.972263,20.347375,30.830653,13.701552,28.855793,24.114381,7.394659,15.428755,21.009646,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,107.0,-0.000259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23280,65.0,16.0,32.0000,3.0,3.0000,0.00,3139.0,55.770775,37.376626,7.0,854.0,1668.0,2017.0,24.0,12.037019,22.983329,19.365280,20.906658,18.399611,9.518163,27.991173,27.747618,27.962967,10.089865,9.175341,19.418162,12.430542,26.866549,18.870407,4.462226,15.368601,5.164828,15.377642,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,203.0,-0.158203
23281,56.9,9.6,36.5000,6.0,3.2241,0.69,7624.0,55.785777,37.585790,1.0,303.0,636.0,2020.0,20.0,9.779186,11.233093,6.600914,17.534405,11.941410,5.477051,15.759378,18.338786,14.750961,13.393136,4.145669,8.900796,11.388984,21.904932,6.253308,11.245654,15.915249,13.443340,4.151457,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,-0.048000
23282,73.4,23.0,29.3000,16.0,2.9468,2.00,8021.0,55.753686,37.513143,7.0,262.0,1222.0,2018.0,16.0,5.576513,16.975225,10.881278,14.837920,15.964554,6.366417,19.233854,19.357770,19.773195,13.186257,2.426128,10.664969,12.865810,20.226880,12.046015,8.707912,11.420446,7.763608,6.692260,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,259.0,2.000000
23283,100.8,20.0,58.2610,4.0,2.9300,0.00,4591.0,55.611302,37.696741,5.0,1087.0,1173.0,1997.0,10.0,14.843783,27.692254,17.256789,6.787076,32.271312,25.107959,15.209754,6.999371,21.614893,32.710273,21.242422,12.545383,31.696573,2.561174,23.370321,28.195313,13.206908,22.659793,16.585287,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,374.0,-0.004000


In [37]:
ntrain = X_train.shape[0]
ntest = X_test.shape[0]
SEED = 42 # for reproducibility
NFOLDS = 5 # set number of folds for out-of-fold prediction
kf = KFold(
    n_splits=NFOLDS,
    shuffle=True,
    random_state=SEED
) # K-Folds cross-validator

def get_oof(clf, x_train, y_train, x_test):
    """
    Popular function on Kaggle.
    
    Trains a classifier on 4/5 of the training data and
    predicts the rest (1/5). This procedure is repeated for all 5 folds,
    thus we have predictions for all training set. This prediction is one
    column of meta-data, later on used as a feature column by a meta-algorithm.
    We predict the test part and average predictions across all 5 models.
    
    Keyword arguments:
    clf -- classifier
    x_train -- 4/5 of training data
    y_train -- corresponding labels
    x_test -- all test data
    
    """
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.fit(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [38]:
X_train = X_train.values
X_test = X_test.values
X_train_cat = X_train_cat.values
X_test_cat = X_test_cat.values
y_train = y_train.ravel()

In [39]:
rf_oof_train, rf_oof_test = get_oof(model1, X_train, y_train, X_test)
gb_oof_train, gb_oof_test = get_oof(model2, X_train, y_train, X_test)
lgb_oof_train, lgb_oof_test = get_oof(model3, X_train_cat, y_train, X_test_cat)
xgb_oof_train, xgb_oof_test = get_oof(model4, X_train, y_train, X_test) # set reg:squarederror?


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 402 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done 752 tasks      | elapsed:   32.9s
[Parallel(n_jobs=-1)]: Done 1202 tasks      | elapsed:   49.6s
[Parallel(n_jobs=-1)]: Done 1500 out of 1500 | elapsed:  1.0min finished
[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.1s
[Parallel(n_jobs=24)]: Done 402 tasks      | elapsed:    0.2s
[Parallel(n_jobs=24)]: Done 752 tasks      | elapsed:    0.3s
[Parallel(n_jobs=24)]: Done 1202 tasks      | elapsed:    0.4s
[Parallel(n_jobs=24)]: Done 1500 out of 1500 | elapsed:    0.5s finished
[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concur

      Iter       Train Loss   Remaining Time 
         1           1.3649            4.90m
         2           1.2142            4.88m
         3           1.0804            4.86m
         4           0.9622            4.87m
         5           0.8574            4.85m
         6           0.7646            4.84m
         7           0.6823            4.83m
         8           0.6093            4.83m
         9           0.5443            4.82m
        10           0.4867            4.82m
        20           0.1696            4.72m
        30           0.0693            4.62m
        40           0.0350            4.54m
        50           0.0219            4.46m
        60           0.0162            4.40m
        70           0.0132            4.33m
        80           0.0112            4.26m
        90           0.0097            4.18m
       100           0.0087            4.10m
       200           0.0057            3.43m
       300           0.0041            2.62m
       40

In [40]:
cat_oof_train, cat_oof_test = get_oof(model5, X_train_cat, y_train, X_test_cat)

In [46]:
x_train = np.concatenate((
    rf_oof_train,
    gb_oof_train,
    lgb_oof_train,
    xgb_oof_train,
    cat_oof_train
), axis=1)

x_test = np.concatenate((
    rf_oof_test,
    gb_oof_test,
    lgb_oof_test,
    xgb_oof_test,
    cat_oof_test
), axis=1)

In [47]:
# META_MODEL = lgb.LGBMRegressor(
#     num_leaves=5,
#     max_depth=7, 
#     random_state=SEED, 
#     silent=True, 
#     metric='mse',
#     n_jobs=4, 
#     n_estimators=200,
#     colsample_bytree=1,
#     subsample=0.9,
#     learning_rate=0.05
# )
# META_MODEL = LinearRegression(
#     n_jobs=-1,
# )
META_MODEL = RidgeCV(cv=5)
META_MODEL.fit(x_train, y_train)
final_predictions = np.power(2, META_MODEL.predict(x_test))

# final_predictions = np.average(
#     [
#      rf_oof_test,
#      gb_oof_test,
#      lgb_oof_test,
#      xgb_oof_test,
#      cat_oof_test
#     ],
#     weights = 1 / acc['RMSLE']**9,
#     axis=0
# )
# final_predictions = np.power(2, final_predictions)
final_predictions

array([29268363.3803488 ,  9624909.61620726,  6073326.73167329, ...,
        9268469.12955655,  9987809.24606033,  7071743.48549537])

In [48]:
min_pred = min(final_predictions)
print(min_pred)

2268179.216626703


# To CSV

In [49]:
submission = pd.DataFrame()
submission['id'] = data_test.id
submission['price_prediction'] = final_predictions
submission

# Construct submission dataframe
# submission = pd.DataFrame()
# submission['id'] = data_test.id
# submission.loc[~X_test_nan, 'price_prediction'] = prediction # Predict on non-nan entries
# submission['price_prediction'].fillna(y_train.mean(), inplace=True) # Fill missing entries with mean predictor
# print(f'Generated {len(submission)} predictions')

# submission.loc[~X_test_nan, 'price_prediction'] = prediction # Predict on non-nan entries
# submission['price_prediction'].fillna(y_train.mean(), inplace=True) # Fill missing entries with mean predictor
# print(f'Generated {len(submission)} predictions')

Unnamed: 0,id,price_prediction
0,23285,2.926836e+07
1,23286,9.624910e+06
2,23287,6.073327e+06
3,23288,7.953853e+06
4,23289,5.113096e+06
...,...,...
9932,33217,3.636619e+07
9933,33218,1.780716e+07
9934,33219,9.268469e+06
9935,33220,9.987809e+06


In [50]:
submission.to_csv('STACKEDXGBCATløøøø.csv', index=False)