# Задача с хакатона Райффайзенбанка Raifhack DS 2021
## Оценка стоимости коммерческой недвижимости: На основании данных о продаже коммерческой недвижимости построить модель, прогнозирующую изменение цен.
## Pipeline состоит из предобработки и чистки данных и ансамбля из трех моделей (LightGBM, XGBoost, Catboost)

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from metrics import deviation_metric

import preprocess

import lightgbm as lgbm 
import xgboost as xgb
import catboost

df = pd.read_csv('train.csv', index_col='id')
X_test = pd.read_csv('test.csv', index_col='id')

df.head()

Unnamed: 0_level_0,city,floor,lat,lng,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,osm_building_points_in_0.005,...,reform_mean_floor_count_1000,reform_mean_floor_count_500,reform_mean_year_building_1000,reform_mean_year_building_500,region,total_square,street,date,realty_type,price_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
COL_0,Пермь,,57.998207,56.292797,4,19,35,52,0,0,...,5.762963,5.530612,1964.118519,1960.959184,Пермский край,32.0,S27289,2020-01-05,10,0
COL_1,Шатура,,55.574284,39.543835,3,24,37,59,0,0,...,2.894366,3.527778,1952.321678,1957.222222,Московская область,280.0,S17052,2020-01-05,10,0
COL_2,Ярославль,,57.61914,39.850525,1,30,67,128,0,0,...,6.141414,7.222222,1968.15,1973.37037,Ярославская область,297.4,S16913,2020-01-05,110,0
COL_3,Новокузнецк,,53.897083,87.108604,0,0,5,21,0,0,...,8.581081,9.0,1992.716216,2014.0,Кемеровская область,190.0,S10148,2020-01-05,110,0
COL_4,Москва,,55.80259,37.48711,1,23,64,153,0,1,...,7.263889,5.684211,1963.229167,1960.5,Москва,60.2,S1338,2020-01-05,10,0


# Preprocessing

In [2]:
processor = preprocess.Processing(model=lgbm.LGBMRegressor())
X_feat, y = processor._process(data=df, target_col='per_square_meter_price')

X_test, _ = processor._process(X_test, target_col=None, price_type=1, mode='test')

# LGBM

In [3]:
# # Поиск параметров
# import optuna
# from optuna.samplers import TPESampler
# sampler = TPESampler(seed=13)

# def create_model(trial):
#     num_leaves = trial.suggest_int("num_leaves", 2, 500)
#     n_estimators = trial.suggest_int("n_estimators", 50, 500)
#     max_depth = trial.suggest_int('max_depth', 2, 50)
#     learning_rate = trial.suggest_uniform('learning_rate', 0.00001, 0.99)
#     min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 2, 300)
    
#     model = lgbm.LGBMRegressor(
#         num_leaves=num_leaves,
#         n_estimators=n_estimators, 
#         max_depth=max_depth, 
#         learning_rate=learning_rate,
#         min_data_in_leaf=min_data_in_leaf,
#         boosting_type='rf',
#         bagging_fraction=0.8,
#         bagging_freq=5
# )
#     return model

# def objective(trial):
#     model = create_model(trial)
#     X_train, X_test, y_train, y_test = train_test_split(X_feat, y, 
#                                         random_state=np.random.randint(1, 10000))
#     model.fit(X_train, y_train, 
#                 eval_set=[(X_train, y_train), (X_test, y_test)], early_stopping_rounds=20, verbose=False)

#     result = model.predict(X_test)
#     #---
#     model.fit(X_train, y_train)
#     result1 = model.predict(X_train)
#     #---
#     score = abs(deviation_metric(np.expm1(y_test.values), np.expm1(result)) - \
#             deviation_metric(np.expm1(y_train.values), np.expm1(result1)))
#     return score

# study = optuna.create_study(direction="minimize", sampler=sampler)
# study.optimize(objective, n_trials=200)
# model_params = study.best_params

In [4]:
model_params = {
            'n_estimators': 75,
            'learning_rate': 0.01,
            'reg_alpha': 1,
            'num_leaves': 40,
            'min_child_samples': 5,
            'importance_type': "gain",
            'n_jobs': 1,
            'random_state': 563,
          }
model = lgbm.LGBMRegressor(**model_params)

In [5]:
# Проверка на кросс-валидации
kf_random_state = [21, 42, 228, 777, 69]
n_splits = 5
y_predict_lgbm = np.zeros(X_test.shape[0])
scores = []
for rand_state in kf_random_state:
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=rand_state)
    scores = []
    for train_index, val_index in kf.split(X_feat):
        X_train, X_val = X_feat.iloc[train_index], X_feat.iloc[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Проверим на валидационной выборке
        model.fit(X_train, y_train)
        y_predict = model.predict(X_val)
        scores.append(deviation_metric(np.expm1(y_val), np.expm1(y_predict)))

        print('On validation set: %f' % scores[-1])
        # Предсказание. Коэффициент 0.94 из-за несимметричности метрики (если предсказанная цена выше
        # истинной - штраф сильнее)
        y_predict = model.predict(X_test)
        y_predict_lgbm += np.expm1(y_predict)/(n_splits*len(kf_random_state))*0.94

    print('Mean value of the loss: %.2f\n\
    Relative standard deviation: %.2f %%' % \
            (np.mean(scores), np.std(scores)/np.mean(scores)*100))

On validation set: 1.375080
On validation set: 1.435074
On validation set: 1.461219
On validation set: 1.655304
On validation set: 1.651168
Mean value of the loss: 1.52
    Relative standard deviation: 7.64 %
On validation set: 1.747400
On validation set: 1.332736
On validation set: 1.535095
On validation set: 1.641826
On validation set: 1.337103
Mean value of the loss: 1.52
    Relative standard deviation: 10.83 %
On validation set: 1.440366
On validation set: 1.390961
On validation set: 1.531765
On validation set: 1.644977
On validation set: 1.582983
Mean value of the loss: 1.52
    Relative standard deviation: 6.09 %
On validation set: 1.438748
On validation set: 1.448612
On validation set: 1.630836
On validation set: 1.663020
On validation set: 1.413603
Mean value of the loss: 1.52
    Relative standard deviation: 6.95 %
On validation set: 1.536509
On validation set: 1.481256
On validation set: 1.640282
On validation set: 1.505803
On validation set: 1.430532
Mean value of the loss:

# XGBoost

In [6]:
xgb_params = {
    'n_estimators': 75,
    'eta': 0.046,
    'max_depth': 3,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:squarederror', #'reg:linear',
    'eval_metric': 'rmse',
}

model = xgb.XGBRegressor(**xgb_params)

In [7]:
# Проверка на кросс-валидации
kf_random_state = [21, 42, 228, 777, 69]
n_splits = 5
y_predict_xgb = np.zeros(X_test.shape[0])
scores = []
for rand_state in kf_random_state:
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=rand_state)
    scores = []
    for train_index, val_index in kf.split(X_feat):
        X_train, X_val = X_feat.iloc[train_index], X_feat.iloc[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Проверим на валидационной выборке
        model.fit(X_train, y_train)
        y_predict = model.predict(X_val)
        scores.append(deviation_metric(np.expm1(y_val), np.expm1(y_predict)))

        print('On validation set: %f' % scores[-1])
        # Предсказание
        y_predict = model.predict(X_test)
        y_predict_xgb += np.expm1(y_predict)/(n_splits*len(kf_random_state))*0.94

    print('Mean value of the loss: %.2f\n\
    Relative standard deviation: %.2f %%' % \
            (np.mean(scores), np.std(scores)/np.mean(scores)*100))


On validation set: 1.363247
On validation set: 1.280497
On validation set: 1.303244
On validation set: 1.256151
On validation set: 1.456547
Mean value of the loss: 1.33
    Relative standard deviation: 5.38 %
On validation set: 1.189484
On validation set: 1.423753
On validation set: 1.294716
On validation set: 1.251612
On validation set: 1.513163
Mean value of the loss: 1.33
    Relative standard deviation: 8.83 %
On validation set: 1.290194
On validation set: 1.300718
On validation set: 1.369279
On validation set: 1.315412
On validation set: 1.374332
Mean value of the loss: 1.33
    Relative standard deviation: 2.64 %
On validation set: 1.277197
On validation set: 1.285158
On validation set: 1.416420
On validation set: 1.298852
On validation set: 1.380491
Mean value of the loss: 1.33
    Relative standard deviation: 4.22 %
On validation set: 1.301141
On validation set: 1.355764
On validation set: 1.312165
On validation set: 1.403527
On validation set: 1.266622
Mean value of the loss: 

# CatBoost

In [8]:
cat_params = {
    'n_estimators': 75,
    'max_depth': 3,
    'learning_rate': 0.1
}
model = catboost.CatBoostRegressor(**cat_params)

In [9]:
# Проверка на кросс-валидации
kf_random_state = [21, 42, 228, 777, 69]
n_splits = 5
y_predict_cat = np.zeros(X_test.shape[0])
scores = []
# Преобразование для правильной работы CatBoost
X_feat.columns = X_feat.columns.map(str)
X_test.columns = X_test.columns.map(str)
X_feat.sort_index(axis=1, inplace=True)
X_test.sort_index(axis=1, inplace=True)

for rand_state in kf_random_state:
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=rand_state)
    scores = []
    for train_index, val_index in kf.split(X_feat):
        X_train, X_val = X_feat.iloc[train_index], X_feat.iloc[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Проверим на валидационной выборке
        model.fit(X_train, y_train, verbose=False)
        y_predict = model.predict(X_val)
        scores.append(deviation_metric(np.expm1(y_val), np.expm1(y_predict)))

        print('On validation set: %f' % scores[-1])
        # Предсказание
        y_predict = model.predict(X_test)
        y_predict_cat += np.expm1(y_predict)/(n_splits*len(kf_random_state))*0.94

    print('Mean value of the loss: %.2f\n\
    Relative standard deviation: %.2f %%' % \
            (np.mean(scores), np.std(scores)/np.mean(scores)*100))

On validation set: 0.459245
On validation set: 0.479835
On validation set: 0.424854
On validation set: 0.539960
On validation set: 0.468935
Mean value of the loss: 0.47
    Relative standard deviation: 7.91 %
On validation set: 0.542303
On validation set: 0.390656
On validation set: 0.516589
On validation set: 0.536542
On validation set: 0.394670
Mean value of the loss: 0.48
    Relative standard deviation: 14.43 %
On validation set: 0.379527
On validation set: 0.490641
On validation set: 0.504583
On validation set: 0.505778
On validation set: 0.491376
Mean value of the loss: 0.47
    Relative standard deviation: 10.09 %
On validation set: 0.421485
On validation set: 0.442747
On validation set: 0.544215
On validation set: 0.492108
On validation set: 0.491472
Mean value of the loss: 0.48
    Relative standard deviation: 8.97 %
On validation set: 0.492621
On validation set: 0.461586
On validation set: 0.507054
On validation set: 0.466970
On validation set: 0.476004
Mean value of the loss

# Финальный результат

In [10]:
y_predict = y_predict_lgbm*0.70 + \
            y_predict_xgb*0.15 + \
            y_predict_cat*0.15

# Сохранение данных

In [11]:
results_final = pd.Series(data=y_predict, index=X_test.index, name='per_square_meter_price')
results_final.to_csv('results_final.csv')