In [1]:
IS_GPU = False
# Импорт нужных библиотек
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")
import tensorflow as tf
import tensorflow.keras as keras
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
import time
from scipy.optimize import minimize
from neighbors import Neighborhoods

from indices import MainDataset
from dnn_utils import preprocess_floor
from metric import metrics_stat, deviation_metric
from catboost import CatBoostRegressor
from catboost import Pool

def reset_tensorflow_session():
    tf.keras.backend.clear_session()
    tf.random.set_seed(41)
    np.random.seed(41)


THRESHOLD = 0.15
NEGATIVE_WEIGHT = 1.1

In [2]:

# Категориальные данные
CATEGORICAL_FEATURES_COLUMNS = ['region', 'city', 'realty_type', 'floor', 'osm_city_nearest_name', 'street']
# Численные данные
NUM_FEATURES_COLUMNS = ['lat', 'lng', 'osm_amenity_points_in_0.001',
                        'osm_amenity_points_in_0.005', 'osm_amenity_points_in_0.0075',
                        'osm_amenity_points_in_0.01', 'osm_building_points_in_0.001',
                        'osm_building_points_in_0.005', 'osm_building_points_in_0.0075',
                        'osm_building_points_in_0.01', 'osm_catering_points_in_0.001',
                        'osm_catering_points_in_0.005', 'osm_catering_points_in_0.0075',
                        'osm_catering_points_in_0.01', 'osm_city_closest_dist',
                        'osm_city_nearest_population',
                        'osm_crossing_closest_dist', 'osm_crossing_points_in_0.001',
                        'osm_crossing_points_in_0.005', 'osm_crossing_points_in_0.0075',
                        'osm_crossing_points_in_0.01', 'osm_culture_points_in_0.001',
                        'osm_culture_points_in_0.005', 'osm_culture_points_in_0.0075',
                        'osm_culture_points_in_0.01', 'osm_finance_points_in_0.001',
                        'osm_finance_points_in_0.005', 'osm_finance_points_in_0.0075',
                        'osm_finance_points_in_0.01', 'osm_healthcare_points_in_0.005',
                        'osm_healthcare_points_in_0.0075', 'osm_healthcare_points_in_0.01',
                        'osm_historic_points_in_0.005', 'osm_historic_points_in_0.0075',
                        'osm_historic_points_in_0.01', 'osm_hotels_points_in_0.005',
                        'osm_hotels_points_in_0.0075', 'osm_hotels_points_in_0.01',
                        'osm_leisure_points_in_0.005', 'osm_leisure_points_in_0.0075',
                        'osm_leisure_points_in_0.01', 'osm_offices_points_in_0.001',
                        'osm_offices_points_in_0.005', 'osm_offices_points_in_0.0075',
                        'osm_offices_points_in_0.01', 'osm_shops_points_in_0.001',
                        'osm_shops_points_in_0.005', 'osm_shops_points_in_0.0075',
                        'osm_shops_points_in_0.01', 'osm_subway_closest_dist',
                        'osm_train_stop_closest_dist', 'osm_train_stop_points_in_0.005',
                        'osm_train_stop_points_in_0.0075', 'osm_train_stop_points_in_0.01',
                        'osm_transport_stop_closest_dist', 'osm_transport_stop_points_in_0.005',
                        'osm_transport_stop_points_in_0.0075',
                        'osm_transport_stop_points_in_0.01',
                        'reform_count_of_houses_1000', 'reform_count_of_houses_500',
                        'reform_house_population_1000', 'reform_house_population_500',
                        'reform_mean_floor_count_1000', 'reform_mean_floor_count_500',
                        'reform_mean_year_building_1000', 'reform_mean_year_building_500', 'total_square',
                        "neighbor_dist", "neighbor_total_price", "neighbor_square_price", "neighbor10_dist",
                        "has_basement", "floor_count"

                        ]
# Таргет
TARGET_COLUMNS = ['per_square_meter_price']

In [3]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')
train = train[train.price_type == 1].reset_index(drop=True)
train['is_train'] = 1
test['is_train'] = 0
dataset = pd.concat([train, test]).reset_index(drop=True)


In [4]:
train_dataset_index = MainDataset("dataset/train.csv")
test_dataset_index = MainDataset("dataset/test.csv", need_index=False)
neighborhoods = Neighborhoods(train_dataset_index.index)


In [5]:
dataset["neighbor_dist"] = -999
dataset["neighbor_total_price"] = -999
dataset["neighbor_square_price"] = -999
dataset["neighbor10_dist"] = -999
for d in [test_dataset_index, train_dataset_index]:
    for i, o in enumerate(d.all_objects):
        if o.row["price_type"] != 1:
            continue
        neighbor = neighborhoods.get_haversine_closest(o, 12)
        neighbor1 = neighborhoods.get_haversine_closest(o, 2)
        n = neighbor[0]
        dataset.loc[dataset["id"] == o.row["id"], "neighbor_dist"] = n[1]
        dataset.loc[dataset["id"] == o.row["id"], "neighbor_total_price"] = n[0].row["per_square_meter_price"] * \
                                                                            n[0].row["total_square"]
        dataset.loc[dataset["id"] == o.row["id"], "neighbor_square_price"] = n[0].row["per_square_meter_price"]
        dataset.loc[dataset["id"] == o.row["id"], "neighbor10_dist"] = neighbor[10][1]


In [6]:

dataset=preprocess_floor.preprocess(dataset)



In [7]:
def encode_categorical_features(df, categorical_columns):
    for column in categorical_columns:
        dict_encoding = {key: val for val, key in enumerate(df[column].unique())}
        df[column] = df[column].map(dict_encoding)
    return df

In [8]:

# Квантильное преобразование данных
def get_quantile_transform(_df, columns_for_quantilization, random_state=41, n_quantiles=100,
                           output_distribution='normal'):
    df = _df.copy()
    for col in columns_for_quantilization:
        qt = QuantileTransformer(random_state=random_state, n_quantiles=n_quantiles,
                                 output_distribution=output_distribution)
        df[col] = qt.fit_transform(df[[col]])
    return df

In [9]:

# МинМакс преобразование данных
def get_minmax_transform(_df, columns_for_quantilization, min_value=-1, max_value=1):
    df = _df.copy()
    for col in columns_for_quantilization:
        scaler = MinMaxScaler(feature_range=(min_value, max_value))
        df[col] = scaler.fit_transform(df[[col]])
    return df

In [10]:
# Hotencoding для категориальных фичей
data = encode_categorical_features(dataset, CATEGORICAL_FEATURES_COLUMNS)
# Нормализация численных данных
data = get_quantile_transform(data, NUM_FEATURES_COLUMNS)
data = get_minmax_transform(data, NUM_FEATURES_COLUMNS)
# Заполняем NaN значения
data = data.fillna(data.mean())
train = data[data.is_train == 1].reset_index(drop=True)
test = data[data.is_train == 0].reset_index(drop=True)
train = train.drop(columns=['is_train'])
test = test.drop(columns=['is_train'])

In [11]:
def get_standart_split(data, n_splits=5, seed=41):
    kf = KFold(n_splits=n_splits, random_state=seed, shuffle=True)
    split_list = []
    for train_index, test_index in kf.split(data):
        split_list += [(train_index, test_index)]
    return split_list

In [17]:
def get_columns_order(columns):
    columns_order = sorted([x for x in columns if not x in (CATEGORICAL_FEATURES_COLUMNS + TARGET_COLUMNS)])
    return columns_order + CATEGORICAL_FEATURES_COLUMNS + TARGET_COLUMNS

features_columns_order = get_columns_order(train.columns.values.tolist())

split_list = get_standart_split(train, n_splits=3)


In [18]:

# Кастомная метрика для xgboost
def xbg_error(preds, dtrain):
    labels = dtrain.get_label()
    err = deviation_metric(np.exp(labels), np.exp(preds)/1.1)
    return 'deviation_error', err


def train_xgb(train, valid, num_features, categorical_features, target_train, target_valid, EPOCHS, params):
    dtest = xgb.DMatrix(test[num_features + categorical_features])
    y_valid = np.zeros(len(valid))

    dtrain = xgb.DMatrix(train[num_features + categorical_features], np.log(target_train), 
                        )
    dvalid = xgb.DMatrix(valid[num_features + categorical_features], np.log(target_valid), 
                        )
    model = xgb.train(
        params,
        dtrain,
        EPOCHS,
        [(dvalid, "valid")],
        verbose_eval=250,
        early_stopping_rounds=500,
        feval=xbg_error,
    )
    y_valid = model.predict(dvalid)

    return model, y_valid


start_train_model_time = time.time()

xgboost_seed = 41
xgboost_params = {
    "subsample": 0.70,
    "colsample_bytree": 0.50,
    "max_depth": 7,
    "learning_rate": 0.012,
    "objective": "reg:squarederror",
    'disable_default_eval_metric': 1,
    "nthread": -1,
    "max_bin": 128,
    'min_child_weight': 0.0,
    'reg_lambda': 0.0,
    'reg_alpha': 0.0,
    'seed': xgboost_seed,
}

# Количество эпох обучения
EPOCHS = 10000
scores = []
xgb_predicts = np.zeros(len(train))

xgb_models = []
for fold_num, (train_indexes, valid_indexes) in enumerate(split_list):
    start_time = time.time()
    print(f"Фолд: {fold_num}")

    train_sub_df = train[features_columns_order].loc[train_indexes].reset_index(drop=True)
    valid_sub_df = train[features_columns_order].loc[valid_indexes].reset_index(drop=True)

    print(f"Размер трейна = {train_sub_df.shape} Размер валидации = {valid_sub_df.shape}")
    # Обучаем Xgboost и делаем предикт на валидационной выборке
    model, predict_validation = train_xgb(
        train_sub_df,
        valid_sub_df,
        NUM_FEATURES_COLUMNS,
        CATEGORICAL_FEATURES_COLUMNS,
        train_sub_df[TARGET_COLUMNS[0]].values,
        valid_sub_df[TARGET_COLUMNS[0]].values,
        EPOCHS,
        xgboost_params)

    xgb_models += [model]
    predict_on_validation = model.predict(
        xgb.DMatrix(valid_sub_df[NUM_FEATURES_COLUMNS + CATEGORICAL_FEATURES_COLUMNS]))
    xgb_predicts[valid_indexes] = np.exp(predict_on_validation)
    targets_for_validation = valid_sub_df[TARGET_COLUMNS].values[:, 0]
    current_score = deviation_metric(targets_for_validation, predict_on_validation)
    scores += [current_score]
    print(
        f"Скор для фолда({fold_num}) : {np.round(current_score, 4)} средний скор на префиксе = {np.round(np.mean(scores), 4)} это заняло = {int(time.time() - start_time)} сек.")
print(f"Процесс обучения модели занял = {int(time.time() - start_train_model_time)} секунд")






Фолд: 0
Размер трейна = (2995, 83) Размер валидации = (1498, 83)
[0]	valid-deviation_error:9.00000
[250]	valid-deviation_error:4.31510
[500]	valid-deviation_error:1.13611
[750]	valid-deviation_error:1.12503
[1000]	valid-deviation_error:1.11958
[1250]	valid-deviation_error:1.12260
[1500]	valid-deviation_error:1.12366
[1579]	valid-deviation_error:1.12409
Скор для фолда(0) : 9.0 средний скор на префиксе = 9.0 это заняло = 21 сек.
Фолд: 1
Размер трейна = (2995, 83) Размер валидации = (1498, 83)
[0]	valid-deviation_error:9.00000
[250]	valid-deviation_error:4.12835
[500]	valid-deviation_error:1.05043
[750]	valid-deviation_error:1.03665
[1000]	valid-deviation_error:1.03147
[1250]	valid-deviation_error:1.03268
[1500]	valid-deviation_error:1.03312
[1558]	valid-deviation_error:1.03375
Скор для фолда(1) : 9.0 средний скор на префиксе = 9.0 это заняло = 21 сек.
Фолд: 2
Размер трейна = (2996, 83) Размер валидации = (1497, 83)
[0]	valid-deviation_error:9.00000
[250]	valid-deviation_error:4.49565
[50

In [19]:
# Предикт xgb на test
def get_xgb_predict(models, test):
    result = np.zeros(len(test))
    for model in models:
        predict = model.predict(xgb.DMatrix(test[NUM_FEATURES_COLUMNS + CATEGORICAL_FEATURES_COLUMNS]))
        result += predict / len(models)
    return result

test_xgb_predict = get_xgb_predict(xgb_models, test)

test_xgb_predict=np.exp(test_xgb_predict)

test_xgb_predict.min(), test_xgb_predict.max(), test_xgb_predict.mean()



(17877.038478399743, 1141842.7613402253, 61048.72920941093)

In [20]:
class CatBoostEvalMetricPearson(object):
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])
        preds = np.array(approxes[0])
        target = np.array(target)
        err = deviation_metric(np.exp(target), np.exp(preds)/1.1)
        return err, 0


def train_cat(train, valid, num_features, categorical_features, target_train, target_valid, EPOCHS):

    test_data = Pool(data=test[num_features + categorical_features],
                  cat_features=categorical_features)


    train_data = Pool(data=train[num_features + categorical_features],
                      cat_features=categorical_features,
                      label=np.log(target_train))

    val_data = Pool(data=valid[num_features + categorical_features],
                cat_features=categorical_features,
                  label=np.log(target_valid))

    cat_model = CatBoostRegressor(
        learning_rate=0.012,
        iterations=15000,
        metric_period=50,
        eval_metric=CatBoostEvalMetricPearson(),
    )
    cat_model.fit(train_data, eval_set=val_data, use_best_model=True, early_stopping_rounds=300)
  
    y_valid = cat_model.predict(test_data)

    return cat_model, y_valid


start_train_model_time = time.time()

scores = []
cat_predicts = np.zeros(len(train))

cat_models = []
for fold_num, (train_indexes, valid_indexes) in enumerate(split_list):
    start_time = time.time()
    print(f"Фолд: {fold_num}")

    train_sub_df = train[features_columns_order].loc[train_indexes].reset_index(drop=True)
    valid_sub_df = train[features_columns_order].loc[valid_indexes].reset_index(drop=True)

    print(f"Размер трейна = {train_sub_df.shape} Размер валидации = {valid_sub_df.shape}")
    model, predict_validation = train_cat(
        train_sub_df,
        valid_sub_df,
        NUM_FEATURES_COLUMNS,
        CATEGORICAL_FEATURES_COLUMNS,
        train_sub_df[TARGET_COLUMNS[0]].values,
        valid_sub_df[TARGET_COLUMNS[0]].values,
        EPOCHS
        )

    cat_models += [model]
    predict_on_validation = model.predict(valid_sub_df[NUM_FEATURES_COLUMNS + CATEGORICAL_FEATURES_COLUMNS])
    cat_predicts[valid_indexes] = np.exp(predict_on_validation)
    targets_for_validation = valid_sub_df[TARGET_COLUMNS].values[:, 0]
    current_score = deviation_metric(targets_for_validation, predict_on_validation)
    scores += [current_score]
    print(
        f"Скор для фолда({fold_num}) : {np.round(current_score, 4)} средний скор на префиксе = {np.round(np.mean(scores), 4)} это заняло = {int(time.time() - start_time)} сек.")
print(f"Процесс обучения модели занял = {int(time.time() - start_train_model_time)} секунд")



Фолд: 0
Размер трейна = (2995, 83) Размер валидации = (1498, 83)
0:	learn: 3.1788728	test: 3.2299436	best: 3.2299436 (0)	total: 80.7ms	remaining: 20m 11s




50:	learn: 2.4586453	test: 2.5373713	best: 2.5373713 (50)	total: 765ms	remaining: 3m 44s
100:	learn: 1.9858759	test: 2.0904839	best: 2.0904839 (100)	total: 1.43s	remaining: 3m 30s
150:	learn: 1.7353200	test: 1.8677523	best: 1.8677523 (150)	total: 2.1s	remaining: 3m 26s
200:	learn: 1.5726791	test: 1.7175637	best: 1.7175637 (200)	total: 2.78s	remaining: 3m 24s
250:	learn: 1.4558507	test: 1.6287695	best: 1.6287695 (250)	total: 3.42s	remaining: 3m 21s
300:	learn: 1.3686715	test: 1.5693457	best: 1.5693457 (300)	total: 4.07s	remaining: 3m 18s
350:	learn: 1.3028162	test: 1.5201147	best: 1.5201147 (350)	total: 4.71s	remaining: 3m 16s
400:	learn: 1.2556862	test: 1.4826212	best: 1.4826212 (400)	total: 5.37s	remaining: 3m 15s
450:	learn: 1.2118792	test: 1.4530336	best: 1.4530336 (450)	total: 6s	remaining: 3m 13s
500:	learn: 1.1751995	test: 1.4297742	best: 1.4297742 (500)	total: 6.68s	remaining: 3m 13s
550:	learn: 1.1418332	test: 1.4085939	best: 1.4085939 (550)	total: 7.34s	remaining: 3m 12s
600:	

4500:	learn: 0.2175249	test: 1.1635898	best: 1.1633934 (4440)	total: 58.7s	remaining: 2m 16s
4550:	learn: 0.2137547	test: 1.1626887	best: 1.1626887 (4550)	total: 59.4s	remaining: 2m 16s
4600:	learn: 0.2102575	test: 1.1625418	best: 1.1622822 (4588)	total: 1m	remaining: 2m 15s
4650:	learn: 0.2055827	test: 1.1614897	best: 1.1614678 (4649)	total: 1m	remaining: 2m 14s
4700:	learn: 0.2018025	test: 1.1611392	best: 1.1611077 (4699)	total: 1m 1s	remaining: 2m 14s
4750:	learn: 0.1977519	test: 1.1606988	best: 1.1604020 (4733)	total: 1m 1s	remaining: 2m 13s
4800:	learn: 0.1941167	test: 1.1602063	best: 1.1601697 (4794)	total: 1m 2s	remaining: 2m 12s
4850:	learn: 0.1906001	test: 1.1593090	best: 1.1593090 (4850)	total: 1m 3s	remaining: 2m 12s
4900:	learn: 0.1874413	test: 1.1584158	best: 1.1582788 (4898)	total: 1m 3s	remaining: 2m 11s
4950:	learn: 0.1834351	test: 1.1580445	best: 1.1578636 (4946)	total: 1m 4s	remaining: 2m 11s
5000:	learn: 0.1796278	test: 1.1573997	best: 1.1573997 (5000)	total: 1m 5s	r



50:	learn: 2.4373885	test: 2.5466981	best: 2.5466981 (50)	total: 682ms	remaining: 3m 19s
100:	learn: 1.9953267	test: 2.0745179	best: 2.0745179 (100)	total: 1.32s	remaining: 3m 14s
150:	learn: 1.7623639	test: 1.8208148	best: 1.8208148 (150)	total: 1.96s	remaining: 3m 12s
200:	learn: 1.6106865	test: 1.6601155	best: 1.6601155 (200)	total: 2.59s	remaining: 3m 10s
250:	learn: 1.4919654	test: 1.5502867	best: 1.5502867 (250)	total: 3.23s	remaining: 3m 9s
300:	learn: 1.4049933	test: 1.4729815	best: 1.4729815 (300)	total: 3.87s	remaining: 3m 8s
350:	learn: 1.3368211	test: 1.4244973	best: 1.4244973 (350)	total: 4.5s	remaining: 3m 8s
400:	learn: 1.2842676	test: 1.3876845	best: 1.3876845 (400)	total: 5.15s	remaining: 3m 7s
450:	learn: 1.2433444	test: 1.3639538	best: 1.3639538 (450)	total: 5.79s	remaining: 3m 6s
500:	learn: 1.2066292	test: 1.3438028	best: 1.3438028 (500)	total: 6.42s	remaining: 3m 5s
550:	learn: 1.1748859	test: 1.3264371	best: 1.3264371 (550)	total: 7.06s	remaining: 3m 5s
600:	lear

4500:	learn: 0.2203027	test: 1.1163860	best: 1.1160604 (4409)	total: 59s	remaining: 2m 17s
4550:	learn: 0.2160005	test: 1.1165278	best: 1.1160604 (4409)	total: 59.7s	remaining: 2m 17s
4600:	learn: 0.2113114	test: 1.1163429	best: 1.1160604 (4409)	total: 1m	remaining: 2m 16s
4650:	learn: 0.2066991	test: 1.1149513	best: 1.1149260 (4649)	total: 1m	remaining: 2m 15s
4700:	learn: 0.2025820	test: 1.1130580	best: 1.1130580 (4700)	total: 1m 1s	remaining: 2m 15s
4750:	learn: 0.1986770	test: 1.1121773	best: 1.1121080 (4748)	total: 1m 2s	remaining: 2m 14s
4800:	learn: 0.1946025	test: 1.1114815	best: 1.1114667 (4798)	total: 1m 3s	remaining: 2m 13s
4850:	learn: 0.1908542	test: 1.1112582	best: 1.1112240 (4849)	total: 1m 3s	remaining: 2m 13s
4900:	learn: 0.1864800	test: 1.1109879	best: 1.1107159 (4898)	total: 1m 4s	remaining: 2m 12s
4950:	learn: 0.1828110	test: 1.1109066	best: 1.1107016 (4926)	total: 1m 5s	remaining: 2m 12s
5000:	learn: 0.1792435	test: 1.1101865	best: 1.1101497 (4995)	total: 1m 5s	rem



50:	learn: 2.4982284	test: 2.3208120	best: 2.3208120 (50)	total: 689ms	remaining: 3m 21s
100:	learn: 2.0282373	test: 1.9404435	best: 1.9404435 (100)	total: 1.35s	remaining: 3m 19s
150:	learn: 1.7622193	test: 1.7482683	best: 1.7482683 (150)	total: 2.02s	remaining: 3m 18s
200:	learn: 1.5841600	test: 1.6456248	best: 1.6456248 (200)	total: 2.68s	remaining: 3m 17s
250:	learn: 1.4625266	test: 1.5783445	best: 1.5783445 (250)	total: 3.35s	remaining: 3m 16s
300:	learn: 1.3770252	test: 1.5318035	best: 1.5318035 (300)	total: 4.02s	remaining: 3m 16s
350:	learn: 1.3129928	test: 1.4979953	best: 1.4979953 (350)	total: 4.68s	remaining: 3m 15s
400:	learn: 1.2660452	test: 1.4748742	best: 1.4748742 (400)	total: 5.33s	remaining: 3m 14s
450:	learn: 1.2295537	test: 1.4538283	best: 1.4538283 (450)	total: 5.99s	remaining: 3m 13s
500:	learn: 1.1963699	test: 1.4366288	best: 1.4366288 (500)	total: 6.63s	remaining: 3m 11s
550:	learn: 1.1722182	test: 1.4241552	best: 1.4241552 (550)	total: 7.28s	remaining: 3m 10s
6

4500:	learn: 0.2477482	test: 1.1827267	best: 1.1827267 (4500)	total: 59.4s	remaining: 2m 18s
4550:	learn: 0.2432561	test: 1.1821587	best: 1.1819057 (4543)	total: 1m	remaining: 2m 17s
4600:	learn: 0.2380471	test: 1.1812658	best: 1.1812658 (4600)	total: 1m	remaining: 2m 17s
4650:	learn: 0.2336450	test: 1.1802623	best: 1.1801701 (4619)	total: 1m 1s	remaining: 2m 16s
4700:	learn: 0.2286050	test: 1.1790669	best: 1.1790669 (4700)	total: 1m 2s	remaining: 2m 16s
4750:	learn: 0.2243632	test: 1.1776094	best: 1.1774876 (4741)	total: 1m 2s	remaining: 2m 15s
4800:	learn: 0.2195458	test: 1.1769279	best: 1.1765289 (4794)	total: 1m 3s	remaining: 2m 14s
4850:	learn: 0.2150924	test: 1.1764850	best: 1.1763860 (4838)	total: 1m 4s	remaining: 2m 13s
4900:	learn: 0.2109321	test: 1.1755024	best: 1.1755024 (4900)	total: 1m 4s	remaining: 2m 13s
4950:	learn: 0.2068641	test: 1.1749373	best: 1.1749373 (4950)	total: 1m 5s	remaining: 2m 12s
5000:	learn: 0.2027476	test: 1.1750635	best: 1.1744842 (4990)	total: 1m 5s	r

(17827.272672948777, 1145496.2816781513, 61357.92834358371)

In [24]:
def train_cat(train, valid, num_features, categorical_features, target_train, target_valid, EPOCHS):

    test_data = Pool(data=test[num_features + categorical_features],
                  cat_features=categorical_features)


    train_data = Pool(data=train[num_features + categorical_features],
                      cat_features=categorical_features,
                      label=np.log(target_train))

    val_data = Pool(data=valid[num_features + categorical_features],
                cat_features=categorical_features,
                  label=np.log(target_valid))

    cat_model = CatBoostRegressor(
        l2_leaf_reg=6,
        bagging_temperature=1.3,
        random_strength=1.2,
        learning_rate=0.02,
        iterations=15000,
        metric_period=50,
#         task_type="GPU",
        eval_metric=CatBoostEvalMetricPearson(),
    )
    cat_model.fit(train_data, eval_set=val_data, use_best_model=True, early_stopping_rounds=300)
  
    y_valid = cat_model.predict(test_data)

    return cat_model, y_valid


start_train_model_time = time.time()

scores = []
cat_predicts2 = np.zeros(len(train))

cat_models2 = []
for fold_num, (train_indexes, valid_indexes) in enumerate(split_list):
    start_time = time.time()
    print(f"Фолд: {fold_num}")

    train_sub_df = train[features_columns_order].loc[train_indexes].reset_index(drop=True)
    valid_sub_df = train[features_columns_order].loc[valid_indexes].reset_index(drop=True)

    print(f"Размер трейна = {train_sub_df.shape} Размер валидации = {valid_sub_df.shape}")
    model, predict_validation = train_cat(
        train_sub_df,
        valid_sub_df,
        NUM_FEATURES_COLUMNS,
        CATEGORICAL_FEATURES_COLUMNS,
        train_sub_df[TARGET_COLUMNS[0]].values,
        valid_sub_df[TARGET_COLUMNS[0]].values,
        EPOCHS
        )

    cat_models2 += [model]
    predict_on_validation = model.predict(valid_sub_df[NUM_FEATURES_COLUMNS + CATEGORICAL_FEATURES_COLUMNS])
    cat_predicts2[valid_indexes] = np.exp(predict_on_validation)
    targets_for_validation = valid_sub_df[TARGET_COLUMNS].values[:, 0]
    current_score = deviation_metric(targets_for_validation, predict_on_validation)
    scores += [current_score]
    print(
        f"Скор для фолда({fold_num}) : {np.round(current_score, 4)} средний скор на префиксе = {np.round(np.mean(scores), 4)} это заняло = {int(time.time() - start_time)} сек.")
print(f"Процесс обучения модели занял = {int(time.time() - start_train_model_time)} секунд")


# Предикт xgb на test
def get_cat_predict(models, test):
    result = np.zeros(len(test))
    for model in cat_models:
        predict = model.predict(test[NUM_FEATURES_COLUMNS + CATEGORICAL_FEATURES_COLUMNS])
        result += np.exp(predict) / len(models)
    return result


test_cat_predict2 = get_cat_predict(xgb_models, test)

test_cat_predict2.min(), test_cat_predict2.max(), test_cat_predict2.mean()

Фолд: 0
Размер трейна = (2995, 83) Размер валидации = (1498, 83)
0:	learn: 3.1699973	test: 3.2224005	best: 3.2224005 (0)	total: 33.8ms	remaining: 8m 26s




50:	learn: 2.1493047	test: 2.2197046	best: 2.2197046 (50)	total: 703ms	remaining: 3m 25s
100:	learn: 1.7222655	test: 1.8397412	best: 1.8397412 (100)	total: 1.4s	remaining: 3m 26s
150:	learn: 1.5070406	test: 1.6559591	best: 1.6559591 (150)	total: 2.08s	remaining: 3m 24s
200:	learn: 1.3760509	test: 1.5486875	best: 1.5486875 (200)	total: 2.79s	remaining: 3m 25s
250:	learn: 1.2911390	test: 1.4861908	best: 1.4861908 (250)	total: 3.48s	remaining: 3m 24s
300:	learn: 1.2269009	test: 1.4470546	best: 1.4470546 (300)	total: 4.2s	remaining: 3m 25s
350:	learn: 1.1741302	test: 1.4138724	best: 1.4138724 (350)	total: 4.89s	remaining: 3m 24s
400:	learn: 1.1299573	test: 1.3943368	best: 1.3943368 (400)	total: 5.59s	remaining: 3m 23s
450:	learn: 1.0892769	test: 1.3751348	best: 1.3751348 (450)	total: 6.29s	remaining: 3m 22s
500:	learn: 1.0476730	test: 1.3535539	best: 1.3535539 (500)	total: 6.96s	remaining: 3m 21s
550:	learn: 1.0113122	test: 1.3386054	best: 1.3386054 (550)	total: 7.63s	remaining: 3m 20s
600

4500:	learn: 0.1343165	test: 1.1335575	best: 1.1325471 (4428)	total: 58.9s	remaining: 2m 17s
4550:	learn: 0.1307765	test: 1.1324430	best: 1.1323949 (4548)	total: 59.6s	remaining: 2m 16s
4600:	learn: 0.1274110	test: 1.1314402	best: 1.1313513 (4597)	total: 1m	remaining: 2m 16s
4650:	learn: 0.1243476	test: 1.1306149	best: 1.1305985 (4638)	total: 1m	remaining: 2m 15s
4700:	learn: 0.1213155	test: 1.1302923	best: 1.1299895 (4671)	total: 1m 1s	remaining: 2m 14s
4750:	learn: 0.1192130	test: 1.1299790	best: 1.1299237 (4748)	total: 1m 2s	remaining: 2m 14s
4800:	learn: 0.1164331	test: 1.1297720	best: 1.1294476 (4782)	total: 1m 2s	remaining: 2m 13s
4850:	learn: 0.1141530	test: 1.1296691	best: 1.1294476 (4782)	total: 1m 3s	remaining: 2m 12s
4900:	learn: 0.1115753	test: 1.1295888	best: 1.1293763 (4860)	total: 1m 4s	remaining: 2m 12s
4950:	learn: 0.1094976	test: 1.1293783	best: 1.1293637 (4947)	total: 1m 4s	remaining: 2m 11s
5000:	learn: 0.1070434	test: 1.1294287	best: 1.1288543 (4956)	total: 1m 5s	r



50:	learn: 2.1658576	test: 2.2492601	best: 2.2492601 (50)	total: 659ms	remaining: 3m 13s
100:	learn: 1.7733005	test: 1.8061561	best: 1.8061561 (100)	total: 1.3s	remaining: 3m 11s
150:	learn: 1.5655805	test: 1.5933626	best: 1.5933626 (150)	total: 1.94s	remaining: 3m 10s
200:	learn: 1.4261815	test: 1.4726516	best: 1.4726516 (200)	total: 2.58s	remaining: 3m 9s
250:	learn: 1.3343742	test: 1.4038153	best: 1.4038153 (250)	total: 3.22s	remaining: 3m 9s
300:	learn: 1.2702160	test: 1.3683569	best: 1.3683569 (300)	total: 3.85s	remaining: 3m 8s
350:	learn: 1.2162285	test: 1.3413645	best: 1.3413645 (350)	total: 4.5s	remaining: 3m 7s
400:	learn: 1.1729544	test: 1.3189627	best: 1.3189627 (400)	total: 5.13s	remaining: 3m 6s
450:	learn: 1.1407348	test: 1.3066023	best: 1.3063865 (445)	total: 5.77s	remaining: 3m 6s
500:	learn: 1.1065721	test: 1.2895331	best: 1.2895331 (500)	total: 6.4s	remaining: 3m 5s
550:	learn: 1.0797977	test: 1.2790951	best: 1.2789850 (548)	total: 7.04s	remaining: 3m 4s
600:	learn: 



50:	learn: 2.1885329	test: 2.0566066	best: 2.0566066 (50)	total: 740ms	remaining: 3m 36s
100:	learn: 1.7506617	test: 1.7398082	best: 1.7398082 (100)	total: 1.38s	remaining: 3m 24s
150:	learn: 1.5145579	test: 1.6053755	best: 1.6053755 (150)	total: 2.03s	remaining: 3m 19s
200:	learn: 1.3843400	test: 1.5278529	best: 1.5278529 (200)	total: 2.68s	remaining: 3m 17s
250:	learn: 1.2943436	test: 1.4753033	best: 1.4752120 (249)	total: 3.33s	remaining: 3m 15s
300:	learn: 1.2380168	test: 1.4446155	best: 1.4446155 (300)	total: 3.97s	remaining: 3m 13s
350:	learn: 1.1899685	test: 1.4160192	best: 1.4160192 (350)	total: 4.62s	remaining: 3m 12s
400:	learn: 1.1459875	test: 1.3895075	best: 1.3895075 (400)	total: 5.26s	remaining: 3m 11s
450:	learn: 1.1089827	test: 1.3708167	best: 1.3706543 (449)	total: 5.89s	remaining: 3m 10s
500:	learn: 1.0809051	test: 1.3587639	best: 1.3587639 (500)	total: 6.53s	remaining: 3m 8s
550:	learn: 1.0567955	test: 1.3483956	best: 1.3483956 (550)	total: 7.18s	remaining: 3m 8s
600

4500:	learn: 0.1553300	test: 1.1499237	best: 1.1483975 (4265)	total: 58.4s	remaining: 2m 16s
4550:	learn: 0.1506053	test: 1.1506968	best: 1.1483975 (4265)	total: 59.1s	remaining: 2m 15s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 1.148397504
bestIteration = 4265

Shrink model to first 4266 iterations.
Скор для фолда(2) : 9.0 средний скор на префиксе = 9.0 это заняло = 59 сек.
Процесс обучения модели занял = 184 секунд


(17827.272672948777, 1145496.2816781513, 61357.92834358371)

In [25]:
def minimize_arit(W):
    ypred =  W[0] * xgb_predicts + W[1] * cat_predicts
    return deviation_metric(train_targets, ypred)


W = minimize(minimize_arit, [1.0 / 2] * 2, options={'gtol': 1e-6, 'disp': True}).x
# 1.006250
# array([0.55692824, 0.34630855])


NameError: name 'train_targets' is not defined

In [86]:

test_submission = pd.read_csv('dataset/test_submission.csv')
test_submission['per_square_meter_price'] = test_xgb_predict * W[0] + test_cat_predict * W[1]
test_submission['per_square_meter_price'] = test_submission['per_square_meter_price'].apply(lambda x: max(1000.0, x))
test_submission.to_csv('submission.csv', index = False)

In [82]:
for city_id in [0,4,2,22]:
    xgb_predicts_spb = xgb_predicts[train.city == city_id]
    cat_predicts_spb = cat_predicts[train.city == city_id]
    nn_predicts_spb = nn_predicts[train.city == city_id]
    lgb_predicts_spb = lgb_predicts[train.city == city_id]
    train_targets_spb = train_targets[train.city == city_id]

    test_nn_predict_spb = test_nn_predict[test.city == city_id]
    test_lgb_predict_spb = test_lgb_predict[test.city == city_id]
    test_xgb_predict_spb = test_xgb_predict[test.city == city_id]
    test_cat_predict_spb = test_cat_predict[test.city == city_id]

    def minimize_arit_spb(W):
        ypred = W[0] * nn_predicts_spb + W[1] * lgb_predicts_spb + W[2] * xgb_predicts_spb + W[3] * cat_predicts_spb
        return deviation_metric(train_targets_spb, ypred)


    W_spb = minimize(minimize_arit_spb, [1.0 / 4] * 4, options={'gtol': 1e-6, 'disp': True}).x
    print(W_spb)
    test_submission.loc[test.city == city_id, 'per_square_meter_price'] = test_nn_predict_spb * W_spb[0] + test_lgb_predict_spb * W_spb[1] + test_xgb_predict_spb * W_spb[2] + test_cat_predict_spb * W_spb[3]


Optimization terminated successfully.
         Current function value: 1.037561
         Iterations: 16
         Function evaluations: 100
         Gradient evaluations: 20
[-0.01351148 -0.26195604  1.05303676  0.13243172]
Optimization terminated successfully.
         Current function value: 1.601880
         Iterations: 17
         Function evaluations: 115
         Gradient evaluations: 23
[-0.12379493  0.15593163  0.94058522 -0.10292575]
Optimization terminated successfully.
         Current function value: 1.465565
         Iterations: 12
         Function evaluations: 80
         Gradient evaluations: 16
[0.29737273 0.27367064 0.31115634 0.02938499]
Optimization terminated successfully.
         Current function value: 0.801832
         Iterations: 13
         Function evaluations: 80
         Gradient evaluations: 16
[0.05656501 0.15657642 0.6925764  0.02681123]


In [83]:
test_submission.to_csv('submission_cities.csv', index = False)