In [1]:
IS_GPU = True
# Импорт нужных библиотек
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")
import tensorflow as tf
import tensorflow.keras as keras
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
import time
from scipy.optimize import minimize
from neighbors import Neighborhoods

from indices import MainDataset
from dnn_utils import preprocess_floor
from metric import metrics_stat, deviation_metric

def reset_tensorflow_session():
    tf.keras.backend.clear_session()
    tf.random.set_seed(41)
    np.random.seed(41)


THRESHOLD = 0.15
NEGATIVE_WEIGHT = 1.1

In [2]:

# Категориальные данные
CATEGORICAL_FEATURES_COLUMNS = ['region', 'city', 'realty_type', 'floor', 'osm_city_nearest_name', 'street']
# Численные данные
NUM_FEATURES_COLUMNS = ['lat', 'lng', 'osm_amenity_points_in_0.001',
                        'osm_amenity_points_in_0.005', 'osm_amenity_points_in_0.0075',
                        'osm_amenity_points_in_0.01', 'osm_building_points_in_0.001',
                        'osm_building_points_in_0.005', 'osm_building_points_in_0.0075',
                        'osm_building_points_in_0.01', 'osm_catering_points_in_0.001',
                        'osm_catering_points_in_0.005', 'osm_catering_points_in_0.0075',
                        'osm_catering_points_in_0.01', 'osm_city_closest_dist',
                        'osm_city_nearest_population',
                        'osm_crossing_closest_dist', 'osm_crossing_points_in_0.001',
                        'osm_crossing_points_in_0.005', 'osm_crossing_points_in_0.0075',
                        'osm_crossing_points_in_0.01', 'osm_culture_points_in_0.001',
                        'osm_culture_points_in_0.005', 'osm_culture_points_in_0.0075',
                        'osm_culture_points_in_0.01', 'osm_finance_points_in_0.001',
                        'osm_finance_points_in_0.005', 'osm_finance_points_in_0.0075',
                        'osm_finance_points_in_0.01', 'osm_healthcare_points_in_0.005',
                        'osm_healthcare_points_in_0.0075', 'osm_healthcare_points_in_0.01',
                        'osm_historic_points_in_0.005', 'osm_historic_points_in_0.0075',
                        'osm_historic_points_in_0.01', 'osm_hotels_points_in_0.005',
                        'osm_hotels_points_in_0.0075', 'osm_hotels_points_in_0.01',
                        'osm_leisure_points_in_0.005', 'osm_leisure_points_in_0.0075',
                        'osm_leisure_points_in_0.01', 'osm_offices_points_in_0.001',
                        'osm_offices_points_in_0.005', 'osm_offices_points_in_0.0075',
                        'osm_offices_points_in_0.01', 'osm_shops_points_in_0.001',
                        'osm_shops_points_in_0.005', 'osm_shops_points_in_0.0075',
                        'osm_shops_points_in_0.01', 'osm_subway_closest_dist',
                        'osm_train_stop_closest_dist', 'osm_train_stop_points_in_0.005',
                        'osm_train_stop_points_in_0.0075', 'osm_train_stop_points_in_0.01',
                        'osm_transport_stop_closest_dist', 'osm_transport_stop_points_in_0.005',
                        'osm_transport_stop_points_in_0.0075',
                        'osm_transport_stop_points_in_0.01',
                        'reform_count_of_houses_1000', 'reform_count_of_houses_500',
                        'reform_house_population_1000', 'reform_house_population_500',
                        'reform_mean_floor_count_1000', 'reform_mean_floor_count_500',
                        'reform_mean_year_building_1000', 'reform_mean_year_building_500', 'total_square',
                        "neighbor_dist", "neighbor_total_price", "neighbor_square_price", "neighbor10_dist",
                        "has_basement", "floor_count"

                        ]
# Таргет
TARGET_COLUMNS = ['per_square_meter_price']

In [3]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')
train = train[train.price_type == 1].reset_index(drop=True)
train['is_train'] = 1
test['is_train'] = 0
dataset = pd.concat([train, test]).reset_index(drop=True)


In [4]:
train_dataset_index = MainDataset("dataset/train.csv")
test_dataset_index = MainDataset("dataset/test.csv", need_index=False)
neighborhoods = Neighborhoods(train_dataset_index.index)


In [5]:
dataset["neighbor_dist"] = -999
dataset["neighbor_total_price"] = -999
dataset["neighbor_square_price"] = -999
dataset["neighbor10_dist"] = -999

for d in [test_dataset_index, train_dataset_index]:
    for i, o in enumerate(d.all_objects):
        if o.row["price_type"] != 1:
            continue
        neighbor = neighborhoods.get_haversine_closest(o, 12)
        neighbor1 = neighborhoods.get_haversine_closest(o, 2)
        n = neighbor[0]
        dataset.loc[dataset["id"] == o.row["id"], "neighbor_dist"] = n[1]
        dataset.loc[dataset["id"] == o.row["id"], "neighbor_total_price"] = n[0].row["per_square_meter_price"] * \
                                                                            n[0].row["total_square"]
        dataset.loc[dataset["id"] == o.row["id"], "neighbor_square_price"] = n[0].row["per_square_meter_price"]
        dataset.loc[dataset["id"] == o.row["id"], "neighbor10_dist"] = neighbor[10][1]


In [6]:

dataset=preprocess_floor.preprocess(dataset)



In [7]:
dataset_copy = dataset.copy()

In [8]:
dataset.head()

Unnamed: 0,city,floor,id,lat,lng,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,...,date,realty_type,price_type,is_train,neighbor_dist,neighbor_total_price,neighbor_square_price,neighbor10_dist,has_basement,floor_count
0,Красноярск,-999,COL_62,56.063615,92.958428,0,7,14,26,0,...,2020-01-05,110,1,1,0.334024,995000.0,41458.333333,0.451369,-999,-999
1,Саратов,-999,COL_71,51.534581,46.020549,13,198,345,462,0,...,2020-01-05,10,1,1,0.086136,2985000.0,33166.666667,0.190652,-999,-999
2,Красноярск,-999,COL_140,56.026884,92.818323,3,15,23,33,0,...,2020-01-05,10,1,1,0.027117,18308000.0,61026.666667,0.291762,-999,-999
3,Иркутск,-999,COL_202,52.275528,104.251444,0,10,26,40,0,...,2020-01-05,10,1,1,0.220089,5870000.0,58700.0,0.435699,-999,-999
4,Белгород,-999,COL_207,50.576545,36.584197,4,48,73,92,0,...,2020-01-05,10,1,1,0.046677,4179000.0,59700.0,0.147191,-999,-999


In [9]:
def encode_categorical_features(df, categorical_columns):
    for column in categorical_columns:
        dict_encoding = {key: val for val, key in enumerate(df[column].unique())}
        df[column] = df[column].map(dict_encoding)
    return df

In [10]:

# Квантильное преобразование данных
def get_quantile_transform(_df, columns_for_quantilization, random_state=41, n_quantiles=100,
                           output_distribution='normal'):
    df = _df.copy()
    for col in columns_for_quantilization:
        qt = QuantileTransformer(random_state=random_state, n_quantiles=n_quantiles,
                                 output_distribution=output_distribution)
        df[col] = qt.fit_transform(df[[col]])
    return df

In [11]:

# МинМакс преобразование данных
def get_minmax_transform(_df, columns_for_quantilization, min_value=-1, max_value=1):
    df = _df.copy()
    for col in columns_for_quantilization:
        scaler = MinMaxScaler(feature_range=(min_value, max_value))
        df[col] = scaler.fit_transform(df[[col]])
    return df

In [12]:
# Hotencoding для категориальных фичей
data = encode_categorical_features(dataset, CATEGORICAL_FEATURES_COLUMNS)
# Нормализация численных данных
data = get_quantile_transform(data, NUM_FEATURES_COLUMNS)
data = get_minmax_transform(data, NUM_FEATURES_COLUMNS)
# Заполняем NaN значения
data = data.fillna(data.mean())
train = data[data.is_train == 1].reset_index(drop=True)
test = data[data.is_train == 0].reset_index(drop=True)
train = train.drop(columns=['is_train'])
test = test.drop(columns=['is_train'])

In [13]:
data.head()

Unnamed: 0,city,floor,id,lat,lng,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,...,date,realty_type,price_type,is_train,neighbor_dist,neighbor_total_price,neighbor_square_price,neighbor10_dist,has_basement,floor_count
0,0,0,COL_62,0.060226,0.223088,-1.0,-0.234768,-0.256798,-0.245381,-1.0,...,2020-01-05,0,1,1,0.29513,-0.335815,-0.100252,0.11742,-1.0,-1.0
1,1,0,COL_71,-0.284322,-0.042332,0.298058,0.285412,0.284259,0.272468,-1.0,...,2020-01-05,1,1,1,-0.125504,-0.153689,-0.153689,-0.153382,-1.0,-1.0
2,0,0,COL_140,0.03388,0.146323,0.067077,-0.131259,-0.182216,-0.206562,-1.0,...,2020-01-05,1,1,1,-0.370964,0.158787,0.014434,-0.012065,-1.0,-1.0
3,2,0,COL_202,-0.206973,0.27153,-1.0,-0.194,-0.167492,-0.178437,-1.0,...,2020-01-05,1,1,1,0.17513,-0.023753,0.001876,0.108638,-1.0,-1.0
4,3,0,COL_207,-0.335843,-0.118494,0.110487,0.077531,0.017066,-0.039228,-1.0,...,2020-01-05,1,1,1,-0.260458,-0.092197,0.007306,-0.236639,-1.0,-1.0


In [14]:
def get_standart_split(data, n_splits=5, seed=41):
    kf = KFold(n_splits=n_splits, random_state=seed, shuffle=True)
    split_list = []
    for train_index, test_index in kf.split(data):
        split_list += [(train_index, test_index)]
    return split_list

In [15]:
def get_dataset(arr_features, arr_target, arr_region, arr_city, arr_realty, batch_size):
    return tf.data.Dataset.from_tensor_slices(
        (
            {
                "model_features_input": arr_features,
                "model_region_input": arr_region,
                "model_city_input": arr_city,
                "model_realty_input": arr_realty,
            },
            {
                "model_output": arr_target,
            },
        )
    ).batch(batch_size)

In [16]:
def get_columns_order(columns):
    columns_order = sorted([x for x in columns if not x in (CATEGORICAL_FEATURES_COLUMNS + TARGET_COLUMNS)])
    return columns_order + CATEGORICAL_FEATURES_COLUMNS + TARGET_COLUMNS

In [17]:

# Коллбэк, для отслеживания целевой метрики
class CustomCallback(keras.callbacks.Callback):
    def __init__(self, val_dataset, val_targets):
        super(CustomCallback, self).__init__()
        self.val_targets = val_targets
        self.val_dataset = val_dataset

    def on_epoch_end(self, epoch, logs=None):
        predicts = self.model.predict(self.val_dataset)[:, 0]
        targets = self.val_targets[:, 0]
        if epoch % 50 == 0:
            print(f"Текущий реальный скор(валидационная часть): {np.round(deviation_metric(targets, predicts), 4)}")

In [18]:

def Dropout(x):
    return keras.layers.Dropout(x)


def Flatten():
    return keras.layers.Flatten()


def Concatenate():
    return keras.layers.Concatenate()


# Функция обучения модели
def fit(model, epochs, train_dataset, val_dataset, val_targets, verbose=1):
    if IS_GPU:
        print(f"Начинаю обучение модели (GPU) количество эпох = {epochs}")
        with tf.device('/device:GPU:0'):
            # Коллбэк для остановки, если модель перестала обучаться
            early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=2.5e-6,
                                                                       patience=100, restore_best_weights=True,
                                                                       mode='min')
            # Коллбэк для уменьшения скорости обучения
            lr_callback = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-9,
                                                               mode='min')
            # Кастомный коллбэк для отображения скора по целевой метрике
            metric_callback = CustomCallback(val_dataset, val_targets)
            history = model.fit(train_dataset, epochs=epochs, validation_data=val_dataset, verbose=verbose,
                                shuffle=True, callbacks=[early_stopping_callback, lr_callback, metric_callback],
                                workers=-1)
            return history
    else:
        print(f"Начинаю обучение модели (СPU) количество эпох = {epochs}")
        # Коллбэк для остановки, если модель перестала обучаться
        early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=2.5e-6, patience=100,
                                                                   restore_best_weights=True, mode='min')
        # Коллбэк для уменьшения скорости обучения
        lr_callback = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-9,
                                                           mode='min')
        # Кастомный коллбэк для отображения скора по целевой метрике
        metric_callback = CustomCallback(val_dataset, val_targets)
        history = model.fit(train_dataset, epochs=epochs, validation_data=val_dataset, verbose=verbose, shuffle=True,
                            callbacks=[early_stopping_callback, lr_callback, metric_callback], workers=-1)
        return history

In [19]:

# Реализация кастомной функции потерь для обучения
def tf_custom_loss(y_true, y_pred):
    threshold = 0.6
    error = tf.abs(y_true - y_pred) / y_true
    is_small_error = error <= threshold
    small_error_loss = tf.square(error / 0.15 - 1)
    big_error_loss = 9.0 * tf.ones_like(small_error_loss) + tf.abs(error)
    # big_error_loss = (3.0 * tf.ones_like(small_error_loss) + tf.abs(error)) ** 2
    return tf.where(is_small_error, small_error_loss, big_error_loss)

In [20]:

# Компиляция текущей модели
def compile_model(train_dataset, val_dataset, num_features, max_realty, max_region, max_city, lr=5e-4):
    reset_tensorflow_session()
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

    model_input_layer = tf.keras.Input(shape=(num_features), name="model_features_input")
    model_input_realty = tf.keras.Input(shape=(1), name="model_realty_input")
    model_input_region = tf.keras.Input(shape=(1), name="model_region_input")
    model_input_city = tf.keras.Input(shape=(1), name="model_city_input")

    model_embedding_layer_realty = keras.layers.Embedding(max_realty + 1, 4, input_length=1, dtype=tf.float64)(
        model_input_realty)
    model_embedding_layer_region = keras.layers.Embedding(max_region + 1, 32, input_length=1, dtype=tf.float64)(
        model_input_region)
    model_embedding_layer_city = keras.layers.Embedding(max_city + 1, 32, input_length=1, dtype=tf.float64)(
        model_input_city)

    concatenated_input_layer = Concatenate()(
        [Flatten()(model_embedding_layer_realty), Flatten()(model_embedding_layer_region),
         Flatten()(model_embedding_layer_city), Flatten()(model_input_layer)])

    layer_0 = keras.layers.Dense(128, activation="relu")(concatenated_input_layer)
    layer_1 = keras.layers.Dense(64, activation="relu")(layer_0)
    layer_2 = keras.layers.Dense(32, activation="relu")(layer_1)
    model_output_layer = keras.layers.Dense(1, activation="relu", name="model_output")(layer_2)

    cur_model = keras.Model(
        inputs=[
            model_input_layer,
            model_input_realty,
            model_input_region,
            model_input_city,
        ],
        outputs=[
            model_output_layer,
        ])

    print(f"Модель: input_shape = {cur_model.input_shape} output_shape = {cur_model.output_shape}")
#     cur_model.compile(loss=tf_custom_loss, optimizer=optimizer)  # , run_eagerly=True)
    cur_model.compile(loss=tf_custom_loss, optimizer=optimizer)  # , run_eagerly=True)

    #
    return cur_model

In [21]:
features_columns_order = get_columns_order(train.columns.values.tolist())
split_list = get_standart_split(train, n_splits=20)

start_train_model_time = time.time()
# Размер батча для Dataset
BATCH_SIZE = int(2 ** 5)
# Количество эпох обучения
EPOCHS = 1000
# Количество численных входных переменных модели
NUM_FEATURES = len(NUM_FEATURES_COLUMNS)
# Макс. значения категориалных фичей
MAX_REALTY = max(train['realty_type'].max(), test['realty_type'].max())
MAX_REGION = max(train['region'].max(), test['region'].max())
MAX_CITY = max(train['city'].max(), test['city'].max())
# Коэффициент домножения таргета, с целью быстрейшего сходимости модельки и лучшего обучения
MUL_TARGET = 5e-5

scores = []
nn_predicts = np.zeros(len(train))
models_nn = []

for fold_num, (train_indexes, valid_indexes) in enumerate(split_list):
    start_time = time.time()
    print(f"Фолд: {fold_num}")

    train_sub_df = train[features_columns_order].loc[train_indexes].reset_index(drop=True)
    valid_sub_df = train[features_columns_order].loc[valid_indexes].reset_index(drop=True)

    print(f"Размер трейна = {train_sub_df.shape} Размер валидации = {valid_sub_df.shape}")

    # Строим датасеты
    train_ds = get_dataset(
        train_sub_df[NUM_FEATURES_COLUMNS].values,
        train_sub_df[TARGET_COLUMNS].values * MUL_TARGET,
        train_sub_df[['region']].values,
        train_sub_df[['city']].values,
        train_sub_df[['realty_type']].values,
        BATCH_SIZE)
    valid_ds = get_dataset(
        valid_sub_df[NUM_FEATURES_COLUMNS].values,
        valid_sub_df[TARGET_COLUMNS].values * MUL_TARGET,
        valid_sub_df[['region']].values,
        valid_sub_df[['city']].values,
        valid_sub_df[['realty_type']].values,
        len(valid_sub_df))

    # Компилируем модель
    model = compile_model(train_ds, valid_ds, NUM_FEATURES, MAX_REALTY, MAX_REGION, MAX_CITY)
    # Обучаем модель
    fit(model, EPOCHS, train_ds, valid_ds, valid_sub_df[TARGET_COLUMNS].values * MUL_TARGET)

    predict_on_validation = model.predict(valid_ds)[:, 0] / MUL_TARGET
    nn_predicts[valid_indexes] = predict_on_validation
    targets_for_validation = valid_sub_df[TARGET_COLUMNS].values[:, 0]
    current_score = deviation_metric(targets_for_validation, predict_on_validation)
    scores += [current_score]
    models_nn += [model]
    print(
        f"Скор для фолда({fold_num}) : {np.round(current_score, 4)} средний скор на префиксе = {np.round(np.mean(scores), 4)} это заняло = {int(time.time() - start_time)} сек.")
print(f"Процесс обучения модели занял = {int(time.time() - start_train_model_time)} секунд")

Фолд: 0
Размер трейна = (4268, 83) Размер валидации = (225, 83)
Модель: input_shape = [(None, 73), (None, 1), (None, 1), (None, 1)] output_shape = (None, 1)
Начинаю обучение модели (GPU) количество эпох = 1000
Epoch 1/1000
Текущий реальный скор(валидационная часть): 1.8038
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Текущий реальный скор(валидационна

In [22]:
# Предикт нейронной сетью на test
def get_nn_predict(models, test):
    result = np.zeros(len(test))
    test_ds = get_dataset(
        test[NUM_FEATURES_COLUMNS].values,
        np.zeros(len(test)),
        test[['region']].values,
        test[['city']].values,
        test[['realty_type']].values,
        len(test))
    for model in models:
        predict = model.predict(test_ds)[:, 0]
        result += (predict / MUL_TARGET) / len(models)
    return result


test_nn_predict = get_nn_predict(models_nn, test)

test_submission = pd.read_csv('dataset/test_submission.csv')

test_submission['per_square_meter_price'] = test_nn_predict
test_submission.to_csv('nn2.csv', index=False)



In [23]:

# LightGBM кастомная метрика
def feval_deviation(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'deviation_error', deviation_metric(np.exp(y_true), np.exp(y_pred)), False


# Функция для обучения модели LightGBM
def train_lgb(train, valid, num_features, categorical_features, target_train, target_valid, EPOCHS, params):
    # feature_importances = np.zeros(len(features))
    train_dataset = lgb.Dataset(train[num_features + categorical_features], np.log(target_train), 
                                categorical_feature=categorical_features)
    valid_dataset = lgb.Dataset(valid[num_features + categorical_features], np.log(target_valid), 
                                categorical_feature=categorical_features)
    model = lgb.train(
        params=params,
        num_boost_round=EPOCHS,
        train_set=train_dataset,
        valid_sets=[train_dataset, valid_dataset],
        verbose_eval=100,
        early_stopping_rounds=int(5 / params['learning_rate']),
        feval=feval_deviation)

    y_valid = model.predict(valid[num_features + categorical_features])
    # feature_importances = model.feature_importance(importance_type='gain') / 5.0
    # lgb.plot_importance(model,max_num_features = 41)

    return model, np.exp(y_valid)


start_train_model_time = time.time()

boosting_seed = 41
boosting_params = {
    'bagging_fraction': 0.9,
    'bagging_freq': 1,
    'boost': 'gbdt',
    'feature_fraction': 0.9,
    'max_depth': 3,
    'learning_rate': 0.02,
    'metric': 'custom',
    'objective': 'regression_l1',
    'verbose': -1,
    'n_jobs': -1,
    'seed': boosting_seed,
    'feature_fraction_seed': boosting_seed,
    'bagging_seed': boosting_seed,
    'drop_seed': boosting_seed,
    'data_random_seed': boosting_seed,
}

# Количество эпох обучения
EPOCHS = 10000
scores = []
lgb_predicts = np.zeros(len(train))

lgb_models = []
for fold_num, (train_indexes, valid_indexes) in enumerate(split_list):
    start_time = time.time()
    print(f"Фолд: {fold_num}")

    train_sub_df = train[features_columns_order].loc[train_indexes].reset_index(drop=True)
    valid_sub_df = train[features_columns_order].loc[valid_indexes].reset_index(drop=True)

    print(f"Размер трейна = {train_sub_df.shape} Размер валидации = {valid_sub_df.shape}")
    # Обучаем LightGBM и делаем предикт на валидационной выборке
    model, predict_validation = train_lgb(
        train_sub_df,
        valid_sub_df,
        NUM_FEATURES_COLUMNS,
        CATEGORICAL_FEATURES_COLUMNS,
        train_sub_df[TARGET_COLUMNS[0]].values,
        valid_sub_df[TARGET_COLUMNS[0]].values,
        EPOCHS,
        boosting_params)

    lgb_models += [model]
    predict_on_validation = model.predict(valid_sub_df[NUM_FEATURES_COLUMNS + CATEGORICAL_FEATURES_COLUMNS])
    lgb_predicts[valid_indexes] = np.exp(predict_on_validation)
    targets_for_validation = valid_sub_df[TARGET_COLUMNS].values[:, 0]
    current_score = deviation_metric(targets_for_validation, predict_on_validation)
    scores += [current_score]
    print(
        f"Скор для фолда({fold_num}) : {np.round(current_score, 4)} средний скор на префиксе = {np.round(np.mean(scores), 4)} это заняло = {int(time.time() - start_time)} сек.")
print(f"Процесс обучения модели занял = {int(time.time() - start_train_model_time)} секунд")

Фолд: 0
Размер трейна = (4268, 83) Размер валидации = (225, 83)
Training until validation scores don't improve for 250 rounds
[100]	training's deviation_error: 1.83197	valid_1's deviation_error: 1.96792
[200]	training's deviation_error: 1.51318	valid_1's deviation_error: 1.8648
[300]	training's deviation_error: 1.37444	valid_1's deviation_error: 1.79724
[400]	training's deviation_error: 1.29135	valid_1's deviation_error: 1.75005
[500]	training's deviation_error: 1.23679	valid_1's deviation_error: 1.71761
[600]	training's deviation_error: 1.19414	valid_1's deviation_error: 1.69941
[700]	training's deviation_error: 1.16096	valid_1's deviation_error: 1.68737
[800]	training's deviation_error: 1.13221	valid_1's deviation_error: 1.68779
[900]	training's deviation_error: 1.11092	valid_1's deviation_error: 1.67793
[1000]	training's deviation_error: 1.0903	valid_1's deviation_error: 1.67273
[1100]	training's deviation_error: 1.07043	valid_1's deviation_error: 1.666
[1200]	training's deviation_e

In [24]:

# Предикт lgb на test
def get_lgb_predict(models, test):
    result = np.zeros(len(test))
    for model in models:
        predict = model.predict(test[NUM_FEATURES_COLUMNS + CATEGORICAL_FEATURES_COLUMNS])
        result += np.exp(predict) / len(models)
    return result


test_lgb_predict = get_lgb_predict(lgb_models, test)

test_lgb_predict.min(), test_lgb_predict.max(), test_lgb_predict.mean()

(16618.962328515547, 468614.19895911, 60258.90236155056)

In [25]:

# Кастомная метрика для xgboost
def xbg_error(preds, dtrain):
    labels = dtrain.get_label()
    err = deviation_metric(np.exp(labels), np.exp(preds))
    return 'deviation_error', err


def train_xgb(train, valid, num_features, categorical_features, target_train, target_valid, EPOCHS, params):
    dtest = xgb.DMatrix(test[num_features + categorical_features])
    y_valid = np.zeros(len(valid))

    dtrain = xgb.DMatrix(train[num_features + categorical_features], np.log(target_train), 
                        )
    dvalid = xgb.DMatrix(valid[num_features + categorical_features], np.log(target_valid), 
                        )
    model = xgb.train(
        params,
        dtrain,
        EPOCHS,
        [(dvalid, "valid")],
        verbose_eval=250,
        early_stopping_rounds=500,
        feval=xbg_error,
    )
    y_valid = model.predict(dvalid)

    return model, y_valid


start_train_model_time = time.time()

xgboost_seed = 41
xgboost_params = {
    "subsample": 0.60,
    "colsample_bytree": 0.40,
    "max_depth": 7,
    "learning_rate": 0.01,
    "objective": "reg:squarederror",
    'disable_default_eval_metric': 1,
    "nthread": -1,
    "max_bin": 64,
    'min_child_weight': 0.0,
    'reg_lambda': 0.0,
    'reg_alpha': 0.0,
    'seed': xgboost_seed,
}

# Количество эпох обучения
EPOCHS = 10000
scores = []
xgb_predicts = np.zeros(len(train))

xgb_models = []
for fold_num, (train_indexes, valid_indexes) in enumerate(split_list):
    start_time = time.time()
    print(f"Фолд: {fold_num}")

    train_sub_df = train[features_columns_order].loc[train_indexes].reset_index(drop=True)
    valid_sub_df = train[features_columns_order].loc[valid_indexes].reset_index(drop=True)

    print(f"Размер трейна = {train_sub_df.shape} Размер валидации = {valid_sub_df.shape}")
    # Обучаем Xgboost и делаем предикт на валидационной выборке
    model, predict_validation = train_xgb(
        train_sub_df,
        valid_sub_df,
        NUM_FEATURES_COLUMNS,
        CATEGORICAL_FEATURES_COLUMNS,
        train_sub_df[TARGET_COLUMNS[0]].values,
        valid_sub_df[TARGET_COLUMNS[0]].values,
        EPOCHS,
        xgboost_params)

    xgb_models += [model]
    predict_on_validation = model.predict(
        xgb.DMatrix(valid_sub_df[NUM_FEATURES_COLUMNS + CATEGORICAL_FEATURES_COLUMNS]))
    xgb_predicts[valid_indexes] = np.exp(predict_on_validation)
    targets_for_validation = valid_sub_df[TARGET_COLUMNS].values[:, 0]
    current_score = deviation_metric(targets_for_validation, predict_on_validation)
    scores += [current_score]
    print(
        f"Скор для фолда({fold_num}) : {np.round(current_score, 4)} средний скор на префиксе = {np.round(np.mean(scores), 4)} это заняло = {int(time.time() - start_time)} сек.")
print(f"Процесс обучения модели занял = {int(time.time() - start_train_model_time)} секунд")


# Предикт xgb на test
def get_xgb_predict(models, test):
    result = np.zeros(len(test))
    for model in models:
        predict = model.predict(xgb.DMatrix(test[NUM_FEATURES_COLUMNS + CATEGORICAL_FEATURES_COLUMNS]))
        result += predict / len(models)
    return result


test_xgb_predict = get_xgb_predict(xgb_models, test)

test_xgb_predict.min(), test_xgb_predict.max(), test_xgb_predict.mean()

train_targets = train[TARGET_COLUMNS[0]].values


def minimize_arit(W):
    ypred = W[0] * nn_predicts + W[1] * lgb_predicts + W[2] * xgb_predicts
    return deviation_metric(train_targets, ypred)

Фолд: 0
Размер трейна = (4268, 83) Размер валидации = (225, 83)
[0]	valid-deviation_error:9.00000
[250]	valid-deviation_error:6.41834
[500]	valid-deviation_error:1.21503
[750]	valid-deviation_error:1.33561
[932]	valid-deviation_error:1.31440
Скор для фолда(0) : 9.0 средний скор на префиксе = 9.0 это заняло = 3 сек.
Фолд: 1
Размер трейна = (4268, 83) Размер валидации = (225, 83)
[0]	valid-deviation_error:9.00000
[250]	valid-deviation_error:6.67476
[500]	valid-deviation_error:1.14004
[750]	valid-deviation_error:1.20935
[971]	valid-deviation_error:1.19046
Скор для фолда(1) : 9.0 средний скор на префиксе = 9.0 это заняло = 3 сек.
Фолд: 2
Размер трейна = (4268, 83) Размер валидации = (225, 83)
[0]	valid-deviation_error:9.00000
[250]	valid-deviation_error:6.69256
[500]	valid-deviation_error:0.96898
[750]	valid-deviation_error:1.15695
[952]	valid-deviation_error:1.16055
Скор для фолда(2) : 9.0 средний скор на префиксе = 9.0 это заняло = 3 сек.
Фолд: 3
Размер трейна = (4268, 83) Размер валидац

In [26]:
from catboost import CatBoostRegressor
from catboost import Pool

In [27]:
class CatBoostEvalMetricPearson(object):
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])
        preds = np.array(approxes[0])
        target = np.array(target)
        err = deviation_metric(np.exp(target), np.exp(preds))
        return err, 0


def train_cat(train, valid, num_features, categorical_features, target_train, target_valid, EPOCHS):





    test_data = Pool(data=test[num_features + categorical_features],
                  cat_features=categorical_features)


    train_data = Pool(data=train[num_features + categorical_features],
                      cat_features=categorical_features,
                      label=np.log(target_train))

    val_data = Pool(data=valid[num_features + categorical_features],
                cat_features=categorical_features,
                  label=np.log(target_valid))
    

    cat_model = CatBoostRegressor(
        learning_rate=0.012,
        iterations=15000,
        metric_period=100,
        eval_metric=CatBoostEvalMetricPearson(),
    )
    cat_model.fit(train_data, eval_set=val_data, use_best_model=True, early_stopping_rounds=300)
  
    y_valid = cat_model.predict(test_data)

    return cat_model, y_valid


start_train_model_time = time.time()

scores = []
cat_predicts = np.zeros(len(train))

cat_models = []
for fold_num, (train_indexes, valid_indexes) in enumerate(split_list):
    start_time = time.time()
    print(f"Фолд: {fold_num}")

    train_sub_df = train[features_columns_order].loc[train_indexes].reset_index(drop=True)
    valid_sub_df = train[features_columns_order].loc[valid_indexes].reset_index(drop=True)

    print(f"Размер трейна = {train_sub_df.shape} Размер валидации = {valid_sub_df.shape}")
    model, predict_validation = train_cat(
        train_sub_df,
        valid_sub_df,
        NUM_FEATURES_COLUMNS,
        CATEGORICAL_FEATURES_COLUMNS,
        train_sub_df[TARGET_COLUMNS[0]].values,
        valid_sub_df[TARGET_COLUMNS[0]].values,
        EPOCHS
        )

    cat_models += [model]
    predict_on_validation = model.predict(valid_sub_df[NUM_FEATURES_COLUMNS + CATEGORICAL_FEATURES_COLUMNS])
    cat_predicts[valid_indexes] = np.exp(predict_on_validation)
    targets_for_validation = valid_sub_df[TARGET_COLUMNS].values[:, 0]
    current_score = deviation_metric(targets_for_validation, predict_on_validation)
    scores += [current_score]
    print(
        f"Скор для фолда({fold_num}) : {np.round(current_score, 4)} средний скор на префиксе = {np.round(np.mean(scores), 4)} это заняло = {int(time.time() - start_time)} сек.")
print(f"Процесс обучения модели занял = {int(time.time() - start_train_model_time)} секунд")



Фолд: 0
Размер трейна = (4268, 83) Размер валидации = (225, 83)




0:	learn: 3.6306521	test: 3.3506940	best: 3.3506940 (0)	total: 153ms	remaining: 38m 15s
100:	learn: 2.2816934	test: 2.2319623	best: 2.2319623 (100)	total: 540ms	remaining: 1m 19s
200:	learn: 1.8608139	test: 2.0075182	best: 2.0075182 (200)	total: 930ms	remaining: 1m 8s
300:	learn: 1.6571693	test: 1.9604907	best: 1.9599099 (299)	total: 1.31s	remaining: 1m 4s
400:	learn: 1.5344864	test: 1.8883493	best: 1.8883493 (400)	total: 1.69s	remaining: 1m 1s
500:	learn: 1.4512465	test: 1.8371421	best: 1.8371421 (500)	total: 2.07s	remaining: 60s
600:	learn: 1.3858743	test: 1.7849629	best: 1.7845136 (599)	total: 2.45s	remaining: 58.8s
700:	learn: 1.3362125	test: 1.7603217	best: 1.7568215 (689)	total: 2.83s	remaining: 57.7s
800:	learn: 1.2939424	test: 1.7489645	best: 1.7489645 (800)	total: 3.21s	remaining: 56.9s
900:	learn: 1.2519733	test: 1.7312443	best: 1.7312443 (900)	total: 3.59s	remaining: 56.2s
1000:	learn: 1.2161282	test: 1.7209376	best: 1.7208907 (995)	total: 3.99s	remaining: 55.8s
1100:	learn:



0:	learn: 3.5956751	test: 3.4685541	best: 3.4685541 (0)	total: 52.9ms	remaining: 13m 12s
100:	learn: 2.2665210	test: 2.1492417	best: 2.1492417 (100)	total: 437ms	remaining: 1m 4s
200:	learn: 1.8511002	test: 1.8099356	best: 1.8099356 (200)	total: 878ms	remaining: 1m 4s
300:	learn: 1.6580147	test: 1.6913554	best: 1.6913554 (300)	total: 1.32s	remaining: 1m 4s
400:	learn: 1.5479568	test: 1.6449970	best: 1.6449970 (400)	total: 1.75s	remaining: 1m 3s
500:	learn: 1.4654375	test: 1.5953271	best: 1.5953271 (500)	total: 2.19s	remaining: 1m 3s
600:	learn: 1.4052732	test: 1.5660224	best: 1.5660224 (600)	total: 2.61s	remaining: 1m 2s
700:	learn: 1.3594636	test: 1.5475225	best: 1.5475225 (700)	total: 3.01s	remaining: 1m 1s
800:	learn: 1.3191755	test: 1.5258390	best: 1.5256909 (799)	total: 3.41s	remaining: 1m
900:	learn: 1.2875241	test: 1.5129597	best: 1.5129173 (899)	total: 3.81s	remaining: 59.6s
1000:	learn: 1.2556710	test: 1.5042075	best: 1.5042075 (1000)	total: 4.2s	remaining: 58.7s
1100:	learn: 



100:	learn: 2.2713789	test: 2.1993631	best: 2.1993631 (100)	total: 432ms	remaining: 1m 3s
200:	learn: 1.8602873	test: 1.7019493	best: 1.7019493 (200)	total: 837ms	remaining: 1m 1s
300:	learn: 1.6724229	test: 1.5583451	best: 1.5583451 (300)	total: 1.25s	remaining: 1m 1s
400:	learn: 1.5507042	test: 1.4558377	best: 1.4558377 (400)	total: 1.66s	remaining: 1m
500:	learn: 1.4692820	test: 1.3907886	best: 1.3907886 (500)	total: 2.07s	remaining: 59.9s
600:	learn: 1.4128081	test: 1.3524901	best: 1.3520312 (597)	total: 2.48s	remaining: 59.3s
700:	learn: 1.3659360	test: 1.3239218	best: 1.3226357 (698)	total: 2.88s	remaining: 58.8s
800:	learn: 1.3276786	test: 1.3109933	best: 1.3109933 (800)	total: 3.29s	remaining: 58.3s
900:	learn: 1.2911525	test: 1.2909525	best: 1.2904603 (898)	total: 3.69s	remaining: 57.8s
1000:	learn: 1.2578328	test: 1.2709168	best: 1.2704744 (998)	total: 4.1s	remaining: 57.3s
1100:	learn: 1.2228342	test: 1.2474744	best: 1.2474495 (1097)	total: 4.5s	remaining: 56.8s
1200:	learn:



100:	learn: 2.2820651	test: 2.4190933	best: 2.4190933 (100)	total: 436ms	remaining: 1m 4s
200:	learn: 1.8682024	test: 2.0010787	best: 2.0010787 (200)	total: 847ms	remaining: 1m 2s
300:	learn: 1.6795690	test: 1.8104984	best: 1.8104984 (300)	total: 1.26s	remaining: 1m 1s
400:	learn: 1.5644668	test: 1.6962294	best: 1.6962294 (400)	total: 1.67s	remaining: 1m
500:	learn: 1.4845197	test: 1.6160277	best: 1.6159948 (499)	total: 2.08s	remaining: 1m
600:	learn: 1.4229979	test: 1.5714169	best: 1.5714169 (600)	total: 2.48s	remaining: 59.5s
700:	learn: 1.3737605	test: 1.5423659	best: 1.5423659 (700)	total: 2.9s	remaining: 59.1s
800:	learn: 1.3307976	test: 1.5254365	best: 1.5248348 (794)	total: 3.3s	remaining: 58.5s
900:	learn: 1.2901456	test: 1.5068471	best: 1.5065648 (898)	total: 3.71s	remaining: 58s
1000:	learn: 1.2527881	test: 1.5056993	best: 1.5042068 (947)	total: 4.11s	remaining: 57.5s
1100:	learn: 1.2131286	test: 1.4913624	best: 1.4913624 (1100)	total: 4.51s	remaining: 56.9s
1200:	learn: 1.17



0:	learn: 3.6063420	test: 3.7375832	best: 3.7375832 (0)	total: 53.4ms	remaining: 13m 20s
100:	learn: 2.2591604	test: 2.4757102	best: 2.4757102 (100)	total: 435ms	remaining: 1m 4s
200:	learn: 1.8355695	test: 2.1116617	best: 2.1116617 (200)	total: 848ms	remaining: 1m 2s
300:	learn: 1.6484432	test: 1.9217747	best: 1.9217747 (300)	total: 1.27s	remaining: 1m 1s
400:	learn: 1.5376102	test: 1.7864071	best: 1.7864071 (400)	total: 1.68s	remaining: 1m 1s
500:	learn: 1.4616987	test: 1.6990474	best: 1.6990025 (499)	total: 2.09s	remaining: 1m
600:	learn: 1.4070561	test: 1.6343958	best: 1.6343958 (600)	total: 2.5s	remaining: 59.8s
700:	learn: 1.3546584	test: 1.6006275	best: 1.6006275 (700)	total: 2.9s	remaining: 59.2s
800:	learn: 1.3109032	test: 1.5792414	best: 1.5792414 (800)	total: 3.3s	remaining: 58.6s
900:	learn: 1.2707011	test: 1.5447523	best: 1.5439265 (898)	total: 3.72s	remaining: 58.2s
1000:	learn: 1.2346458	test: 1.5163497	best: 1.5160291 (993)	total: 4.12s	remaining: 57.6s
1100:	learn: 1.1



0:	learn: 3.5858702	test: 3.5956380	best: 3.5956380 (0)	total: 53.9ms	remaining: 13m 28s
100:	learn: 2.2505649	test: 2.5385997	best: 2.5385997 (100)	total: 436ms	remaining: 1m 4s
200:	learn: 1.8263268	test: 2.2001189	best: 2.2001189 (200)	total: 846ms	remaining: 1m 2s
300:	learn: 1.6426821	test: 2.0674813	best: 2.0674813 (300)	total: 1.29s	remaining: 1m 2s
400:	learn: 1.5266160	test: 1.9990530	best: 1.9990530 (400)	total: 1.73s	remaining: 1m 3s
500:	learn: 1.4455735	test: 1.9527475	best: 1.9527475 (500)	total: 2.16s	remaining: 1m 2s
600:	learn: 1.3820168	test: 1.9193397	best: 1.9191524 (599)	total: 2.61s	remaining: 1m 2s
700:	learn: 1.3306568	test: 1.8953612	best: 1.8950210 (699)	total: 3.04s	remaining: 1m 2s
800:	learn: 1.2890930	test: 1.8769644	best: 1.8769644 (800)	total: 3.45s	remaining: 1m 1s
900:	learn: 1.2458338	test: 1.8538230	best: 1.8538230 (900)	total: 3.85s	remaining: 1m
1000:	learn: 1.2085066	test: 1.8421369	best: 1.8421369 (1000)	total: 4.25s	remaining: 59.5s
1100:	learn:



100:	learn: 2.2892446	test: 2.3234340	best: 2.3234340 (100)	total: 435ms	remaining: 1m 4s
200:	learn: 1.8725100	test: 1.7365216	best: 1.7365216 (200)	total: 846ms	remaining: 1m 2s
300:	learn: 1.6784571	test: 1.5327579	best: 1.5327579 (300)	total: 1.26s	remaining: 1m 1s
400:	learn: 1.5501856	test: 1.4323333	best: 1.4323333 (400)	total: 1.67s	remaining: 1m
500:	learn: 1.4671639	test: 1.3897365	best: 1.3897365 (500)	total: 2.08s	remaining: 1m
600:	learn: 1.4070228	test: 1.3629995	best: 1.3629995 (600)	total: 2.49s	remaining: 59.7s
700:	learn: 1.3570050	test: 1.3277727	best: 1.3277727 (700)	total: 2.9s	remaining: 59.2s
800:	learn: 1.3140365	test: 1.3142976	best: 1.3142976 (800)	total: 3.31s	remaining: 58.6s
900:	learn: 1.2718682	test: 1.2995603	best: 1.2977248 (889)	total: 3.72s	remaining: 58.2s
1000:	learn: 1.2354415	test: 1.2861292	best: 1.2860331 (997)	total: 4.12s	remaining: 57.7s
1100:	learn: 1.2030544	test: 1.2766925	best: 1.2766925 (1100)	total: 4.53s	remaining: 57.2s
1200:	learn: 1



0:	learn: 3.5959039	test: 4.1806534	best: 4.1806534 (0)	total: 53.6ms	remaining: 13m 23s
100:	learn: 2.2765378	test: 2.6903588	best: 2.6903588 (100)	total: 443ms	remaining: 1m 5s
200:	learn: 1.8576017	test: 2.2172943	best: 2.2172943 (200)	total: 853ms	remaining: 1m 2s
300:	learn: 1.6887901	test: 2.0311366	best: 2.0311366 (300)	total: 1.26s	remaining: 1m 1s
400:	learn: 1.5930300	test: 1.9366652	best: 1.9366652 (400)	total: 1.65s	remaining: 1m
500:	learn: 1.5260922	test: 1.8846165	best: 1.8846165 (500)	total: 2.04s	remaining: 59.1s
600:	learn: 1.4680369	test: 1.8180481	best: 1.8180194 (599)	total: 2.43s	remaining: 58.2s
700:	learn: 1.4215352	test: 1.7624831	best: 1.7624831 (700)	total: 2.82s	remaining: 57.5s
800:	learn: 1.3846493	test: 1.7270565	best: 1.7270565 (800)	total: 3.21s	remaining: 56.9s
900:	learn: 1.3505153	test: 1.7024461	best: 1.7023271 (897)	total: 3.6s	remaining: 56.3s
1000:	learn: 1.3162042	test: 1.6868175	best: 1.6867814 (999)	total: 3.99s	remaining: 55.8s
1100:	learn: 1



0:	learn: 3.6208332	test: 3.4494664	best: 3.4494664 (0)	total: 52.5ms	remaining: 13m 6s
100:	learn: 2.2894691	test: 1.9872266	best: 1.9872266 (100)	total: 432ms	remaining: 1m 3s
200:	learn: 1.8809759	test: 1.5788248	best: 1.5788248 (200)	total: 841ms	remaining: 1m 1s
300:	learn: 1.6967370	test: 1.4478439	best: 1.4478439 (300)	total: 1.26s	remaining: 1m 1s
400:	learn: 1.5826403	test: 1.3950269	best: 1.3950269 (400)	total: 1.67s	remaining: 1m
500:	learn: 1.5025258	test: 1.3403741	best: 1.3403741 (500)	total: 2.07s	remaining: 1m
600:	learn: 1.4411726	test: 1.3068349	best: 1.3068349 (600)	total: 2.49s	remaining: 59.6s
700:	learn: 1.3914870	test: 1.2852815	best: 1.2848278 (693)	total: 2.89s	remaining: 59s
800:	learn: 1.3461759	test: 1.2644898	best: 1.2640648 (799)	total: 3.3s	remaining: 58.6s
900:	learn: 1.3072232	test: 1.2497622	best: 1.2497622 (900)	total: 3.71s	remaining: 58s
1000:	learn: 1.2672945	test: 1.2479215	best: 1.2474457 (996)	total: 4.12s	remaining: 57.6s
1100:	learn: 1.2316001



100:	learn: 2.2578959	test: 2.2293408	best: 2.2293408 (100)	total: 437ms	remaining: 1m 4s
200:	learn: 1.8449108	test: 1.7343715	best: 1.7343715 (200)	total: 846ms	remaining: 1m 2s
300:	learn: 1.6638176	test: 1.5809180	best: 1.5809180 (300)	total: 1.26s	remaining: 1m 1s
400:	learn: 1.5564237	test: 1.5248136	best: 1.5248136 (400)	total: 1.67s	remaining: 1m
500:	learn: 1.4856655	test: 1.4782255	best: 1.4782255 (500)	total: 2.08s	remaining: 1m
600:	learn: 1.4258691	test: 1.4146555	best: 1.4146555 (600)	total: 2.47s	remaining: 59.3s
700:	learn: 1.3718551	test: 1.3743947	best: 1.3743947 (700)	total: 2.88s	remaining: 58.8s
800:	learn: 1.3283817	test: 1.3440469	best: 1.3440469 (800)	total: 3.28s	remaining: 58.2s
900:	learn: 1.2911296	test: 1.3253418	best: 1.3253418 (900)	total: 3.69s	remaining: 57.7s
1000:	learn: 1.2531519	test: 1.2998406	best: 1.2998406 (1000)	total: 4.08s	remaining: 57.1s
1100:	learn: 1.2161406	test: 1.2791872	best: 1.2785207 (1097)	total: 4.49s	remaining: 56.6s
1200:	learn:



100:	learn: 2.2781699	test: 2.4472949	best: 2.4472949 (100)	total: 436ms	remaining: 1m 4s
200:	learn: 1.8580891	test: 1.9917153	best: 1.9917153 (200)	total: 843ms	remaining: 1m 2s
300:	learn: 1.6660933	test: 1.8101806	best: 1.8101806 (300)	total: 1.26s	remaining: 1m 1s
400:	learn: 1.5434389	test: 1.7076125	best: 1.7076125 (400)	total: 1.67s	remaining: 1m
500:	learn: 1.4608359	test: 1.6491472	best: 1.6491472 (500)	total: 2.08s	remaining: 1m
600:	learn: 1.3989525	test: 1.6258771	best: 1.6245994 (599)	total: 2.49s	remaining: 59.7s
700:	learn: 1.3472341	test: 1.6012206	best: 1.6008355 (696)	total: 2.9s	remaining: 59.1s
800:	learn: 1.3048386	test: 1.5944643	best: 1.5937002 (799)	total: 3.3s	remaining: 58.6s
900:	learn: 1.2600271	test: 1.5912742	best: 1.5906614 (892)	total: 3.71s	remaining: 58.1s
1000:	learn: 1.2192132	test: 1.5798077	best: 1.5778476 (969)	total: 4.11s	remaining: 57.5s
1100:	learn: 1.1802876	test: 1.5642288	best: 1.5633630 (1095)	total: 4.52s	remaining: 57.1s
1200:	learn: 1.



100:	learn: 2.2781289	test: 2.5806910	best: 2.5806910 (100)	total: 439ms	remaining: 1m 4s
200:	learn: 1.8441340	test: 2.3441520	best: 2.3441520 (200)	total: 847ms	remaining: 1m 2s
300:	learn: 1.6613869	test: 2.1756669	best: 2.1756669 (300)	total: 1.26s	remaining: 1m 1s
400:	learn: 1.5411778	test: 2.0349199	best: 2.0349199 (400)	total: 1.67s	remaining: 1m
500:	learn: 1.4596866	test: 1.9425825	best: 1.9425825 (500)	total: 2.08s	remaining: 1m
600:	learn: 1.4004927	test: 1.8870225	best: 1.8870225 (600)	total: 2.49s	remaining: 59.7s
700:	learn: 1.3492258	test: 1.8438788	best: 1.8435335 (699)	total: 2.9s	remaining: 59.1s
800:	learn: 1.3038881	test: 1.7990129	best: 1.7990129 (800)	total: 3.3s	remaining: 58.5s
900:	learn: 1.2645651	test: 1.7676735	best: 1.7676735 (900)	total: 3.7s	remaining: 58s
1000:	learn: 1.2235257	test: 1.7470723	best: 1.7470723 (1000)	total: 4.11s	remaining: 57.4s
1100:	learn: 1.1865449	test: 1.7256325	best: 1.7255431 (1090)	total: 4.51s	remaining: 57s
1200:	learn: 1.1553



100:	learn: 2.2741111	test: 2.1081658	best: 2.1081658 (100)	total: 437ms	remaining: 1m 4s
200:	learn: 1.8686395	test: 1.8941318	best: 1.8941318 (200)	total: 850ms	remaining: 1m 2s
300:	learn: 1.6772161	test: 1.8706775	best: 1.8691407 (298)	total: 1.27s	remaining: 1m 1s
400:	learn: 1.5577352	test: 1.8342161	best: 1.8322597 (396)	total: 1.68s	remaining: 1m 1s
500:	learn: 1.4742572	test: 1.8145560	best: 1.8141148 (498)	total: 2.09s	remaining: 1m
600:	learn: 1.4181788	test: 1.7912077	best: 1.7911524 (594)	total: 2.5s	remaining: 59.9s
700:	learn: 1.3714399	test: 1.7692425	best: 1.7692425 (700)	total: 2.9s	remaining: 59.2s
800:	learn: 1.3346837	test: 1.7578514	best: 1.7552849 (789)	total: 3.31s	remaining: 58.7s
900:	learn: 1.2991144	test: 1.7437171	best: 1.7437171 (900)	total: 3.71s	remaining: 58.1s
1000:	learn: 1.2658163	test: 1.7270271	best: 1.7270271 (1000)	total: 4.11s	remaining: 57.5s
1100:	learn: 1.2319374	test: 1.7130885	best: 1.7130885 (1100)	total: 4.51s	remaining: 57s
1200:	learn: 



100:	learn: 2.2451768	test: 2.7101555	best: 2.7101555 (100)	total: 429ms	remaining: 1m 3s
200:	learn: 1.8278955	test: 2.2774447	best: 2.2774447 (200)	total: 833ms	remaining: 1m 1s
300:	learn: 1.6435356	test: 2.1070696	best: 2.1070696 (300)	total: 1.25s	remaining: 1m 1s
400:	learn: 1.5277810	test: 1.9934603	best: 1.9934603 (400)	total: 1.67s	remaining: 1m
500:	learn: 1.4434127	test: 1.9021353	best: 1.9016791 (499)	total: 2.07s	remaining: 1m
600:	learn: 1.3839069	test: 1.8483446	best: 1.8473074 (597)	total: 2.49s	remaining: 59.6s
700:	learn: 1.3330710	test: 1.8096389	best: 1.8084086 (693)	total: 2.9s	remaining: 59.1s
800:	learn: 1.2940991	test: 1.7747221	best: 1.7743251 (796)	total: 3.3s	remaining: 58.6s
900:	learn: 1.2536001	test: 1.7418086	best: 1.7418086 (900)	total: 3.71s	remaining: 58.1s
1000:	learn: 1.2181969	test: 1.7216079	best: 1.7216079 (1000)	total: 4.12s	remaining: 57.6s
1100:	learn: 1.1860602	test: 1.7005914	best: 1.7005914 (1100)	total: 4.53s	remaining: 57.2s
1200:	learn: 1



100:	learn: 2.2909616	test: 1.9735892	best: 1.9735892 (100)	total: 431ms	remaining: 1m 3s
200:	learn: 1.8833660	test: 1.6089466	best: 1.6089466 (200)	total: 839ms	remaining: 1m 1s
300:	learn: 1.6926518	test: 1.4698206	best: 1.4698206 (300)	total: 1.26s	remaining: 1m 1s
400:	learn: 1.5732936	test: 1.4194986	best: 1.4194986 (400)	total: 1.67s	remaining: 1m
500:	learn: 1.4918474	test: 1.3985310	best: 1.3978291 (495)	total: 2.08s	remaining: 1m
600:	learn: 1.4308692	test: 1.3994373	best: 1.3938539 (532)	total: 2.5s	remaining: 59.8s
700:	learn: 1.3789152	test: 1.3806574	best: 1.3806574 (700)	total: 2.91s	remaining: 59.4s
800:	learn: 1.3331137	test: 1.3721648	best: 1.3710821 (754)	total: 3.32s	remaining: 58.9s
900:	learn: 1.2912423	test: 1.3638892	best: 1.3629718 (888)	total: 3.73s	remaining: 58.4s
1000:	learn: 1.2510510	test: 1.3597785	best: 1.3597785 (1000)	total: 4.13s	remaining: 57.8s
1100:	learn: 1.2159348	test: 1.3557855	best: 1.3552232 (1099)	total: 4.54s	remaining: 57.3s
1200:	learn: 



100:	learn: 2.2823710	test: 2.2270637	best: 2.2270637 (100)	total: 432ms	remaining: 1m 3s
200:	learn: 1.8579320	test: 1.9256308	best: 1.9256308 (200)	total: 840ms	remaining: 1m 1s
300:	learn: 1.6710584	test: 1.8054847	best: 1.8054847 (300)	total: 1.25s	remaining: 1m 1s
400:	learn: 1.5499173	test: 1.7349975	best: 1.7349975 (400)	total: 1.67s	remaining: 1m
500:	learn: 1.4697319	test: 1.6931968	best: 1.6921800 (499)	total: 2.07s	remaining: 1m
600:	learn: 1.4068767	test: 1.6596299	best: 1.6594033 (599)	total: 2.48s	remaining: 59.5s
700:	learn: 1.3554709	test: 1.6412504	best: 1.6412504 (700)	total: 2.9s	remaining: 59.1s
800:	learn: 1.3084631	test: 1.6370500	best: 1.6323973 (781)	total: 3.3s	remaining: 58.6s
900:	learn: 1.2661027	test: 1.6402623	best: 1.6323973 (781)	total: 3.71s	remaining: 58.1s
1000:	learn: 1.2273907	test: 1.6296934	best: 1.6296934 (1000)	total: 4.12s	remaining: 57.6s
1100:	learn: 1.1851306	test: 1.6193054	best: 1.6187720 (1096)	total: 4.53s	remaining: 57.1s
1200:	learn: 1



0:	learn: 3.5631075	test: 3.9645199	best: 3.9645199 (0)	total: 53.7ms	remaining: 13m 25s
100:	learn: 2.2375504	test: 2.3550841	best: 2.3550841 (100)	total: 437ms	remaining: 1m 4s
200:	learn: 1.8342268	test: 2.0434953	best: 2.0434953 (200)	total: 842ms	remaining: 1m 2s
300:	learn: 1.6557779	test: 1.8800266	best: 1.8800266 (300)	total: 1.26s	remaining: 1m 1s
400:	learn: 1.5389589	test: 1.7855804	best: 1.7855804 (400)	total: 1.68s	remaining: 1m 1s
500:	learn: 1.4582512	test: 1.7111873	best: 1.7111873 (500)	total: 2.09s	remaining: 1m
600:	learn: 1.3927412	test: 1.6575787	best: 1.6575787 (600)	total: 2.5s	remaining: 1m
700:	learn: 1.3419549	test: 1.6191689	best: 1.6191689 (700)	total: 2.92s	remaining: 59.5s
800:	learn: 1.2966493	test: 1.5984371	best: 1.5984371 (800)	total: 3.33s	remaining: 59s
900:	learn: 1.2560023	test: 1.5902312	best: 1.5902312 (900)	total: 3.73s	remaining: 58.4s
1000:	learn: 1.2165368	test: 1.5708201	best: 1.5702368 (998)	total: 4.14s	remaining: 57.9s
1100:	learn: 1.1807



0:	learn: 3.5805699	test: 3.5348576	best: 3.5348576 (0)	total: 53.4ms	remaining: 13m 20s
100:	learn: 2.2749076	test: 2.0091391	best: 2.0091391 (100)	total: 437ms	remaining: 1m 4s
200:	learn: 1.8551599	test: 1.5998641	best: 1.5998641 (200)	total: 849ms	remaining: 1m 2s
300:	learn: 1.6638910	test: 1.4446922	best: 1.4446922 (300)	total: 1.27s	remaining: 1m 1s
400:	learn: 1.5407124	test: 1.3611662	best: 1.3609150 (399)	total: 1.68s	remaining: 1m 1s
500:	learn: 1.4563038	test: 1.3218093	best: 1.3218093 (500)	total: 2.09s	remaining: 1m
600:	learn: 1.3950900	test: 1.2978193	best: 1.2977158 (599)	total: 2.5s	remaining: 60s
700:	learn: 1.3431564	test: 1.2784039	best: 1.2784039 (700)	total: 2.91s	remaining: 59.3s
800:	learn: 1.2978466	test: 1.2650589	best: 1.2650589 (800)	total: 3.32s	remaining: 58.8s
900:	learn: 1.2556805	test: 1.2602835	best: 1.2587932 (882)	total: 3.73s	remaining: 58.3s
1000:	learn: 1.2223171	test: 1.2515799	best: 1.2493306 (992)	total: 4.13s	remaining: 57.8s
1100:	learn: 1.1



100:	learn: 2.2874461	test: 1.9611819	best: 1.9611819 (100)	total: 443ms	remaining: 1m 5s
200:	learn: 1.8582581	test: 1.7463254	best: 1.7460603 (199)	total: 856ms	remaining: 1m 3s
300:	learn: 1.6670895	test: 1.6234341	best: 1.6234341 (300)	total: 1.27s	remaining: 1m 2s
400:	learn: 1.5500209	test: 1.5635847	best: 1.5635847 (400)	total: 1.69s	remaining: 1m 1s
500:	learn: 1.4694045	test: 1.5095406	best: 1.5095406 (500)	total: 2.1s	remaining: 1m
600:	learn: 1.4096235	test: 1.4751204	best: 1.4751204 (600)	total: 2.51s	remaining: 1m
700:	learn: 1.3612508	test: 1.4491031	best: 1.4489617 (699)	total: 2.92s	remaining: 59.6s
800:	learn: 1.3198053	test: 1.4285682	best: 1.4285682 (800)	total: 3.33s	remaining: 59s
900:	learn: 1.2789784	test: 1.4095265	best: 1.4090160 (898)	total: 3.73s	remaining: 58.4s
1000:	learn: 1.2397467	test: 1.3908417	best: 1.3908417 (1000)	total: 4.14s	remaining: 57.9s
1100:	learn: 1.2016456	test: 1.3700755	best: 1.3698920 (1099)	total: 4.54s	remaining: 57.4s
1200:	learn: 1.



0:	learn: 3.6010025	test: 3.5506341	best: 3.5506341 (0)	total: 53.1ms	remaining: 13m 16s
100:	learn: 2.2829011	test: 2.1106872	best: 2.1106872 (100)	total: 437ms	remaining: 1m 4s
200:	learn: 1.8702343	test: 1.7458089	best: 1.7458089 (200)	total: 857ms	remaining: 1m 3s
300:	learn: 1.6756832	test: 1.5492084	best: 1.5492084 (300)	total: 1.27s	remaining: 1m 2s
400:	learn: 1.5573759	test: 1.4786759	best: 1.4786759 (400)	total: 1.69s	remaining: 1m 1s
500:	learn: 1.4795572	test: 1.4247589	best: 1.4247589 (500)	total: 2.1s	remaining: 1m
600:	learn: 1.4196923	test: 1.3884986	best: 1.3881891 (593)	total: 2.51s	remaining: 1m
700:	learn: 1.3689058	test: 1.3667969	best: 1.3667495 (693)	total: 2.93s	remaining: 59.7s
800:	learn: 1.3229831	test: 1.3514663	best: 1.3513452 (799)	total: 3.33s	remaining: 59.1s
900:	learn: 1.2842701	test: 1.3301991	best: 1.3293276 (898)	total: 3.74s	remaining: 58.5s
1000:	learn: 1.2497323	test: 1.3201893	best: 1.3197656 (997)	total: 4.15s	remaining: 58s
1100:	learn: 1.2147

In [28]:

# Предикт xgb на test
def get_cat_predict(models, test):
    result = np.zeros(len(test))
    for model in cat_models:
        predict = model.predict(test[NUM_FEATURES_COLUMNS + CATEGORICAL_FEATURES_COLUMNS])
        result += np.exp(predict) / len(models)
    return result


test_cat_predict = get_cat_predict(xgb_models, test)

test_cat_predict.min(), test_cat_predict.max(), test_cat_predict.mean()



(16538.684613884474, 1271158.006455984, 61562.51109312501)

In [33]:
np.save('nn_predicts.npy', nn_predicts)
np.save('lgb_predicts.npy', lgb_predicts)
np.save('xgb_predicts.npy', xgb_predicts)
np.save('cat_predicts.npy', cat_predicts)


In [87]:
def minimize_arit(W):
    
    vect = train.city.apply(lambda x: W[4] if x == 0 else W[5] if x == 4 else 1.0).values
    
    ypred = (W[0] * nn_predicts + W[1] * lgb_predicts + W[2] * xgb_predicts + W[3] * cat_predicts) * vect
    return deviation_metric(train_targets, ypred)

W = minimize(minimize_arit, [1.0 / 4] * 4 + [1.0] * 2, options={'gtol': 1e-6, 'disp': True}).x

Optimization terminated successfully.
         Current function value: 1.017171
         Iterations: 17
         Function evaluations: 176
         Gradient evaluations: 22


In [88]:
W

array([ 0.11291272, -0.03197168,  0.41792838,  0.41104496,  1.00082998,
        0.99003515])

In [89]:
vect = test.city.apply(lambda x: W[4] if x == 0 else W[5] if x == 4 else 1.0).values

test_submission = pd.read_csv('dataset/test_submission.csv')
test_submission['per_square_meter_price'] = (test_nn_predict * W[0] + test_lgb_predict * W[1] + np.exp(test_xgb_predict) * W[2] + test_cat_predict * W[3]) * vect
test_submission['per_square_meter_price'] = test_submission['per_square_meter_price'].apply(lambda x: max(0.0, x))
test_submission.to_csv('submission_city.csv', index = False)

In [None]:
test_cat_predict

In [None]:
test_xgb_predict

In [92]:
sub118 = pd.read_csv('submission_1.18.csv')
sub = pd.read_csv('submission.csv')

In [96]:
sub118['pred_city'] = sub.per_square_meter_price
sub118

Unnamed: 0,id,per_square_meter_price,pred_city
0,COL_289284,39867.435371,40259.468314
1,COL_289305,39655.637291,40036.357894
2,COL_289318,38896.851638,39364.412040
3,COL_289354,92529.584903,89562.368291
4,COL_289399,45660.643114,46518.927106
...,...,...,...
2969,COL_455089,24861.348343,24744.822612
2970,COL_455212,40821.884639,40702.485920
2971,COL_455261,40610.014172,41730.787348
2972,COL_455381,42825.089013,42580.386226


In [90]:
train_ = pd.read_csv('dataset/train.csv')
train_ = train_[train_.price_type == 1].reset_index(drop=True)

In [91]:
list(zip(train_[:20].city, train[:20].city))

[('Красноярск', 0),
 ('Саратов', 1),
 ('Красноярск', 0),
 ('Иркутск', 2),
 ('Белгород', 3),
 ('Санкт-Петербург', 4),
 ('Калуга', 5),
 ('Сургут', 6),
 ('Иркутск', 2),
 ('Иркутск', 2),
 ('Кемерово', 7),
 ('Иркутск', 2),
 ('Новокузнецк', 8),
 ('Новокузнецк', 8),
 ('Калуга', 5),
 ('Белгород', 3),
 ('Пермь', 9),
 ('Пермь', 9),
 ('Владивосток', 10),
 ('Петрозаводск', 11)]