In [32]:
IS_GPU = False
# Импорт нужных библиотек
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")
import tensorflow as tf
import tensorflow.keras as keras
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
import time
from scipy.optimize import minimize
from neighbors import Neighborhoods

from indices import MainDataset
from dnn_utils import preprocess_floor
from metric import metrics_stat, deviation_metric

def reset_tensorflow_session():
    tf.keras.backend.clear_session()
    tf.random.set_seed(41)
    np.random.seed(41)


THRESHOLD = 0.15
NEGATIVE_WEIGHT = 1.1

In [20]:

# Категориальные данные
CATEGORICAL_FEATURES_COLUMNS = ['region', 'city', 'realty_type', 'floor', 'osm_city_nearest_name', 'street']
# Численные данные
NUM_FEATURES_COLUMNS = ['lat', 'lng', 'osm_amenity_points_in_0.001',
                        'osm_amenity_points_in_0.005', 'osm_amenity_points_in_0.0075',
                        'osm_amenity_points_in_0.01', 'osm_building_points_in_0.001',
                        'osm_building_points_in_0.005', 'osm_building_points_in_0.0075',
                        'osm_building_points_in_0.01', 'osm_catering_points_in_0.001',
                        'osm_catering_points_in_0.005', 'osm_catering_points_in_0.0075',
                        'osm_catering_points_in_0.01', 'osm_city_closest_dist',
                        'osm_city_nearest_population',
                        'osm_crossing_closest_dist', 'osm_crossing_points_in_0.001',
                        'osm_crossing_points_in_0.005', 'osm_crossing_points_in_0.0075',
                        'osm_crossing_points_in_0.01', 'osm_culture_points_in_0.001',
                        'osm_culture_points_in_0.005', 'osm_culture_points_in_0.0075',
                        'osm_culture_points_in_0.01', 'osm_finance_points_in_0.001',
                        'osm_finance_points_in_0.005', 'osm_finance_points_in_0.0075',
                        'osm_finance_points_in_0.01', 'osm_healthcare_points_in_0.005',
                        'osm_healthcare_points_in_0.0075', 'osm_healthcare_points_in_0.01',
                        'osm_historic_points_in_0.005', 'osm_historic_points_in_0.0075',
                        'osm_historic_points_in_0.01', 'osm_hotels_points_in_0.005',
                        'osm_hotels_points_in_0.0075', 'osm_hotels_points_in_0.01',
                        'osm_leisure_points_in_0.005', 'osm_leisure_points_in_0.0075',
                        'osm_leisure_points_in_0.01', 'osm_offices_points_in_0.001',
                        'osm_offices_points_in_0.005', 'osm_offices_points_in_0.0075',
                        'osm_offices_points_in_0.01', 'osm_shops_points_in_0.001',
                        'osm_shops_points_in_0.005', 'osm_shops_points_in_0.0075',
                        'osm_shops_points_in_0.01', 'osm_subway_closest_dist',
                        'osm_train_stop_closest_dist', 'osm_train_stop_points_in_0.005',
                        'osm_train_stop_points_in_0.0075', 'osm_train_stop_points_in_0.01',
                        'osm_transport_stop_closest_dist', 'osm_transport_stop_points_in_0.005',
                        'osm_transport_stop_points_in_0.0075',
                        'osm_transport_stop_points_in_0.01',
                        'reform_count_of_houses_1000', 'reform_count_of_houses_500',
                        'reform_house_population_1000', 'reform_house_population_500',
                        'reform_mean_floor_count_1000', 'reform_mean_floor_count_500',
                        'reform_mean_year_building_1000', 'reform_mean_year_building_500', 'total_square',
                        "neighbor_dist", "neighbor_total_price", "neighbor_square_price", "neighbor10_dist",
                        "has_basement", "floor_count"

                        ]
# Таргет
TARGET_COLUMNS = ['per_square_meter_price']

In [6]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')
train = train[train.price_type == 1].reset_index(drop=True)
train['is_train'] = 1
test['is_train'] = 0
dataset = pd.concat([train, test]).reset_index(drop=True)


In [7]:
train_dataset_index = MainDataset("dataset/train.csv")
test_dataset_index = MainDataset("dataset/test.csv", need_index=False)
neighborhoods = Neighborhoods(train_dataset_index.index)


In [8]:
dataset["neighbor_dist"] = -999
dataset["neighbor_total_price"] = -999
dataset["neighbor_square_price"] = -999
dataset["neighbor10_dist"] = -999
for d in [test_dataset_index, train_dataset_index]:
    for i, o in enumerate(d.all_objects):
        if o.row["price_type"] != 1:
            continue
        neighbor = neighborhoods.get_haversine_closest(o, 12)
        neighbor1 = neighborhoods.get_haversine_closest(o, 2)
        n = neighbor[0]
        dataset.loc[dataset["id"] == o.row["id"], "neighbor_dist"] = n[1]
        dataset.loc[dataset["id"] == o.row["id"], "neighbor_total_price"] = n[0].row["per_square_meter_price"] * \
                                                                            n[0].row["total_square"]
        dataset.loc[dataset["id"] == o.row["id"], "neighbor_square_price"] = n[0].row["per_square_meter_price"]
        dataset.loc[dataset["id"] == o.row["id"], "neighbor10_dist"] = neighbor[10][1]


In [11]:

dataset=preprocess_floor.preprocess(dataset)



In [13]:
dataset_copy = dataset.copy()

In [12]:
dataset.head()

Unnamed: 0,city,floor,id,lat,lng,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,...,date,realty_type,price_type,is_train,neighbor_dist,neighbor_total_price,neighbor_square_price,neighbor10_dist,has_basement,floor_count
0,Красноярск,-999,COL_62,56.063615,92.958428,0,7,14,26,0,...,2020-01-05,110,1,1,0.334024,995000.0,41458.333333,0.451369,-999,-999
1,Саратов,-999,COL_71,51.534581,46.020549,13,198,345,462,0,...,2020-01-05,10,1,1,0.086136,2985000.0,33166.666667,0.190652,-999,-999
2,Красноярск,-999,COL_140,56.026884,92.818323,3,15,23,33,0,...,2020-01-05,10,1,1,0.027117,18308000.0,61026.666667,0.291762,-999,-999
3,Иркутск,-999,COL_202,52.275528,104.251444,0,10,26,40,0,...,2020-01-05,10,1,1,0.220089,5870000.0,58700.0,0.435699,-999,-999
4,Белгород,-999,COL_207,50.576545,36.584197,4,48,73,92,0,...,2020-01-05,10,1,1,0.046677,4179000.0,59700.0,0.147191,-999,-999


In [14]:
def encode_categorical_features(df, categorical_columns):
    for column in categorical_columns:
        dict_encoding = {key: val for val, key in enumerate(df[column].unique())}
        df[column] = df[column].map(dict_encoding)
    return df

In [15]:

# Квантильное преобразование данных
def get_quantile_transform(_df, columns_for_quantilization, random_state=41, n_quantiles=100,
                           output_distribution='normal'):
    df = _df.copy()
    for col in columns_for_quantilization:
        qt = QuantileTransformer(random_state=random_state, n_quantiles=n_quantiles,
                                 output_distribution=output_distribution)
        df[col] = qt.fit_transform(df[[col]])
    return df

In [16]:

# МинМакс преобразование данных
def get_minmax_transform(_df, columns_for_quantilization, min_value=-1, max_value=1):
    df = _df.copy()
    for col in columns_for_quantilization:
        scaler = MinMaxScaler(feature_range=(min_value, max_value))
        df[col] = scaler.fit_transform(df[[col]])
    return df

In [21]:
# Hotencoding для категориальных фичей
data = encode_categorical_features(dataset, CATEGORICAL_FEATURES_COLUMNS)
# Нормализация численных данных
data = get_quantile_transform(data, NUM_FEATURES_COLUMNS)
data = get_minmax_transform(data, NUM_FEATURES_COLUMNS)
# Заполняем NaN значения
data = data.fillna(data.mean())
train = data[data.is_train == 1].reset_index(drop=True)
test = data[data.is_train == 0].reset_index(drop=True)
train = train.drop(columns=['is_train'])
test = test.drop(columns=['is_train'])

In [22]:
data.head()

Unnamed: 0,city,floor,id,lat,lng,osm_amenity_points_in_0.001,osm_amenity_points_in_0.005,osm_amenity_points_in_0.0075,osm_amenity_points_in_0.01,osm_building_points_in_0.001,...,date,realty_type,price_type,is_train,neighbor_dist,neighbor_total_price,neighbor_square_price,neighbor10_dist,has_basement,floor_count
0,0,0,COL_62,0.060226,0.223088,-1.0,-0.234768,-0.256798,-0.245381,-1.0,...,2020-01-05,0,1,1,0.29513,-0.335815,-0.100331,0.117364,-1.0,-1.0
1,1,0,COL_71,-0.284322,-0.042332,0.298058,0.285412,0.284259,0.272468,-1.0,...,2020-01-05,1,1,1,-0.125504,-0.153689,-0.153689,-0.153382,-1.0,-1.0
2,0,0,COL_140,0.03388,0.146323,0.067077,-0.131259,-0.182216,-0.206562,-1.0,...,2020-01-05,1,1,1,-0.370964,0.156685,0.014726,-0.012172,-1.0,-1.0
3,2,0,COL_202,-0.206973,0.27153,-1.0,-0.194,-0.167492,-0.178437,-1.0,...,2020-01-05,1,1,1,0.17513,-0.024577,0.00198,0.108546,-1.0,-1.0
4,3,0,COL_207,-0.335843,-0.118494,0.110487,0.077531,0.017066,-0.039228,-1.0,...,2020-01-05,1,1,1,-0.260458,-0.093665,0.007306,-0.236691,-1.0,-1.0


In [23]:
def get_standart_split(data, n_splits=5, seed=41):
    kf = KFold(n_splits=n_splits, random_state=seed, shuffle=True)
    split_list = []
    for train_index, test_index in kf.split(data):
        split_list += [(train_index, test_index)]
    return split_list

In [24]:
def get_dataset(arr_features, arr_target, arr_region, arr_city, arr_realty, batch_size):
    return tf.data.Dataset.from_tensor_slices(
        (
            {
                "model_features_input": arr_features,
                "model_region_input": arr_region,
                "model_city_input": arr_city,
                "model_realty_input": arr_realty,
            },
            {
                "model_output": arr_target,
            },
        )
    ).batch(batch_size)

In [25]:
def get_columns_order(columns):
    columns_order = sorted([x for x in columns if not x in (CATEGORICAL_FEATURES_COLUMNS + TARGET_COLUMNS)])
    return columns_order + CATEGORICAL_FEATURES_COLUMNS + TARGET_COLUMNS

In [62]:

# Коллбэк, для отслеживания целевой метрики
class CustomCallback(keras.callbacks.Callback):
    def __init__(self, val_dataset, val_targets):
        super(CustomCallback, self).__init__()
        self.val_targets = val_targets
        self.val_dataset = val_dataset

    def on_epoch_end(self, epoch, logs=None):
        predicts = self.model.predict(self.val_dataset)[:, 0]
        targets = self.val_targets[:, 0]
        print(f"Текущий реальный скор(валидационная часть): {np.round(deviation_metric(targets, predicts), 4)}")

In [63]:

def Dropout(x):
    return keras.layers.Dropout(x)


def Flatten():
    return keras.layers.Flatten()


def Concatenate():
    return keras.layers.Concatenate()


# Функция обучения модели
def fit(model, epochs, train_dataset, val_dataset, val_targets, verbose=True):
    if IS_GPU:
        print(f"Начинаю обучение модели (GPU) количество эпох = {epochs}")
        with tf.device('/device:GPU:0'):
            # Коллбэк для остановки, если модель перестала обучаться
            early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=2.5e-6,
                                                                       patience=100, restore_best_weights=True,
                                                                       mode='min')
            # Коллбэк для уменьшения скорости обучения
            lr_callback = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-9,
                                                               mode='min')
            # Кастомный коллбэк для отображения скора по целевой метрике
            metric_callback = CustomCallback(val_dataset, val_targets)
            history = model.fit(train_dataset, epochs=epochs, validation_data=val_dataset, verbose=verbose,
                                shuffle=True, callbacks=[early_stopping_callback, lr_callback, metric_callback],
                                workers=-1)
            return history
    else:
        print(f"Начинаю обучение модели (СPU) количество эпох = {epochs}")
        # Коллбэк для остановки, если модель перестала обучаться
        early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=2.5e-6, patience=100,
                                                                   restore_best_weights=True, mode='min')
        # Коллбэк для уменьшения скорости обучения
        lr_callback = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-9,
                                                           mode='min')
        # Кастомный коллбэк для отображения скора по целевой метрике
        metric_callback = CustomCallback(val_dataset, val_targets)
        history = model.fit(train_dataset, epochs=epochs, validation_data=val_dataset, verbose=verbose, shuffle=True,
                            callbacks=[early_stopping_callback, lr_callback, metric_callback], workers=-1)
        return history

In [64]:

# Реализация кастомной функции потерь для обучения
def tf_custom_loss(y_true, y_pred):
    threshold = 0.6
    error = tf.abs(y_true - y_pred) / y_true
    is_small_error = error <= threshold
    small_error_loss = tf.square(error / 0.15 - 1)
    big_error_loss = 9.0 * tf.ones_like(small_error_loss) + tf.abs(error)
    # big_error_loss = (3.0 * tf.ones_like(small_error_loss) + tf.abs(error)) ** 2
    return tf.where(is_small_error, small_error_loss, big_error_loss)

In [65]:

# Компиляция текущей модели
def compile_model(train_dataset, val_dataset, num_features, max_realty, max_region, max_city, lr=5e-4):
    reset_tensorflow_session()
    optimizer = tf.keras.optimizers.Adam(learning_rate=lr)

    model_input_layer = tf.keras.Input(shape=(num_features), name="model_features_input")
    model_input_realty = tf.keras.Input(shape=(1), name="model_realty_input")
    model_input_region = tf.keras.Input(shape=(1), name="model_region_input")
    model_input_city = tf.keras.Input(shape=(1), name="model_city_input")

    model_embedding_layer_realty = keras.layers.Embedding(max_realty + 1, 4, input_length=1, dtype=tf.float64)(
        model_input_realty)
    model_embedding_layer_region = keras.layers.Embedding(max_region + 1, 32, input_length=1, dtype=tf.float64)(
        model_input_region)
    model_embedding_layer_city = keras.layers.Embedding(max_city + 1, 32, input_length=1, dtype=tf.float64)(
        model_input_city)

    concatenated_input_layer = Concatenate()(
        [Flatten()(model_embedding_layer_realty), Flatten()(model_embedding_layer_region),
         Flatten()(model_embedding_layer_city), Flatten()(model_input_layer)])

    layer_0 = keras.layers.Dense(128, activation="relu")(concatenated_input_layer)
    layer_1 = keras.layers.Dense(64, activation="relu")(layer_0)
    layer_2 = keras.layers.Dense(32, activation="relu")(layer_1)
    model_output_layer = keras.layers.Dense(1, activation="relu", name="model_output")(layer_2)

    cur_model = keras.Model(
        inputs=[
            model_input_layer,
            model_input_realty,
            model_input_region,
            model_input_city,
        ],
        outputs=[
            model_output_layer,
        ])

    print(f"Модель: input_shape = {cur_model.input_shape} output_shape = {cur_model.output_shape}")
#     cur_model.compile(loss=tf_custom_loss, optimizer=optimizer)  # , run_eagerly=True)
    cur_model.compile(loss=tf_custom_loss, optimizer=optimizer)  # , run_eagerly=True)

    #
    return cur_model

In [66]:
features_columns_order = get_columns_order(train.columns.values.tolist())
split_list = get_standart_split(train)

start_train_model_time = time.time()
# Размер батча для Dataset
BATCH_SIZE = int(2 ** 5)
# Количество эпох обучения
EPOCHS = 500
# Количество численных входных переменных модели
NUM_FEATURES = len(NUM_FEATURES_COLUMNS)
# Макс. значения категориалных фичей
MAX_REALTY = max(train['realty_type'].max(), test['realty_type'].max())
MAX_REGION = max(train['region'].max(), test['region'].max())
MAX_CITY = max(train['city'].max(), test['city'].max())
# Коэффициент домножения таргета, с целью быстрейшего сходимости модельки и лучшего обучения
MUL_TARGET = 5e-5

scores = []
nn_predicts = np.zeros(len(train))
models_nn = []

for fold_num, (train_indexes, valid_indexes) in enumerate(split_list):
    start_time = time.time()
    print(f"Фолд: {fold_num}")

    train_sub_df = train[features_columns_order].loc[train_indexes].reset_index(drop=True)
    valid_sub_df = train[features_columns_order].loc[valid_indexes].reset_index(drop=True)

    print(f"Размер трейна = {train_sub_df.shape} Размер валидации = {valid_sub_df.shape}")

    # Строим датасеты
    train_ds = get_dataset(
        train_sub_df[NUM_FEATURES_COLUMNS].values,
        train_sub_df[TARGET_COLUMNS].values * MUL_TARGET,
        train_sub_df[['region']].values,
        train_sub_df[['city']].values,
        train_sub_df[['realty_type']].values,
        BATCH_SIZE)
    valid_ds = get_dataset(
        valid_sub_df[NUM_FEATURES_COLUMNS].values,
        valid_sub_df[TARGET_COLUMNS].values * MUL_TARGET,
        valid_sub_df[['region']].values,
        valid_sub_df[['city']].values,
        valid_sub_df[['realty_type']].values,
        len(valid_sub_df))

    # Компилируем модель
    model = compile_model(train_ds, valid_ds, NUM_FEATURES, MAX_REALTY, MAX_REGION, MAX_CITY)
    # Обучаем модель
    fit(model, EPOCHS, train_ds, valid_ds, valid_sub_df[TARGET_COLUMNS].values * MUL_TARGET)

    predict_on_validation = model.predict(valid_ds)[:, 0] / MUL_TARGET
    nn_predicts[valid_indexes] = predict_on_validation
    targets_for_validation = valid_sub_df[TARGET_COLUMNS].values[:, 0]
    current_score = deviation_metric(targets_for_validation, predict_on_validation)
    scores += [current_score]
    models_nn += [model]
    print(
        f"Скор для фолда({fold_num}) : {np.round(current_score, 4)} средний скор на префиксе = {np.round(np.mean(scores), 4)} это заняло = {int(time.time() - start_time)} сек.")
print(f"Процесс обучения модели занял = {int(time.time() - start_train_model_time)} секунд")

Фолд: 0
Размер трейна = (3594, 83) Размер валидации = (899, 83)


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Модель: input_shape = [(None, 73)

Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500


Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500


Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Epoch 119/500
Epoch 120/500
Epoch 121/500
Epoch 122/500
Epoch 123/500
Epoch 124/500
Epoch 125/500
Epoch 126/500
Epoch 127/500
Epoch 128/500
Epoch 129/500
Epoch 130/500


Epoch 131/500
Epoch 132/500
Epoch 133/500
Epoch 134/500
Epoch 135/500
Epoch 136/500
Epoch 137/500
Epoch 138/500
Epoch 139/500
Epoch 140/500
Epoch 141/500
Epoch 142/500
Epoch 143/500
Epoch 144/500
Скор для фолда(0) : 1.3888 средний скор на префиксе = 1.3888 это заняло = 36 сек.
Фолд: 1
Размер трейна = (3594, 83) Размер валидации = (899, 83)


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default,

Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500


Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 79/500
Epoch 80/500
Epoch 81/500


Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500
Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500


Скор для фолда(1) : 1.4594 средний скор на префиксе = 1.4241 это заняло = 30 сек.
Фолд: 2
Размер трейна = (3594, 83) Размер валидации = (899, 83)


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passin

Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500


Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500


Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Скор для фолда(2) : 1.4523 средний скор на префиксе = 1.4335 это заняло = 28 сек.
Фолд: 3
Размер трейна = (3595, 83) Размер валидации = (898, 83)


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all la

Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500


Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500


Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500
Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500


Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Epoch 119/500
Epoch 120/500
Epoch 121/500
Скор для фолда(3) : 1.4186 средний скор на префиксе = 1.4298 это заняло = 28 сек.
Фолд: 4
Размер трейна = (3595, 83) Размер валидации = (898, 83)


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtyp

Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500


Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 79/500
Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500


Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500
Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Скор для фолда(4) : 1.377 средний скор на префиксе = 1.4192 это заняло = 25 сек.
Процесс обучения модели занял = 148 секунд


In [67]:
# Предикт нейронной сетью на test
def get_nn_predict(models, test):
    result = np.zeros(len(test))
    test_ds = get_dataset(
        test[NUM_FEATURES_COLUMNS].values,
        np.zeros(len(test)),
        test[['region']].values,
        test[['city']].values,
        test[['realty_type']].values,
        len(test))
    for model in models:
        predict = model.predict(test_ds)[:, 0]
        result += (predict / MUL_TARGET) / len(models)
    return result


test_nn_predict = get_nn_predict(models_nn, test)

test_submission = pd.read_csv('dataset/test_submission.csv')

test_submission['per_square_meter_price'] = test_nn_predict
test_submission.to_csv('nn2.csv', index=False)

In [None]:

# LightGBM кастомная метрика
def feval_deviation(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'deviation_error', deviation_metric(np.exp(y_true), np.exp(y_pred)/1.15), False


# Функция для обучения модели LightGBM
def train_lgb(train, valid, num_features, categorical_features, target_train, target_valid, EPOCHS, params):
    # feature_importances = np.zeros(len(features))
    train_dataset = lgb.Dataset(train[num_features + categorical_features], np.log(target_train), 
#                                 weight=(1.0 / target_train),
                                categorical_feature=categorical_features)
    valid_dataset = lgb.Dataset(valid[num_features + categorical_features], np.log(target_valid), 
#                                 weight=(1.0 / target_valid),
                                categorical_feature=categorical_features)
    model = lgb.train(
        params=params,
        num_boost_round=EPOCHS,
        train_set=train_dataset,
        valid_sets=[train_dataset, valid_dataset],
        verbose_eval=100,
        early_stopping_rounds=int(5 / params['learning_rate']),
        feval=feval_deviation)

    y_valid = model.predict(valid[num_features + categorical_features])
    # feature_importances = model.feature_importance(importance_type='gain') / 5.0
    # lgb.plot_importance(model,max_num_features = 41)

    return model, y_valid


start_train_model_time = time.time()

boosting_seed = 41
boosting_params = {
    'bagging_fraction': 0.9,
    'bagging_freq': 1,
    'boost': 'gbdt',
    'feature_fraction': 0.9,
    'max_depth': 3,
    'learning_rate': 0.05,
    'metric': 'custom',
    'objective': 'regression_l1',
    'verbose': -1,
    'n_jobs': -1,
    'seed': boosting_seed,
    'feature_fraction_seed': boosting_seed,
    'bagging_seed': boosting_seed,
    'drop_seed': boosting_seed,
    'data_random_seed': boosting_seed,
}

# Количество эпох обучения
EPOCHS = 10000
scores = []
lgb_predicts = np.zeros(len(train))

lgb_models = []
for fold_num, (train_indexes, valid_indexes) in enumerate(split_list):
    start_time = time.time()
    print(f"Фолд: {fold_num}")

    train_sub_df = train[features_columns_order].loc[train_indexes].reset_index(drop=True)
    valid_sub_df = train[features_columns_order].loc[valid_indexes].reset_index(drop=True)

    print(f"Размер трейна = {train_sub_df.shape} Размер валидации = {valid_sub_df.shape}")
    # Обучаем LightGBM и делаем предикт на валидационной выборке
    model, predict_validation = train_lgb(
        train_sub_df,
        valid_sub_df,
        NUM_FEATURES_COLUMNS,
        CATEGORICAL_FEATURES_COLUMNS,
        train_sub_df[TARGET_COLUMNS[0]].values,
        valid_sub_df[TARGET_COLUMNS[0]].values,
        EPOCHS,
        boosting_params)

    lgb_models += [model]
    predict_on_validation = model.predict(valid_sub_df[NUM_FEATURES_COLUMNS + CATEGORICAL_FEATURES_COLUMNS])
    lgb_predicts[valid_indexes] = predict_on_validation
    targets_for_validation = valid_sub_df[TARGET_COLUMNS].values[:, 0]
    current_score = deviation_metric(targets_for_validation, predict_on_validation)
    scores += [current_score]
    print(
        f"Скор для фолда({fold_num}) : {np.round(current_score, 4)} средний скор на префиксе = {np.round(np.mean(scores), 4)} это заняло = {int(time.time() - start_time)} сек.")
print(f"Процесс обучения модели занял = {int(time.time() - start_train_model_time)} секунд")

Фолд: 0
Размер трейна = (3594, 83) Размер валидации = (899, 83)
Training until validation scores don't improve for 100 rounds
[100]	training's deviation_error: 1.20754	valid_1's deviation_error: 1.37905
[200]	training's deviation_error: 1.03693	valid_1's deviation_error: 1.2685
[300]	training's deviation_error: 0.958647	valid_1's deviation_error: 1.2364
[400]	training's deviation_error: 0.904766	valid_1's deviation_error: 1.21915
[500]	training's deviation_error: 0.869869	valid_1's deviation_error: 1.20899
[600]	training's deviation_error: 0.842595	valid_1's deviation_error: 1.2015
[700]	training's deviation_error: 0.818877	valid_1's deviation_error: 1.19738
[800]	training's deviation_error: 0.799916	valid_1's deviation_error: 1.19172
[900]	training's deviation_error: 0.783745	valid_1's deviation_error: 1.18761
[1000]	training's deviation_error: 0.768392	valid_1's deviation_error: 1.18108
[1100]	training's deviation_error: 0.755527	valid_1's deviation_error: 1.17597
[1200]	training's d