In [1]:
IS_GPU = False
# Импорт нужных библиотек
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
import time
from scipy.optimize import minimize
from neighbors import Neighborhoods

from indices import MainDataset
from dnn_utils import preprocess_floor
from metric import metrics_stat, deviation_metric
from catboost import CatBoostRegressor
from catboost import Pool

def reset_tensorflow_session():
    tf.keras.backend.clear_session()
    tf.random.set_seed(41)
    np.random.seed(41)


THRESHOLD = 0.15
NEGATIVE_WEIGHT = 1.1

In [2]:

# Категориальные данные
CATEGORICAL_FEATURES_COLUMNS = ['region', 'city', 'realty_type', 'floor', 'osm_city_nearest_name', 'street']
# Численные данные
NUM_FEATURES_COLUMNS = ['lat', 'lng', 'osm_amenity_points_in_0.001',
                        'osm_amenity_points_in_0.005', 'osm_amenity_points_in_0.0075',
                        'osm_amenity_points_in_0.01', 'osm_building_points_in_0.001',
                        'osm_building_points_in_0.005', 'osm_building_points_in_0.0075',
                        'osm_building_points_in_0.01', 'osm_catering_points_in_0.001',
                        'osm_catering_points_in_0.005', 'osm_catering_points_in_0.0075',
                        'osm_catering_points_in_0.01', 'osm_city_closest_dist',
                        'osm_city_nearest_population',
                        'osm_crossing_closest_dist', 'osm_crossing_points_in_0.001',
                        'osm_crossing_points_in_0.005', 'osm_crossing_points_in_0.0075',
                        'osm_crossing_points_in_0.01', 'osm_culture_points_in_0.001',
                        'osm_culture_points_in_0.005', 'osm_culture_points_in_0.0075',
                        'osm_culture_points_in_0.01', 'osm_finance_points_in_0.001',
                        'osm_finance_points_in_0.005', 'osm_finance_points_in_0.0075',
                        'osm_finance_points_in_0.01', 'osm_healthcare_points_in_0.005',
                        'osm_healthcare_points_in_0.0075', 'osm_healthcare_points_in_0.01',
                        'osm_historic_points_in_0.005', 'osm_historic_points_in_0.0075',
                        'osm_historic_points_in_0.01', 'osm_hotels_points_in_0.005',
                        'osm_hotels_points_in_0.0075', 'osm_hotels_points_in_0.01',
                        'osm_leisure_points_in_0.005', 'osm_leisure_points_in_0.0075',
                        'osm_leisure_points_in_0.01', 'osm_offices_points_in_0.001',
                        'osm_offices_points_in_0.005', 'osm_offices_points_in_0.0075',
                        'osm_offices_points_in_0.01', 'osm_shops_points_in_0.001',
                        'osm_shops_points_in_0.005', 'osm_shops_points_in_0.0075',
                        'osm_shops_points_in_0.01', 'osm_subway_closest_dist',
                        'osm_train_stop_closest_dist', 'osm_train_stop_points_in_0.005',
                        'osm_train_stop_points_in_0.0075', 'osm_train_stop_points_in_0.01',
                        'osm_transport_stop_closest_dist', 'osm_transport_stop_points_in_0.005',
                        'osm_transport_stop_points_in_0.0075',
                        'osm_transport_stop_points_in_0.01',
                        'reform_count_of_houses_1000', 'reform_count_of_houses_500',
                        'reform_house_population_1000', 'reform_house_population_500',
                        'reform_mean_floor_count_1000', 'reform_mean_floor_count_500',
                        'reform_mean_year_building_1000', 'reform_mean_year_building_500', 'total_square',
                        "neighbor_dist", "neighbor_total_price", "neighbor_square_price", "neighbor10_dist",
                        "has_basement", "floor_count"

                        ]
# Таргет
TARGET_COLUMNS = ['per_square_meter_price']

In [3]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')
train = train[train.price_type == 1].reset_index(drop=True)
train['is_train'] = 1
test['is_train'] = 0
dataset = pd.concat([train, test]).reset_index(drop=True)


In [4]:
train_dataset_index = MainDataset("dataset/train.csv")
test_dataset_index = MainDataset("dataset/test.csv", need_index=False)
neighborhoods = Neighborhoods(train_dataset_index.index)


In [5]:
dataset["neighbor_dist"] = -999
dataset["neighbor_total_price"] = -999
dataset["neighbor_square_price"] = -999
dataset["neighbor10_dist"] = -999
for d in [test_dataset_index, train_dataset_index]:
    for i, o in enumerate(d.all_objects):
        if o.row["price_type"] != 1:
            continue
        neighbor = neighborhoods.get_haversine_closest(o, 12)
        neighbor1 = neighborhoods.get_haversine_closest(o, 2)
        n = neighbor[0]
        dataset.loc[dataset["id"] == o.row["id"], "neighbor_dist"] = n[1]
        dataset.loc[dataset["id"] == o.row["id"], "neighbor_total_price"] = n[0].row["per_square_meter_price"] * \
                                                                            n[0].row["total_square"]
        dataset.loc[dataset["id"] == o.row["id"], "neighbor_square_price"] = n[0].row["per_square_meter_price"]
        dataset.loc[dataset["id"] == o.row["id"], "neighbor10_dist"] = neighbor[10][1]


In [6]:

dataset=preprocess_floor.preprocess(dataset)



In [7]:
def encode_categorical_features(df, categorical_columns):
    for column in categorical_columns:
        dict_encoding = {key: val for val, key in enumerate(df[column].unique())}
        df[column] = df[column].map(dict_encoding)
    return df

In [8]:
data = encode_categorical_features(dataset, CATEGORICAL_FEATURES_COLUMNS)
data = data.fillna(data.mean())
train = data[data.is_train == 1].reset_index(drop=True)
test = data[data.is_train == 0].reset_index(drop=True)
train = train.drop(columns=['is_train'])
test = test.drop(columns=['is_train'])

In [9]:
def get_standart_split(data, n_splits=5, seed=41):
    kf = KFold(n_splits=n_splits, random_state=seed, shuffle=True)
    split_list = []
    for train_index, test_index in kf.split(data):
        split_list += [(train_index, test_index)]
    return split_list

In [12]:
def get_columns_order(columns):
    columns_order = sorted([x for x in columns if not x in (CATEGORICAL_FEATURES_COLUMNS + TARGET_COLUMNS)])
    return columns_order + CATEGORICAL_FEATURES_COLUMNS + TARGET_COLUMNS

features_columns_order = get_columns_order(train.columns.values.tolist())

split_list = get_standart_split(train, n_splits=12)


In [13]:
def xbg_error(preds, dtrain):
    labels = dtrain.get_label()
    err = deviation_metric(np.exp(labels), np.exp(preds)/1.1)
    return 'deviation_error', err


def train_xgb(train, valid, num_features, categorical_features, target_train, target_valid, EPOCHS, params):
    dtest = xgb.DMatrix(test[num_features + categorical_features])
    y_valid = np.zeros(len(valid))

    dtrain = xgb.DMatrix(train[num_features + categorical_features], np.log(target_train), 
                        )
    dvalid = xgb.DMatrix(valid[num_features + categorical_features], np.log(target_valid), 
                        )
    model = xgb.train(
        params,
        dtrain,
        EPOCHS,
        [(dvalid, "valid")],
        verbose_eval=250,
        early_stopping_rounds=500,
        feval=xbg_error,
    )
    y_valid = model.predict(dvalid)

    return model, y_valid


start_train_model_time = time.time()

xgboost_seed = 41
xgboost_params = {
    "subsample": 0.70,
    "colsample_bytree": 0.50,
    "max_depth": 7,
    "learning_rate": 0.012,
    "objective": "reg:squarederror",
    'disable_default_eval_metric': 1,
    "nthread": -1,
    "max_bin": 128,
    'min_child_weight': 0.0,
    'reg_lambda': 0.0,
    'reg_alpha': 0.0,
    'seed': xgboost_seed,
}


EPOCHS = 10000
scores = []
xgb_predicts = np.zeros(len(train))

xgb_models = []
for fold_num, (train_indexes, valid_indexes) in enumerate(split_list):
    start_time = time.time()
    print(f"Фолд: {fold_num}")

    train_sub_df = train[features_columns_order].loc[train_indexes].reset_index(drop=True)
    valid_sub_df = train[features_columns_order].loc[valid_indexes].reset_index(drop=True)

    print(f"Размер трейна = {train_sub_df.shape} Размер валидации = {valid_sub_df.shape}")
    # Обучаем Xgboost и делаем предикт на валидационной выборке
    model, predict_validation = train_xgb(
        train_sub_df,
        valid_sub_df,
        NUM_FEATURES_COLUMNS,
        CATEGORICAL_FEATURES_COLUMNS,
        train_sub_df[TARGET_COLUMNS[0]].values,
        valid_sub_df[TARGET_COLUMNS[0]].values,
        EPOCHS,
        xgboost_params)

    xgb_models += [model]
    predict_on_validation = model.predict(
        xgb.DMatrix(valid_sub_df[NUM_FEATURES_COLUMNS + CATEGORICAL_FEATURES_COLUMNS]))
    xgb_predicts[valid_indexes] = np.exp(predict_on_validation)
    targets_for_validation = valid_sub_df[TARGET_COLUMNS].values[:, 0]
    current_score = deviation_metric(targets_for_validation, predict_on_validation)
    scores += [current_score]
    print(
        f"Скор для фолда({fold_num}) : {np.round(current_score, 4)} средний скор на префиксе = {np.round(np.mean(scores), 4)} это заняло = {int(time.time() - start_time)} сек.")
print(f"Процесс обучения модели занял = {int(time.time() - start_train_model_time)} секунд")






Фолд: 0
Размер трейна = (4118, 83) Размер валидации = (375, 83)
[0]	valid-deviation_error:9.00000
[250]	valid-deviation_error:4.14471
[500]	valid-deviation_error:1.11063
[750]	valid-deviation_error:1.09036
[1000]	valid-deviation_error:1.05467
[1250]	valid-deviation_error:1.04542
[1500]	valid-deviation_error:1.04273
[1750]	valid-deviation_error:1.03548
[2000]	valid-deviation_error:1.02966
[2250]	valid-deviation_error:1.02741
[2500]	valid-deviation_error:1.02660
[2750]	valid-deviation_error:1.02156
[3000]	valid-deviation_error:1.02469
[3250]	valid-deviation_error:1.02430
[3266]	valid-deviation_error:1.02435
Скор для фолда(0) : 9.0 средний скор на префиксе = 9.0 это заняло = 26 сек.
Фолд: 1
Размер трейна = (4118, 83) Размер валидации = (375, 83)
[0]	valid-deviation_error:9.00000
[250]	valid-deviation_error:4.61144
[500]	valid-deviation_error:0.97635
[750]	valid-deviation_error:0.95947
[1000]	valid-deviation_error:0.95970
[1250]	valid-deviation_error:0.96812
[1448]	valid-deviation_error:0.

In [14]:
# Предикт xgb на test
def get_xgb_predict(models, test):
    result = np.zeros(len(test))
    for model in models:
        predict = model.predict(xgb.DMatrix(test[NUM_FEATURES_COLUMNS + CATEGORICAL_FEATURES_COLUMNS]))
        result += predict / len(models)
    return result

test_xgb_predict = get_xgb_predict(xgb_models, test)

test_xgb_predict=np.exp(test_xgb_predict)

test_xgb_predict.min(), test_xgb_predict.max(), test_xgb_predict.mean()



(15061.37296136694, 1282121.9939197013, 61540.77902514505)

In [16]:
class CatBoostEvalMetricPearson(object):
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])
        preds = np.array(approxes[0])
        target = np.array(target)
        err = deviation_metric(np.exp(target), np.exp(preds)/1.1)
        return err, 0


def train_cat(train, valid, num_features, categorical_features, target_train, target_valid, EPOCHS):

    test_data = Pool(data=test[num_features + categorical_features],
                  cat_features=categorical_features)


    train_data = Pool(data=train[num_features + categorical_features],
                      cat_features=categorical_features,
                      label=np.log(target_train))

    val_data = Pool(data=valid[num_features + categorical_features],
                cat_features=categorical_features,
                  label=np.log(target_valid))

    cat_model = CatBoostRegressor(
        l2_leaf_reg=5,
        bagging_temperature=1.2,
        random_strength=1.1,
        learning_rate=0.02,
        iterations=10000,
        metric_period=50,
        eval_metric=CatBoostEvalMetricPearson(),
    )
    cat_model.fit(train_data, eval_set=val_data, use_best_model=True, early_stopping_rounds=300)
  
    y_valid = cat_model.predict(test_data)

    return cat_model, y_valid


start_train_model_time = time.time()

scores = []
cat_predicts = np.zeros(len(train))

cat_models = []
for fold_num, (train_indexes, valid_indexes) in enumerate(split_list):
    start_time = time.time()
    print(f"Фолд: {fold_num}")

    train_sub_df = train[features_columns_order].loc[train_indexes].reset_index(drop=True)
    valid_sub_df = train[features_columns_order].loc[valid_indexes].reset_index(drop=True)

    print(f"Размер трейна = {train_sub_df.shape} Размер валидации = {valid_sub_df.shape}")
    model, predict_validation = train_cat(
        train_sub_df,
        valid_sub_df,
        NUM_FEATURES_COLUMNS,
        CATEGORICAL_FEATURES_COLUMNS,
        train_sub_df[TARGET_COLUMNS[0]].values,
        valid_sub_df[TARGET_COLUMNS[0]].values,
        EPOCHS
        )

    cat_models += [model]
    predict_on_validation = model.predict(valid_sub_df[NUM_FEATURES_COLUMNS + CATEGORICAL_FEATURES_COLUMNS])
    cat_predicts[valid_indexes] = np.exp(predict_on_validation)
    targets_for_validation = valid_sub_df[TARGET_COLUMNS].values[:, 0]
    current_score = deviation_metric(targets_for_validation, predict_on_validation)
    scores += [current_score]
    print(
        f"Скор для фолда({fold_num}) : {np.round(current_score, 4)} средний скор на префиксе = {np.round(np.mean(scores), 4)} это заняло = {int(time.time() - start_time)} сек.")
print(f"Процесс обучения модели занял = {int(time.time() - start_train_model_time)} секунд")



Фолд: 0
Размер трейна = (4118, 83) Размер валидации = (375, 83)
0:	learn: 3.1896582	test: 3.0861598	best: 3.0861598 (0)	total: 31.2ms	remaining: 5m 12s




50:	learn: 2.1418478	test: 2.1108779	best: 2.1108779 (50)	total: 423ms	remaining: 1m 22s
100:	learn: 1.6918265	test: 1.8522402	best: 1.8522402 (100)	total: 799ms	remaining: 1m 18s
150:	learn: 1.4813633	test: 1.7358007	best: 1.7358007 (150)	total: 1.2s	remaining: 1m 18s
200:	learn: 1.3644125	test: 1.6375849	best: 1.6375849 (200)	total: 1.64s	remaining: 1m 20s
250:	learn: 1.2802979	test: 1.5721733	best: 1.5721733 (250)	total: 2.05s	remaining: 1m 19s
300:	learn: 1.2248879	test: 1.5341984	best: 1.5341984 (300)	total: 2.46s	remaining: 1m 19s
350:	learn: 1.1827280	test: 1.5091411	best: 1.5091411 (350)	total: 2.84s	remaining: 1m 18s
400:	learn: 1.1448268	test: 1.4852076	best: 1.4852076 (400)	total: 3.22s	remaining: 1m 17s
450:	learn: 1.1162809	test: 1.4629401	best: 1.4629401 (450)	total: 3.63s	remaining: 1m 16s
500:	learn: 1.0891955	test: 1.4510873	best: 1.4507240 (499)	total: 4s	remaining: 1m 15s
550:	learn: 1.0606290	test: 1.4403695	best: 1.4403695 (550)	total: 4.4s	remaining: 1m 15s
600:	l

4550:	learn: 0.1991403	test: 1.1284678	best: 1.1278484 (4547)	total: 36.6s	remaining: 43.8s
4600:	learn: 0.1956082	test: 1.1260138	best: 1.1259460 (4599)	total: 37s	remaining: 43.4s
4650:	learn: 0.1914159	test: 1.1244880	best: 1.1244880 (4650)	total: 37.4s	remaining: 43.1s
4700:	learn: 0.1874501	test: 1.1247014	best: 1.1238469 (4662)	total: 37.9s	remaining: 42.7s
4750:	learn: 0.1832141	test: 1.1228506	best: 1.1228506 (4750)	total: 38.3s	remaining: 42.3s
4800:	learn: 0.1790162	test: 1.1205057	best: 1.1205057 (4800)	total: 38.7s	remaining: 41.9s
4850:	learn: 0.1757739	test: 1.1182789	best: 1.1182744 (4849)	total: 39.1s	remaining: 41.5s
4900:	learn: 0.1719876	test: 1.1167491	best: 1.1166160 (4893)	total: 39.5s	remaining: 41.1s
4950:	learn: 0.1685828	test: 1.1177694	best: 1.1166160 (4893)	total: 39.9s	remaining: 40.7s
5000:	learn: 0.1655049	test: 1.1184430	best: 1.1166160 (4893)	total: 40.3s	remaining: 40.3s
5050:	learn: 0.1622470	test: 1.1181471	best: 1.1166160 (4893)	total: 40.7s	remaini



50:	learn: 2.1454508	test: 1.8923501	best: 1.8923501 (50)	total: 407ms	remaining: 1m 19s
100:	learn: 1.7261476	test: 1.4822977	best: 1.4822977 (100)	total: 798ms	remaining: 1m 18s
150:	learn: 1.5124014	test: 1.2733597	best: 1.2733597 (150)	total: 1.35s	remaining: 1m 28s
200:	learn: 1.3947502	test: 1.1865796	best: 1.1858588 (199)	total: 1.91s	remaining: 1m 33s
250:	learn: 1.3157120	test: 1.1390623	best: 1.1390623 (250)	total: 2.32s	remaining: 1m 30s
300:	learn: 1.2524838	test: 1.1045345	best: 1.1045345 (300)	total: 2.84s	remaining: 1m 31s
350:	learn: 1.2071922	test: 1.0846644	best: 1.0846644 (350)	total: 3.25s	remaining: 1m 29s
400:	learn: 1.1680568	test: 1.0599506	best: 1.0599506 (400)	total: 3.64s	remaining: 1m 27s
450:	learn: 1.1374790	test: 1.0583123	best: 1.0580553 (439)	total: 4.05s	remaining: 1m 25s
500:	learn: 1.1069729	test: 1.0489356	best: 1.0489356 (500)	total: 4.49s	remaining: 1m 25s
550:	learn: 1.0845318	test: 1.0435803	best: 1.0434457 (549)	total: 4.9s	remaining: 1m 24s
60



50:	learn: 2.1050667	test: 2.2839419	best: 2.2839419 (50)	total: 428ms	remaining: 1m 23s
100:	learn: 1.7061528	test: 1.9164742	best: 1.9164742 (100)	total: 808ms	remaining: 1m 19s
150:	learn: 1.5040949	test: 1.6875017	best: 1.6875017 (150)	total: 1.19s	remaining: 1m 17s
200:	learn: 1.3879832	test: 1.5345835	best: 1.5345835 (200)	total: 1.6s	remaining: 1m 17s
250:	learn: 1.3161902	test: 1.4367633	best: 1.4367633 (250)	total: 1.97s	remaining: 1m 16s
300:	learn: 1.2568802	test: 1.3656942	best: 1.3656942 (300)	total: 2.37s	remaining: 1m 16s
350:	learn: 1.2166926	test: 1.3339214	best: 1.3339214 (350)	total: 2.77s	remaining: 1m 16s
400:	learn: 1.1780169	test: 1.2968312	best: 1.2968312 (400)	total: 3.16s	remaining: 1m 15s
450:	learn: 1.1465729	test: 1.2773556	best: 1.2767048 (449)	total: 3.54s	remaining: 1m 14s
500:	learn: 1.1216995	test: 1.2813172	best: 1.2734774 (459)	total: 3.94s	remaining: 1m 14s
550:	learn: 1.0988037	test: 1.2725001	best: 1.2725001 (550)	total: 4.33s	remaining: 1m 14s
60



50:	learn: 2.0934859	test: 2.3527154	best: 2.3527154 (50)	total: 434ms	remaining: 1m 24s
100:	learn: 1.6869103	test: 1.8611961	best: 1.8611961 (100)	total: 812ms	remaining: 1m 19s
150:	learn: 1.4862785	test: 1.6637572	best: 1.6637572 (150)	total: 1.19s	remaining: 1m 17s
200:	learn: 1.3625772	test: 1.5881743	best: 1.5881743 (200)	total: 1.58s	remaining: 1m 17s
250:	learn: 1.2853747	test: 1.5526745	best: 1.5526745 (250)	total: 1.97s	remaining: 1m 16s
300:	learn: 1.2311671	test: 1.5413154	best: 1.5408398 (299)	total: 2.35s	remaining: 1m 15s
350:	learn: 1.1865909	test: 1.5169634	best: 1.5169634 (350)	total: 2.71s	remaining: 1m 14s
400:	learn: 1.1423208	test: 1.4943630	best: 1.4943630 (400)	total: 3.1s	remaining: 1m 14s
450:	learn: 1.1102951	test: 1.4835494	best: 1.4831225 (449)	total: 3.47s	remaining: 1m 13s
500:	learn: 1.0797280	test: 1.4764053	best: 1.4762202 (498)	total: 3.85s	remaining: 1m 12s
550:	learn: 1.0476120	test: 1.4615516	best: 1.4611768 (544)	total: 4.22s	remaining: 1m 12s
60



50:	learn: 2.1361769	test: 2.1708202	best: 2.1708202 (50)	total: 395ms	remaining: 1m 17s
100:	learn: 1.7226689	test: 1.6247888	best: 1.6247888 (100)	total: 776ms	remaining: 1m 16s
150:	learn: 1.5138936	test: 1.3765553	best: 1.3765553 (150)	total: 1.17s	remaining: 1m 16s
200:	learn: 1.3974029	test: 1.2630100	best: 1.2630100 (200)	total: 1.55s	remaining: 1m 15s
250:	learn: 1.3151568	test: 1.1919514	best: 1.1919514 (250)	total: 1.95s	remaining: 1m 15s
300:	learn: 1.2577926	test: 1.1564821	best: 1.1564821 (300)	total: 2.33s	remaining: 1m 15s
350:	learn: 1.2142057	test: 1.1292924	best: 1.1292924 (350)	total: 2.74s	remaining: 1m 15s
400:	learn: 1.1752484	test: 1.1164105	best: 1.1164105 (400)	total: 3.12s	remaining: 1m 14s
450:	learn: 1.1445202	test: 1.1134222	best: 1.1087370 (441)	total: 3.5s	remaining: 1m 14s
500:	learn: 1.1157422	test: 1.1074117	best: 1.1066447 (499)	total: 3.88s	remaining: 1m 13s
550:	learn: 1.0864099	test: 1.1039201	best: 1.1039201 (550)	total: 4.26s	remaining: 1m 13s
60

4550:	learn: 0.1860320	test: 0.9222638	best: 0.9208462 (4481)	total: 35.1s	remaining: 42s
4600:	learn: 0.1823227	test: 0.9208958	best: 0.9206461 (4581)	total: 35.5s	remaining: 41.6s
4650:	learn: 0.1785471	test: 0.9194582	best: 0.9194582 (4650)	total: 35.8s	remaining: 41.2s
4700:	learn: 0.1751985	test: 0.9190463	best: 0.9182092 (4678)	total: 36.2s	remaining: 40.8s
4750:	learn: 0.1710711	test: 0.9183825	best: 0.9182092 (4678)	total: 36.6s	remaining: 40.5s
4800:	learn: 0.1676691	test: 0.9169916	best: 0.9169229 (4799)	total: 37s	remaining: 40.1s
4850:	learn: 0.1642676	test: 0.9175315	best: 0.9161848 (4816)	total: 37.4s	remaining: 39.7s
4900:	learn: 0.1607009	test: 0.9177642	best: 0.9161848 (4816)	total: 37.8s	remaining: 39.3s
4950:	learn: 0.1574694	test: 0.9184592	best: 0.9161848 (4816)	total: 38.1s	remaining: 38.9s
5000:	learn: 0.1538780	test: 0.9172632	best: 0.9161848 (4816)	total: 38.5s	remaining: 38.5s
5050:	learn: 0.1503732	test: 0.9178150	best: 0.9161848 (4816)	total: 38.9s	remaining



50:	learn: 2.1480415	test: 2.0615877	best: 2.0615877 (50)	total: 414ms	remaining: 1m 20s
100:	learn: 1.7250223	test: 1.6386174	best: 1.6386174 (100)	total: 792ms	remaining: 1m 17s
150:	learn: 1.5090413	test: 1.4513281	best: 1.4513281 (150)	total: 1.17s	remaining: 1m 16s
200:	learn: 1.3868274	test: 1.3638777	best: 1.3638777 (200)	total: 1.54s	remaining: 1m 15s
250:	learn: 1.3088657	test: 1.3190792	best: 1.3190792 (250)	total: 1.92s	remaining: 1m 14s
300:	learn: 1.2589638	test: 1.2876158	best: 1.2876158 (300)	total: 2.3s	remaining: 1m 14s
350:	learn: 1.2140880	test: 1.2600401	best: 1.2594976 (348)	total: 2.67s	remaining: 1m 13s
400:	learn: 1.1796083	test: 1.2289082	best: 1.2289082 (400)	total: 3.04s	remaining: 1m 12s
450:	learn: 1.1554938	test: 1.2174609	best: 1.2171420 (449)	total: 3.4s	remaining: 1m 11s
500:	learn: 1.1374819	test: 1.2082278	best: 1.2081015 (498)	total: 3.77s	remaining: 1m 11s
550:	learn: 1.1150810	test: 1.1930179	best: 1.1930179 (550)	total: 4.13s	remaining: 1m 10s
600



50:	learn: 2.1311605	test: 2.1746809	best: 2.1746809 (50)	total: 443ms	remaining: 1m 26s
100:	learn: 1.6980523	test: 1.7767068	best: 1.7767068 (100)	total: 833ms	remaining: 1m 21s
150:	learn: 1.4989941	test: 1.6085429	best: 1.6085429 (150)	total: 1.22s	remaining: 1m 19s
200:	learn: 1.3813799	test: 1.5013932	best: 1.5013932 (200)	total: 1.61s	remaining: 1m 18s
250:	learn: 1.3008597	test: 1.4423207	best: 1.4423207 (250)	total: 2s	remaining: 1m 17s
300:	learn: 1.2428157	test: 1.4052259	best: 1.4052259 (300)	total: 2.4s	remaining: 1m 17s
350:	learn: 1.1979319	test: 1.3708690	best: 1.3708690 (350)	total: 2.78s	remaining: 1m 16s
400:	learn: 1.1581719	test: 1.3455097	best: 1.3455097 (400)	total: 3.18s	remaining: 1m 16s
450:	learn: 1.1225976	test: 1.3241356	best: 1.3241356 (450)	total: 3.57s	remaining: 1m 15s
500:	learn: 1.0927221	test: 1.3134141	best: 1.3134119 (494)	total: 3.95s	remaining: 1m 14s
550:	learn: 1.0666126	test: 1.3005782	best: 1.3000771 (548)	total: 4.34s	remaining: 1m 14s
600:	



50:	learn: 2.1067615	test: 2.2828005	best: 2.2828005 (50)	total: 396ms	remaining: 1m 17s
100:	learn: 1.6871366	test: 1.8461625	best: 1.8461625 (100)	total: 774ms	remaining: 1m 15s
150:	learn: 1.4803402	test: 1.6587181	best: 1.6587181 (150)	total: 1.17s	remaining: 1m 16s
200:	learn: 1.3579018	test: 1.5581715	best: 1.5581715 (200)	total: 1.56s	remaining: 1m 16s
250:	learn: 1.2789197	test: 1.4940806	best: 1.4940806 (250)	total: 2s	remaining: 1m 17s
300:	learn: 1.2224024	test: 1.4597242	best: 1.4592997 (299)	total: 2.37s	remaining: 1m 16s
350:	learn: 1.1752378	test: 1.4380619	best: 1.4380619 (350)	total: 2.75s	remaining: 1m 15s
400:	learn: 1.1366209	test: 1.4214252	best: 1.4210670 (398)	total: 3.12s	remaining: 1m 14s
450:	learn: 1.0996151	test: 1.4099796	best: 1.4095190 (449)	total: 3.51s	remaining: 1m 14s
500:	learn: 1.0667147	test: 1.4023167	best: 1.4023167 (500)	total: 3.89s	remaining: 1m 13s
550:	learn: 1.0391624	test: 1.4009645	best: 1.4008065 (549)	total: 4.27s	remaining: 1m 13s
600:

4550:	learn: 0.1690115	test: 1.1986505	best: 1.1986505 (4550)	total: 34.9s	remaining: 41.8s
4600:	learn: 0.1659863	test: 1.1974407	best: 1.1972585 (4598)	total: 35.3s	remaining: 41.4s
4650:	learn: 0.1634643	test: 1.1956742	best: 1.1956742 (4650)	total: 35.7s	remaining: 41s
4700:	learn: 0.1605142	test: 1.1938258	best: 1.1934277 (4699)	total: 36s	remaining: 40.6s
4750:	learn: 0.1576289	test: 1.1950953	best: 1.1934277 (4699)	total: 36.4s	remaining: 40.3s
4800:	learn: 0.1544135	test: 1.1954647	best: 1.1934277 (4699)	total: 36.8s	remaining: 39.9s
4850:	learn: 0.1517268	test: 1.1950654	best: 1.1934277 (4699)	total: 37.2s	remaining: 39.5s
4900:	learn: 0.1485095	test: 1.1944382	best: 1.1934277 (4699)	total: 37.6s	remaining: 39.1s
4950:	learn: 0.1458082	test: 1.1927239	best: 1.1927220 (4949)	total: 38s	remaining: 38.7s
5000:	learn: 0.1429887	test: 1.1932094	best: 1.1927220 (4949)	total: 38.3s	remaining: 38.3s
5050:	learn: 0.1400980	test: 1.1924224	best: 1.1921707 (5046)	total: 38.7s	remaining: 



0:	learn: 3.1630399	test: 3.2203769	best: 3.2203769 (0)	total: 28.9ms	remaining: 4m 49s
50:	learn: 2.1312060	test: 2.2122665	best: 2.2122665 (50)	total: 417ms	remaining: 1m 21s
100:	learn: 1.7150768	test: 1.7810161	best: 1.7810161 (100)	total: 809ms	remaining: 1m 19s
150:	learn: 1.5088407	test: 1.5917916	best: 1.5917916 (150)	total: 1.19s	remaining: 1m 17s
200:	learn: 1.3834269	test: 1.4756428	best: 1.4756428 (200)	total: 1.58s	remaining: 1m 17s
250:	learn: 1.3030794	test: 1.4304459	best: 1.4304459 (250)	total: 1.97s	remaining: 1m 16s
300:	learn: 1.2435550	test: 1.3892494	best: 1.3892494 (300)	total: 2.36s	remaining: 1m 16s
350:	learn: 1.1975520	test: 1.3683958	best: 1.3683958 (350)	total: 2.73s	remaining: 1m 15s
400:	learn: 1.1572018	test: 1.3467048	best: 1.3467048 (400)	total: 3.14s	remaining: 1m 15s
450:	learn: 1.1249089	test: 1.3333030	best: 1.3329007 (448)	total: 3.53s	remaining: 1m 14s
500:	learn: 1.0959765	test: 1.3190519	best: 1.3190519 (500)	total: 3.91s	remaining: 1m 14s
550:

4500:	learn: 0.2003654	test: 1.0217861	best: 1.0215055 (4488)	total: 34.5s	remaining: 42.1s
4550:	learn: 0.1968939	test: 1.0217289	best: 1.0212234 (4515)	total: 34.9s	remaining: 41.8s
4600:	learn: 0.1934062	test: 1.0191116	best: 1.0189462 (4598)	total: 35.3s	remaining: 41.4s
4650:	learn: 0.1892954	test: 1.0191035	best: 1.0189462 (4598)	total: 35.7s	remaining: 41s
4700:	learn: 0.1854405	test: 1.0175415	best: 1.0170683 (4685)	total: 36.1s	remaining: 40.6s
4750:	learn: 0.1821847	test: 1.0172338	best: 1.0162678 (4714)	total: 36.5s	remaining: 40.3s
4800:	learn: 0.1788818	test: 1.0179403	best: 1.0162678 (4714)	total: 36.9s	remaining: 39.9s
4850:	learn: 0.1756405	test: 1.0188394	best: 1.0162678 (4714)	total: 37.3s	remaining: 39.5s
4900:	learn: 0.1721219	test: 1.0186393	best: 1.0162678 (4714)	total: 37.6s	remaining: 39.2s
4950:	learn: 0.1688827	test: 1.0177273	best: 1.0162678 (4714)	total: 38s	remaining: 38.8s
5000:	learn: 0.1662100	test: 1.0155101	best: 1.0155101 (5000)	total: 38.4s	remaining



0:	learn: 3.1833405	test: 3.0301244	best: 3.0301244 (0)	total: 28.6ms	remaining: 4m 45s
50:	learn: 2.1157274	test: 2.0969079	best: 2.0969079 (50)	total: 418ms	remaining: 1m 21s
100:	learn: 1.6994431	test: 1.8124669	best: 1.8124669 (100)	total: 817ms	remaining: 1m 20s
150:	learn: 1.4894828	test: 1.6828097	best: 1.6823628 (149)	total: 1.21s	remaining: 1m 18s
200:	learn: 1.3643867	test: 1.6101813	best: 1.6101813 (200)	total: 1.62s	remaining: 1m 18s
250:	learn: 1.2860812	test: 1.5497025	best: 1.5496408 (249)	total: 1.98s	remaining: 1m 17s
300:	learn: 1.2286137	test: 1.5136662	best: 1.5136662 (300)	total: 2.37s	remaining: 1m 16s
350:	learn: 1.1871138	test: 1.4892285	best: 1.4892285 (350)	total: 2.77s	remaining: 1m 16s
400:	learn: 1.1481142	test: 1.4675977	best: 1.4675977 (400)	total: 3.15s	remaining: 1m 15s
450:	learn: 1.1147784	test: 1.4536445	best: 1.4534412 (449)	total: 3.52s	remaining: 1m 14s
500:	learn: 1.0840076	test: 1.4405147	best: 1.4405147 (500)	total: 3.91s	remaining: 1m 14s
550:



50:	learn: 2.1242180	test: 1.9176610	best: 1.9176610 (50)	total: 426ms	remaining: 1m 23s
100:	learn: 1.7079913	test: 1.5590585	best: 1.5590585 (100)	total: 815ms	remaining: 1m 19s
150:	learn: 1.5001085	test: 1.4066247	best: 1.4066247 (150)	total: 1.24s	remaining: 1m 20s
200:	learn: 1.3831464	test: 1.3451983	best: 1.3451983 (200)	total: 1.66s	remaining: 1m 20s
250:	learn: 1.3049047	test: 1.3007486	best: 1.3007486 (250)	total: 2.04s	remaining: 1m 19s
300:	learn: 1.2474469	test: 1.2683378	best: 1.2683378 (300)	total: 2.45s	remaining: 1m 18s
350:	learn: 1.2041193	test: 1.2458278	best: 1.2458278 (350)	total: 2.83s	remaining: 1m 17s
400:	learn: 1.1690030	test: 1.2279634	best: 1.2279634 (400)	total: 3.2s	remaining: 1m 16s
450:	learn: 1.1368217	test: 1.2149056	best: 1.2149056 (450)	total: 3.58s	remaining: 1m 15s
500:	learn: 1.1063748	test: 1.2035444	best: 1.2032828 (499)	total: 3.99s	remaining: 1m 15s
550:	learn: 1.0776102	test: 1.1957691	best: 1.1957691 (550)	total: 4.39s	remaining: 1m 15s
60

4550:	learn: 0.1945343	test: 1.0452815	best: 1.0450906 (4511)	total: 35.6s	remaining: 42.6s
4600:	learn: 0.1904337	test: 1.0441211	best: 1.0437875 (4583)	total: 36s	remaining: 42.2s
4650:	learn: 0.1877200	test: 1.0432985	best: 1.0428914 (4635)	total: 36.4s	remaining: 41.9s
4700:	learn: 0.1844054	test: 1.0425535	best: 1.0424809 (4699)	total: 36.8s	remaining: 41.5s
4750:	learn: 0.1811075	test: 1.0418479	best: 1.0418479 (4750)	total: 37.2s	remaining: 41.1s
4800:	learn: 0.1783449	test: 1.0428401	best: 1.0417509 (4752)	total: 37.6s	remaining: 40.7s
4850:	learn: 0.1751468	test: 1.0417370	best: 1.0416951 (4849)	total: 38s	remaining: 40.3s
4900:	learn: 0.1719968	test: 1.0410758	best: 1.0401836 (4885)	total: 38.4s	remaining: 39.9s
4950:	learn: 0.1687731	test: 1.0409834	best: 1.0400128 (4918)	total: 38.8s	remaining: 39.6s
5000:	learn: 0.1655493	test: 1.0403435	best: 1.0400128 (4918)	total: 39.2s	remaining: 39.2s
5050:	learn: 0.1627950	test: 1.0402019	best: 1.0400050 (5029)	total: 39.6s	remaining



50:	learn: 2.1348874	test: 1.9833567	best: 1.9833567 (50)	total: 462ms	remaining: 1m 30s
100:	learn: 1.7090644	test: 1.6644728	best: 1.6644728 (100)	total: 868ms	remaining: 1m 25s
150:	learn: 1.4968440	test: 1.5200333	best: 1.5200333 (150)	total: 1.25s	remaining: 1m 21s
200:	learn: 1.3699662	test: 1.4164226	best: 1.4156916 (199)	total: 1.64s	remaining: 1m 19s
250:	learn: 1.2920312	test: 1.3595795	best: 1.3593821 (249)	total: 2.01s	remaining: 1m 18s
300:	learn: 1.2326437	test: 1.2986064	best: 1.2986064 (300)	total: 2.4s	remaining: 1m 17s
350:	learn: 1.1811799	test: 1.2650891	best: 1.2650891 (350)	total: 2.77s	remaining: 1m 16s
400:	learn: 1.1453188	test: 1.2363818	best: 1.2363818 (400)	total: 3.14s	remaining: 1m 15s
450:	learn: 1.1127116	test: 1.2140382	best: 1.2140382 (450)	total: 3.51s	remaining: 1m 14s
500:	learn: 1.0859799	test: 1.2053391	best: 1.2048405 (499)	total: 3.89s	remaining: 1m 13s
550:	learn: 1.0636630	test: 1.1865869	best: 1.1865869 (550)	total: 4.26s	remaining: 1m 13s
60

4550:	learn: 0.1911715	test: 0.9107875	best: 0.9089155 (4318)	total: 34.6s	remaining: 41.5s
4600:	learn: 0.1882567	test: 0.9100018	best: 0.9089155 (4318)	total: 35s	remaining: 41.1s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.9089154684
bestIteration = 4318

Shrink model to first 4319 iterations.
Скор для фолда(11) : 9.0 средний скор на префиксе = 9.0 это заняло = 35 сек.
Процесс обучения модели занял = 451 секунд


In [17]:


def get_cat_predict(models, test):
    result = np.zeros(len(test))
    for model in cat_models:
        predict = model.predict(test[NUM_FEATURES_COLUMNS + CATEGORICAL_FEATURES_COLUMNS])
        result += np.exp(predict) / len(models)
    return result


test_cat_predict = get_cat_predict(xgb_models, test)

test_cat_predict.min(), test_cat_predict.max(), test_cat_predict.mean()


(16447.04172594073, 1259450.8579065762, 61479.90929318597)

In [18]:
train_targets = train[TARGET_COLUMNS[0]].values

In [19]:
def minimize_arit(W):
    ypred =  W[0] * xgb_predicts + W[1] * cat_predicts
    return deviation_metric(train_targets, ypred)


W = minimize(minimize_arit, [1.0 / 2] * 2, options={'gtol': 1e-6, 'disp': True}).x
W
# 1.006250
# array([0.55692824, 0.34630855])


Optimization terminated successfully.
         Current function value: 1.012829
         Iterations: 10
         Function evaluations: 45
         Gradient evaluations: 15


array([0.62845338, 0.27159312])

In [20]:

test_submission = pd.read_csv('dataset/test_submission.csv')
test_submission['per_square_meter_price'] = test_xgb_predict * W[0] + test_cat_predict * W[1]
test_submission['per_square_meter_price'] = test_submission['per_square_meter_price'].apply(lambda x: max(1000.0, x))
test_submission.to_csv('submission.csv', index = False)

In [21]:
for city_id in [0,4,2,22]:
    xgb_predicts_spb = xgb_predicts[train.city == city_id]
    cat_predicts_spb = cat_predicts[train.city == city_id]
    train_targets_spb = train_targets[train.city == city_id]

    test_xgb_predict_spb = test_xgb_predict[test.city == city_id]
    test_cat_predict_spb = test_cat_predict[test.city == city_id]

    def minimize_arit_spb(W):
        ypred = W[0] * xgb_predicts_spb + W[1] * cat_predicts_spb
        return deviation_metric(train_targets_spb, ypred)


    W_spb = minimize(minimize_arit_spb, [1.0 / 2] * 2, options={'gtol': 1e-6, 'disp': True}).x
    print(W_spb)
    new_score = test_xgb_predict_spb * W_spb[0] + test_cat_predict_spb * W_spb[1]
    old_score = test_submission.loc[test.city == city_id, 'per_square_meter_price'].values
    test_submission.loc[test.city == city_id, 'per_square_meter_price'] =  (new_score+old_score)/2
test_submission.to_csv('submission_city.csv', index = False)

Optimization terminated successfully.
         Current function value: 0.990203
         Iterations: 10
         Function evaluations: 36
         Gradient evaluations: 12
[ 0.9809335  -0.07784708]
Optimization terminated successfully.
         Current function value: 1.675573
         Iterations: 9
         Function evaluations: 33
         Gradient evaluations: 11
[0.7116518  0.16042453]
Optimization terminated successfully.
         Current function value: 1.561361
         Iterations: 8
         Function evaluations: 30
         Gradient evaluations: 10
[0.53635383 0.35041296]
Optimization terminated successfully.
         Current function value: 0.857445
         Iterations: 10
         Function evaluations: 36
         Gradient evaluations: 12
[ 0.97625562 -0.04545534]


In [25]:
test_submission['per_square_meter_price'] = test_xgb_predict * W[0] + test_cat_predict * W[1]
test_submission['per_square_meter_price'] = test_submission['per_square_meter_price'].apply(lambda x: max(1000.0, x))

In [26]:
for realty_id in [0,1,2]:
    xgb_predicts_spb = xgb_predicts[train.realty_type == realty_id]
    cat_predicts_spb = cat_predicts[train.realty_type == realty_id]
    train_targets_spb = train_targets[train.realty_type == realty_id]

    test_xgb_predict_spb = test_xgb_predict[test.realty_type == realty_id]
    test_cat_predict_spb = test_cat_predict[test.realty_type == realty_id]

    def minimize_arit_spb(W):
        ypred = W[0] * xgb_predicts_spb + W[1] * cat_predicts_spb
        return deviation_metric(train_targets_spb, ypred)


    W_spb = minimize(minimize_arit_spb, [1.0 / 2] * 2, options={'gtol': 1e-6, 'disp': True}).x
    print(W_spb)
    new_score = test_xgb_predict_spb * W_spb[0] + test_cat_predict_spb * W_spb[1]
    old_score = test_submission.loc[test.realty_type == realty_id, 'per_square_meter_price'].values
    test_submission.loc[test.realty_type == realty_id, 'per_square_meter_price'] =  (new_score+old_score)/2

test_submission.to_csv('submission_realty.csv', index = False)

Optimization terminated successfully.
         Current function value: 1.049092
         Iterations: 10
         Function evaluations: 36
         Gradient evaluations: 12
[0.71474959 0.20057966]
Optimization terminated successfully.
         Current function value: 1.104496
         Iterations: 8
         Function evaluations: 30
         Gradient evaluations: 10
[0.60046668 0.29104654]
Optimization terminated successfully.
         Current function value: 0.828466
         Iterations: 9
         Function evaluations: 36
         Gradient evaluations: 12
[0.62227654 0.27797815]
