Задание считается успешно выполненным при получении скора RMSE ≤ 2.45⋅10e6 

В итоге помог стекинг бустингов, и усреднение многих сабмитов

In [None]:
import warnings

from hyperopt import hp
import hyperopt as hopt
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, cross_val_predict, train_test_split
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.base import BaseEstimator, ClassifierMixin
from tqdm import tqdm


warnings.filterwarnings("ignore")

In [142]:
class Stacking(BaseEstimator, ClassifierMixin):  

    def __init__(self, models, ens_model):
        self.models = models
        self.ens_model = ens_model
        self.n = len(models)
        self.valid = None
        
    def fit(self, X, y=None, p=0.25, cv=3, err=0.001, random_state=None):
        if p > 0:
            # разбиение на обучение моделей и метамодели
            train, valid, y_train, y_valid = train_test_split(X, y, test_size=p, random_state=random_state)
            
            # заполнение матрицы для обучения метамодели
            self.valid = np.zeros((valid.shape[0], self.n))
            for t, clf in tqdm(enumerate(self.models)):
                clf.fit(train, y_train)
                self.valid[:, t] = clf.predict(valid)
                
            # обучение метамодели
            self.ens_model.fit(self.valid, y_valid)
            
        else:
            # для регуляризации - берём случайные добавки
            self.valid = err*np.random.randn(X.shape[0], self.n)
            
            for t, clf in tqdm(enumerate(self.models)):
                # oob-ответы алгоритмов
                self.valid[:, t] += cross_val_predict(clf, X, y, cv=cv, method='predict')
                # но сам алгоритм надо настроить
                clf.fit(X, y)
            
            # обучение метамодели
            self.ens_model.fit(self.valid, y)    

    def predict(self, X, y=None):
        X_meta = np.zeros((X.shape[0], self.n))
        
        for t, clf in tqdm(enumerate(self.models)):
            X_meta[:, t] = clf.predict(X)
        
        a = self.ens_model.predict(X_meta)
        
        return (a)


class Meaning:
    def __init__(self, n_models, params, cat_features, param_grid, num_it):
        self.param_grid = param_grid
        self.num_it = num_it
        self.cat_features = cat_features
        self.params = params
        self.n_models = n_models
        self.models = []

    def fit(self, X_train, y_train):
        for i in tqdm(range(self.n_models)):
            if self.num_it > 1:
                params = self._find_best_params(X_train, y_train, i)
                print(params)
            else:
                params = self.params
            model = LGBMRegressor(random_state=100*i, **params)
            model.fit(X_train, y_train)
                
            self.models.append(model)
            
            
    
    def _find_best_params(self, X_train, y_train, i) -> dict:
        """Use hyperopt to find optimal model hyper-parameters."""

        def objective(pars):
            model = LGBMRegressor(random_state=100*i, **pars)
            score = cross_val_score(model, X_train, y_train, cv=3, scoring=rmse).mean()

            return score

        best_params = hopt.fmin(fn=objective, space=self.param_grid, algo=hopt.tpe.suggest, max_evals=self.num_it)

        return best_params
            
            
    def predict(self, X_test):
        y_pred = np.zeros((len(X_test), self.n_models))

        for i, model in tqdm(enumerate(self.models)): 
            y_pred[:, i] = model.predict(X_test)
            
        return np.mean(y_pred, axis=1)          


def plot_ecdf(data):
    x, y = sorted(data), np.arange(1, len(data)+1) / len(data)
    plt.figure()
    plt.scatter(x, y)
    plt.grid()
    plt.show()

    
def bias(true, pred):
    return (pred - true).sum() / true.sum()


def mean_delta(true, pred):
    return (pred - true).mean()

In [359]:
features = [
    'full_sq', 
    'full_sq_bins', 
    'life_sq', 
    'life_sq_bins', 
    'kitch_sq', 
    'kitch_sq_bins',
    'some_extra_sqr',
    'ratio_life_dash_full_sq', 
    'ration_kitchen_dash_full_sq',
    'floor', 
    'floor_bins',
    'max_floor',
    'max_floor_bins',
    'material',
    'num_room', 
    'apartment condition',
    'sub_area',
    'sub_area_bins',
    'population', 
    'indust_part',
    'preschool_facilities',
    'school_facilities', 
    'hospital_beds_raion',
    'healthcare_facilities',
    'university_num', 
    'sport_objects_facilities',
    'additional_education_facilities',
    'culture_objects_facilities',
    'shopping_centers_facilities',
    'office_num', 
    'green_part', 
    'prom_part',
    'cafe_count', 
    'church_facilities', 
    'mosque',
    'leisure_facilities',
    'year',
    'month',
    'week_of_year', 
    'day_of_week',
    'timestamp_int', 
    'build_year',
    'age', 
]

cat_features = [
       'material', 'apartment condition', 'full_sq_bins',
       'year', 'month', 'week_of_year', 'day_of_week',
]

In [372]:
# считаем данные в соответствующие датафреймы

train_main_df = pd.read_csv('./data_hw/HW_train_main_data.csv')
train_additional_df = pd.read_csv('./data_hw/HW_train_additional_data.csv')

In [373]:
test_main_df = pd.read_csv('./data_hw/HW_test_main_data.csv')
test_additional_df = pd.read_csv('./data_hw/HW_test_additional_data.csv')

In [374]:
# добавим дополнительные данные

whole_train_df = train_main_df.merge(train_additional_df, how='left', on='id')
whole_test_df = test_main_df.merge(test_additional_df, how='left', on='id')

# Data preparation

In [375]:
def prepare_features(df):   
    bins = [0, 20, 30, 40, 50, 70, 80, 90, 100, 150, 200, 250, 375, 500, 10000]
    df['full_sq_bins'] = np.searchsorted(bins, df['full_sq'].values)
    
    bins = [0, 20, 30, 40, 50, 100, 150, 201, 400, 500, 10000]
    df['life_sq_bins'] = np.searchsorted(bins, df['life_sq'].values)

    bins = [0, 5, 10, 15, 20, 40, 50, 100]
    df['floor_bins'] = np.searchsorted(bins, df['floor'].values)
    
    bins = [0, 5, 9, 13, 15, 20, 35, 42, 50, 200]
    df['max_floor_bins'] = np.searchsorted(bins, df['max_floor'].values)
    
    bins = [0, 2, 5, 10, 15, 45, 70, 80, 100, 1000]
    df['kitch_sq_bins'] = np.searchsorted(bins, df['kitch_sq'].values)
    
    bins = [0, 20, 40, 60, 70, 90, 100, 110, 1000]
    df['sub_area_bins'] = np.searchsorted(bins, df['sub_area'].values)
    
    # конвертируем колонку в datetime
    df['timestamp'] = pd.to_datetime(df['timestamp'])  
    
    # добавим дополнительные столбцы на основе имеющейся даты
    # get year  
    df['year'] = df.timestamp.dt.year

    # get month of year
    df['month'] = df.timestamp.dt.month

    # get day of week
    df['week_of_year'] = df.timestamp.dt.weekofyear

    # get week of the year
    df['day_of_week'] = df.timestamp.dt.weekday

    df['timestamp_int'] = df.timestamp.astype(int)
    
    # создадим столбец для месяца года
    df['year_month'] = df['year'].astype(str) + '_' + df['month'].astype(str)  

    # вспомним, что цена сильно зависит от площади квартиры, на основе этих данных
    # добавим столбцы для отношения площадей
    df["ratio_life_dash_full_sq"] = np.where(df["full_sq"] > 0, df["life_sq"] / df["full_sq"], -99)
    df["ration_kitchen_dash_full_sq"] = np.where(df["full_sq"] > 0, df["kitch_sq"] / df["full_sq"], -99)

    # добавим воздраст здания
    df['age'] = df["build_year"] - df['year']

    # добавим разность между общей и жилой площадью квартиры
    df['some_extra_sqr'] = df["full_sq"] - df["life_sq"]
    
    return df


def fill_nans(df):
    df.loc[df['life_sq'] == 0, 'life_sq'] = -99
    df.loc[df['floor'] == 0, 'floor'] = -99
    df.loc[df['max_floor'] == 0, 'max_floor'] = -99
    df.loc[df['build_year'] <= 1900, 'build_year'] = -99
    df.loc[df['num_room'] == 0, 'num_room'] = -99
    df.loc[df['kitch_sq'] == 0, 'kitch_sq'] = -99
    df.loc[df['some_extra_sqr'] < 0, 'some_extra_sqr'] = -99
    df.loc[df['age'] < 0, 'age'] = -99
    
    df = df.fillna(-99)
      
    return df

def change_types(df):
    df['material'] = df['material'].astype(int)
    df['apartment condition'] = df['apartment condition'].astype(int)
    
    return df

def clip_dataset(cols, test, train):
    new_train = train.copy()
    for feat in cols:
        f_min = test[feat].min()
        f_max = test[feat].max()
        new_train[feat] = new_train[feat].values.clip(f_min, f_max)

    return new_train

In [376]:
# Удаление выброса

whole_train_df = whole_train_df.loc[~whole_train_df.id.isin([85073, 67278])]

In [377]:
# Явно ошибочные значения

whole_train_df.loc[whole_train_df.id == 28125, 'build_year'] = whole_train_df.loc[whole_train_df.id == 28125, 'kitch_sq']
whole_train_df.loc[whole_train_df.kitch_sq > 1500, 'kitch_sq'] = 0

whole_train_df.loc[whole_train_df.build_year == 20052009.0, 'build_year'] = 2007.0
whole_train_df.loc[whole_train_df.build_year == 4965.0, 'build_year'] = 1965.0

In [378]:
# Замена выбросов в kitch_sq, floor, max_floor, num_room

max_kitch_sq = whole_train_df.kitch_sq.max()
whole_train_df.loc[whole_train_df.kitch_sq == max_kitch_sq, 'kitch_sq'] = max_kitch_sq / 100

max_floor = whole_train_df.floor.max()
whole_train_df.loc[whole_train_df.floor == max_floor, 'floor'] = max_floor / 10

max_floor = whole_train_df.max_floor.max()
whole_train_df.loc[whole_train_df.max_floor == max_floor, 'max_floor'] = max_floor / 10

whole_train_df.loc[whole_train_df.max_floor == 99, 'max_floor'] = 0
whole_train_df.loc[whole_train_df.num_room >= 10, 'num_room'] = 0
whole_train_df.loc[whole_train_df.kitch_sq >= 100, 'kitch_sq'] = 0

In [379]:
# Замена выброса в full_sq

full_sq_max = whole_train_df.full_sq.max()
whole_train_df.loc[whole_train_df.full_sq == full_sq_max, 'full_sq'] = full_sq_max / 100

In [380]:
# Замена выбросов в life_sq и full_sq

ids = [37439, 68638, 52414, 32115, 90717, 13539, 51155, 32184, 73754, 89638, 20359]
whole_train_df.loc[whole_train_df.id.isin(ids), 'life_sq'] = whole_train_df.loc[whole_train_df.id.isin(ids), 'life_sq'] / 10

ids=[91769, 52414, 32115, 49518, 21211, 11221, 41202, 61536,11965, 95936,71405,73754,20359]
whole_train_df.loc[whole_train_df.id.isin(ids), 'full_sq'] = whole_train_df.loc[whole_train_df.id.isin(ids), 'full_sq'] / 10

In [381]:
# Замена выброса в life_sq

whole_test_df.loc[whole_test_df.id == 71980, 'life_sq'] = whole_test_df.loc[whole_test_df.id == 71980, 'life_sq'] / 10
whole_test_df.loc[whole_test_df.id == 24392, 'life_sq'] = whole_test_df.loc[whole_test_df.id == 24392, 'life_sq'] / 100

In [384]:
# Считаем признаки

whole_train_df = change_types(fill_nans(prepare_features(whole_train_df)))
whole_test_df = change_types(fill_nans(prepare_features(whole_test_df)))

In [315]:
# Пробуем удалить выбросы

price_max = np.quantile(whole_train_df['price'].values, .97)
price_min = np.quantile(whole_train_df['price'].values, .0)

whole_train_df['price_lim'] = whole_train_df['price'].values.clip(price_min, price_max)

# Modeling

In [387]:
# Валидационный фолд

df_train, df_val, y_train, y_val = train_test_split(
    whole_train_df, whole_train_df.price,
    test_size=.33,
    random_state=42,
)
print(f'df_train: {len(df_train)} df_val: {len(df_val)}')

df_train: 19428 df_val: 9570


In [389]:
# Полный датасет

X_train, y_train = df_train[features], df_train['price']
print(X_train.shape)

(19428, 44)


In [390]:
params = {
    'learning_rate': 0.1, # 0.1
    'num_leaves': 25, # 31
    'max_depth': -1, # -1
    'min_child_samples': 20, # 20
    'subsample': 0.9,
    'colsample_bytree': 0.5,
    'n_estimators': 121, # 100
}


param_grid = {
    'learning_rate': .1,
    'max_depth': -1,
    'n_estimators': 70 + hp.randint('n_estimators', 131),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.2, 0.9),
    'subsample': hp.uniform('subsample', 0.2, 0.9),
    'num_leaves': 10 + hp.randint('num_leaves', 40),
}

model = LGBMRegressor(**params)
rmse = make_scorer(mean_squared_error, squared=False)

In [391]:
score = cross_val_score(model, X_train, y_train, cv=5, scoring=rmse).mean()
print(f'RMSE: {round(score)}')

RMSE: 2828696


In [393]:
meaning = Meaning(100, params, cat_features, param_grid, -1)
meaning.fit(X_train, y_train)


100%|██████████| 100/100 [01:04<00:00,  1.55it/s]


In [394]:
y_val_pred = meaning.predict(df_val[features])
score = mean_squared_error(y_val, y_val_pred, squared=False)
print(f'RMSE: {round(score)}')

100it [00:06, 15.77it/s]

RMSE: 2679095





# Scoring

In [395]:
features = [
    'full_sq', 
    'full_sq_bins', 
    'life_sq', 
    'life_sq_bins', 
    'kitch_sq', 
    'kitch_sq_bins',
    'some_extra_sqr',
    'ratio_life_dash_full_sq', 
    'ration_kitchen_dash_full_sq',
    'floor',
    'floor_bins',
    'max_floor',
    'max_floor_bins',
    'material',
    'num_room', 
    'apartment condition',
    'sub_area',
    'sub_area_bins',
    'population', 
    'indust_part',
    'preschool_facilities',
    'school_facilities', 
    'hospital_beds_raion',
    'healthcare_facilities',
    'university_num', 
    'sport_objects_facilities',
    'additional_education_facilities',
    'culture_objects_facilities',
    'shopping_centers_facilities',
    'office_num', 
    'green_part', 
    'prom_part',
    'cafe_count', 
    'church_facilities', 
    'mosque',
    'leisure_facilities',
    'year',
    'month',
    'week_of_year', 
    'day_of_week',
    'timestamp_int', 
    'build_year',
    'age', 
]

In [406]:
predictors = [LGBMRegressor(**
                            {
                            'learning_rate': 0.1,
                            'num_leaves': np.random.randint(5, 32),
                            'max_depth': np.random.randint(5, 32),
                            'subsample': max(0.1, np.random.rand(1)[0]),
                            'colsample_bytree': max(0.1, np.random.rand(1)[0]),
                            'n_estimators': np.random.randint(50, 250),
                            }) for i in range(20)
             ]

ens_model = LinearRegression()

In [407]:
%%time

stacking = Stacking(predictors, ens_model)
stacking.fit(whole_train_df[features], whole_train_df.price, p=0, cv=3, random_state=42)

y_test_pred = stacking.predict(whole_test_df[features])
test_main_df['predicted_price'] = y_test_pred

20it [01:00,  3.02s/it]
20it [00:00, 64.08it/s]


CPU times: user 1min 59s, sys: 1.17 s, total: 2min
Wall time: 1min


In [None]:
# test_main_df[['id', 'predicted_price']].to_csv('./data_hw/submit_stacking.csv', index=False)