In [None]:
!wget https://storage.yandexcloud.net/ds-ods/files/files/afad66cf/df_train.csv
!git clone https://github.com/nalgeon/metro.git
!pip install hdbscan geohash2

In [134]:
import numpy as np
import pandas as pd
import geopandas as gpd
from tqdm.auto import trange, tqdm
import multiprocessing as mp
from sklearn.cluster import KMeans, DBSCAN
from hdbscan import HDBSCAN
from functools import partial
from scipy import spatial
import geohash2
import json
from catboost import CatBoostClassifier, Pool, cv, CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numba
import gc
from math import radians, sin, cos, asin, sqrt, pi, log2
from sklearn.metrics.pairwise import haversine_distances
from sklearn.model_selection import train_test_split, StratifiedKFold
from geopy.distance import geodesic, distance, great_circle, lonlat
from geopy.geocoders import Nominatim
from sklearn.base import BaseEstimator
import geohash2

In [135]:
def haversine(x,y):
    x = [x[0] * np.pi / 180, x[1] * np.pi / 180]
    y = [y[0] * np.pi / 180, y[1] * np.pi / 180]
    return haversine_distances(X=[x],Y=[y])[0][0]

def build_distance_matrix_fast(coords):
    coords = [[x[0] * np.pi / 180, x[1] * np.pi / 180] for x in coords]
    mat = haversine_distances(X=coords) * 6371.127
    mat += np.eye(mat.shape[0]) * 10e10
    return mat

def build_distance_matrix_test_fast(coords_tr,coords_ts):
    coords_tr = [[x[0] * np.pi / 180, x[1] * np.pi / 180] for x in coords_tr]
    coords_ts = [[x[0] * np.pi / 180, x[1] * np.pi / 180] for x in coords_ts]
    mat = haversine_distances(X=coords_ts,Y=coords_tr) * 6371.127
    #mat += np.eye(*mat.shape) * 10e10
    return mat

def get_dist_maper(train_data, test_data,coords='coords',group_col='area_cluster'):
    train_group, test_group = {}, {}
    
    for k,v in zip(train_data[group_col],train_data[coords]):
        if k in train_group:
            train_group[k].append(v)
        else:
            train_group[k] = [v]
            
    for k,v in zip(test_data[group_col],test_data[coords]):
        if k in test_group:
            test_group[k].append(v)
        else:
            test_group[k] = [v]

    for k in train_group:
        train_group[k] = build_distance_matrix_fast(train_group[k])
        if k in test_group:
            test_group[k] = build_distance_matrix_fast(test_group[k])
        else:
            test_group[k] = np.nan
    
    return train_group, test_group

In [136]:
metro_data = pd.read_csv('/kaggle/working/metro/data/metro.ru.csv')
metro_data = metro_data[['lat','lng']].values.tolist()
metro_data = [[x[0] * np.pi / 180, x[1] * np.pi / 180] for x in metro_data]

In [114]:
data = pd.read_csv('/kaggle/working/df_train.csv')
data['coords'] = data[['latitude','longitude']].values.tolist()
#tqdm.pandas()
data['hash'] = data['coords'].progress_apply(lambda x: geohash2.encode(*x,precision=12))
train_data, test_data = train_test_split(data,test_size=0.2,random_state=56)

  0%|          | 0/8042 [00:00<?, ?it/s]

In [115]:
area_clusters = np.digitize(data['area'].values,bins=[data['area'].quantile(i/100) for i in range(0,100,5)])
train_data['area_cluster'] = area_clusters[:train_data.shape[0]]
test_data['area_cluster'] = area_clusters[train_data.shape[0]:]

In [116]:
dist_mat = build_distance_matrix_fast(train_data[['latitude','longitude']].values.tolist())

In [117]:
test_dist_mat = build_distance_matrix_test_fast(
    train_data[['latitude','longitude']].values.tolist(),
    test_data[['latitude','longitude']].values.tolist(),
)

In [118]:
dist_tr_cls, dist_te_cls = get_dist_maper(train_data, test_data, coords='coords',group_col='area_cluster')

In [119]:
def get_ranks(data,mat,topk=100):
    idxes = np.argsort(mat,axis=1)
    top_idx = [i[:topk] for i in idxes]
    top_dists = [row[i[:topk]] for i,row in zip(idxes,mat)]
    data['cand_dist'] = top_dists
    data['cand_idxs'] = top_idx
    
def get_ranks_special(data,mat,topk=100):
    dists, idxes = [], []
    step_matrix = {k:0 for k in mat}
    idx_maper = {}
    
    for v,k in enumerate(data['area_cluster']):
        if k in idx_maper:
            idx_maper[k].append(v)
        else:
            idx_maper[k] = [v]

    for cl in data['area_cluster'].values:
        dist = mat[cl][step_matrix[cl]]
        idx = np.argsort(dist).tolist()
        idxes.append([idx_maper[cl][j] for j in idx[:topk]])
        dists.append([dist[j] for j in idx[:topk]])
    
    data['cand_dist_area'] = dists
    data['cand_idxs_area'] = idxes
    
def get_ranks_test_special(data,on_data,mat,topk=100):
    dists, idxes = [], []
    step_matrix = {k:0 for k in mat}
    idx_maper = {}
    
    for v,k in enumerate(on_data['area_cluster']):
        if k in idx_maper:
            idx_maper[k].append(v)
        else:
            idx_maper[k] = [v]

    for cl in data['area_cluster'].values:
        dist = mat[cl][step_matrix[cl]]
        idx = np.argsort(dist).tolist()
        idxes.append([idx_maper[cl][j] for j in idx[:topk]])
        dists.append([dist[j] for j in idx[:topk]])
    
    data['cand_dist_area'] = dists
    data['cand_idxs_area'] = idxes

get_ranks(train_data,dist_mat)
get_ranks(test_data,test_dist_mat)
get_ranks_special(train_data,dist_tr_cls)
get_ranks_test_special(test_data,train_data,dist_te_cls)

In [120]:
train_data['price_of_1m2'] = train_data['price'] / train_data['area']
train_label_maper = dict(zip(range(len(train_data)),train_data['price']))
train_otn_label_maper = dict(zip(range(len(train_data)),train_data['price_of_1m2']))

In [122]:
hash_maper = train_data.groupby('hash')['price'].agg('mean')

In [131]:
#def mean_without_idx(x,i):
#    return np.mean([v for k,v in x if v != i])
#
#train_data['maper_geo'] = train_data.apply(lambda x: [x['Unnamed: 0'],x['price']],axis=1)
#train_data['hash_prices'] = train_data.groupby('hash')['maper_geo'].transform(list)
#
#train_data['hash_pred'] = train_data.apply(lambda x: mean_without_idx(x['hash_prices'],x['Unnamed: 0']),axis=1)
#test_data['hash_pred'] = test_data['hash'].map(hash_maper)

TypeError: cannot unpack non-iterable int object

In [109]:
def nearest_metro(row):
    coords = (row['latitude'],row['longitude'])
    return np.min(haversine_distances(X=[coords],Y=metro_data))

def agg_from_2cols(row,dist_pref=0.0,agg=np.mean):
    return agg([i for i,j in zip(row['neigh_label'],row['cand_dist']) if j <= dist_pref])

def agg_from_2cols_otn(row,dist_pref=0.0,agg=np.mean):
    return agg([i for i,j in zip(row['neigh_label_otn'],row['cand_dist']) if j <= dist_pref])

def agg_from_2cols_area(row,dist_pref=0.0,agg=np.mean):
    return agg([i for i,j in zip(row['neigh_label_area'],row['cand_dist_area']) if j <= dist_pref])

def agg_from_2cols_otn_area(row,dist_pref=0.0,agg=np.mean):
    return agg([i for i,j in zip(row['neigh_label_otn_area'],row['cand_dist_area']) if j <= dist_pref])

def get_distance_feats_area(data):
    data['neigh_label_area'] = data['cand_idxs_area'].apply(lambda x: [train_label_maper[j] for j in x])
    data['neigh_label_otn_area'] = data['cand_idxs_area'].apply(lambda x: [train_otn_label_maper[j] for j in x])
    data['min_distance_area'] =  data['cand_dist_area'].apply(lambda x: min(x))
    data['mean_distance_area'] =  data['cand_dist_area'].apply(lambda x: np.mean(x))
    data['nearest_label_area'] =  data['neigh_label_area'].apply(lambda x: x[0])
    data['nearest_label_otn_area'] =  data['neigh_label_otn_area'].apply(lambda x: x[0])
    data['distance_zero_cnt_area'] =  data['cand_dist_area'].apply(lambda x: x.count(0.0))
    
    data['mean_label@5_area'] =   data['neigh_label_area'].apply(lambda x: np.mean(x[:5]))
    data['mean_label@10_area'] =  data['neigh_label_area'].apply(lambda x: np.mean(x[:10]))
    data['mean_label@25_area'] =  data['neigh_label_area'].apply(lambda x: np.mean(x[:10]))
    
    data['mean_label<0_area'] =   data.apply(partial(agg_from_2cols_area,dist_pref=0.0),axis=1)
    data['mean_label<0.5_area'] = data.apply(partial(agg_from_2cols_area,dist_pref=0.5),axis=1)
    data['mean_label<5_area'] =   data.apply(partial(agg_from_2cols_area,dist_pref=5.0),axis=1)
    
    data['mean_label@5_otn_area'] =   data['neigh_label_otn_area'].apply(lambda x: np.mean(x[:5]))
    data['mean_label@10_otn_area'] =  data['neigh_label_otn_area'].apply(lambda x: np.mean(x[:10]))
    data['mean_label@25_otn_area'] =  data['neigh_label_otn_area'].apply(lambda x: np.mean(x[:10]))
    
    data['mean_label<0_otn_area'] =   data.apply(partial(agg_from_2cols_otn_area,dist_pref=0.0),axis=1)
    data['mean_label<0.5_otn_area'] = data.apply(partial(agg_from_2cols_otn_area,dist_pref=0.5),axis=1)
    data['mean_label<1.0_otn_area'] = data.apply(partial(agg_from_2cols_otn_area,dist_pref=1.0),axis=1)
    data['mean_label<5_otn_area'] =   data.apply(partial(agg_from_2cols_otn_area,dist_pref=5.0),axis=1)
    
    data['center_city_cords'] = data['city'].map(city_coords)
    tqdm.pandas()
    data['nearest_metro'] = data[['latitude','longitude']].progress_apply(nearest_metro,axis=1)
    data['dist2center'] = data.apply(lambda row: haversine(row['center_city_cords'],(row['latitude'],row['longitude'])),axis=1)
    
def get_distance_feats(data):
    data['neigh_label'] = data['cand_idxs'].apply(lambda x: [train_label_maper[j] for j in x])
    data['neigh_label_otn'] = data['cand_idxs'].apply(lambda x: [train_otn_label_maper[j] for j in x])
    data['min_distance'] =  data['cand_dist'].apply(lambda x: min(x))
    data['mean_distance'] =  data['cand_dist'].apply(lambda x: np.mean(x))
    data['nearest_label'] =  data['neigh_label'].apply(lambda x: x[0])
    data['nearest_label_otn'] =  data['neigh_label_otn'].apply(lambda x: x[0])
    data['distance_zero_cnt'] =  data['cand_dist'].apply(lambda x: x.tolist().count(0.0))
    
    data['mean_label@5'] =   data['neigh_label'].apply(lambda x: np.mean(x[:5]))
    data['mean_label@10'] =  data['neigh_label'].apply(lambda x: np.mean(x[:10]))
    data['mean_label@25'] =  data['neigh_label'].apply(lambda x: np.mean(x[:10]))
    
    data['mean_label<0'] =   data.apply(partial(agg_from_2cols,dist_pref=0.0),axis=1)
    data['mean_label<0.5'] = data.apply(partial(agg_from_2cols,dist_pref=0.5),axis=1)
    data['mean_label<1.0'] = data.apply(partial(agg_from_2cols,dist_pref=1.0),axis=1)
    data['mean_label<5'] =   data.apply(partial(agg_from_2cols,dist_pref=5.0),axis=1)
    
    data['mean_label@5_otn'] =   data['neigh_label_otn'].apply(lambda x: np.mean(x[:5]))
    data['mean_label@10_otn'] =  data['neigh_label_otn'].apply(lambda x: np.mean(x[:10]))
    data['mean_label@25_otn'] =  data['neigh_label_otn'].apply(lambda x: np.mean(x[:10]))
    
    data['mean_label<0_otn'] =   data.apply(partial(agg_from_2cols_otn,dist_pref=0.0),axis=1)
    data['mean_label<0.5_otn'] = data.apply(partial(agg_from_2cols_otn,dist_pref=0.5),axis=1)
    data['mean_label<1.0_otn'] = data.apply(partial(agg_from_2cols_otn,dist_pref=1.0),axis=1)
    data['mean_label<5_otn'] =   data.apply(partial(agg_from_2cols_otn,dist_pref=5.0),axis=1)
    
    data['center_city_cords'] = data['city'].map(city_coords)
    tqdm.pandas()
    data['nearest_metro'] = data[['latitude','longitude']].progress_apply(nearest_metro,axis=1)
    data['dist2center'] = data.apply(lambda row: haversine(row['center_city_cords'],(row['latitude'],row['longitude'])),axis=1)
    
def crt_crds(df): 
    df['rot_15_x'] = (np.cos(np.radians(15)) * df['longitude']) + \
                      (np.sin(np.radians(15)) * df['latitude'])
    
    df['rot_15_y'] = (np.cos(np.radians(15)) * df['latitude']) + \
                      (np.sin(np.radians(15)) * df['longitude'])
    
    df['rot_30_x'] = (np.cos(np.radians(30)) * df['longitude']) + \
                      (np.sin(np.radians(30)) * df['latitude'])
    
    df['rot_30_y'] = (np.cos(np.radians(30)) * df['latitude']) + \
                      (np.sin(np.radians(30)) * df['longitude'])
    
    df['rot_45_x'] = (np.cos(np.radians(45)) * df['longitude']) + \
                      (np.sin(np.radians(45)) * df['latitude'])
    return df

def exp_dims_tric(df):
    coordinates = df[['latitude', 'longitude']].values
    
    emb_size = 20
    precision = 1e6 

    latlon = np.expand_dims(coordinates, axis=-1) 

    m = np.exp(np.log(precision) / emb_size) 
    angle_freq = m ** np.arange(emb_size) 
    angle_freq = angle_freq.reshape(1, 1, emb_size) 

    latlon = latlon * angle_freq 
    latlon[..., 0::2] = np.cos(latlon[..., 0::2]) 
    latlon[..., 1::2] = np.sin(latlon[..., 1::2]) 
    latlon = latlon.reshape(-1, 2 * emb_size) 

    df['exp_latlon1'] = [lat[0] for lat in latlon]
    df['exp_latlon2'] = [lat[1] for lat in latlon]
    
def get_designes_feats(coords):
    tree = spatial.cKDTree(coords)
    result = tree.query_ball_point(coords, 0.01, return_length=True)
    return result

In [69]:
cluster = KMeans(n_clusters=32)#DBSCAN(metric='haversine')
coords = pd.concat([train_data[['latitude','longitude']],test_data[['latitude','longitude']]],axis=0).values.tolist()
coords = [[x[0] * np.pi / 180, x[1] * np.pi / 180] for x in coords]
clusters = cluster.fit_predict(coords)



In [70]:
train_data['cluster'] = clusters[:train_data.shape[0]]
test_data['cluster'] = clusters[train_data.shape[0]:]

In [71]:
geolocator = Nominatim(user_agent="user_agent")
city_coords = {x:(geolocator.geocode(x).latitude, geolocator.geocode(x).longitude) for x in data['city'].unique()}

In [72]:
get_distance_feats(train_data)
get_distance_feats(test_data)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


  0%|          | 0/6433 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


  0%|          | 0/1609 [00:00<?, ?it/s]

In [73]:
get_distance_feats_area(train_data)
get_distance_feats_area(test_data)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


  0%|          | 0/6433 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


  0%|          | 0/1609 [00:00<?, ?it/s]

In [74]:
disignes = get_designes_feats(pd.concat([train_data[['latitude','longitude']],test_data[['latitude','longitude']]],axis=0)).tolist()
train_data['desinges'] = disignes[:train_data.shape[0]]
test_data['desinges'] = disignes[train_data.shape[0]:]

In [75]:
#exp_dims_tric(train_data)
#exp_dims_tric(test_data)
#crt_crds(train_data)
#crt_crds(test_data)

In [87]:
cb_params = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'loss_function': 'RMSE',
    'max_depth': 5,
    'eval_metric': 'RMSE',
    'use_best_model':True,
    'task_type': 'CPU',
    'random_seed': 56,
}

params = {
    'cb_params':cb_params,
    'lgb_params': None,
    'xgb_params': None,
}

cat_cols = ['city','has_balcony','cluster']
label_col = 'price'
drop_cols = [
    'cand_dist','cand_dist_area',
    'cand_idxs','cand_idxs_area',
    'neigh_label','neigh_label_area',
    'center_city_cords','coords',
    'neigh_label_otn','neigh_label_otn_area',
    'price_of_1m2', 'area_cluster'
]

In [88]:
class EnsembleClassifier(BaseEstimator):
    def __init__(self,cb_params,lgb_params,xgb_params):
        self.cbm = CatBoostClassifier(**cb_params)
        #self.lgbm = lgb.LGBMClassifier(**lgb_params)
        #self.xgbm = xgb.XGBClassifier(**xgb_params)
    
    def fit(self,X,y,X_val,y_val,cat_features=None,verbose=False):
        train_pool = Pool(X,label=y,cat_features=cat_features)
        eval_pool = Pool(X_val,label=y_val,cat_features=cat_features)
        self.cbm.fit(train_pool,eval_set=eval_pool,verbose=verbose)
        #self.lgbm.fit(X, y,eval_set=[(X_val,y_val)],categorical_feature=cat_features,callbacks=[lgb.log_evaluation(verbose)],eval_metric=lgb_f1_score)
        #self.xgbm.fit(X,y,eval_set=[(X_val,y_val)],verbose=False)
    
    def predict_proba(self,X_test,cat_features):
        test_pool = Pool(X_test,cat_features=cat_features)
        cb_preds = self.cbm.predict_proba(test_pool)[:,1]
        #lgb_preds = self.lgbm.predict_proba(X_test)[:,1]
        #xgb_preds = self.xgbm.predict_proba(X_test)[:,1]
        return lgb_preds
    
    def predict(self,X_test,cat_features):
        test_pool = Pool(X_test,cat_features=cat_features)
        cb_preds = self.cbm.predict(test_pool)
        #lgb_preds = self.lgbm.predict_proba(X_test)
        #xgb_preds = self.xgbm.predict_proba(X_test)[:,1]
        return cb_preds
    
class EnsembleRegressor(BaseEstimator):
    def __init__(self,cb_params,lgb_params,xgb_params):
        self.cbm = CatBoostRegressor(**cb_params)
        #self.lgbm = lgb.LGBMClassifier(**lgb_params)
        #self.xgbm = xgb.XGBClassifier(**xgb_params)
    
    def fit(self,X,y,X_val,y_val,cat_features=None,verbose=False):
        train_pool = Pool(X,label=y,cat_features=cat_features)
        eval_pool = Pool(X_val,label=y_val,cat_features=cat_features)
        self.cbm.fit(train_pool,eval_set=eval_pool,verbose=verbose)
        #self.lgbm.fit(X, y,eval_set=[(X_val,y_val)],categorical_feature=cat_features,callbacks=[lgb.log_evaluation(verbose)],eval_metric=lgb_f1_score)
        #self.xgbm.fit(X,y,eval_set=[(X_val,y_val)],verbose=False)
        
    def predict(self,X_test,cat_features):
        test_pool = Pool(X_test,cat_features=cat_features)
        cb_preds = self.cbm.predict(test_pool)
        #lgb_preds = self.lgbm.predict_proba(X_test)
        #xgb_preds = self.xgbm.predict_proba(X_test)[:,1]
        return cb_preds

In [89]:
class CustomBoostKfoldWraper(BaseEstimator):
    def __init__(self,num_folds,num_repits,params,random_state=56,score_func=None):
        self.models = []
        self.params = params
        self.random_state = random_state
        self.num_folds = num_folds
        self.num_repits = num_repits
        self.score_func = score_func
        
    def fit(self,train_data,cat_features=None,drop_cols=None,label_col=None,verbose=False):
        self.scores = []
        
        for i in trange(self.num_repits):
            kfold = StratifiedKFold(self.num_folds,random_state=self.random_state+i,shuffle=True)
            for train_index, test_index in (kfold.split(train_data,train_data[label_col])):
                train_df = train_data.iloc[train_index]
                test_df = train_data.iloc[test_index]
                
                model = EnsembleRegressor(**self.params)
                model.fit(
                    X = train_df.drop([label_col]+drop_cols,axis=1),
                    y = train_df[label_col],
                    X_val = test_df.drop([label_col]+drop_cols,axis=1),
                    y_val = test_df[label_col],
                    cat_features = cat_features,
                    verbose = verbose
                )
                cb_preds = model.predict(test_df.drop([label_col]+drop_cols,axis=1),cat_features)
                avg_preds =  cb_preds#(cb_preds + lgb_preds) / 2
                self.scores += [[
                    self.score_func(
                        test_df[label_col],
                        avg_preds
                    )
                ]]
                print(self.scores[-1])
                self.models += [model]
                
        print(f"Total Score {np.mean([x[0] for x in self.scores])}")
            
    def predict(self,test_data,drop_cols=None,cat_features=None):
        preds = np.mean([
            model.predict(test_data.drop(drop_cols,axis=1),cat_features=cat_features)
            for model in self.models
        ],axis=0)
        return preds
    
    def get_feature_importance(self,type='FeatureImportance'):
        imp_0 = self.models[0].cbm.get_feature_importance(prettified=True,type=type).set_index('Feature Id')
        for i in range(1,len(self.models)):
            imp_0 += self.models[i].cbm.get_feature_importance(prettified=True,type=type).set_index('Feature Id')
        return (imp_0 / len(self.models)).sort_values(by='Importances')[::-1]

In [90]:
model = CustomBoostKfoldWraper(
    num_folds=5,
    num_repits=5,
    params=params,
    random_state=56,
    score_func=partial(mean_squared_error,squared=False)
)

In [91]:
model.fit(
    train_data=train_data,
    cat_features=cat_cols,
    drop_cols=drop_cols,
    label_col=label_col,
    verbose=350,
)

  0%|          | 0/5 [00:00<?, ?it/s]



0:	learn: 4219839.5300295	test: 3534326.3255211	best: 3534326.3255211 (0)	total: 6.53ms	remaining: 6.52s
350:	learn: 738932.3290524	test: 1073777.1071822	best: 1073559.4594861 (343)	total: 2.07s	remaining: 3.83s
700:	learn: 593535.4421995	test: 1051016.3514713	best: 1050923.2414503 (697)	total: 4.12s	remaining: 1.76s
999:	learn: 516583.5726541	test: 1044266.9099477	best: 1044266.9099477 (999)	total: 5.87s	remaining: 0us

bestTest = 1044266.91
bestIteration = 999

[1044266.9099477095]
0:	learn: 3951169.4675714	test: 4649298.2629226	best: 4649298.2629226 (0)	total: 5.84ms	remaining: 5.83s
350:	learn: 770454.6763265	test: 1663808.7045886	best: 1661203.7946914 (229)	total: 2.12s	remaining: 3.92s
700:	learn: 606611.7074544	test: 1635649.4232314	best: 1635400.1473827 (687)	total: 4.16s	remaining: 1.78s
999:	learn: 525270.5498479	test: 1637528.5566537	best: 1634035.8295606 (773)	total: 5.89s	remaining: 0us

bestTest = 1634035.83
bestIteration = 773

Shrink model to first 774 iterations.
[1634



350:	learn: 733198.9856272	test: 1075893.3526853	best: 1075893.3526853 (350)	total: 1.99s	remaining: 3.69s
700:	learn: 585787.8298255	test: 1044995.2934912	best: 1044995.2934912 (700)	total: 4.03s	remaining: 1.72s
999:	learn: 510031.3638479	test: 1034979.6775753	best: 1034873.3159249 (997)	total: 5.77s	remaining: 0us

bestTest = 1034873.316
bestIteration = 997

Shrink model to first 998 iterations.
[1034873.3159248682]
0:	learn: 3951624.1755970	test: 4651646.7777462	best: 4651646.7777462 (0)	total: 6.45ms	remaining: 6.45s
350:	learn: 768401.4394682	test: 1556802.0346108	best: 1556802.0346108 (350)	total: 2.12s	remaining: 3.92s
700:	learn: 612764.8047441	test: 1542823.5253139	best: 1541848.3017403 (651)	total: 4.18s	remaining: 1.78s
999:	learn: 533928.4606364	test: 1536069.6706887	best: 1535970.0776221 (992)	total: 5.94s	remaining: 0us

bestTest = 1535970.078
bestIteration = 992

Shrink model to first 993 iterations.
[1535970.0776221387]
0:	learn: 3957840.1422002	test: 4595560.6360627	b



350:	learn: 743324.7081138	test: 1062941.4776356	best: 1062941.4776356 (350)	total: 2.08s	remaining: 3.85s
700:	learn: 590707.3997182	test: 1039484.5894431	best: 1039472.3093206 (699)	total: 4.18s	remaining: 1.78s
999:	learn: 519861.6979960	test: 1028468.7261326	best: 1028468.7261326 (999)	total: 6.05s	remaining: 0us

bestTest = 1028468.726
bestIteration = 999

[1028468.7261326063]
0:	learn: 3951652.7459466	test: 4651173.9987249	best: 4651173.9987249 (0)	total: 6.23ms	remaining: 6.22s
350:	learn: 763475.1339206	test: 1535785.6250108	best: 1534543.5358989 (328)	total: 2.08s	remaining: 3.85s
700:	learn: 603903.4859422	test: 1524177.4433492	best: 1524177.4433492 (700)	total: 4.15s	remaining: 1.77s
999:	learn: 524516.3093884	test: 1516077.5245721	best: 1515619.1605050 (992)	total: 5.93s	remaining: 0us

bestTest = 1515619.161
bestIteration = 992

Shrink model to first 993 iterations.
[1515619.160504955]
0:	learn: 3957985.3984952	test: 4595534.3525336	best: 4595534.3525336 (0)	total: 6.38ms	



350:	learn: 747695.6192227	test: 1073524.5069196	best: 1073524.5069196 (350)	total: 2.04s	remaining: 3.77s
700:	learn: 596967.3978750	test: 1048761.9540053	best: 1048761.9540053 (700)	total: 4.18s	remaining: 1.78s
999:	learn: 524269.4198109	test: 1042811.3280805	best: 1042606.0680820 (993)	total: 5.94s	remaining: 0us

bestTest = 1042606.068
bestIteration = 993

Shrink model to first 994 iterations.
[1042606.0680819951]
0:	learn: 3951552.4661463	test: 4651555.5312055	best: 4651555.5312055 (0)	total: 6.47ms	remaining: 6.47s
350:	learn: 765027.7535043	test: 1575399.8921835	best: 1575399.8921835 (350)	total: 2.02s	remaining: 3.73s
700:	learn: 604213.6606327	test: 1563243.6763544	best: 1563243.6763544 (700)	total: 4.05s	remaining: 1.73s
999:	learn: 526236.6436371	test: 1556219.1435803	best: 1556219.1435803 (999)	total: 5.83s	remaining: 0us

bestTest = 1556219.144
bestIteration = 999

[1556219.1435802842]
0:	learn: 3957980.0921850	test: 4595377.2632145	best: 4595377.2632145 (0)	total: 6.38ms



350:	learn: 745469.4605172	test: 1047728.3174751	best: 1047561.6299138 (349)	total: 2.05s	remaining: 3.79s
700:	learn: 593754.7993786	test: 1027869.2914616	best: 1027869.2914616 (700)	total: 4.16s	remaining: 1.78s
999:	learn: 519239.1466528	test: 1017502.8559357	best: 1017399.2998826 (998)	total: 5.92s	remaining: 0us

bestTest = 1017399.3
bestIteration = 998

Shrink model to first 999 iterations.
[1017399.2998826343]
0:	learn: 3956492.8753812	test: 4658628.6509640	best: 4658628.6509640 (0)	total: 6.21ms	remaining: 6.2s
350:	learn: 767066.2257017	test: 1593299.3319833	best: 1582334.1042457 (186)	total: 2.06s	remaining: 3.82s
700:	learn: 608868.3045019	test: 1582950.9226543	best: 1582334.1042457 (186)	total: 4.17s	remaining: 1.78s
999:	learn: 527921.2610213	test: 1574645.7173260	best: 1574640.0107959 (960)	total: 5.92s	remaining: 0us

bestTest = 1574640.011
bestIteration = 960

Shrink model to first 961 iterations.
[1574640.0107958964]
0:	learn: 3954957.8453837	test: 4594586.7058013	best

In [92]:
model.get_feature_importance()

Unnamed: 0_level_0,Importances
Feature Id,Unnamed: 1_level_1
area,54.165759
mean_label@5_otn,4.25605
area_kitchen,3.473561
mean_label<1.0,3.236324
mean_label@10_otn,3.211618
mean_label<0.5_otn,3.024476
mean_label@25_otn,2.895969
mean_label<5,2.336929
nearest_label_otn,2.032525
mean_label<1.0_otn,1.915596


In [93]:
drop_cols.remove('price_of_1m2')

In [94]:
preds = model.predict(test_data,drop_cols=drop_cols,cat_features=cat_cols)

In [86]:
mean_squared_error(preds,test_data['price'],squared=False)

1239100.5993724377

In [None]:
1252445.0520686784
1239100.5993724377