In [2]:
!wget https://storage.yandexcloud.net/ds-ods/files/files/afad66cf/df_train.csv
!git clone https://github.com/nalgeon/metro.git

--2024-11-12 12:00:16--  https://storage.yandexcloud.net/ds-ods/files/files/afad66cf/df_train.csv
Resolving storage.yandexcloud.net (storage.yandexcloud.net)... 213.180.193.243, 2a02:6b8::1d9
Connecting to storage.yandexcloud.net (storage.yandexcloud.net)|213.180.193.243|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 520095 (508K) [text/csv]
Saving to: 'df_train.csv'


2024-11-12 12:00:18 (523 KB/s) - 'df_train.csv' saved [520095/520095]

Cloning into 'metro'...
remote: Enumerating objects: 594, done.[K
remote: Counting objects: 100% (74/74), done.[K
remote: Compressing objects: 100% (46/46), done.[K
remote: Total 594 (delta 54), reused 46 (delta 28), pack-reused 520 (from 1)[K
Receiving objects: 100% (594/594), 532.33 KiB | 13.65 MiB/s, done.
Resolving deltas: 100% (466/466), done.


In [3]:
pip install hdbscan

Note: you may need to restart the kernel to use updated packages.


In [52]:
import numpy as np
import pandas as pd
import geopandas as gpd
from tqdm.auto import trange, tqdm
import multiprocessing as mp
from sklearn.cluster import KMeans, DBSCAN
from hdbscan import HDBSCAN
from functools import partial
from scipy import spatial
import json
from catboost import CatBoostClassifier, Pool, cv, CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numba
import gc
from math import radians, sin, cos, asin, sqrt, pi, log2
from sklearn.metrics.pairwise import haversine_distances
from sklearn.model_selection import train_test_split, StratifiedKFold
from geopy.distance import geodesic, distance, great_circle, lonlat
from geopy.geocoders import Nominatim
from sklearn.base import BaseEstimator

In [53]:
def haversine(x,y):
    x = [x[0] * np.pi / 180, x[1] * np.pi / 180]
    y = [y[0] * np.pi / 180, y[1] * np.pi / 180]
    return haversine_distances(X=[x],Y=[y])[0][0]

def build_distance_matrix_fast(coords):
    coords = [[x[0] * np.pi / 180, x[1] * np.pi / 180] for x in coords]
    mat = haversine_distances(X=coords) * 6371.127
    mat += np.eye(mat.shape[0]) * 10e10
    return mat

def build_distance_matrix_test_fast(coords_tr,coords_ts):
    coords_tr = [[x[0] * np.pi / 180, x[1] * np.pi / 180] for x in coords_tr]
    coords_ts = [[x[0] * np.pi / 180, x[1] * np.pi / 180] for x in coords_ts]
    mat = haversine_distances(X=coords_ts,Y=coords_tr) * 6371.127
    mat += np.eye(*mat.shape) * 10e10
    return mat

In [54]:
metro_data = pd.read_csv('/kaggle/working/metro/data/metro.ru.csv')
metro_data = metro_data[['lat','lng']].values.tolist()
metro_data = [[x[0] * np.pi / 180, x[1] * np.pi / 180] for x in metro_data]

In [55]:
data = pd.read_csv('/kaggle/working/df_train.csv')
train_data, test_data = train_test_split(data,test_size=0.2,random_state=56)

In [56]:
dist_mat = build_distance_matrix_fast(train_data[['latitude','longitude']].values.tolist())

In [57]:
test_dist_mat = build_distance_matrix_test_fast(
    train_data[['latitude','longitude']].values.tolist(),
    test_data[['latitude','longitude']].values.tolist(),
)

In [58]:
def get_ranks(data,mat,topk=100):
    idxes = np.argsort(mat,axis=1)
    top_idx = [i[:topk] for i in idxes]
    top_dists = [row[i[:topk]] for i,row in zip(idxes,mat)]
    data['cand_dist'] = top_dists
    data['cand_idxs'] = top_idx
    
get_ranks(train_data,dist_mat)
get_ranks(test_data,test_dist_mat)

In [59]:
train_data['price_of_1m2'] = train_data['price'] / train_data['area']
train_label_maper = dict(zip(range(len(train_data)),train_data['price']))
train_otn_label_maper = dict(zip(range(len(train_data)),train_data['price_of_1m2']))

In [60]:
def nearest_metro(row):
    coords = (row['latitude'],row['longitude'])
    return np.min(haversine_distances(X=[coords],Y=metro_data))

def agg_from_2cols(row,dist_pref=0.0,agg=np.mean):
    return agg([i for i,j in zip(row['neigh_label'],row['cand_dist']) if j <= dist_pref])

def agg_from_2cols_otn(row,dist_pref=0.0,agg=np.mean):
    return agg([i for i,j in zip(row['neigh_label_otn'],row['cand_dist']) if j <= dist_pref])

def get_distance_feats(data):
    data['neigh_label'] = data['cand_idxs'].apply(lambda x: [train_label_maper[j] for j in x])
    data['neigh_label_otn'] = data['cand_idxs'].apply(lambda x: [train_otn_label_maper[j] for j in x])
    data['min_distance'] =  data['cand_dist'].apply(lambda x: min(x))
    data['mean_distance'] =  data['cand_dist'].apply(lambda x: np.mean(x))
    data['nearest_label'] =  data['neigh_label'].apply(lambda x: x[0])
    data['nearest_label_otn'] =  data['neigh_label_otn'].apply(lambda x: x[0])
    data['distance_zero_cnt'] =  data['cand_dist'].apply(lambda x: x.tolist().count(0.0))
    
    data['mean_label@5'] =   data['neigh_label'].apply(lambda x: np.mean(x[:5]))
    data['mean_label@10'] =  data['neigh_label'].apply(lambda x: np.mean(x[:10]))
    data['mean_label@25'] =  data['neigh_label'].apply(lambda x: np.mean(x[:10]))
    
    data['mean_label<0'] =   data.apply(partial(agg_from_2cols,dist_pref=0.0),axis=1)
    data['mean_label<0.5'] = data.apply(partial(agg_from_2cols,dist_pref=0.5),axis=1)
    data['mean_label<1.0'] = data.apply(partial(agg_from_2cols,dist_pref=1.0),axis=1)
    data['mean_label<5'] =   data.apply(partial(agg_from_2cols,dist_pref=5.0),axis=1)
    
    data['mean_label@5_otn'] =   data['neigh_label_otn'].apply(lambda x: np.mean(x[:5]))
    data['mean_label@10_otn'] =  data['neigh_label_otn'].apply(lambda x: np.mean(x[:10]))
    data['mean_label@25_otn'] =  data['neigh_label_otn'].apply(lambda x: np.mean(x[:10]))
    
    data['mean_label<0_otn'] =   data.apply(partial(agg_from_2cols_otn,dist_pref=0.0),axis=1)
    data['mean_label<0.5_otn'] = data.apply(partial(agg_from_2cols_otn,dist_pref=0.5),axis=1)
    data['mean_label<1.0_otn'] = data.apply(partial(agg_from_2cols_otn,dist_pref=1.0),axis=1)
    data['mean_label<5_otn'] =   data.apply(partial(agg_from_2cols_otn,dist_pref=5.0),axis=1)
    
    data['center_city_cords'] = data['city'].map(city_coords)
    tqdm.pandas()
    data['nearest_metro'] = data[['latitude','longitude']].progress_apply(nearest_metro,axis=1)
    data['dist2center'] = data.apply(lambda row: haversine(row['center_city_cords'],(row['latitude'],row['longitude'])),axis=1)
    
def crt_crds(df): 
    df['rot_15_x'] = (np.cos(np.radians(15)) * df['longitude']) + \
                      (np.sin(np.radians(15)) * df['latitude'])
    
    df['rot_15_y'] = (np.cos(np.radians(15)) * df['latitude']) + \
                      (np.sin(np.radians(15)) * df['longitude'])
    
    df['rot_30_x'] = (np.cos(np.radians(30)) * df['longitude']) + \
                      (np.sin(np.radians(30)) * df['latitude'])
    
    df['rot_30_y'] = (np.cos(np.radians(30)) * df['latitude']) + \
                      (np.sin(np.radians(30)) * df['longitude'])
    
    df['rot_45_x'] = (np.cos(np.radians(45)) * df['longitude']) + \
                      (np.sin(np.radians(45)) * df['latitude'])
    return df

def exp_dims_tric(df):
    coordinates = df[['latitude', 'longitude']].values
    
    emb_size = 20
    precision = 1e6 

    latlon = np.expand_dims(coordinates, axis=-1) 

    m = np.exp(np.log(precision) / emb_size) 
    angle_freq = m ** np.arange(emb_size) 
    angle_freq = angle_freq.reshape(1, 1, emb_size) 

    latlon = latlon * angle_freq 
    latlon[..., 0::2] = np.cos(latlon[..., 0::2]) 
    latlon[..., 1::2] = np.sin(latlon[..., 1::2]) 
    latlon = latlon.reshape(-1, 2 * emb_size) 

    df['exp_latlon1'] = [lat[0] for lat in latlon]
    df['exp_latlon2'] = [lat[1] for lat in latlon]
    
def get_designes_feats(coords):
    tree = spatial.cKDTree(coords)
    result = tree.query_ball_point(coords, 0.01, return_length=True)
    return result

In [61]:
cluster = KMeans(n_clusters=32)#DBSCAN(metric='haversine')
coords = pd.concat([train_data[['latitude','longitude']],test_data[['latitude','longitude']]],axis=0).values.tolist()
coords = [[x[0] * np.pi / 180, x[1] * np.pi / 180] for x in coords]
clusters = cluster.fit_predict(coords)



In [62]:
train_data['cluster'] = clusters[:train_data.shape[0]]
test_data['cluster'] = clusters[train_data.shape[0]:]

In [63]:
geolocator = Nominatim(user_agent="user_agent")
city_coords = {x:(geolocator.geocode(x).latitude, geolocator.geocode(x).longitude) for x in data['city'].unique()}

In [64]:
get_distance_feats(train_data)
get_distance_feats(test_data)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


  0%|          | 0/6433 [00:00<?, ?it/s]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


  0%|          | 0/1609 [00:00<?, ?it/s]

In [65]:
disignes = get_designes_feats(pd.concat([train_data[['latitude','longitude']],test_data[['latitude','longitude']]],axis=0)).tolist()
train_data['desinges'] = disignes[:train_data.shape[0]]
test_data['desinges'] = disignes[train_data.shape[0]:]

In [66]:
#exp_dims_tric(train_data)
#exp_dims_tric(test_data)
#crt_crds(train_data)
#crt_crds(test_data)

In [78]:
cb_params = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'loss_function': 'RMSE',
    'max_depth': 6,
    'eval_metric': 'RMSE',
    'use_best_model':True,
    'task_type': 'CPU',
    'random_seed': 56,
}

params = {
    'cb_params':cb_params,
    'lgb_params': None,
    'xgb_params': None,
}

cat_cols = ['city','has_balcony','cluster']
label_col = 'price'
drop_cols = [
    'cand_dist',
    'cand_idxs',
    'neigh_label',
    'center_city_cords',
    'neigh_label_otn',
    'price_of_1m2'
]

In [79]:
class EnsembleClassifier(BaseEstimator):
    def __init__(self,cb_params,lgb_params,xgb_params):
        self.cbm = CatBoostClassifier(**cb_params)
        #self.lgbm = lgb.LGBMClassifier(**lgb_params)
        #self.xgbm = xgb.XGBClassifier(**xgb_params)
    
    def fit(self,X,y,X_val,y_val,cat_features=None,verbose=False):
        train_pool = Pool(X,label=y,cat_features=cat_features)
        eval_pool = Pool(X_val,label=y_val,cat_features=cat_features)
        self.cbm.fit(train_pool,eval_set=eval_pool,verbose=verbose)
        #self.lgbm.fit(X, y,eval_set=[(X_val,y_val)],categorical_feature=cat_features,callbacks=[lgb.log_evaluation(verbose)],eval_metric=lgb_f1_score)
        #self.xgbm.fit(X,y,eval_set=[(X_val,y_val)],verbose=False)
    
    def predict_proba(self,X_test,cat_features):
        test_pool = Pool(X_test,cat_features=cat_features)
        cb_preds = self.cbm.predict_proba(test_pool)[:,1]
        #lgb_preds = self.lgbm.predict_proba(X_test)[:,1]
        #xgb_preds = self.xgbm.predict_proba(X_test)[:,1]
        return lgb_preds
    
    def predict(self,X_test,cat_features):
        test_pool = Pool(X_test,cat_features=cat_features)
        cb_preds = self.cbm.predict(test_pool)
        #lgb_preds = self.lgbm.predict_proba(X_test)
        #xgb_preds = self.xgbm.predict_proba(X_test)[:,1]
        return cb_preds
    
class EnsembleRegressor(BaseEstimator):
    def __init__(self,cb_params,lgb_params,xgb_params):
        self.cbm = CatBoostRegressor(**cb_params)
        #self.lgbm = lgb.LGBMClassifier(**lgb_params)
        #self.xgbm = xgb.XGBClassifier(**xgb_params)
    
    def fit(self,X,y,X_val,y_val,cat_features=None,verbose=False):
        train_pool = Pool(X,label=y,cat_features=cat_features)
        eval_pool = Pool(X_val,label=y_val,cat_features=cat_features)
        self.cbm.fit(train_pool,eval_set=eval_pool,verbose=verbose)
        #self.lgbm.fit(X, y,eval_set=[(X_val,y_val)],categorical_feature=cat_features,callbacks=[lgb.log_evaluation(verbose)],eval_metric=lgb_f1_score)
        #self.xgbm.fit(X,y,eval_set=[(X_val,y_val)],verbose=False)
        
    def predict(self,X_test,cat_features):
        test_pool = Pool(X_test,cat_features=cat_features)
        cb_preds = self.cbm.predict(test_pool)
        #lgb_preds = self.lgbm.predict_proba(X_test)
        #xgb_preds = self.xgbm.predict_proba(X_test)[:,1]
        return cb_preds

In [80]:
class CustomBoostKfoldWraper(BaseEstimator):
    def __init__(self,num_folds,num_repits,params,random_state=56,score_func=None):
        self.models = []
        self.params = params
        self.random_state = random_state
        self.num_folds = num_folds
        self.num_repits = num_repits
        self.score_func = score_func
        
    def fit(self,train_data,cat_features=None,drop_cols=None,label_col=None,verbose=False):
        self.scores = []
        
        for i in trange(self.num_repits):
            kfold = StratifiedKFold(self.num_folds,random_state=self.random_state+i,shuffle=True)
            for train_index, test_index in (kfold.split(train_data,train_data[label_col])):
                train_df = train_data.iloc[train_index]
                test_df = train_data.iloc[test_index]
                
                model = EnsembleRegressor(**self.params)
                model.fit(
                    X = train_df.drop([label_col]+drop_cols,axis=1),
                    y = train_df[label_col],
                    X_val = test_df.drop([label_col]+drop_cols,axis=1),
                    y_val = test_df[label_col],
                    cat_features = cat_features,
                    verbose = verbose
                )
                cb_preds = model.predict(test_df.drop([label_col]+drop_cols,axis=1),cat_features)
                avg_preds =  cb_preds#(cb_preds + lgb_preds) / 2
                self.scores += [[
                    self.score_func(
                        test_df[label_col],
                        avg_preds
                    )
                ]]
                print(self.scores[-1])
                self.models += [model]
                
        print(f"Total Score {np.mean([x[0] for x in self.scores])}")
            
    def predict(self,test_data,drop_cols=None,cat_features=None):
        preds = np.mean([
            model.predict(test_data.drop(drop_cols,axis=1),cat_features=cat_features)
            for model in self.models
        ],axis=0)
        return preds
    
    def get_feature_importance(self,type='FeatureImportance'):
        imp_0 = self.models[0].cbm.get_feature_importance(prettified=True,type=type).set_index('Feature Id')
        for i in range(1,len(self.models)):
            imp_0 += self.models[i].cbm.get_feature_importance(prettified=True,type=type).set_index('Feature Id')
        return (imp_0 / len(self.models)).sort_values(by='Importances')[::-1]

In [81]:
model = CustomBoostKfoldWraper(
    num_folds=5,
    num_repits=5,
    params=params,
    random_state=56,
    score_func=partial(mean_squared_error,squared=False)
)

In [82]:
model.fit(
    train_data=train_data,
    cat_features=cat_cols,
    drop_cols=drop_cols,
    label_col=label_col,
    verbose=350,
)

  0%|          | 0/5 [00:00<?, ?it/s]



0:	learn: 4212676.8629753	test: 3533237.0365576	best: 3533237.0365576 (0)	total: 9.94ms	remaining: 9.93s
350:	learn: 724376.5789595	test: 1072837.7508063	best: 1072837.7508063 (350)	total: 2.75s	remaining: 5.08s
700:	learn: 557702.4925681	test: 1050473.9210394	best: 1050436.8374728 (677)	total: 5.43s	remaining: 2.32s
999:	learn: 473973.2712141	test: 1040459.0609532	best: 1040397.8468673 (998)	total: 8.38s	remaining: 0us

bestTest = 1040397.847
bestIteration = 998

Shrink model to first 999 iterations.
[1040397.8468673484]
0:	learn: 3951748.7016422	test: 4625214.0357888	best: 4625214.0357888 (0)	total: 8.66ms	remaining: 8.65s
350:	learn: 733846.5312968	test: 1587251.4538809	best: 1581382.8728113 (277)	total: 2.59s	remaining: 4.79s
700:	learn: 559073.3632944	test: 1578522.8115668	best: 1577001.8267856 (572)	total: 5.28s	remaining: 2.25s
999:	learn: 479719.9703320	test: 1579703.9723228	best: 1577001.8267856 (572)	total: 7.55s	remaining: 0us

bestTest = 1577001.827
bestIteration = 572

Shr



0:	learn: 4212093.5620212	test: 3534403.7381967	best: 3534403.7381967 (0)	total: 9.69ms	remaining: 9.68s
350:	learn: 727638.3264857	test: 1092459.1659997	best: 1092148.0925279 (346)	total: 2.72s	remaining: 5.03s
700:	learn: 551717.2419911	test: 1061919.0412743	best: 1061436.6630928 (690)	total: 5.35s	remaining: 2.28s
999:	learn: 478430.2601955	test: 1053671.8029875	best: 1053671.8029875 (999)	total: 7.6s	remaining: 0us

bestTest = 1053671.803
bestIteration = 999

[1053671.8029875138]
0:	learn: 3951071.6451374	test: 4633386.7652543	best: 4633386.7652543 (0)	total: 7.85ms	remaining: 7.84s
350:	learn: 724109.3187318	test: 1587301.9378417	best: 1585247.8625947 (272)	total: 2.59s	remaining: 4.79s
700:	learn: 556872.8169236	test: 1568002.0903421	best: 1568002.0903421 (700)	total: 5.28s	remaining: 2.25s
999:	learn: 475940.9154506	test: 1561867.3043783	best: 1561867.3043783 (999)	total: 7.7s	remaining: 0us

bestTest = 1561867.304
bestIteration = 999

[1561867.3043783235]
0:	learn: 3956707.4662



0:	learn: 4212350.0980690	test: 3532403.2431854	best: 3532403.2431854 (0)	total: 10.2ms	remaining: 10.2s
350:	learn: 749634.0497588	test: 1082163.5742045	best: 1081710.7228962 (335)	total: 2.64s	remaining: 4.88s
700:	learn: 558070.9428629	test: 1048229.7144284	best: 1048229.7144284 (700)	total: 5.26s	remaining: 2.24s
999:	learn: 479160.3052704	test: 1039557.6282274	best: 1039151.0383507 (970)	total: 7.51s	remaining: 0us

bestTest = 1039151.038
bestIteration = 970

Shrink model to first 971 iterations.
[1039151.0383507225]
0:	learn: 3951830.1623319	test: 4627435.7149145	best: 4627435.7149145 (0)	total: 8.61ms	remaining: 8.6s
350:	learn: 734734.0241772	test: 1522921.1118342	best: 1522921.1118342 (350)	total: 2.56s	remaining: 4.73s
700:	learn: 563255.8965191	test: 1487561.5581523	best: 1487068.1832504 (683)	total: 5.21s	remaining: 2.22s
999:	learn: 476877.0633605	test: 1487567.5716523	best: 1486024.6294256 (922)	total: 7.44s	remaining: 0us

bestTest = 1486024.629
bestIteration = 922

Shri



0:	learn: 4212571.7315346	test: 3534506.8793813	best: 3534506.8793813 (0)	total: 7.85ms	remaining: 7.85s
350:	learn: 733153.5414813	test: 1084461.6729768	best: 1084282.8698905 (320)	total: 2.67s	remaining: 4.93s
700:	learn: 561409.5298478	test: 1058722.1736260	best: 1058701.4787375 (699)	total: 5.34s	remaining: 2.28s
999:	learn: 478865.4662565	test: 1045914.1364050	best: 1045914.1364050 (999)	total: 7.56s	remaining: 0us

bestTest = 1045914.136
bestIteration = 999

[1045914.1364049865]
0:	learn: 3951248.7828889	test: 4633607.6491397	best: 4633607.6491397 (0)	total: 8.01ms	remaining: 8.01s
350:	learn: 719526.4861015	test: 1583713.9851960	best: 1583713.9851960 (350)	total: 2.54s	remaining: 4.7s
700:	learn: 556375.4646952	test: 1568737.7124092	best: 1568592.3214327 (687)	total: 5.14s	remaining: 2.19s
999:	learn: 477115.6069807	test: 1558051.7801064	best: 1557908.1140954 (985)	total: 7.66s	remaining: 0us

bestTest = 1557908.114
bestIteration = 985

Shrink model to first 986 iterations.
[155



0:	learn: 4212163.2843306	test: 3533880.3692781	best: 3533880.3692781 (0)	total: 10.2ms	remaining: 10.2s
350:	learn: 739161.4117100	test: 1031691.7090542	best: 1031533.3956461 (349)	total: 2.67s	remaining: 4.93s
700:	learn: 557878.3958853	test: 1000025.3034542	best: 1000025.3034542 (700)	total: 5.27s	remaining: 2.25s
999:	learn: 473997.2731402	test: 987574.0190719	best: 987205.0863409 (989)	total: 7.59s	remaining: 0us

bestTest = 987205.0863
bestIteration = 989

Shrink model to first 990 iterations.
[987205.0863408992]
0:	learn: 3951193.8210588	test: 4633576.2424217	best: 4633576.2424217 (0)	total: 9.88ms	remaining: 9.87s
350:	learn: 724562.5381674	test: 1571288.8411822	best: 1561699.6821619 (176)	total: 2.92s	remaining: 5.39s
700:	learn: 561851.9163054	test: 1563020.4485279	best: 1561699.6821619 (176)	total: 5.52s	remaining: 2.35s
999:	learn: 475189.3403495	test: 1558660.4369409	best: 1557610.7107299 (836)	total: 7.75s	remaining: 0us

bestTest = 1557610.711
bestIteration = 836

Shrink

In [74]:
model.get_feature_importance()

Unnamed: 0_level_0,Importances
Feature Id,Unnamed: 1_level_1
area,54.5426
mean_label@5_otn,4.666252
area_kitchen,3.4484
mean_label@25_otn,3.263296
mean_label<5,3.054205
mean_label<0.5_otn,3.002844
mean_label@10_otn,2.766402
nearest_label_otn,2.599648
mean_label<1.0,2.532413
mean_label<0.5,2.480985


In [75]:
drop_cols.remove('price_of_1m2')

In [76]:
preds = model.predict(test_data,drop_cols=drop_cols,cat_features=cat_cols)

In [77]:
mean_squared_error(preds,test_data['price'],squared=False)

1252445.0520686784

In [None]:
1252445.0520686784.6403511064