In [1]:
!wget https://storage.yandexcloud.net/ds-ods/files/files/afad66cf/df_train.csv

--2024-11-10 11:47:46--  https://storage.yandexcloud.net/ds-ods/files/files/afad66cf/df_train.csv
Resolving storage.yandexcloud.net (storage.yandexcloud.net)... 213.180.193.243, 2a02:6b8::1d9
Connecting to storage.yandexcloud.net (storage.yandexcloud.net)|213.180.193.243|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 520095 (508K) [text/csv]
Saving to: 'df_train.csv'


2024-11-10 11:47:46 (1.78 MB/s) - 'df_train.csv' saved [520095/520095]



In [50]:
!git clone https://github.com/nalgeon/metro.git

Cloning into 'metro'...
remote: Enumerating objects: 594, done.[K
remote: Counting objects: 100% (74/74), done.[K
remote: Compressing objects: 100% (46/46), done.[K
remote: Total 594 (delta 54), reused 46 (delta 28), pack-reused 520 (from 1)[K
Receiving objects: 100% (594/594), 532.33 KiB | 11.09 MiB/s, done.
Resolving deltas: 100% (466/466), done.


In [103]:
import numpy as np
import pandas as pd
from tqdm.auto import trange, tqdm
import multiprocessing as mp
from sklearn.cluster import KMeans, DBSCAN
from functools import partial
import json
from catboost import CatBoostClassifier, Pool, cv, CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numba
import gc
from math import radians, sin, cos, asin, sqrt, pi, log2
from sklearn.model_selection import train_test_split, StratifiedKFold
from geopy.distance import geodesic, distance, great_circle, lonlat
from geopy.geocoders import Nominatim
from sklearn.base import BaseEstimator

In [104]:
#@numba.jit()
def haversine(origin, destination, units='km'):
    # Radian deltas
    origin_lat = radians(float(origin[0]))
    origin_lon = radians(float(origin[1]))
    destination_lat = radians(float(destination[0]))
    destination_lon = radians(float(destination[1]))
    lat_delta = destination_lat - origin_lat
    lon_delta = destination_lon - origin_lon

    # Radius of earth in meters
    r = 6378127

    # Haversine formula
    a = sin(lat_delta / 2) ** 2 + cos(origin_lat) * \
        cos(destination_lat) * sin(lon_delta / 2) ** 2
    c = 2 * asin(sqrt(a))
    meters_traveled = c * r

    scaling_factors = {
        "m:": 1,
        "km": 1 / 1000,
        "ft": 3.2808,  # meters to feet
        "mi:": 0.000621371  # meters to miles
    }

    return meters_traveled * scaling_factors[units]

def euclid(c1,c2):
    (x0,y0), (x1,y1) = c1,c2
    return ((x1-x0)**2 + (y1-y0)**2)**0.5

def set_from_iterator(j,mat,coords,i):
    mat[i,j] = mat[j,i] = great_circle(coords[i],coords[j]).km
    return 0

def build_distance_matrix(coords):
    mat = np.ones((len(coords),len(coords))) * np.inf
    for i in trange(len(coords)):
        for j in range(i+1,len(coords)):
            mat[i,j] = mat[j,i] = haversine(coords[i],coords[j])
    return mat

def build_distance_matrix_test(coords_tr,coords_ts):
    mat = np.ones((len(coords_ts),len(coords_tr)))
    for i in trange(len(coords_ts)):
        for j in range(i+1,len(coords_tr)):
            mat[i,j] = haversine(coords_ts[i],coords_tr[j])
    return mat

In [105]:
metro_data = pd.read_csv('/kaggle/working/metro/data/metro.ru.csv')
metro_data = metro_data[['lat','lng']].values.tolist()

In [106]:
data = pd.read_csv('/kaggle/working/df_train.csv')
train_data, test_data = train_test_split(data,test_size=0.2,random_state=56)

In [107]:
dist_mat = build_distance_matrix(train_data[['latitude','longitude']].values.tolist())

  0%|          | 0/6433 [00:00<?, ?it/s]

In [108]:
test_dist_mat = build_distance_matrix_test = build_distance_matrix_test(
    train_data[['latitude','longitude']].values.tolist(),
    test_data[['latitude','longitude']].values.tolist(),
)

  0%|          | 0/1609 [00:00<?, ?it/s]

In [109]:
def get_ranks(data,mat,topk=100):
    idxes = np.argsort(mat,axis=1)
    top_idx = [i[:topk] for i in idxes]
    top_dists = [row[i[:topk]] for i,row in zip(idxes,mat)]
    data['cand_dist'] = top_dists
    data['cand_idxs'] = top_idx
    
get_ranks(train_data,dist_mat)
get_ranks(test_data,test_dist_mat)

In [110]:
train_data['price_of_1m2'] = train_data['price']
train_label_maper = dict(zip(range(len(train_data)),train_data['price']))
train_otn_label_maper = dict(zip(range(len(train_data)),train_data['price_of_1m2']))

In [111]:
def nearest_metro(row):
    return np.min([haversine((row['latitude'],row['longitude']),mtr) for mtr in metro_data])

def agg_from_2cols(row,dist_pref=0.0,agg=np.mean):
    return agg([i for i,j in zip(row['neigh_label'],row['cand_dist']) if j <= dist_pref])

def agg_from_2cols_otn(row,dist_pref=0.0,agg=np.mean):
    return agg([i for i,j in zip(row['neigh_label_otn'],row['cand_dist']) if j <= dist_pref])

def get_distance_feats(data):
    data['neigh_label'] = data['cand_idxs'].apply(lambda x: [train_label_maper[j] for j in x])
    data['neigh_label_otn'] = data['cand_idxs'].apply(lambda x: [train_otn_label_maper[j] for j in x])
    data['min_distance'] =  data['cand_dist'].apply(lambda x: min(x))
    data['mean_distance'] =  data['cand_dist'].apply(lambda x: np.mean(x))
    data['nearest_label'] =  data['neigh_label'].apply(lambda x: x[0])
    data['nearest_label_otn'] =  data['neigh_label_otn'].apply(lambda x: x[0])
    data['distance_zero_cnt'] =  data['cand_dist'].apply(lambda x: x.tolist().count(0.0))
    
    data['mean_label@5'] =   data['neigh_label'].apply(lambda x: np.mean(x[:5]))
    data['mean_label@10'] =  data['neigh_label'].apply(lambda x: np.mean(x[:10]))
    data['mean_label@25'] =  data['neigh_label'].apply(lambda x: np.mean(x[:10]))
    
    data['mean_label<0'] =   data.apply(partial(agg_from_2cols,dist_pref=0.0),axis=1)
    data['mean_label<0.5'] = data.apply(partial(agg_from_2cols,dist_pref=0.5),axis=1)
    data['mean_label<5'] =   data.apply(partial(agg_from_2cols,dist_pref=5.0),axis=1)
    
    data['mean_label@5_otn'] =   data['neigh_label_otn'].apply(lambda x: np.mean(x[:5]))
    data['mean_label@10_otn'] =  data['neigh_label_otn'].apply(lambda x: np.mean(x[:10]))
    data['mean_label@25_otn'] =  data['neigh_label_otn'].apply(lambda x: np.mean(x[:10]))
    
    data['mean_label<0_otn'] =   data.apply(partial(agg_from_2cols_otn,dist_pref=0.0),axis=1)
    data['mean_label<0.5_otn'] = data.apply(partial(agg_from_2cols_otn,dist_pref=0.5),axis=1)
    data['mean_label<5_otn'] =   data.apply(partial(agg_from_2cols_otn,dist_pref=5.0),axis=1)
    
    data['center_city_cords'] = data['city'].map(city_coords)
    tqdm.pandas()
    data['nearest_metro'] = data[['latitude','longitude']].progress_apply(nearest_metro,axis=1)
    data['dist2center'] = data.apply(lambda row: haversine(row['center_city_cords'],(row['latitude'],row['longitude'])),axis=1)

In [112]:
cluster = DBSCAN(metric=haversine)
clusters = cluster.fit_predict(pd.concat([train_data[['latitude','longitude']],test_data[['latitude','longitude']]],axis=0))

In [113]:
train_data['cluster'] = clusters[:train_data.shape[0]]
test_data['cluster'] = clusters[train_data.shape[0]:]

In [114]:
geolocator = Nominatim(user_agent="user_agent")
city_coords = {x:(geolocator.geocode(x).latitude, geolocator.geocode(x).longitude) for x in data['city'].unique()}

In [115]:
get_distance_feats(train_data)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


  0%|          | 0/6433 [00:00<?, ?it/s]

In [116]:
get_distance_feats(test_data)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


  0%|          | 0/1609 [00:00<?, ?it/s]

In [117]:
cb_params = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'loss_function': 'RMSE',
    'max_depth': 5,
    'eval_metric': 'RMSE',
    'use_best_model':True,
    'task_type': 'CPU',
    'random_seed': 56,
}

params = {
    'cb_params':cb_params,
    'lgb_params': None,
    'xgb_params': None,
}

cat_cols = ['city','has_balcony','cluster']
label_col = 'price'
drop_cols = [
    'cand_dist',
    'cand_idxs',
    'neigh_label',
    'center_city_cords',
    'neigh_label_otn',
    'price_of_1m2'
]

In [118]:
class EnsembleClassifier(BaseEstimator):
    def __init__(self,cb_params,lgb_params,xgb_params):
        self.cbm = CatBoostClassifier(**cb_params)
        #self.lgbm = lgb.LGBMClassifier(**lgb_params)
        #self.xgbm = xgb.XGBClassifier(**xgb_params)
    
    def fit(self,X,y,X_val,y_val,cat_features=None,verbose=False):
        train_pool = Pool(X,label=y,cat_features=cat_features)
        eval_pool = Pool(X_val,label=y_val,cat_features=cat_features)
        self.cbm.fit(train_pool,eval_set=eval_pool,verbose=verbose)
        #self.lgbm.fit(X, y,eval_set=[(X_val,y_val)],categorical_feature=cat_features,callbacks=[lgb.log_evaluation(verbose)],eval_metric=lgb_f1_score)
        #self.xgbm.fit(X,y,eval_set=[(X_val,y_val)],verbose=False)
    
    def predict_proba(self,X_test,cat_features):
        test_pool = Pool(X_test,cat_features=cat_features)
        cb_preds = self.cbm.predict_proba(test_pool)[:,1]
        #lgb_preds = self.lgbm.predict_proba(X_test)[:,1]
        #xgb_preds = self.xgbm.predict_proba(X_test)[:,1]
        return lgb_preds
    
    def predict(self,X_test,cat_features):
        test_pool = Pool(X_test,cat_features=cat_features)
        cb_preds = self.cbm.predict(test_pool)
        #lgb_preds = self.lgbm.predict_proba(X_test)
        #xgb_preds = self.xgbm.predict_proba(X_test)[:,1]
        return cb_preds
    
class EnsembleRegressor(BaseEstimator):
    def __init__(self,cb_params,lgb_params,xgb_params):
        self.cbm = CatBoostRegressor(**cb_params)
        #self.lgbm = lgb.LGBMClassifier(**lgb_params)
        #self.xgbm = xgb.XGBClassifier(**xgb_params)
    
    def fit(self,X,y,X_val,y_val,cat_features=None,verbose=False):
        train_pool = Pool(X,label=y,cat_features=cat_features)
        eval_pool = Pool(X_val,label=y_val,cat_features=cat_features)
        self.cbm.fit(train_pool,eval_set=eval_pool,verbose=verbose)
        #self.lgbm.fit(X, y,eval_set=[(X_val,y_val)],categorical_feature=cat_features,callbacks=[lgb.log_evaluation(verbose)],eval_metric=lgb_f1_score)
        #self.xgbm.fit(X,y,eval_set=[(X_val,y_val)],verbose=False)
        
    def predict(self,X_test,cat_features):
        test_pool = Pool(X_test,cat_features=cat_features)
        cb_preds = self.cbm.predict(test_pool)
        #lgb_preds = self.lgbm.predict_proba(X_test)
        #xgb_preds = self.xgbm.predict_proba(X_test)[:,1]
        return cb_preds

In [119]:
class CustomBoostKfoldWraper(BaseEstimator):
    def __init__(self,num_folds,num_repits,params,random_state=56,score_func=None):
        self.models = []
        self.params = params
        self.random_state = random_state
        self.num_folds = num_folds
        self.num_repits = num_repits
        self.score_func = score_func
        
    def fit(self,train_data,cat_features=None,drop_cols=None,label_col=None,verbose=False):
        self.scores = []
        
        for i in trange(self.num_repits):
            kfold = StratifiedKFold(self.num_folds,random_state=self.random_state+i,shuffle=True)
            for train_index, test_index in (kfold.split(train_data,train_data[label_col])):
                train_df = train_data.iloc[train_index]
                test_df = train_data.iloc[test_index]
                
                model = EnsembleRegressor(**self.params)
                model.fit(
                    X = train_df.drop([label_col]+drop_cols,axis=1),
                    y = train_df[label_col],
                    X_val = test_df.drop([label_col]+drop_cols,axis=1),
                    y_val = test_df[label_col],
                    cat_features = cat_features,
                    verbose = verbose
                )
                cb_preds = model.predict(test_df.drop([label_col]+drop_cols,axis=1),cat_features)
                avg_preds =  cb_preds#(cb_preds + lgb_preds) / 2
                self.scores += [[
                    self.score_func(
                        test_df[label_col],
                        avg_preds
                    )
                ]]
                print(self.scores[-1])
                self.models += [model]
                
        print(f"Total Score {np.mean([x[0] for x in self.scores])}")
            
    def predict(self,test_data,drop_cols=None,cat_features=None):
        preds = np.mean([
            model.predict(test_data.drop(drop_cols,axis=1),cat_features=cat_features)
            for model in self.models
        ],axis=0)
        return preds
    
    def get_feature_importance(self,type='FeatureImportance'):
        imp_0 = self.models[0].cbm.get_feature_importance(prettified=True,type=type).set_index('Feature Id')
        for i in range(1,len(self.models)):
            imp_0 += self.models[i].cbm.get_feature_importance(prettified=True,type=type).set_index('Feature Id')
        return (imp_0 / len(self.models)).sort_values(by='Importances')[::-1]

In [120]:
model = CustomBoostKfoldWraper(
    num_folds=5,
    num_repits=5,
    params=params,
    random_state=56,
    score_func=partial(mean_squared_error,squared=False)
)

In [121]:
model.fit(
    train_data=train_data,
    cat_features=cat_cols,
    drop_cols=drop_cols,
    label_col=label_col,
    verbose=350,
)

  0%|          | 0/5 [00:00<?, ?it/s]



0:	learn: 4223032.9904315	test: 3538211.8999404	best: 3538211.8999404 (0)	total: 6.19ms	remaining: 6.18s
350:	learn: 811992.6459973	test: 1182314.7339343	best: 1182314.7339343 (350)	total: 1.86s	remaining: 3.44s
700:	learn: 635340.2408915	test: 1147860.9293096	best: 1147799.5164018 (698)	total: 4.14s	remaining: 1.77s
999:	learn: 557405.9348994	test: 1128943.6745454	best: 1128627.8676813 (993)	total: 5.7s	remaining: 0us

bestTest = 1128627.868
bestIteration = 993

Shrink model to first 994 iterations.
[1128627.86768126]
0:	learn: 3945246.5344044	test: 4618059.9924620	best: 4618059.9924620 (0)	total: 5.57ms	remaining: 5.56s
350:	learn: 807021.5862602	test: 1650468.2805503	best: 1650468.2805503 (350)	total: 1.79s	remaining: 3.32s
700:	learn: 638702.9147592	test: 1621548.7532158	best: 1621548.7532158 (700)	total: 3.61s	remaining: 1.54s
999:	learn: 564951.7011367	test: 1620783.6029718	best: 1618548.2983131 (756)	total: 5.21s	remaining: 0us

bestTest = 1618548.298
bestIteration = 756

Shrink



350:	learn: 826982.9064147	test: 1144191.5426983	best: 1143895.4960069 (349)	total: 1.8s	remaining: 3.32s
700:	learn: 634698.3470569	test: 1102028.7764422	best: 1101803.7383988 (691)	total: 3.66s	remaining: 1.56s
999:	learn: 554553.6519920	test: 1086217.1149708	best: 1086217.1149708 (999)	total: 5.21s	remaining: 0us

bestTest = 1086217.115
bestIteration = 999

[1086217.114970808]
0:	learn: 3945154.0400668	test: 4619126.6338077	best: 4619126.6338077 (0)	total: 5.82ms	remaining: 5.81s
350:	learn: 820975.5677494	test: 1702865.1341075	best: 1694638.2132564 (272)	total: 1.98s	remaining: 3.67s
700:	learn: 649655.3567520	test: 1680548.8543162	best: 1680417.9993163 (690)	total: 4.12s	remaining: 1.75s
999:	learn: 570142.4798902	test: 1677634.4689336	best: 1676866.8421915 (981)	total: 5.73s	remaining: 0us

bestTest = 1676866.842
bestIteration = 981

Shrink model to first 982 iterations.
[1676866.8421914687]
0:	learn: 3955598.2269270	test: 4600633.5009827	best: 4600633.5009827 (0)	total: 7.46ms	r



350:	learn: 837413.1745820	test: 1161845.2784943	best: 1161845.2784943 (350)	total: 1.81s	remaining: 3.35s
700:	learn: 641445.4982357	test: 1115167.9857538	best: 1115092.5816136 (699)	total: 3.64s	remaining: 1.55s
999:	learn: 562184.2059697	test: 1094492.8973415	best: 1094492.8973415 (999)	total: 5.23s	remaining: 0us

bestTest = 1094492.897
bestIteration = 999

[1094492.8973414558]
0:	learn: 3944784.8997159	test: 4618502.3466492	best: 4618502.3466492 (0)	total: 5.48ms	remaining: 5.47s
350:	learn: 808414.4148587	test: 1660323.1956840	best: 1660166.5265858 (340)	total: 1.84s	remaining: 3.41s
700:	learn: 635088.0557550	test: 1642580.6055208	best: 1641358.2580702 (617)	total: 3.66s	remaining: 1.56s
999:	learn: 555369.6485117	test: 1637371.6428063	best: 1637352.8732711 (968)	total: 5.24s	remaining: 0us

bestTest = 1637352.873
bestIteration = 968

Shrink model to first 969 iterations.
[1637352.8732710897]
0:	learn: 3955553.1232204	test: 4600542.7361466	best: 4600542.7361466 (0)	total: 6.81ms



0:	learn: 4222070.4322046	test: 3536932.4796436	best: 3536932.4796436 (0)	total: 6.23ms	remaining: 6.22s
350:	learn: 835171.8634286	test: 1176141.4066737	best: 1176141.4066737 (350)	total: 1.8s	remaining: 3.33s
700:	learn: 641051.1725361	test: 1132771.6409322	best: 1132325.3740865 (683)	total: 3.61s	remaining: 1.54s
999:	learn: 556409.3390836	test: 1121037.3229562	best: 1121020.5627204 (997)	total: 5.17s	remaining: 0us

bestTest = 1121020.563
bestIteration = 997

Shrink model to first 998 iterations.
[1121020.562720365]
0:	learn: 3944942.2002505	test: 4618669.0376859	best: 4618669.0376859 (0)	total: 5.69ms	remaining: 5.68s
350:	learn: 811416.4037369	test: 1672362.7554205	best: 1672095.2433785 (250)	total: 1.82s	remaining: 3.36s
700:	learn: 641529.8184883	test: 1649576.9120633	best: 1649564.9883540 (698)	total: 3.63s	remaining: 1.55s
999:	learn: 566006.6711718	test: 1645904.0147205	best: 1645904.0147205 (999)	total: 5.19s	remaining: 0us

bestTest = 1645904.015
bestIteration = 999

[1645



0:	learn: 4222466.7215894	test: 3537837.6656274	best: 3537837.6656274 (0)	total: 7.45ms	remaining: 7.44s
350:	learn: 842291.1693498	test: 1162235.5441399	best: 1162235.5441399 (350)	total: 1.83s	remaining: 3.38s
700:	learn: 649058.0130129	test: 1119869.6319204	best: 1119869.6319204 (700)	total: 3.63s	remaining: 1.55s
999:	learn: 570266.8234301	test: 1100616.9456191	best: 1100615.9166104 (994)	total: 5.18s	remaining: 0us

bestTest = 1100615.917
bestIteration = 994

Shrink model to first 995 iterations.
[1100615.9166103841]
0:	learn: 3944993.4979619	test: 4618758.7689216	best: 4618758.7689216 (0)	total: 5.71ms	remaining: 5.7s
350:	learn: 808214.5350423	test: 1727387.1892941	best: 1726485.8194997 (315)	total: 1.78s	remaining: 3.29s
700:	learn: 635495.2267728	test: 1724278.6662233	best: 1722438.0416537 (648)	total: 3.62s	remaining: 1.54s
999:	learn: 555665.6690744	test: 1719068.0046654	best: 1718754.8705687 (975)	total: 5.16s	remaining: 0us

bestTest = 1718754.871
bestIteration = 975

Shri

In [124]:
model.get_feature_importance()

Unnamed: 0_level_0,Importances
Feature Id,Unnamed: 1_level_1
area,50.621408
mean_label@5_otn,6.022405
area_kitchen,4.594558
mean_label@25_otn,4.589347
mean_label<5_otn,3.718364
mean_label<0.5_otn,3.314126
mean_label@10_otn,3.025137
mean_label<5,2.937351
rooms_num,2.328013
nearest_label_otn,2.261272


In [125]:
train_data['area_sq'] = train_data['area'].apply(sqrt)
train_data['area_lg'] = train_data['area'].apply(log2)

In [126]:
drop_cols.remove('price_of_1m2')

In [127]:
preds = model.predict(test_data,drop_cols=drop_cols,cat_features=cat_cols)

In [128]:
mean_squared_error(preds,test_data['price'],squared=False)

1387493.9285109432