In [288]:
import sys

sys.path.append('../../price-estimator-rest-api/scripts/')
import quadkey
sys.path.append('../../price-estimator-rest-api/proto.out')

import json
import logging

import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from config import *
from realty.prediction import price_prediction_pb2
from realty.offer import common_pb2
from time import time
from joblib import delayed, Parallel
from tqdm import tqdm_notebook
import requests
from sklearn.model_selection import ParameterGrid
import datetime


import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)

import warnings
warnings.filterwarnings("ignore") 

## vkokhtev

In [241]:
offers = pd.read_csv('./rooms.tsv', sep='\t', error_bad_lines=False)
offers['is_apartment'] = offers['is_apartment'].fillna(False)
offers['kitchen_area'] = offers['kitchen_area'].fillna(0)
offers['living_area'] = offers['living_area'].fillna(0)
offers['renovation'] = offers['renovation'].fillna(0)
offers['balcony'] = offers['balcony'].fillna(0)
offers['parking'] = offers['parking'].fillna(0)

In [242]:
# def address_to_locality(string):
#     try:
#         r = requests.get('http://geocode-net.datatesting.int01e.tst.maps.yandex.ru/1.x/?format=json&geocode={}&results=1'.format(string))
#         obj = r.json()['response']['GeoObjectCollection']['featureMember'][0]['GeoObject']['metaDataProperty']['GeocoderMetaData']
#     except:
#         obj = dict({'text':'geocode_error'})
#     try:
#         coord = obj['InternalToponymInfo']['Point']['pos']
#     except:
#         coord = '0 0'
#     try:
#         locality_name = obj['AddressDetails']['Country']['AdministrativeArea']['SubAdministrativeArea']['Locality']['LocalityName']
#     except:
#         try:
#             locality_name = obj['AddressDetails']['Country']['AdministrativeArea']['Locality']['LocalityName']
#         except:
#             locality_name = 'geocode_error'
#     try:
#         region = obj['AddressDetails']['Country']['AdministrativeArea']['AdministrativeAreaName']
#     except:
#         region = 'geocode_error'
#     if obj['text'] == string:
#         return [string, region, locality_name, coord]
#     else:
#         return [obj['text'], region, locality_name, coord]

# def choose_address(geo_adr, db_adr):
#     count = 0
#     if (geo_adr == db_adr) | ('ё' in (geo_adr + db_adr)):
#         return [geo_adr, count]
#     if ("ЖК" in db_adr) | ('geocode_error' in geo_adr) | (len(geo_adr.split(',')) < 2):
#         return [db_adr, count]
#     geo_pred = ' '.join(geo_adr.split(' ')[:-1])
#     db_pred = ' '.join(db_adr.split(' ')[:-1])
#     if geo_pred == db_pred:
#         count += 1
#         return [geo_adr, count]
#     return [geo_adr, count]

# offers_addess_pd = pd.unique(offers['unified_address'])

# offers_addresses = Parallel(n_jobs=40, verbose=0)(delayed(address_to_locality)(address) for address in tqdm_notebook(offers_addess_pd))

# offers_addr = pd.DataFrame(offers_addresses, columns=['geocoder_address','region','locality_name', 'coords'])
# offers_addr['offers_address'] = offers_addess_pd

# offers_addr['correct_address'], offers_addr['count'] = '', 0
# offers_addr['correct_address'], offers_addr['count'] = map(list, zip(*([choose_address(geo_adr, db_adr) for geo_adr, db_adr \
#                                 in tqdm_notebook(zip(offers_addr['geocoder_address'],offers_addr['offers_address']))])))
# dubl_offers = offers_addr[offers_addr['count'] == 1]
# dubl_offers['correct_address'] = dubl_offers['offers_address']
# offers_addr['count'] = 0
# offers_addr = pd.concat([offers_addr,dubl_offers], axis=0)
# for col in ['geocoder_address', 'offers_address', 'correct_address']:
#     offers_addr[col] = offers_addr[col].apply(lambda x: x.split(', подъезд')[0])

# offers_addr.to_csv('./offers_addr_loc_names_coords.tsv',sep='\t', index=None)

In [None]:
def get_quadkeys(lat, lon):
    city_quadkey = quadkey.latlon2quadkey(lat, lon, zoom=15)
    region_quadkey = city_quadkey[:10]
    return city_quadkey, region_quadkey

def error_percentage(y_test, y_pred):
    return sum((np.divide(y_pred,y_test) - 1).apply(abs) < 0.15) / y_pred.shape[0]

def mean_absolute_percentage_error(y_test, y_pred): 
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

offers_addr = pd.read_csv('./offers_addr_loc_names_coords.tsv',sep='\t')
cols = offers.columns

offers = pd.merge(offers, offers_addr, left_on='unified_address', right_on='offers_address')

adr_mask = (offers['correct_address'] != 'geocode_error')
sf_mask = (offers['subject_federation_id'].notnull())
offers = offers[adr_mask & sf_mask]
offers.rename(columns={'locality_name_y':'locality_name'}, inplace=True)
offers = offers[list(cols) + ['region', 'coords', 'correct_address']]
offers.drop(['unified_address','total_area', 
             'subject_federation_id', 'locality_name', 'region'], axis=1, inplace=True)

housebase = pd.read_csv('../../data/database/data/housebase_improved_all.tsv', sep='\t')

In [248]:
df = pd.merge(offers, housebase, how='inner', on='correct_address')
df['first_day_exposition'] = pd.to_datetime(df['first_day_exposition'])
df = df.sort_values('first_day_exposition')

In [193]:
class PredictionModelContainer:
    def __init__(self, model_path, factors):
        self.model = CatBoostRegressor().load_model(model_path, format='catboost')
        self.factors = factors

    def predict(self, df):
        start_time = time()
        X = df[self.factors]
        result = np.power(self.model.predict(X), 3)

        end_time = time()
        logging.info('predicted value: ' + str(result))
        logging.info('predict time: ' + str(end_time - start_time))

        return result

class PricePredictor:
    def __init__(self):
        prefix = './../../price-estimator-rest-api/model/'
        self.msk_mo_rent_model = PredictionModelContainer(
            prefix+MSK_MO_RENT_MODEL_PATH, MSK_MO_RENT_FACTORS)
        self.msk_mo_sell_model = PredictionModelContainer(
            prefix+MSK_MO_SELL_MODEL_PATH, MSK_MO_SELL_FACTORS)
        self.spb_lo_rent_model = PredictionModelContainer(
            prefix+SPB_LO_RENT_MODEL_PATH, SPB_LO_RENT_FACTORS)
        self.spb_lo_sell_model = PredictionModelContainer(
            prefix+SPB_LO_SELL_MODEL_PATH, SPB_LO_SELL_FACTORS)
        self.regions_rent_model = PredictionModelContainer(
            prefix+REGIONS_RENT_MODEL_PATH, REGIONS_RENT_FACTORS)
        self.regions_sell_model = PredictionModelContainer(
            prefix+REGIONS_SELL_MODEL_PATH, REGIONS_SELL_FACTORS)

In [278]:
price_pred = PricePredictor()

pred_dict = dict()

for s_f_id in [1, 10174, 0]:
    for off_t in [1, 2]:
        subject_federation_id = s_f_id
        offer_type = common_pb2.OfferType.Name(off_t)
        if subject_federation_id == MSK_MO_SUBJECT_FEDERATION_ID:
            if offer_type == RENT:
                model = price_pred.msk_mo_rent_model
            elif offer_type == SELL:
                model = price_pred.msk_mo_sell_model

        elif subject_federation_id == PITER_LO_SUBJECT_FEDERATION_ID:
            if offer_type == RENT:
                model = price_pred.spb_lo_rent_model
            elif offer_type == SELL:
                model = price_pred.spb_lo_sell_model

        elif subject_federation_id != MSK_MO_SUBJECT_FEDERATION_ID \
                and subject_federation_id != PITER_LO_SUBJECT_FEDERATION_ID:
            if offer_type == RENT:
                model = price_pred.regions_rent_model
            elif offer_type == SELL:
                model = price_pred.regions_sell_model
        
        mask = df['year'] > -1
        mask = mask & (df['offer_type'] == off_t)
        if s_f_id == 0:
            mask = mask & (df['subject_federation_id'] != 1) \
                        & (df['subject_federation_id'] != 10174) 
        else:
            mask = mask & (df['subject_federation_id'] == s_f_id)
        
        X = df[mask]
        
        y_test = X['last_price']
        y_pred = model.predict(X)
        
        values = [mean_absolute_percentage_error(y_test, y_pred*alpha) for alpha in np.linspace(0,1,10000)]
    
        print(s_f_id, offer_type, min(values), error_percentage(y_test, y_pred*np.argmin(values)/10000))

1 SELL 24.804450752993233 0.370595382746051
1 RENT 17.269170455309947 0.5454895913646878
10174 SELL 25.065290469053696 0.3574010654490107
10174 RENT 16.122629330314314 0.5509945877556758
0 SELL 24.15689935140137 0.3756166653819471
0 RENT 25.906599866501367 0.3599845286772019


In [285]:
def splitting_data(X, y, time_column, time):
    X_train = X[X[time_column] < time]
    X_test = X[X[time_column] >= time]
    y_train, y_test = y.loc[X_train.index], y.loc[X_test.index]
    return X_train, y_train, X_test, y_test

split_date_test = pd.to_datetime('05.05.2018')
split_date_val = pd.to_datetime('09.05.2018')

def down_iter(size):
    if size == 1:
        return [1]
    if size == 2:
        return [0.5,0.5]
    prev_arr = down_iter(size-1)[:-1]
    next_value = (1 - sum(prev_arr)) / 2
    return prev_arr + [next_value] * 2

def train_catboost(model, X_train, y_train, X_test, y_test, cat_features, lr_down_rate=5):
    iterations = model.get_params()['iterations'] # 100
    learning_rate = model.get_params()['learning_rate'] # 5
    bounds = np.array(down_iter(lr_down_rate)) # 0.5   , 0.25  , 0.125 , 0.0625, 0.0625
    lr_arr = [learning_rate / (1.9 ** i) for i in range(lr_down_rate)]# 5.0, 2.5, 1.25, 0.625, 0.3125
    iter_arr = (bounds * iterations).astype(int) # 150,  75,  37,  18,  18
    n_trees_count = 0
    for n_trees, lr in zip(iter_arr, lr_arr):
        n_trees_count += n_trees
        print(n_trees, lr)
        model.set_params(iterations=n_trees_count, learning_rate=lr)
        model.fit(X_train, y_train, metric_period=300,
                  early_stopping_rounds=1, cat_features=[i for i, c in enumerate(X_train) if c in cat_features])
    return model

def training_model(X, y, model_parameters, cat_columns):
    X_train, y_train, X_test, y_test = splitting_data(X, y, 'first_day_exposition', split_date_test)
    del X_train['first_day_exposition']
    del X_test['first_day_exposition']
    lr_count = model_parameters['lr_count']
    del model_parameters['lr_count']
    if lr_count == 1:
        del model_parameters['save_snapshot']
    
    model = CatBoostRegressor(**model_parameters)
    trained_model = train_catboost(model, X_train, y_train, X_test, y_test, cat_columns, lr_down_rate=lr_count)
    y_pred = np.power(trained_model.predict(X_test), 3)
    y_test = np.power(y_test, 3)
    ep = error_percentage(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    print(mape)
    return trained_model, ep, mape

def pipeline(X, offer_type, sfId_arr, columns, cat_columns, model_parameters):
    mask = (X['subject_federation_id'] == 0)
    for i in sfId_arr:
        mask = mask | (X['subject_federation_id'] == i)
    pool = X[mask]
    
    pool = pool[pool['offer_type'] == offer_type]
    target = np.cbrt(pool['last_price'])
    pool = pool[columns]
    pool_train, target_train, pool_val, target_val = splitting_data(pool, target,\
                                                                    'first_day_exposition', split_date_val)
    del pool_val['first_day_exposition']
    target_val = np.power(target_val, 3)
    
    results = []
    mape_best = [100, 100, '']
    ep_best = [0, 0, '']
    
    for params in tqdm_notebook(model_parameters):
        model_name = '_'.join([str(s) for s in params.values()])
        #return training_model(pool_train, target_train, params, cat_columns)
        trained_model, ep, mape = training_model(pool_train, target_train, params, cat_columns)
        try:
            os.remove('./catboost_info/experiment.cbsnapshot')
        except:
            a = 0
        
        val_predict = np.power(trained_model.predict(pool_val), 3)
        mape_val = mean_absolute_percentage_error(val_predict, target_val)
        ep_val = error_percentage(val_predict, target_val)
        
        if mape <= mape_best[0]:
            mape_best = [mape, mape_val, model_name]
        
        if ep >= ep_best[0]:
            ep_best = [ep, ep_val, model_name]
        
        results.append([model_name, mape, mape_val, ep, ep_val])
        
    return results, ep_best, mape_best, trained_model

param_grid_catboost_sell = ParameterGrid({'save_snapshot':[True],'lr_count': [1], 'learning_rate': [50, 100, 200],\
                                     'iterations': [1000], 'depth': [7],\
                                     'thread_count':[48], 'random_seed':[0],'border_count':[100],\
                                     'has_time':[True],'counter_calc_method':['SkipTest'],\
                                     'loss_function':['MAPE'],'logging_level':['Verbose']})

param_grid_catboost_rent = ParameterGrid({'save_snapshot':[True],'lr_count': [1], 'learning_rate': [1, 3, 10, 20],\
                                     'iterations': [1000], 'depth': [4, 7, 9],\
                                     'thread_count':[48], 'random_seed':[0],'border_count':[100],\
                                     'has_time':[True],'counter_calc_method':['SkipTest'],\
                                     'loss_function':['MAPE'],'logging_level':['Silent']})


def train(offer_type, name, subject_arr, columns, cat_columns):
    start_time = time()
    if offer_type == 1:
        params = param_grid_catboost_sell
    else:
        params = param_grid_catboost_rent
        
    results, ep_best, mape_best, _ = pipeline(X, offer_type, subject_arr, columns, cat_columns, params)
    results = pd.DataFrame(results, columns=['model_name','MAPE_test','MAPE_val','EP_test','EP_val'])
    results.to_csv('./results_{}.tsv'.format(name), sep='\t',index=None)
    
    end_time = time()
    
    print(str(datetime.timedelta(seconds=end_time-start_time)))
        
    return 0

In [286]:
X = df.copy()

In [287]:
sfID = list(X['subject_federation_id'].unique())
sfID.remove(1)
sfID.remove(10174)

columns_rent_reg = price_pred.regions_rent_model.factors + ['first_day_exposition']
cat_columns_rent_reg = ['studio', 'is_apartment','rooms_offered',
                        'renovation', 'balcony', 'series_name',
                        'building_type_str', 'heatingType',
                        'expectDemolition','subject_federation_id',
                        'locality_name', 'city_quadkey','region_quadkey']

code = train(2, 'rent_reg', sfID, columns_rent_reg, cat_columns_rent_reg)

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))

1000 1.0
16.888883582431365
1000 3.0


learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.


16.835884523267964
1000 10.0


learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.


17.717888995871885
1000 20.0


learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.


19.559446094840233
1000 1.0
16.741488623503802
1000 3.0


learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.


16.711187965912472
1000 10.0


learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.


18.76219806919209
1000 20.0


learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.


24.44779057958254
1000 1.0
16.717177915931234
1000 3.0


learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.


16.798432454399986
1000 10.0


learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.


19.736726148694743
1000 20.0


learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.
learning rate is greater than 1. You probably need to decrease learning rate.


27.745902691509045
0:08:19.077208
