## Model hosted here https://russian-housing-market.herokuapp.com/

In [56]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.pipeline import make_pipeline
from sklearn.base import TransformerMixin
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
import geopy
from geopy.geocoders import Yandex
import pickle
import os

df = pd.read_csv('./data/train.csv')

y = df['price_doc'].values
df = df[['full_sq', 'life_sq', 'build_year', 'num_room', 'sub_area']]

df.head(10)

Unnamed: 0,full_sq,life_sq,build_year,num_room,sub_area
0,43,27.0,,,Bibirevo
1,34,19.0,,,Nagatinskij Zaton
2,43,29.0,,,Tekstil'shhiki
3,89,50.0,,,Mitino
4,77,77.0,,,Basmannoe
5,67,46.0,,,Nizhegorodskoe
6,25,14.0,,,Sokol'niki
7,44,44.0,,,Bibirevo
8,42,27.0,,,Koptevo
9,36,21.0,,,Kuncevo


In [57]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.1, random_state=42)

In [58]:
geolocator = Yandex()

def do_geocode(address, trial=0):
    try:
        location = geolocator.geocode(address)
        if location == None:
            return "NaN", "NaN"
        return location.latitude, location.longitude
    except Exception as e:
        if trial > 3:
            return "NaN", "NaN"
        else:
            return do_geocode(address, trial + 1)
        time.sleep(2)
        
def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)
    
def save_pickle(obj, path):
    with open(path, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [59]:
class GeoDataTransformer(TransformerMixin):
    def __init__(self):
        self.cache = {}
        self.pickle_path = './data/locations.pkl'
        self.imp = Imputer(strategy='median')
        
        if os.path.isfile(self.pickle_path):
            self.cache = load_pickle(self.pickle_path)

    def fit(self, df, y=None):
        return self
                
    def transform(self, df):
        df_copy = df.copy()
        locations = []
        for sub_area in df['sub_area']:
            if sub_area in self.cache:
                latitude, longitude = self.cache[sub_area]
                if latitude == 'NA' or longitude == 'NA':
                    locations.append(['NaN', 'NaN'])
                else:
                    locations.append([latitude, longitude])
            else:
                latitude, longitude = do_geocode(sub_area)
                self.cache[sub_area] = [latitude, longitude]
                locations.append([latitude, longitude])
        
        locations = np.array(locations)
        locations = self.imp.fit_transform(locations)
        
        lat = locations[:, 0]
        lon = locations[:, 1]

        df_copy['x'] = np.cos(lat) * np.cos(lon)
        df_copy['y'] = np.cos(lat) * np.sin(lon)
        df_copy['z'] = np.sin(lat)
        
        df_copy = df_copy.drop(columns=['sub_area'])
        
        if not os.path.isfile(self.pickle_path):
            save_pickle(cache, self.pickle_path)
            
        return df_copy

class MyRegression(TransformerMixin):
    def fit(self, df, y=None):
        self.model = RidgeCV(alphas=(0.01, 0.1, 0.5, 1, 5, 10, 50, 100, 500))
        self.model.fit(df, y)
        return self
    
    def transform(self, df):
        df_copy = df.copy()
        values = self.model.predict(df)
        df_copy = np.column_stack([df_copy, values])
        return df_copy
    
    
class MyXGBoost(TransformerMixin):
    def fit(self, df, y=None):
        xgb_model = xgb.XGBRegressor()
        params = {
            'gamma': [0.5, 1, 2],
            'max_depth': [3, 4, 5]
        }
        grid_search = GridSearchCV(
            xgb_model,
            param_grid=params,
            n_jobs=4
        )
        self.model = grid_search
        self.model.fit(df, y)
        return self
    
    def transform(self, df):
        df_copy = df.copy()
        values = self.model.predict(df)
        df_copy = np.column_stack([df_copy, values])
        return df_copy

pipe = make_pipeline(
    GeoDataTransformer(),
    Imputer(strategy='mean'),
    StandardScaler(),
    MyRegression(),
    MyXGBoost(),
    xgb.XGBRegressor(max_depth=3)
)

model = pipe.fit(X_train, y_train)

Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x7f95e062fd30>>
Traceback (most recent call last):
  File "/home/andranik/anaconda3/lib/python3.6/site-packages/xgboost/core.py", line 366, in __del__
    if self.handle is not None:
AttributeError: 'DMatrix' object has no attribute 'handle'


In [60]:
predictions = np.abs(model.predict(X_test))
mean_squared_log_error(y_test, predictions)

0.24812860962807196

In [61]:
model.MyRegression = MyRegression

joblib.dump(model, 'model.pkl')

['model.pkl']