In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import LabelEncoder, Imputer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.pipeline import make_pipeline, make_union
from sklearn.base import TransformerMixin
from sklearn.model_selection import GridSearchCV
import geopy
from geopy.geocoders import Yandex
import time
import pickle
import os

df = pd.read_csv('./data/train.csv')

y = df['price_doc'].values
df = df.drop(columns=['price_doc'])

df.head(10)

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_1500,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000
0,1,2011-08-20,43,27.0,4.0,,,,,,...,40,9,4,0,13,22,1,0,52,4
1,2,2011-08-23,34,19.0,3.0,,,,,,...,36,15,3,0,15,29,1,10,66,14
2,3,2011-08-27,43,29.0,2.0,,,,,,...,25,10,3,0,11,27,0,4,67,10
3,4,2011-09-01,89,50.0,9.0,,,,,,...,15,11,2,1,4,4,0,0,26,3
4,5,2011-09-05,77,77.0,4.0,,,,,,...,552,319,108,17,135,236,2,91,195,14
5,6,2011-09-06,67,46.0,14.0,,,,,,...,155,62,14,1,53,78,1,20,113,17
6,7,2011-09-08,25,14.0,10.0,,,,,,...,144,81,16,3,38,80,1,27,127,8
7,8,2011-09-09,44,44.0,5.0,,,,,,...,36,9,4,0,11,18,1,0,47,4
8,9,2011-09-10,42,27.0,5.0,,,,,,...,69,19,8,1,18,34,1,3,85,11
9,10,2011-09-13,36,21.0,9.0,,,,,,...,30,19,13,0,10,20,1,3,67,1


In [2]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.3, random_state=42)

In [3]:
geolocator = Yandex()

def do_geocode(address, trial=0):
    try:
        location = geolocator.geocode(address)
        if location == None:
            return "NaN", "NaN"
        return location.latitude, location.longitude
    except Exception as e:
        if trial > 3:
            return "NaN", "NaN"
        else:
            return do_geocode(address, trial + 1)
        time.sleep(2)
        
def load_pickle(path):
    with open(path, 'rb') as f:
        return pickle.load(f)
    
def save_pickle(obj, path):
    with open(path, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [4]:
class GeoDataTransformer(TransformerMixin):
    def __init__(self):
        self.cache = {}
        self.pickle_path = './data/locations.pkl'
        self.imp = Imputer(strategy='median')
        
        if os.path.isfile(self.pickle_path):
            self.cache = load_pickle(self.pickle_path)

    def fit(self, df, y=None):
        return self
                
    def transform(self, df):
        df_copy = df.copy()
        locations = []
        for sub_area in df['sub_area']:
            if sub_area in self.cache:
                latitude, longitude = self.cache[sub_area]
                if latitude == 'NA' or longitude == 'NA':
                    locations.append(['NaN', 'NaN'])
                else:
                    locations.append([latitude, longitude])
            else:
                latitude, longitude = do_geocode(sub_area)
                self.cache[sub_area] = [latitude, longitude]
                locations.append([latitude, longitude])
        
        locations = np.array(locations)
        locations = self.imp.fit_transform(locations)
        
        lat = locations[:, 0]
        lon = locations[:, 1]

        df_copy['x'] = np.cos(lat) * np.cos(lon)
        df_copy['y'] = np.cos(lat) * np.sin(lon)
        df_copy['z'] = np.sin(lat)
        
        df_copy = df_copy.drop(columns=['sub_area'])
        
        if not os.path.isfile(self.pickle_path):
            save_pickle(cache, self.pickle_path)
            
        return df_copy

class MultiColumnLabelEncoder(TransformerMixin):
    def __init__(self, columns):
        self.encoders = {}
        self.columns = columns
    def fit(self, df, y=None):
        for column in self.columns:
            self.encoders[column] = LabelEncoder().fit(df[column])
        return self
    def transform(self, df):
        df_copy = df.copy()
        for column in self.columns:
            df_copy[column] = self.encoders[column].transform(df_copy[column].fillna(df[column].iloc[0]))
        return df_copy
    
class DateTransformer(TransformerMixin):    
    def fit(self, df, y=None):
        return self
    
    def transform(self, df):
        df_copy = df.copy()
        df_copy['timestamp'] = pd.to_datetime(df['timestamp'])
        df_copy['timestamp'] = pd.to_numeric(df_copy['timestamp'])
        return df_copy
    
class UnnecessaryColumnsDroper(TransformerMixin):
    def fit(self, df, y=None):
        return self
    
    def transform(self, df):
        df_copy = df.copy()
        df_copy = df_copy.drop(columns=['id'], axis=1)
        return df_copy
    
class FeatureSelector(TransformerMixin):
    def __init__(self):
        to_drop = []
    
    def fit(self, df, y=None):
        df = pd.DataFrame(df)
        corr_matrix = df.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
        to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]
        self.to_drop = to_drop
        return self
    
    def transform(self, df):
        df = pd.DataFrame(df)
        df = df.drop(df.columns[self.to_drop], axis=1)
        return df
    
class MyRegression(TransformerMixin):
    def fit(self, df, y=None):
        self.model = RidgeCV(alphas=(0.01, 0.1, 0.5, 1, 5, 10, 50, 100, 500))
        self.model.fit(df, y)
        return self
    
    def transform(self, df):
        df_copy = df.copy()
        values = self.model.predict(df)
        df_copy = np.column_stack([df_copy, values])
        return df_copy
    
    
class MyXGBoost(TransformerMixin):
    def fit(self, df, y=None):
        xgb_model = xgb.XGBRegressor()
        params = {
            'gamma': [0.5, 1, 2],
            'max_depth': [3, 4, 5]
        }
        grid_search = GridSearchCV(
            xgb_model,
            param_grid=params,
            n_jobs=4
        )
        self.model = grid_search
        self.model.fit(df, y)
        return self
    
    def transform(self, df):
        df_copy = df.copy()
        values = self.model.predict(df)
        df_copy = np.column_stack([df_copy, values])
        return df_copy
    
columns_to_encode = [
    'product_type',
    'culture_objects_top_25',
    'thermal_power_plant_raion',
    'incineration_raion',
    'oil_chemistry_raion',
    'radiation_raion',
    'railroad_terminal_raion',
    'railroad_1line',
    'big_market_raion',
    'nuclear_reactor_raion',
    'detention_facility_raion',
    'water_1line',
    'big_road1_1line',
    'ecology',
]

pipe = make_pipeline(
    GeoDataTransformer(),
    MultiColumnLabelEncoder(columns_to_encode),
    DateTransformer(),
    UnnecessaryColumnsDroper(),
    Imputer(strategy='median'),
    FeatureSelector(),
    StandardScaler(),
    MyRegression(),
    MyXGBoost(),
    xgb.XGBRegressor(max_depth=3)
)

model = pipe.fit(X_train, y_train)

In [5]:
predictions = np.abs(model.predict(X_test))
mean_squared_log_error(y_test, predictions)

0.22096666832438175

In [6]:
df_test = pd.read_csv('./data/test.csv')
model = pipe.fit(df, y)
result = pd.DataFrame(df_test['id'])
result['price_doc'] = np.abs(model.predict(df_test))
result.to_csv('submission.csv', index=False)

# Score on Kaggle: 0.33478