In [1]:
import pandas as pd
import numpy as np
import gc
np.random.seed(0)

import matplotlib.pyplot as plt
import seaborn as sns
import scikitplot as skplt

import sys
import re

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import learning_curve, train_test_split, KFold

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

import xgbfir
import eli5
%matplotlib inline

In [2]:
train = pd.read_hdf('../input/property.train.h5')
test = pd.read_hdf('../input/property.test.h5') # test set without price

df_all = pd.concat([train, test], sort=False)

In [3]:
df_all['Security:'] = df_all['Security:'].str.lower()
df_all['Security:'].fillna('-1', inplace=True)
df_all.loc[df_all['Security:'].str.contains('24|round'), 'Security:'] = 'full'

condition = df_all['Security:'].isin(['provided', 'is'])
df_all.loc[condition, 'Security:'] = 'yes'
df_all.loc[df_all['Security:'].str.contains('security|guard|protected|secure'), 'Security:'] = 'yes'

condition = df_all['Security:'].isin(['fenced area', 'enclosed courtyard', 'ogorojennaja territory'])
df_all.loc[condition, 'Security:'] = 'closed area'

df_all.loc[df_all['Security:']=='not allowed', 'Security:'] = 'no'
df_all.loc[df_all['Security:']=='cat t', 'Security:'] = 'video surveillance'

condition = df_all['Security:'].isin(['concierge, intercom', 'concierge. doorphone.']) 
df_all.loc[condition, 'Security:'] = 'concierge'

condition = df_all['Security:'].isin(['-1', 'yes', 'closed area', 'full', 'video surveillance', 'concierge', 'no'])
df_all.loc[~condition, 'Security:'] = 'some'

In [4]:
#Change columns with list to string.
col_list = ['breadcrumbs', 'date', 'geo_block', 'owner']
for column in col_list:
    df_all['{}_str'.format(column)] = df_all[column].map(lambda x: ','.join(x))

In [5]:
df_all.loc[df_all['owner_str'].str.contains('@'), 'owner_str'] = 'priv'
counts = df_all['owner_str'].value_counts()
mask = df_all['owner_str'].isin(counts[counts < 5].index)
df_all.loc[mask, 'owner_str'] = 'other'

In [6]:
def bread_geo_func(row): #concatenate breadcrumbs with geo_block, erase repeated values
    bread_str = ','.join(row['breadcrumbs']).lower().strip()
    geo_str = ','.join(row['geo_block']).lower().strip()
    bread_geo_str = (bread_str +','+ geo_str).replace('ул.', 'ул').replace('пер.', 'пер').replace('пр-кт.', 'пр-кт').\
                replace('проезд.', 'проезд').replace('б-р.', 'б-р')
    
    bread_geo = ','.join(sorted(list(set(bread_geo_str.split(',')) - set(['москва', 'г. москва']))))
    
    return bread_geo

df_all['bread_geo_str'] = df_all[['breadcrumbs', 'geo_block']].apply(bread_geo_func, axis=1)

In [7]:
def metro_func(row):
    metro = [i for i in row['bread_geo_str'].split(',') if ('мцк ' in i)]
    if len(metro)>0 :
        return ','.join(metro)
    return 'missing'
df_all['metro_str'] = df_all[['bread_geo_str']].apply(metro_func, axis=1)

In [8]:
def station_func(row):
    station = [i for i in row['bread_geo_str'].split(',') if ('м. ' in i)]
    if len(station)>0 :
        return ','.join(station)
    return 'missing'
df_all['station_str'] = df_all[['bread_geo_str']].apply(station_func, axis=1)

In [9]:
def street_func(row):
    street_ext = ['ул ', 'б-р', 'пер ', 'пр-кт', 'аллея ', 'проезд ', 'ш ', 'пл ', 'наб ']
    street = [i for i in row['bread_geo_str'].split(',') if any(ext in i for ext in street_ext)]
    if len(street)>0 :
        return ','.join(street)
    return 'missing'
df_all['street_str'] = df_all[['bread_geo_str']].apply(street_func, axis=1)

In [14]:
counts = df_all['station_str'].value_counts()
mask = df_all['station_str'].isin(counts[counts < 5].index)
df_all.loc[mask, 'station_str'] = 'other'

counts = df_all['street_str'].value_counts()
mask = df_all['street_str'].isin(counts[counts < 5].index)
df_all.loc[mask, 'street_str'] = 'other'

In [10]:
col_to_factorize = [column for column in df_all.columns if (':' in column) or ('_str' in column)]
for column in col_to_factorize:
    df_all['{}_cat'.format(column)] = df_all[column].factorize()[0]

In [15]:
train = df_all[~df_all['price'].isna()]
test = df_all[df_all['price'].isna()]
test = test.drop(['price'], axis=1)
test['id'] = test['id'].astype('int64')

In [12]:
models = [('XGB', XGBRegressor(max_depth=7, n_estimators=100, learning_rate=0.2, random_state=0)),
          ('CB', CatBoostRegressor(depth=7, verbose=False))]

def get_feats(df, del_list=[]):
    feats = df.select_dtypes(include=['number']).columns
    black_list = ['price', 'id'] + del_list
    feats = [f for f in feats if f not in black_list]
    return feats

def run_cv(model, X, y, folds=3, cv_type=KFold, success_metric=mean_absolute_error):
    cv = cv_type(n_splits=folds, random_state=0, shuffle=True)
    
    scores = []
    for train_idx, test_idx in cv.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        y_log_train = np.log(y_train)
              
        model.fit(X_train, y_log_train)
        y_log_pred = model.predict(X_test)
        
        y_pred = np.exp(y_log_pred)
        y_pred[y_pred < 0] = 0 
            
        score = success_metric(y_test, y_pred)
        scores.append( score )

    return scores, np.mean(scores), np.std(scores)

def run_models(train, feats, models):
    print(feats)
    X = train[feats].values
    y = train['price'].values
    
    for model_name, model in models:
        scores, score, std = run_cv(model, X, y)
        
        print('model {}- each fold score: {}; mean: {:.2f}, std: {:.2f}'.format(model_name,  scores, score, std))

def submit(model, train, test, feats, file='model_.csv'):
    X_train = train[feats]
    y_train = np.log(train['price'])
    X_test = test[feats]
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred = np.exp(y_pred)
    y_pred[y_pred < 0] = 0
    
    test['price'] = y_pred
    test[ ['id', 'price'] ].to_csv('../output/{}'.format(file), index=False) 
    print('submit for {} \ndone'.format(model))

In [15]:
# owner : priv + <5 + bread_geo - bread_str
del_list = ['breadcrumbs_str_cat']
feats = get_feats(train, del_list)
run_models(train, feats, models)

['Security:_cat', 'Building type:_cat', 'Object type:_cat', 'Ad type:_cat', 'Commission agent:_cat', 'Construction phase:_cat', 'Housing class:_cat', 'Elevator:_cat', 'Bathroom type:_cat', 'Balcony type:_cat', 'Mortgage possible:_cat', 'The view from the window:_cat', 'Garbage chute:_cat', 'Repair:_cat', 'Fridge:_cat', 'Phone:_cat', 'Furniture:_cat', 'Free layout:_cat', 'It is possible to bargain:_cat', 'Floor covering:_cat', 'Room type:_cat', 'Internet:_cat', 'Kitchen furniture:_cat', 'TV:_cat', 'Washing machine:_cat', 'Foundation type:_cat', 'Overlap type:_cat', 'Type of the building:_cat', 'Playground:_cat', 'Class:_cat', 'date_str_cat', 'geo_block_str_cat', 'owner_str_cat', 'bread_geo_str_cat']
model XGB- score: 5.753284532111753
model CB- score: 5.750923376738445


In [14]:
# owner : priv + <5 + bread_geo - bread_str KFold
del_list = ['breadcrumbs_str_cat']
feats = get_feats(train, del_list)
run_models(train, feats, models)

['Security:_cat', 'Building type:_cat', 'Object type:_cat', 'Ad type:_cat', 'Commission agent:_cat', 'Construction phase:_cat', 'Housing class:_cat', 'Elevator:_cat', 'Bathroom type:_cat', 'Balcony type:_cat', 'Mortgage possible:_cat', 'The view from the window:_cat', 'Garbage chute:_cat', 'Repair:_cat', 'Fridge:_cat', 'Phone:_cat', 'Furniture:_cat', 'Free layout:_cat', 'It is possible to bargain:_cat', 'Floor covering:_cat', 'Room type:_cat', 'Internet:_cat', 'Kitchen furniture:_cat', 'TV:_cat', 'Washing machine:_cat', 'Foundation type:_cat', 'Overlap type:_cat', 'Type of the building:_cat', 'Playground:_cat', 'Class:_cat', 'date_str_cat', 'geo_block_str_cat', 'owner_str_cat', 'bread_geo_str_cat']
model XGB- each fold score: [5.710966336553063, 5.555384656664529, 5.264931342382497]; mean: 5.51, std: 0.18
model CB- each fold score: [5.657333948992947, 5.446270628619712, 5.218039615381332]; mean: 5.44, std: 0.18


In [13]:
# +metro, station, street
del_list = ['breadcrumbs_str_cat']
feats = get_feats(train, del_list)
run_models(train, feats, models)

['Security:_cat', 'Building type:_cat', 'Object type:_cat', 'Ad type:_cat', 'Commission agent:_cat', 'Construction phase:_cat', 'Housing class:_cat', 'Elevator:_cat', 'Bathroom type:_cat', 'Balcony type:_cat', 'Mortgage possible:_cat', 'The view from the window:_cat', 'Garbage chute:_cat', 'Repair:_cat', 'Fridge:_cat', 'Phone:_cat', 'Furniture:_cat', 'Free layout:_cat', 'It is possible to bargain:_cat', 'Floor covering:_cat', 'Room type:_cat', 'Internet:_cat', 'Kitchen furniture:_cat', 'TV:_cat', 'Washing machine:_cat', 'Foundation type:_cat', 'Overlap type:_cat', 'Type of the building:_cat', 'Playground:_cat', 'Class:_cat', 'date_str_cat', 'geo_block_str_cat', 'owner_str_cat', 'bread_geo_str_cat', 'metro_str_cat', 'station_str_cat', 'street_str_cat']
model XGB- each fold score: [5.6066832861778035, 5.456967897545102, 5.1496763914238315]; mean: 5.40, std: 0.19
model CB- each fold score: [5.525701340869351, 5.373652546512391, 5.072295367359039]; mean: 5.32, std: 0.19


In [16]:
# +metro, station, street >5
del_list = ['breadcrumbs_str_cat']
feats = get_feats(train, del_list)
run_models(train, feats, models)

['Security:_cat', 'Building type:_cat', 'Object type:_cat', 'Ad type:_cat', 'Commission agent:_cat', 'Construction phase:_cat', 'Housing class:_cat', 'Elevator:_cat', 'Bathroom type:_cat', 'Balcony type:_cat', 'Mortgage possible:_cat', 'The view from the window:_cat', 'Garbage chute:_cat', 'Repair:_cat', 'Fridge:_cat', 'Phone:_cat', 'Furniture:_cat', 'Free layout:_cat', 'It is possible to bargain:_cat', 'Floor covering:_cat', 'Room type:_cat', 'Internet:_cat', 'Kitchen furniture:_cat', 'TV:_cat', 'Washing machine:_cat', 'Foundation type:_cat', 'Overlap type:_cat', 'Type of the building:_cat', 'Playground:_cat', 'Class:_cat', 'date_str_cat', 'geo_block_str_cat', 'owner_str_cat', 'bread_geo_str_cat', 'metro_str_cat', 'station_str_cat', 'street_str_cat']
model XGB- each fold score: [5.66342635617137, 5.470616632640759, 5.08100621244768]; mean: 5.41, std: 0.24
model CB- each fold score: [5.492454175083386, 5.373635823152503, 5.0866895466069515]; mean: 5.32, std: 0.17


In [None]:
plt.rcParams['figure.figsize']=(30,15)
sns.heatmap(train[feats+['price']].corr(), vmax=1., vmin=-1., annot=True, linewidths=.8, cmap="YlGnBu");

In [17]:
model = models[1][1]
submit(model, train, test, feats, 'ver4_cb.csv')

submit for <catboost.core.CatBoostRegressor object at 0x7ff7ea6ec2e0> 
done


public score on Kaggle: 5.18976