In [1]:
import pandas as pd
import numpy as np
import gc
np.random.seed(0)

import matplotlib.pyplot as plt
import seaborn as sns
import scikitplot as skplt

import sys
import re

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import learning_curve, train_test_split, KFold

from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

import xgbfir
%matplotlib inline

In [2]:
train = pd.read_hdf('../input/property.train.h5')
test = pd.read_hdf('../input/property.test.h5') # test set without price

df_all = pd.concat([train, test], sort=False)

In [3]:
#Change columns with list to string.
col_list = ['breadcrumbs', 'date', 'geo_block', 'owner']
for column in col_list:
    df_all['{}_str'.format(column)] = df_all[column].map(lambda x: ','.join(x))
    
col_to_factorize = [column for column in df_all.columns if (':' in column) or ('_str' in column)]
for column in col_to_factorize:
    df_all['{}_cat'.format(column)] = df_all[column].factorize()[0]

In [4]:
train = df_all[~df_all['price'].isna()]
test = df_all[df_all['price'].isna()]
test = test.drop(['price'], axis=1)
test['id'] = test['id'].astype('int64')

In [5]:
feats = train.select_dtypes(include=['number']).columns
black_list = ['price', 'id']
feats = [f for f in feats if f not in black_list]

In [11]:
models = [('DT', DecisionTreeRegressor(max_depth=7, random_state=0)),
          ('RF', RandomForestRegressor(max_depth=7, random_state=0)),
          ('XGB', XGBRegressor(max_depth=7, n_estimators=100, learning_rate=0.2, random_state=0)),
          ('CB', CatBoostRegressor(depth=7, verbose=False))]

def run_models(train, feats, models):
    print(feats)
    
    for model_name, model in models:
        X_train, X_test, y_train, y_test = train_test_split(train[feats], train['price'], test_size=0.3, random_state=0)

        y_train  = np.log(y_train)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        y_pred = np.exp(y_pred)
        y_pred[y_pred < 0] = 0

        score = mean_absolute_error(y_test, y_pred)
        print('model {}- score: {}'.format(model_name,  score))

def submit(model, train, test, feats, file='model_.csv'):
    X_train = train[feats]
    y_train = np.log(train['price'])
    X_test = test[feats]
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred = np.exp(y_pred)
    y_pred[y_pred < 0] = 0
    
    test['price'] = y_pred
    test[ ['id', 'price'] ].to_csv('../output/{}'.format(file), index=False) 
    print('submit for {} \ndone'.format(model))

In [12]:
run_models(train, feats, models)

['Security:_cat', 'Building type:_cat', 'Object type:_cat', 'Ad type:_cat', 'Commission agent:_cat', 'Construction phase:_cat', 'Housing class:_cat', 'Elevator:_cat', 'Bathroom type:_cat', 'Balcony type:_cat', 'Mortgage possible:_cat', 'The view from the window:_cat', 'Garbage chute:_cat', 'Repair:_cat', 'Fridge:_cat', 'Phone:_cat', 'Furniture:_cat', 'Free layout:_cat', 'It is possible to bargain:_cat', 'Floor covering:_cat', 'Room type:_cat', 'Internet:_cat', 'Kitchen furniture:_cat', 'TV:_cat', 'Washing machine:_cat', 'Foundation type:_cat', 'Overlap type:_cat', 'Type of the building:_cat', 'Playground:_cat', 'Class:_cat', 'breadcrumbs_str_cat', 'date_str_cat', 'geo_block_str_cat', 'owner_str_cat']
model DT- score: 7.389944709162244
model RF- score: 7.276891267235837
model XGB- score: 5.815427705450044
model CB- score: 5.740346089925315


In [14]:
model = models[3][1]
submit(model, train, test, feats, 'simple_cb.csv')

submit for <catboost.core.CatBoostRegressor object at 0x7f37cc45e940> 
done


public score on Kaggle: 5.28831