In [46]:
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import cohen_kappa_score, accuracy_score, mean_absolute_error, f1_score
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from tqdm import tqdm
import xgboost as xgb
import catboost as ctb
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
from datetime import datetime

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [47]:
seed = 2020

In [48]:
df_feature = pd.read_pickle('feature.pickle')

In [49]:
from sklearn.preprocessing import LabelEncoder
for f in tqdm(df_feature.select_dtypes('object')):
    lbl = LabelEncoder()
    df_feature[f] = lbl.fit_transform(df_feature[f].astype(str))

  0%|          | 0/200000 [00:00<?, ?it/s]


In [50]:
df_test = df_feature[df_feature['price'].isnull()].copy()
df_train = df_feature[df_feature['price'].notnull()].copy()

In [51]:
ycol = 'price'
feature_names = list(
    filter(lambda x: x not in [ycol, 'SaleID', 'regDate', 'creatDate', 'creatDate_year', 'creatDate_month'], df_train.columns))

model = ctb.CatBoostRegressor(
    learning_rate=0.08,
    depth=10,
    subsample=0.75,
    n_estimators=100000,
    loss_function='RMSE',
    random_seed=seed,
)

oof = []
prediction = df_test[['SaleID']]
prediction['price'] = 0
df_importance_list = []

kfold = KFold(n_splits=5, shuffle=True, random_state=seed)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    ctb_model = model.fit(X_train,
                          Y_train,
                          verbose=1000,
                          early_stopping_rounds=500)

    pred_val = ctb_model.predict(
        X_val)
    df_oof = df_train.iloc[val_idx][['SaleID', ycol]].copy()
    df_oof['pred'] = pred_val
    oof.append(df_oof)

    pred_test = ctb_model.predict(
        df_test[feature_names])
    prediction['price'] += pred_test / 5

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': ctb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del ctb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()






Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 1.1320714	total: 373ms	remaining: 1h 2m 6s
1000:	learn: 0.1316481	total: 6m 4s	remaining: 54m 36s
2000:	learn: 0.0945439	total: 12m 8s	remaining: 48m 32s
3000:	learn: 0.0719526	total: 17m 58s	remaining: 41m 56s
4000:	learn: 0.0570824	total: 23m 44s	remaining: 35m 36s
5000:	learn: 0.0463233	total: 29m 33s	remaining: 29m 32s
6000:	learn: 0.0384849	total: 35m 22s	remaining: 23m 34s
7000:	learn: 0.0324109	total: 41m 15s	remaining: 17m 40s
8000:	learn: 0.0276795	total: 47m 9s	remaining: 11m 47s
9000:	learn: 0.0237888	total: 53m 4s	remaining: 5m 53s
9999:	learn: 0.0206946	total: 59m 5s	remaining: 0us


0:	learn: 1.1315348	total: 446ms	remaining: 1h 14m 16s
1000:	learn: 0.1304567	total: 5m 59s	remaining: 53m 53s
2000:	learn: 0.0932892	total: 11m 57s	remaining: 47m 49s
3000:	learn: 0.0716795	total: 17m 55s	remaining: 41m 47s
4000:	learn: 0.0568200	total: 24m 5s	remaining: 36m 7s
5000:	learn: 0.0463679	total: 30m 5s	remaining: 30m 4s
6000:	learn: 0.0384532	total: 36m	remaining: 23m 59

In [52]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
# df_importance

In [53]:
df_oof = pd.concat(oof)
df_oof[ycol] = np.expm1(df_oof[ycol])
df_oof['pred'] = np.expm1(df_oof['pred'])
mae = mean_absolute_error(df_oof[ycol], df_oof['pred'])
print('mae:', mae)
df_oof.to_csv('ctb_oof.csv'.format(mae), index=False, encoding='utf-8')

mae: 456.0954469643469


In [54]:
prediction['price'] = np.expm1(prediction['price'])
sub = prediction.copy(deep=True)
sub.to_csv('sub/ctb_{}.csv'.format(mae), index=False, encoding='utf-8')
sub.to_csv('ctb.csv'.format(mae), index=False, encoding='utf-8')

In [55]:
sub.head()

Unnamed: 0,SaleID,price
0,200000,1255.680901
1,200001,1951.761372
2,200002,8759.420316
3,200003,1023.98363
4,200004,2013.304828
