In [1]:
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import cohen_kappa_score, accuracy_score, mean_absolute_error, f1_score
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from tqdm import tqdm
import xgboost as xgb
import catboost as ctb
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
from datetime import datetime

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

ModuleNotFoundError: No module named 'catboost'

In [None]:
seed = 2020

In [None]:
df_feature = pd.read_pickle('feature.pickle')

In [None]:
from sklearn.preprocessing import LabelEncoder
for f in tqdm(df_feature.select_dtypes('object')):
    lbl = LabelEncoder()
    df_feature[f] = lbl.fit_transform(df_feature[f].astype(str))

In [None]:
df_test = df_feature[df_feature['price'].isnull()].copy()
df_train = df_feature[df_feature['price'].notnull()].copy()

In [None]:
ycol = 'price'
feature_names = list(
    filter(lambda x: x not in [ycol, 'SaleID', 'regDate', 'creatDate', 'creatDate_year', 'creatDate_month'], df_train.columns))

model = ctb.CatBoostRegressor(
    learning_rate=0.08,
    depth=10,
    subsample=0.75,
    n_estimators=100000,
    loss_function='RMSE',
    random_seed=seed,
)

oof = []
prediction = df_test[['SaleID']]
prediction['price'] = 0
df_importance_list = []

kfold = KFold(n_splits=5, shuffle=True, random_state=seed)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    ctb_model = model.fit(X_train,
                          Y_train,
                          verbose=1000,
                          early_stopping_rounds=500)

    pred_val = ctb_model.predict(
        X_val)
    df_oof = df_train.iloc[val_idx][['SaleID', ycol]].copy()
    df_oof['pred'] = pred_val
    oof.append(df_oof)

    pred_test = ctb_model.predict(
        df_test[feature_names])
    prediction['price'] += pred_test / 5

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': ctb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del ctb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()


In [None]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
# df_importance

In [None]:
df_oof = pd.concat(oof)
df_oof[ycol] = np.expm1(df_oof[ycol])
df_oof['pred'] = np.expm1(df_oof['pred'])
mae = mean_absolute_error(df_oof[ycol], df_oof['pred'])
print('mae:', mae)
df_oof.to_csv('ctb_oof.csv'.format(mae), index=False, encoding='utf-8')

In [None]:
prediction['price'] = np.expm1(prediction['price'])
sub = prediction.copy(deep=True)
sub.to_csv('sub/ctb_{}.csv'.format(mae), index=False, encoding='utf-8')
sub.to_csv('ctb.csv'.format(mae), index=False, encoding='utf-8')

In [None]:
sub.head()