In [1]:
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import cohen_kappa_score, accuracy_score, mean_absolute_error, f1_score
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from tqdm import tqdm
import xgboost as xgb
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
from datetime import datetime

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
seed = 2020

In [3]:
df_feature = pd.read_pickle('feature.pickle')

In [4]:
from sklearn.preprocessing import LabelEncoder
for f in tqdm(df_feature.select_dtypes('object')):
    lbl = LabelEncoder()
    df_feature[f] = lbl.fit_transform(df_feature[f].astype(str))

  0%|          | 0/200000 [00:00<?, ?it/s]


In [5]:
df_test = df_feature[df_feature['price'].isnull()].copy()
df_train = df_feature[df_feature['price'].notnull()].copy()

In [6]:
ycol = 'price'
feature_names = list(
    filter(lambda x: x not in [ycol, 'SaleID', 'regDate', 'creatDate', 'creatDate_year', 'creatDate_month'], df_train.columns))

model = xgb.XGBRegressor(num_leaves=64,
                         max_depth=8,
                         learning_rate=0.08,
                         n_estimators=10000000,
                         subsample=0.75,
                         feature_fraction=0.75,
                         reg_alpha=0.7,
                         reg_lambda=1.2,
                         random_state=seed,
                         metric=None,
                         tree_method='hist'
                         )

oof = []
prediction = df_test[['SaleID']]
prediction['price'] = 0
df_importance_list = []

kfold = KFold(n_splits=5, shuffle=True, random_state=seed)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=1000,
                          eval_metric='mae',
                          early_stopping_rounds=500)

    pred_val = lgb_model.predict(
        X_val)
    df_oof = df_train.iloc[val_idx][['SaleID', ycol]].copy()
    df_oof['pred'] = pred_val
    oof.append(df_oof)

    pred_test = lgb_model.predict(
        df_test[feature_names])
    prediction['price'] += pred_test / 5

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()



Parameters: { "feature_fraction", "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-mae:6.93086	validation_1-mae:6.94007
[1000]	validation_0-mae:0.04204	validation_1-mae:0.11405
[2000]	validation_0-mae:0.02531	validation_1-mae:0.11285
[3000]	validation_0-mae:0.01902	validation_1-mae:0.11232
[4000]	validation_0-mae:0.01571	validation_1-mae:0.11207
[5000]	validation_0-mae:0.01369	validation_1-mae:0.11197
[6000]	validation_0-mae:0.01231	validation_1-mae:0.11188
[7000]	validation_0-mae:0.01129	validation_1-mae:0.11183
[8000]	validation_0-mae:0.01050	validation_1-mae:0.11179
[9000]	validation_0-mae:0.00985	validation_1-mae:0.11176
[10000]	validation_0-mae:0.00933	validation_1-mae:0.11175
[10128]	validation_0-mae:0.00927	val

In [1]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance

NameError: name 'pd' is not defined

In [8]:
df_oof = pd.concat(oof)
df_oof[ycol] = np.expm1(df_oof[ycol])
df_oof['pred'] = np.expm1(df_oof['pred'])
mae = mean_absolute_error(df_oof[ycol], df_oof['pred'])
print('mae:', mae)
df_oof.to_csv('xgb_oof.csv'.format(mae), index=False, encoding='utf-8')

mae: 480.0760716578102


In [9]:
prediction['price'] = np.expm1(prediction['price'])
sub = prediction.copy(deep=True)
sub.to_csv('sub/xgb_{}.csv'.format(mae), index=False, encoding='utf-8')
sub.to_csv('xgb.csv'.format(mae), index=False, encoding='utf-8')

In [10]:
sub.head()

Unnamed: 0,SaleID,price
0,200000,1263.522947
1,200001,1986.425167
2,200002,8603.206575
3,200003,999.216484
4,200004,2018.290815
