In [55]:
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import cohen_kappa_score, accuracy_score, mean_absolute_error, f1_score
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from tqdm import tqdm
import lightgbm as lgb
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
from datetime import datetime
import itertools

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [56]:
seed = 2020

In [57]:
df_feature = pd.read_pickle('feature.pickle')

In [58]:
from sklearn.preprocessing import LabelEncoder
for f in tqdm(df_feature.select_dtypes('object')):
    lbl = LabelEncoder()
    df_feature[f] = lbl.fit_transform(df_feature[f].astype(str))

  0%|          | 0/200000 [00:00<?, ?it/s]


In [59]:
df_test = df_feature[df_feature['price'].isnull()].copy()
df_train = df_feature[df_feature['price'].notnull()].copy()

In [60]:
ycol = 'price'
feature_names = list(
    filter(lambda x: x not in [ycol, 'SaleID', 'regDate', 'creatDate', 'creatDate_year', 'creatDate_month'], df_train.columns))


# 0.08
model = lgb.LGBMRegressor(num_leaves=64,
                          max_depth=8,
                          learning_rate=0.08,
                          n_estimators=10000000,
                          subsample=0.75,
                          feature_fraction=0.75,
                          reg_alpha=0.7,
                          reg_lambda=1.2,
                          random_state=seed,
                          metric=None
                          )

oof = []
prediction = df_test[['SaleID']]
prediction['price'] = 0
df_importance_list = []

kfold = KFold(n_splits=5, shuffle=True, random_state=seed)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names])):
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=500,
                          eval_metric='mae',
                          early_stopping_rounds=500)

    pred_val = lgb_model.predict(
        X_val, num_iteration=lgb_model.best_iteration_)
    df_oof = df_train.iloc[val_idx][['SaleID', ycol]].copy()
    df_oof['pred'] = pred_val
    oof.append(df_oof)

    pred_test = lgb_model.predict(
        df_test[feature_names], num_iteration=lgb_model.best_iteration_)
    prediction['price'] += pred_test / 5

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()



[500]	train's l1: 0.0883196	train's l2: 0.0223962	valid's l1: 0.117324	valid's l2: 0.0470408
[1000]	train's l1: 0.0699534	train's l2: 0.0138526	valid's l1: 0.113933	valid's l2: 0.0468162


[500]	train's l1: 0.0879916	train's l2: 0.0223223	valid's l1: 0.117846	valid's l2: 0.0483072
[1000]	train's l1: 0.0697099	train's l2: 0.0137978	valid's l1: 0.114546	valid's l2: 0.0481771


[500]	train's l1: 0.0880991	train's l2: 0.0223374	valid's l1: 0.117858	valid's l2: 0.0491491
[1000]	train's l1: 0.0698338	train's l2: 0.0138554	valid's l1: 0.114504	valid's l2: 0.0488563


[500]	train's l1: 0.0885117	train's l2: 0.0224562	valid's l1: 0.116999	valid's l2: 0.0488622
[1000]	train's l1: 0.0699186	train's l2: 0.0137194	valid's l1: 0.113923	valid's l2: 0.048845


[500]	train's l1: 0.0879974	train's l2: 0.0222328	valid's l1: 0.117439	valid's l2: 0.0469434
[1000]	train's l1: 0.0698038	train's l2: 0.0137864	valid's l1: 0.114382	valid's l2: 0.0469582


In [61]:
# 0.107853
# 0.106296
# 0.107481
# 0.106911
# 0.106629

In [62]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance.head()

Unnamed: 0,column,importance
0,car_age_day,1352.0
1,v_1_add_v_3,1289.0
2,regionCode,1195.4
3,power,943.4
4,v_0_add_v_12_add_v_14,935.2


In [63]:
df_oof = pd.concat(oof)
df_oof[ycol] = np.expm1(df_oof[ycol])
df_oof['pred'] = np.expm1(df_oof['pred'])
mae = mean_absolute_error(df_oof[ycol], df_oof['pred'])
print('mae:', mae)
df_oof.to_csv('lgb_oof.csv'.format(mae), index=False, encoding='utf-8')

mae: 506.7871954319213


In [64]:
prediction['price'] = np.expm1(prediction['price'])
sub = prediction.copy(deep=True)
sub.to_csv('sub/lgb_{}.csv'.format(mae), index=False, encoding='utf-8')
sub.to_csv('lgb.csv'.format(mae), index=False, encoding='utf-8')

In [65]:
sub.head()

Unnamed: 0,SaleID,price
0,200000,1260.218549
1,200001,1946.626806
2,200002,8230.20099
3,200003,1120.572405
4,200004,1994.787633


In [66]:
# 5930.6270
sub['price'].mean()

5828.528734210902