In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd

from tqdm import tqdm

import warnings
from pathlib import Path

from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

import xgboost as xgb

from scipy.optimize import minimize

In [3]:
pd.set_option('max_columns', 100)
pd.set_option('display.precision', 4)

warnings.filterwarnings('ignore')

# 데이터 로드

In [4]:
# 로컬 버전

# data_dir = Path('../input/dankook')
# feature_dir = Path('../output/feature')
# val_dir = Path('../output/oof_pred')
# test_dir = Path('../output/test_pred')
# sub_dir = Path('../output/sub')


# train_file = data_dir / 'train.csv'
# test_file = data_dir / 'test.csv'
# sample_file = data_dir / 'sample_submission.csv'

In [5]:
# 코렙 

from google.colab import drive
drive.mount('/content/drive')

data_dir = Path('/content/drive/My Drive/Colab Notebooks/input/dankook')
feature_dir = Path('/content/drive/My Drive/Colab Notebooks/output/feature')
val_dir = Path('/content/drive/My Drive/Colab Notebooks/output/oof_pred')
test_dir = Path('/content/drive/My Drive/Colab Notebooks/output/test_pred')
sub_dir = Path('/content/drive/My Drive/Colab Notebooks/output/sub')

train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'

Mounted at /content/drive


In [6]:
algorithm_name = 'xgbcv'
feature_name = 'stacking1'
model_name = f'{algorithm_name}_{feature_name}_1'

feature_Ver1_file = feature_dir / f'{feature_name}_Ver1.csv'
feature_Ver2_file = feature_dir / f'{feature_name}_Ver2.csv'
feature_target_file = feature_dir / f'feature_target.csv'

stacking1_oof_pred_file = val_dir / f'{model_name}_oof_pred.csv'
stacking1_test_pred_file = test_dir / f'{model_name}_test_pred.csv'
stacking1_submission_file = sub_dir / f'{model_name}_submission.csv'

In [7]:
SEED = 2020
num_class = 3
n_splits = 5
target_column = 'class'

# Stacking Feature 생성

In [8]:
model_names = ['rfcv_feature','xgbcv_feature','lgbmcv_feature']
all_oof = []
all_test = []
feature_names = []

for model in model_names:
    all_oof.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver1.csv',delimiter=','))
    all_oof.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver2.csv', delimiter=','))
    all_test.append(np.loadtxt(test_dir / f'{model}_test_pred_ver1.csv',delimiter=','))
    all_test.append(np.loadtxt(test_dir / f'{model}_test_pred_ver2.csv', delimiter=','))
    feature_names += [f'{model}_ver1_class0', f'{model}_ver1_class1', f'{model}_ver1_class2',f'{model}_ver2_class0',f'{model}ver2_class1',f'{model}ver2_class2']

all_oof = np.column_stack(all_oof)
all_test = np.column_stack(all_test)
all_oof.shape, all_test.shape, feature_names

((319923, 18),
 (80000, 18),
 ['rfcv_feature_ver1_class0',
  'rfcv_feature_ver1_class1',
  'rfcv_feature_ver1_class2',
  'rfcv_feature_ver2_class0',
  'rfcv_featurever2_class1',
  'rfcv_featurever2_class2',
  'xgbcv_feature_ver1_class0',
  'xgbcv_feature_ver1_class1',
  'xgbcv_feature_ver1_class2',
  'xgbcv_feature_ver2_class0',
  'xgbcv_featurever2_class1',
  'xgbcv_featurever2_class2',
  'lgbmcv_feature_ver1_class0',
  'lgbmcv_feature_ver1_class1',
  'lgbmcv_feature_ver1_class2',
  'lgbmcv_feature_ver2_class0',
  'lgbmcv_featurever2_class1',
  'lgbmcv_featurever2_class2'])

In [9]:
y = pd.read_csv(feature_target_file, index_col=0, usecols=['id',target_column]).values.flatten()
y.shape

(319923,)

# Stacking

- 각 oof마다 fold별로 logloos의 변동이 있으므로 최대한 정보를 뽑아내고자 스태킹을 함.

In [10]:
# Xgboost
stacking_params = {
    'n_jobs' : -1, 
    'n_estimators': 100,
    'eval_metric': 'mlogloss',
    'eta': 0.3, # learning_rate
    'booster': 'gbtree',
    'objective': 'multi:softprob',
    'num_class': 3,
    'random_state': 2020,
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor'
}

In [11]:
mlogloss = []

stk_oof_pred = np.zeros((all_oof.shape[0],num_class))
stk_test_pred = np.zeros((all_test.shape[0],num_class))

kFold = StratifiedKFold(n_splits=n_splits, random_state=2020, shuffle=True)
for fold, (trn_idx, val_idx) in enumerate(kFold.split(all_oof,y)):
    X_train, X_val = all_oof[trn_idx], all_oof[val_idx]
    y_train, y_val = y[trn_idx], y[val_idx]

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    watchlist = [(dtrain,'train'), (dval, 'val')]

    xgb_clf = xgb.train(stacking_params, dtrain, 5000, evals=watchlist, early_stopping_rounds=50, verbose_eval=5000)

    dtest = xgb.DMatrix(all_test)
    stk_test_pred += xgb_clf.predict(dtest) / n_splits
    stk_oof_pred[val_idx] = xgb_clf.predict(dval)
    mlogloss.append(xgb_clf.best_score)

print('mean logloss= ', np.mean(mlogloss))

[0]	train-mlogloss:0.756901	val-mlogloss:0.757914
Multiple eval metrics have been passed: 'val-mlogloss' will be used for early stopping.

Will train until val-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[35]	train-mlogloss:0.144442	val-mlogloss:0.153927

[0]	train-mlogloss:0.757122	val-mlogloss:0.757042
Multiple eval metrics have been passed: 'val-mlogloss' will be used for early stopping.

Will train until val-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[39]	train-mlogloss:0.143881	val-mlogloss:0.151902

[0]	train-mlogloss:0.757028	val-mlogloss:0.757459
Multiple eval metrics have been passed: 'val-mlogloss' will be used for early stopping.

Will train until val-mlogloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[41]	train-mlogloss:0.143131	val-mlogloss:0.152079

[0]	train-mlogloss:0.756861	val-mlogloss:0.757276
Multiple eval metrics have been passed: 'val-mlogloss' will be used for early stopping.

Will train until val-mlogloss 

# Ensemble

- cv score가 안정적이여서 Stacking 예측값와 함께 가중치 최적화함
- 가중치 최적화에는 scipy의 minimize 사용

In [12]:
def log_loss_func(weights):
    final_prediction = 0
    for weight, prediction in zip(weights, oof_predictions):
        final_prediction += weight*prediction

    return log_loss(y, final_prediction)

In [13]:
model_names = ['rfcv_feature','xgbcv_feature','lgbmcv_feature']
oof_predictions = []
test_predictions = []
feature_names = []

for model in model_names:
    oof_predictions.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver1.csv',delimiter=','))
    oof_predictions.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver2.csv', delimiter=','))
    test_predictions.append(np.loadtxt(test_dir / f'{model}_test_pred_ver1.csv',delimiter=','))
    test_predictions.append(np.loadtxt(test_dir / f'{model}_test_pred_ver2.csv', delimiter=','))
    feature_names += [f'{model}_ver1_class0', f'{model}_ver1_class1', f'{model}_ver1_class2',f'{model}_ver2_class0',f'{model}ver2_class1',f'{model}ver2_class2']

oof_predictions.append(stk_oof_pred)
test_predictions.append(stk_test_pred)

In [14]:
best_scores = []
weights = []

iteration = 100 # 여러 번 구한 가중치 값 중 Best score를 선택
for i in tqdm(range(iteration)):
    starting_values = np.random.uniform(size=len(oof_predictions))
    bounds = [(0,1)]*len(oof_predictions)
    cons = ({'type':'eq','fun':lambda w: 1-sum(w)})

    res = minimize(log_loss_func,
                   starting_values,
                   method = 'SLSQP',
                   bounds = bounds,
                   constraints = cons,
                   options={'maxiter': 500})
    best_scores.append(res['fun'])
    weights.append(res['x'])

weights = weights[np.argmin(best_scores)]
print('\n')
print('Ensemble score: {}'.format(np.min(best_scores)))
print('Best Weights: {}'.format(weights))

100%|██████████| 100/100 [30:15<00:00, 18.16s/it]



Ensemble score: 0.15330105977979677
Best Weights: [1.39957455e-01 2.25316168e-02 2.85779613e-01 4.48084248e-01
 1.00830802e-17 1.73472348e-18 1.03647067e-01]





In [15]:
final_prediction = 0

# 가중치 적용
for weight, p in zip(weights, test_predictions):
    final_prediction += weight*p

print(final_prediction)

[[1.56306998e-06 5.21625202e-02 2.75571213e-01]
 [3.27723831e-01 4.76528340e-06 6.70147741e-06]
 [5.05853423e-06 2.22960972e-02 3.05434150e-01]
 ...
 [3.27734762e-01 3.44037724e-07 1.95051959e-07]
 [3.80238768e-06 1.97549282e-02 3.07976567e-01]
 [3.27734133e-01 8.09674076e-07 3.56653332e-07]]


# 제출 파일 및 기타 파일 생성

In [16]:
# submission 파일 생성

sub = pd.read_csv(sample_file)
sub[target_column] = np.argmax(final_prediction, axis=1)
sub.to_csv(stacking1_submission_file, index=False, encoding='utf-8-sig')

In [17]:
# stcking1_oof_pred 파일 생성

np.savetxt(stacking1_oof_pred_file, stk_oof_pred, fmt='%.18f',delimiter=',')

In [18]:
# stacking1_test_pred 파일 생성

np.savetxt(stacking1_test_pred_file, stk_test_pred, fmt='%.18f', delimiter=',')

In [19]:
0.93625

0.93625