In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd

from tqdm import tqdm

import warnings
from pathlib import Path

from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

from sklearn.linear_model import LogisticRegression

from scipy.optimize import minimize

In [None]:
pd.set_option('max_columns', 100)
pd.set_option('display.precision', 4)

warnings.filterwarnings('ignore')

# 데이터 로드

In [None]:
# 로컬 버전

# data_dir = Path('../input/dankook')
# feature_dir = Path('../output/feature')
# val_dir = Path('../output/oof_pred')
# test_dir = Path('../output/test_pred')
# sub_dir = Path('../output/sub')


# train_file = data_dir / 'train.csv'
# test_file = data_dir / 'test.csv'
# sample_file = data_dir / 'sample_submission.csv'

In [None]:
# 코렙 

from google.colab import drive
drive.mount('/content/drive')

data_dir = Path('/content/drive/My Drive/Colab Notebooks/input/dankook')
feature_dir = Path('/content/drive/My Drive/Colab Notebooks/output/feature')
val_dir = Path('/content/drive/My Drive/Colab Notebooks/output/oof_pred')
test_dir = Path('/content/drive/My Drive/Colab Notebooks/output/test_pred')
sub_dir = Path('/content/drive/My Drive/Colab Notebooks/output/sub')

train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
algorithm_name = 'lrcv'
feature_name = 'stacking2'
model_name = f'{algorithm_name}_{feature_name}_1'

feature_Ver1_file = feature_dir / f'{feature_name}_Ver1.csv'
feature_Ver2_file = feature_dir / f'{feature_name}_Ver2.csv'
feature_target_file = feature_dir / f'feature_target.csv'

stacking1_oof_pred_file = val_dir / f'{model_name}_oof_pred.csv'
stacking1_test_pred_file = test_dir / f'{model_name}_test_pred.csv'
stacking1_submission_file = sub_dir / f'{model_name}_submission.csv'

In [None]:
SEED = 2020
num_class = 3
n_splits = 5
target_column = 'class'

# Stacking Feature 생성

In [None]:
def load_data(model_names, oof_list, test_list, feature_names=None):
    for model in model_names:
        oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred.csv',delimiter=','))
        test_list.append(np.loadtxt(test_dir / f'{model}_test_pred.csv',delimiter=','))
        if feature_names != None:
            feature_names += {f'{model}_class0',f'{model}_class1',f'{model}_class2'}

In [None]:
all_oof = []
all_test = []
feature_names = []

model_names = ['xgbcv_stacking1_1', 'xgbcv_stacking1_2', 'xgbcv_stacking1_3', 'xgbcv_stacking1_4','xgbcv_stacking1_5', 'xgbcv_stacking1_6', 'xgbcv_stacking1_7', 'xgbcv_stacking1_8',
               'lgbmcv_stacking1_1','lgbmcv_stacking1_2','lgbmcv_stacking1_3','lgbmcv_stacking1_4',
               'rfcv_stacking1_1','lrcv_stacking1_1','gbcv_stacking1_1','etscv_stacking1_1','adacv_stacking1_1']
load_data(model_names,all_oof, all_test, feature_names)

all_oof = np.column_stack(all_oof)
all_test = np.column_stack(all_test)
all_oof.shape, all_test.shape, feature_names

((319923, 51),
 (80000, 51),
 ['xgbcv_stacking1_1_class1',
  'xgbcv_stacking1_1_class2',
  'xgbcv_stacking1_1_class0',
  'xgbcv_stacking1_2_class1',
  'xgbcv_stacking1_2_class0',
  'xgbcv_stacking1_2_class2',
  'xgbcv_stacking1_3_class1',
  'xgbcv_stacking1_3_class0',
  'xgbcv_stacking1_3_class2',
  'xgbcv_stacking1_4_class0',
  'xgbcv_stacking1_4_class2',
  'xgbcv_stacking1_4_class1',
  'xgbcv_stacking1_5_class0',
  'xgbcv_stacking1_5_class1',
  'xgbcv_stacking1_5_class2',
  'xgbcv_stacking1_6_class2',
  'xgbcv_stacking1_6_class0',
  'xgbcv_stacking1_6_class1',
  'xgbcv_stacking1_7_class0',
  'xgbcv_stacking1_7_class1',
  'xgbcv_stacking1_7_class2',
  'xgbcv_stacking1_8_class1',
  'xgbcv_stacking1_8_class2',
  'xgbcv_stacking1_8_class0',
  'lgbmcv_stacking1_1_class1',
  'lgbmcv_stacking1_1_class0',
  'lgbmcv_stacking1_1_class2',
  'lgbmcv_stacking1_2_class0',
  'lgbmcv_stacking1_2_class1',
  'lgbmcv_stacking1_2_class2',
  'lgbmcv_stacking1_3_class2',
  'lgbmcv_stacking1_3_class1',
  '

In [None]:
y = pd.read_csv(feature_target_file, index_col=0, usecols=['id',target_column]).values.flatten()
y.shape

(319923,)

# Stacking

- 각 oof마다 fold별로 logloos의 변동이 있으므로 최대한 정보를 뽑아내고자 스태킹을 함.

In [None]:
# Logistic Regression parameters
lr_params = {
    'multi_class': 'multinomial',
    'penalty': 'elasticnet',
    'l1_ratio': 0.01,
    'solver': 'saga',
    'C': 1.0,
    'max_iter': 100000,
    'class_weight': None,
    'verbose': 0,
    'n_jobs': -1,
    'random_state': SEED
}

In [None]:
mlogloss = []

stk_oof_pred = np.zeros((all_oof.shape[0],num_class))
stk_test_pred = np.zeros((all_test.shape[0],num_class))

kFold = StratifiedKFold(n_splits=n_splits, random_state=2020, shuffle=True)
for fold, (trn_idx, val_idx) in enumerate(kFold.split(all_oof,y)):
    print(f'Traning model for CV #{fold+1}')
    X_train, X_val = all_oof[trn_idx], all_oof[val_idx]
    y_train, y_val = y[trn_idx], y[val_idx]

    lr_clf = LogisticRegression(**lr_params)
    lr_clf.fit(X_train,y_train)

    stk_test_pred += lr_clf.predict_proba(all_test) / n_splits
    stk_oof_pred[val_idx] += lr_clf.predict_proba(X_val)

    mlogloss.append(log_loss(y_val,stk_oof_pred[val_idx]))
    print(f'{fold+1} logloss = {log_loss(y_val,stk_oof_pred[val_idx])}')

print('mean logloss= ', np.mean(mlogloss))

Traning model for CV #1
1 logloss = 0.15548827313789104
Traning model for CV #2
2 logloss = 0.15413176289317373
Traning model for CV #3
3 logloss = 0.1531388666983933
Traning model for CV #4
4 logloss = 0.15532137300163965
Traning model for CV #5
5 logloss = 0.15397486311830047
mean logloss=  0.15441102776987964


# Ensemble

- cv score가 안정적이여서 Stacking 예측값와 함께 가중치 최적화함
- 가중치 최적화에는 scipy의 minimize 사용

In [None]:
def log_loss_func(weights):
    final_prediction = 0
    for weight, prediction in zip(weights, oof_predictions):
        final_prediction += weight*prediction

    return log_loss(y, final_prediction)

In [None]:
oof_predictions = []
test_predictions = []

model_names = ['xgbcv_stacking1_1', 'xgbcv_stacking1_2', 'xgbcv_stacking1_3', 'xgbcv_stacking1_4','xgbcv_stacking1_5', 'xgbcv_stacking1_6', 'xgbcv_stacking1_7', 'xgbcv_stacking1_8',
               'lgbmcv_stacking1_1','lgbmcv_stacking1_2','lgbmcv_stacking1_3','lgbmcv_stacking1_4',
               'rfcv_stacking1_1','lrcv_stacking1_1','gbcv_stacking1_1','etscv_stacking1_1','adacv_stacking1_1']
load_data(model_names,oof_predictions, test_predictions)

oof_predictions.append(stk_oof_pred)
test_predictions.append(stk_test_pred)
len(oof_predictions), oof_predictions, len(test_predictions), test_predictions

(18, [array([[9.99995112e-01, 2.72299508e-06, 2.17522825e-06],
         [4.74168657e-04, 5.37755191e-01, 4.61770654e-01],
         [9.99741137e-01, 1.66911763e-04, 9.19709055e-05],
         ...,
         [9.99995112e-01, 2.72299508e-06, 2.17522825e-06],
         [9.99997854e-01, 1.05149536e-06, 1.07823098e-06],
         [9.99985695e-01, 5.12483257e-06, 9.21028095e-06]]),
  array([[9.99998212e-01, 8.02710645e-07, 9.54504230e-07],
         [1.21305883e-03, 5.26216209e-01, 4.72570747e-01],
         [9.99936461e-01, 4.05087958e-05, 2.30528822e-05],
         ...,
         [9.99998450e-01, 4.60678990e-07, 1.05475306e-06],
         [9.99997735e-01, 1.05631364e-06, 1.15372768e-06],
         [9.99996305e-01, 1.05324204e-06, 2.68697909e-06]]),
  array([[9.99997735e-01, 1.36645406e-06, 8.50908236e-07],
         [1.15951675e-03, 4.71601397e-01, 5.27239144e-01],
         [9.99813139e-01, 8.85812260e-05, 9.83299906e-05],
         ...,
         [9.99998331e-01, 8.53971983e-07, 8.01132956e-07],
      

In [None]:
best_scores = []
weights = []

iteration = 100 # 여러 번 구한 가중치 값 중 Best score를 선택
for i in tqdm(range(iteration)):
    starting_values = np.random.uniform(size=len(oof_predictions))
    bounds = [(0,1)]*len(oof_predictions)
    cons = ({'type':'eq','fun':lambda w: 1-sum(w)})

    res = minimize(log_loss_func,
                   starting_values,
                   method = 'SLSQP',
                   bounds = bounds,
                   constraints = cons,
                   options={'maxiter': 500})
    best_scores.append(res['fun'])
    weights.append(res['x'])

weights = weights[np.argmin(best_scores)]
print('\n')
print('Ensemble score: {}'.format(np.min(best_scores)))
print('Best Weights: {}'.format(weights))

100%|██████████| 100/100 [1:33:15<00:00, 55.96s/it]



Ensemble score: 0.15010975700835427
Best Weights: [2.92578068e-02 3.90816807e-02 0.00000000e+00 1.08864733e-01
 8.85893759e-02 8.44552507e-02 9.84547595e-02 8.20115029e-02
 2.60208521e-18 6.93889390e-18 3.46944695e-18 2.16840434e-18
 4.11255674e-02 0.00000000e+00 3.02128384e-01 1.19676557e-01
 0.00000000e+00 6.35438185e-03]





In [None]:
final_prediction = 0

# 가중치 적용
for weight, p in zip(weights, test_predictions):
    final_prediction += weight*p

print(final_prediction)

[[1.55459869e-04 1.62720904e-01 8.37123635e-01]
 [9.99888868e-01 3.50339418e-05 7.61009399e-05]
 [1.39507074e-04 8.89575272e-02 9.10902971e-01]
 ...
 [9.99896807e-01 3.01997906e-05 7.29920794e-05]
 [1.21576575e-04 5.17715487e-02 9.48106878e-01]
 [9.99895243e-01 3.07012403e-05 7.40538125e-05]]


# 제출 파일 및 기타 파일 생성

In [None]:
# # submission 파일 생성

# sub = pd.read_csv(sample_file)
# sub[target_column] = np.argmax(stk_test_pred, axis=1)
# sub.to_csv(stacking1_submission_file, index=False, encoding='utf-8-sig')

# submission 파일 생성

sub = pd.read_csv(sample_file)
sub[target_column] = np.argmax(final_prediction, axis=1)
sub.to_csv(stacking1_submission_file, index=False, encoding='utf-8-sig')

In [None]:
# stcking1_oof_pred 파일 생성

np.savetxt(stacking1_oof_pred_file, stk_oof_pred, fmt='%.18f',delimiter=',')

In [None]:
# stacking1_test_pred 파일 생성

np.savetxt(stacking1_test_pred_file, stk_test_pred, fmt='%.18f', delimiter=',')