In [1]:
!git clone https://github.com/Microsoft/LightGBM \
&& cd LightGBM \
&& mkdir build \
&& cmake -DUSE_GPU=1 \
&& make -j$(nproc) \
&& sudo apt-get -y install python-pip \
&& sudo -H pip install setuptools pandas numpy scipy scikit-learn -U \
&& cd /content/LightGBM/python-package \
&& sudo python setup.py install \

Cloning into 'LightGBM'...
remote: Enumerating objects: 19510, done.[K
remote: Total 19510 (delta 0), reused 0 (delta 0), pack-reused 19510[K
Receiving objects: 100% (19510/19510), 15.67 MiB | 28.00 MiB/s, done.
Resolving deltas: 100% (14252/14252), done.
-- The C compiler identification is GNU 7.5.0
-- The CXX compiler identification is GNU 7.5.0
-- Check for working C compiler: /usr/bin/cc
-- Check for working C compiler: /usr/bin/cc -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /usr/bin/c++
-- Check for working CXX compiler: /usr/bin/c++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Found OpenMP_C: -fopenmp (found version "4.5") 
-- Found OpenMP_CXX: -fopenmp (found version "4.5") 
-- Found OpenMP: TRUE (found version "4.5") 

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
import numpy as np
import pandas as pd

from tqdm import tqdm

import warnings
from pathlib import Path

from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

import lightgbm as lgbm

from scipy.optimize import minimize

In [4]:
pd.set_option('max_columns', 100)
pd.set_option('display.precision', 4)

warnings.filterwarnings('ignore')

# 데이터 로드

In [5]:
# 로컬 버전

# data_dir = Path('../input/dankook')
# feature_dir = Path('../output/feature')
# val_dir = Path('../output/oof_pred')
# test_dir = Path('../output/test_pred')
# sub_dir = Path('../output/sub')


# train_file = data_dir / 'train.csv'
# test_file = data_dir / 'test.csv'
# sample_file = data_dir / 'sample_submission.csv'

In [6]:
# 코렙 

from google.colab import drive
drive.mount('/content/drive')

data_dir = Path('/content/drive/My Drive/Colab Notebooks/input/dankook')
feature_dir = Path('/content/drive/My Drive/Colab Notebooks/output/feature')
val_dir = Path('/content/drive/My Drive/Colab Notebooks/output/oof_pred')
test_dir = Path('/content/drive/My Drive/Colab Notebooks/output/test_pred')
sub_dir = Path('/content/drive/My Drive/Colab Notebooks/output/sub')

train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'

Mounted at /content/drive


In [7]:
algorithm_name = 'lgbmcv'
feature_name = 'stacking1'
model_name = f'{algorithm_name}_{feature_name}_3'

feature_Ver1_file = feature_dir / f'{feature_name}_Ver1.csv'
feature_Ver2_file = feature_dir / f'{feature_name}_Ver2.csv'
feature_target_file = feature_dir / f'feature_target.csv'

stacking1_oof_pred_file = val_dir / f'{model_name}_oof_pred.csv'
stacking1_test_pred_file = test_dir / f'{model_name}_test_pred.csv'
stacking1_submission_file = sub_dir / f'{model_name}_submission.csv'

In [8]:
SEED = 2020
num_class = 3
n_splits = 5
target_column = 'class'

# Stacking Feature 생성

In [9]:
def load_data(model_names, oof_list, test_list, feature_names=None,number_of_versions=None):
    if number_of_versions == None or number_of_versions == 1:
        for model in model_names:
            oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver1.csv',delimiter=','))
            test_list.append(np.loadtxt(test_dir / f'{model}_test_pred_ver1.csv', delimiter=','))
            if feature_names != None:
                feature_names += [f'{model}_ver1_class0', f'{model}_ver1_class1', f'{model}_ver1_class2']
    elif number_of_versions == 2:
        for model in model_names:
            oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver1.csv',delimiter=','))
            oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver2.csv', delimiter=','))
            test_list.append(np.loadtxt(test_dir / f'{model}_test_pred_ver1.csv',delimiter=','))
            test_list.append(np.loadtxt(test_dir / f'{model}_test_pred_ver2.csv', delimiter=','))
            if feature_names != None:
                feature_names += [f'{model}_ver1_class0', f'{model}_ver1_class1', f'{model}_ver1_class2',
                                  f'{model}_ver2_class0',f'{model}_ver2_class1',f'{model}_ver2_class2']
    elif number_of_versions == 2.1:
        for model in model_names:
            oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver1.csv',delimiter=','))
            oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver3.csv', delimiter=','))
            test_list.append(np.loadtxt(test_dir / f'{model}_test_pred_ver1.csv',delimiter=','))
            test_list.append(np.loadtxt(test_dir / f'{model}_test_pred_ver3.csv', delimiter=','))
            if feature_names != None:
                feature_names += [f'{model}_ver1_class0', f'{model}_ver1_class1', f'{model}_ver1_class2',
                                  f'{model}_ver3_class0',f'{model}_ver3_class1',f'{model}_ver3_class2']
    elif number_of_versions == 3:
        for model in model_names:
            oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver1.csv',delimiter=','))
            oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver2.csv', delimiter=','))
            oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver3.csv', delimiter=','))
            test_list.append(np.loadtxt(test_dir / f'{model}_test_pred_ver1.csv',delimiter=','))
            test_list.append(np.loadtxt(test_dir / f'{model}_test_pred_ver2.csv', delimiter=','))
            test_list.append(np.loadtxt(test_dir / f'{model}_test_pred_ver3.csv', delimiter=','))
            if feature_names != None:
                feature_names += [f'{model}_ver1_class0', f'{model}_ver1_class1', f'{model}_ver1_class2',
                                  f'{model}_ver2_class0',f'{model}_ver2_class1', f'{model}_ver2_class2',
                                  f'{model}_ver3_class0', f'{model}_ver3_class1',f'{model}_ver3_class2']

In [10]:
all_oof = []
all_test = []
feature_names = []

model_names = ['lrcv_feature', 'etscv_feature', 'rfcv_feature', 'gbcv_feature','xgbcv_feature','lgbmcv_feature','adacv_feature_2']
load_data(model_names, all_oof, all_test, feature_names,3)

model_names = ['lrcv_polynomial_feature','rfcv_polynomial_feature','etscv_polynomial_feature','gbcv_polynomial_feature','adacv_polynomial_feature_2']
load_data(model_names,all_oof, all_test,feature_names,3)

model_names = ['xgbcv_polynomial_feature','lgbmcv_polynomial_feature']
load_data(model_names,all_oof, all_test,feature_names,2.1)

all_oof = np.column_stack(all_oof)
all_test = np.column_stack(all_test)

In [11]:
y = pd.read_csv(feature_target_file, index_col=0, usecols=['id',target_column]).values.flatten()
y.shape

(319923,)

# Stacking

- 각 oof마다 fold별로 logloos의 변동이 있으므로 최대한 정보를 뽑아내고자 스태킹을 함.

In [12]:
# light gbm 
lgbm_params = {
    'num_threads': -1, # aliases: n_jobs
    'num_iterations': 100, # aliases: n_estimators
    'metric': 'multi_logloss',
    'learning_rate': 0.3, # aliases: eta
    'boosting': 'gbdt', # aliases: boosting_type
    'objective': 'multiclass', # aliases: softmax
    'num_class': 3,
    'random_state': SEED,
    'device_type': 'gpu', # aliases: device
    'gpu_use_dp': 'true',
    'verbosity': 0, # aliases: verbose
}

In [13]:
mlogloss = []

stk_oof_pred = np.zeros((all_oof.shape[0],num_class))
stk_test_pred = np.zeros((all_test.shape[0],num_class))

kFold = StratifiedKFold(n_splits=n_splits, random_state=2020, shuffle=True)
for fold, (trn_idx, val_idx) in enumerate(kFold.split(all_oof,y)):
    X_train, X_val = all_oof[trn_idx], all_oof[val_idx]
    y_train, y_val = y[trn_idx], y[val_idx]

    dtrain = lgbm.Dataset(X_train, label=y_train)
    dval = lgbm.Dataset(X_val, label=y_val)

    lgbm_clf = lgbm.train(params=lgbm_params, train_set=dtrain, num_boost_round=5000,valid_sets=[dtrain,dval], early_stopping_rounds=50, verbose_eval=5000)
    mlogloss.append(lgbm_clf.best_score['valid_1']['multi_logloss'])

    stk_test_pred += lgbm_clf.predict(all_test) / n_splits
    stk_oof_pred[val_idx] = lgbm_clf.predict(X_val)

print('mean logloss= ', np.mean(mlogloss))

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[20]	training's multi_logloss: 0.139189	valid_1's multi_logloss: 0.15446
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[17]	training's multi_logloss: 0.142971	valid_1's multi_logloss: 0.153959
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[16]	training's multi_logloss: 0.14363	valid_1's multi_logloss: 0.155158
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[20]	training's multi_logloss: 0.140505	valid_1's multi_logloss: 0.153952
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[18]	training's multi_logloss: 0.143288	valid_1's multi_logloss: 0.154996
mean logloss=  0.1545048718917065


In [14]:
# submission 파일 생성

sub = pd.read_csv(sample_file)
sub[target_column] = np.argmax(stk_test_pred, axis=1)
sub.to_csv(stacking1_submission_file, index=False, encoding='utf-8-sig')

In [15]:
# stcking1_oof_pred 파일 생성

np.savetxt(stacking1_oof_pred_file, stk_oof_pred, fmt='%.18f',delimiter=',')

In [16]:
# stacking1_test_pred 파일 생성

np.savetxt(stacking1_test_pred_file, stk_test_pred, fmt='%.18f', delimiter=',')