# 라이브러리 로드

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd

import warnings
from pathlib import Path

from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import StratifiedKFold

import lightgbm as lgbm

In [3]:
pd.set_option('max_columns', 100)
pd.set_option('display.precision', 4)

warnings.filterwarnings('ignore')

# 학습데이터 로드

In [4]:
data_dir = Path('../data/dacon-novel-author-classification')
feature_dir = Path('../build/feature')
val_dir = Path('../build/val')
tst_dir = Path('../build/tst')
sub_dir = Path('../build/sub')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 2020

In [5]:
algorithm_name = 'lgbm'

feature_names= ['stacking-layer2']

feature_target_file = feature_dir / f'feature_target.csv'

model_names = []
for feature_name in feature_names:
    model_names.append(f'{algorithm_name}_{feature_name}')
    
stacking_oof_pred_files=[]
for model_name in model_names:
    stacking_oof_pred_files.append( val_dir / f'{model_name}_oof_pred.csv')
    
stacking_test_pred_files=[]
for model_name in model_names:
    stacking_test_pred_files.append( tst_dir / f'{model_name}_test_pred.csv')
    
stacking_submission_files=[]
for model_name in model_names:
    stacking_submission_files.append( sub_dir / f'{model_name}_submission.csv')

# Stacking feature 생성

In [6]:
def load_feature(model_names, number_of_ver=None, kind=None):
    oof_list = []
    test_list = []
    
    if number_of_ver==None or kind==None:
        print('error')
        return None
    
    # 딥러닝 시리즈 4가지 버전
    if kind == 0:
        for model in model_names:
            print(f'load {model}_cv')
            for i in range(1,number_of_ver+1):
                oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv', delimiter=','))
                test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
    
    # 로지스틱 회귀 6가지 버전
    elif kind == 1:
        for model in model_names:
            print(f'load {model}_cv')
            for i in range(1, number_of_ver+1):
                oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv', delimiter=','))
                test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))

    # 신경망 기반 불용어 처리 21가지 버전 또는 머신러닝 기반 불용어 처리 18가지 버전
    elif kind == 2:
        for model in model_names:
            print(f'load {model}_cv')
            if model.find('feature') != -1:
                for i in range(2,5):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            elif model.find('tfidf') != -1:
                for i in range(1,4):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            elif model.find('hashing') != -1:
                for i in range(1,4):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            elif model.find('bow') != -1:
                for i in range(1,4):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            else:
                print('not found')
    
    # 신경만 기반 불용어 처리 X 13가지 버전 또는 머신러닝 기반 불용어 처리 X 18가지 버전
    elif kind == 3:
        for model in model_names:
            print(f'load {model}_cv')
            if model.find('feature') != -1:
                for i in range(1,2):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            elif model.find('tfidf') != -1:
                for i in range(4,7):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            elif model.find('hashing') != -1:
                for i in range(4,7):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            elif model.find('bow') != -1:
                for i in range(4,7):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            else:
                print('not found')
                
    # 첫번째 레이어를 학습하기 위한 데이터셋 모두 가져오기 버전
    elif kind == 4:
        for model in model_names:
            print(f'load {model}_cv')
            if model.find('feature') != -1:
                for i in range(1,5):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            elif model.find('tfidf') != -1:
                for i in range(1,7):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            elif model.find('hashing') != -1:
                for i in range(1,7):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            elif model.find('bow') != -1:
                for i in range(1,7):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            else:
                print('not found')
    
    # 두번째 레이어를 학습하기 위한 데이터셋 모두 가져오기 버전
    elif kind == 5:
        for model in model_names:
            print(f'load {model}_cv')
            if model.find('stacking') != -1:
                for feature in ['stopwords-yes-nn','stopwords-no-nn','stopwords-no-ml', 'stopwords-no-ml'] :
                    oof_list.append(np.loadtxt(val_dir / f'{model}-{feature}_oof_pred.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}-{feature}_test_pred.csv',delimiter=','))
            elif model.find('all') != -1:
                oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred.csv',delimiter=','))
                test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred.csv',delimiter=','))
    
    # 두번째 레이어를 학습하기 위한 PCA 데이터셋 + xgb/lgbm tfidf 데이터셋 가져오기 버전 - 하나씩
    elif kind ==6:
        for model in model_names:
            print(f'load {model}_cv')
            oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred.csv', delimiter=','))
            test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred.csv', delimiter=','))
            
                        
        
    
    return oof_list, test_list

In [7]:
model_names = ['mlp_tfidf-pca', 'lr_tfidf-pca', 'cnn_tfidf-pca', 'lgbm_tfidf','lgbm_tfidf-pca']

tmp_oof, tmp_test = load_feature(model_names, -1, 6)
tmp_oof = np.concatenate(tmp_oof, axis=1)
tmp_test = np.concatenate(tmp_test, axis=1)
print(f'shape : {tmp_oof.shape}, {tmp_test.shape}')

load mlp_tfidf-pca_cv
load lr_tfidf-pca_cv
load cnn_tfidf-pca_cv
load lgbm_tfidf_cv
load lgbm_tfidf-pca_cv
shape : (54879, 25), (19617, 25)


In [8]:
model_names = ['ets_stacking-layer1','gb_stacking-layer1','lgbm_stacking-layer1','mlp_stacking-layer1',
              'rf_stacking-layer1','xgb_stacking-layer1','ada_stacking-layer1',
              'xgb_all','lgbm_all']

all_oof, all_test = load_feature(model_names, -1, 5)
all_oof = np.concatenate(all_oof, axis=1)
all_test = np.concatenate(all_test, axis=1)
print(f'shape : {all_oof.shape}, {all_test.shape}')

load ets_stacking-layer1_cv
load gb_stacking-layer1_cv
load lgbm_stacking-layer1_cv
load mlp_stacking-layer1_cv
load rf_stacking-layer1_cv
load xgb_stacking-layer1_cv
load ada_stacking-layer1_cv
load xgb_all_cv
load lgbm_all_cv
shape : (54879, 150), (19617, 150)


In [9]:
all_oof = np.concatenate([all_oof, tmp_oof], axis=1)
all_test = np.concatenate([all_test, tmp_test], axis=1)
print(f'shape : {all_oof.shape}, {all_test.shape}')

shape : (54879, 175), (19617, 175)


In [10]:
y = pd.read_csv(feature_target_file, index_col=0, usecols=['index',target_col]).values.flatten()
y.shape

(54879,)

# 스태킹

- 각 oof 마다 fold별로 logloos 변동이 있으므로 최대한 정보를 뽑아내고자 스태킹을 함.

In [11]:
# lgbm
lgbm_params ={
 'bagging_fraction': 0.9859748005148201,
 'bagging_freq': 5,
 'boosting': 'gbdt',
 'class_weight': None,
 'feature_fraction': 0.6159082178130053,
 'lambda_l1': 0.11292818971664109,
 'lambda_l2': 0,
 'learning_rate': 0.054296605938283865,
 'max_depth': 6,
 'metric': 'multi_logloss',
 'min_child_weight': 2.316612849473689e-07,
 'min_data_in_leaf': 327,
 'n_estimators': 1000,
 'n_jobs': -1,
 'num_class': 5,
 'num_leaves': 93,
 'objective': 'multiclass',
 'random_state': 2020,
 'subsample_for_bin': 60000,
 'verbosity': 0
}

 # bagging_fraction 파라미터가 설정되어 있지만, 실제로는 사용하지 않았음...
 # bagging_freq = 1 로 설정해야지 사용이 가능한데...
 # bagging_fraction을 사용하지 않고, 93.835 라는 결과를 낸 것임.
 # hyperopt를 다시 돌려서 튜닝을 해야 할듯...
 # 그리고 그때 나온 결과가 어떻게 되냐에 따라서 어떤 파라미터를 사용할지 결정을 해야 할듯..
 

In [12]:
datasets = [(all_oof, all_test, y)]

mlogloss = []

lgbm_oof_preds = []
lgbm_test_preds = []

cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

for number, (X, test , y) in enumerate(datasets, 1):
    print(f'start : {number}')
    
    lgbm_oof_pred = np.zeros((X.shape[0], n_class))
    lgbm_test_pred = np.zeros((test.shape[0], n_class))
    
    for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
        print(f'training model for CV #{i}')
        
        X_train , X_val = X[i_trn], X[i_val]
        y_train, y_val = y[i_trn], y[i_val]
        
        dtrain = lgbm.Dataset(X_train, label=y_train)
        dval = lgbm.Dataset(X_val, label=y_val)
        watchlist = [(dtrain, 'train'), (dval, 'val')]
        
        clf = lgbm.train(params=lgbm_params, train_set=dtrain, num_boost_round=5000,
                         valid_sets=[dtrain,dval], early_stopping_rounds=50, verbose_eval=5000)
        
        lgbm_oof_pred[i_val, :] = clf.predict(X_val)
        lgbm_test_pred += clf.predict(test) / n_fold
        mlogloss.append(clf.best_score['valid_1']['multi_logloss'])
    lgbm_oof_preds.append(lgbm_oof_pred)
    lgbm_test_preds.append(lgbm_test_pred)
    
    print(f'end : {number}')

start : 1
training model for CV #1
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 50 rounds










Early stopping, best iteration is:
[95]	training's multi_logloss: 0.327987	valid_1's multi_logloss: 0.400095


training model for CV #2
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 50 rounds












Early stopping, best iteration is:
[94]	training's multi_logloss: 0.328122	valid_1's multi_logloss: 0.402999
training model for CV #3
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 50 rounds












Early stopping, best iteration is:
[105]	training's multi_logloss: 0.32543	valid_1's multi_logloss: 0.38388
training model for CV #4
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 50 rounds














Early stopping, best iteration is:
[107]	training's multi_logloss: 0.318629	valid_1's multi_logloss: 0.401723
training model for CV #5
You can set `force_col_wise=true` to remove the overhead.
Training until validation scores don't improve for 50 rounds










Early stopping, best iteration is:
[87]	training's multi_logloss: 0.331827	valid_1's multi_logloss: 0.400826
end : 1


In [13]:
for i,j in enumerate(lgbm_oof_preds,1):
    print(f'logloss = {log_loss(pd.get_dummies(y),j):8.4f}')
    print(f'accuracy = {accuracy_score(y, np.argmax(j,axis=1))*100:8.4f}')
print('mean logloss = ',np.mean(mlogloss))

logloss =   0.3979
accuracy =  85.7031
mean logloss =  0.397904542629652


# 제출 파일 및 기타 파일 생성

In [14]:
# submission 파일 생성
sub = pd.read_csv(sample_file,index_col=0)

for filename, test_pred in zip(stacking_submission_files, lgbm_test_preds):
    sub[sub.columns] = test_pred
    sub.to_csv(filename)

In [15]:
# stacking_oof_pred 파일 생성

for filename, oof_pred in zip(stacking_oof_pred_files, lgbm_oof_preds):
    np.savetxt(filename, oof_pred, fmt='%.18f', delimiter=',')

In [16]:
# stacking_test_pred 파일 생성

for filename, test_pred in zip(stacking_test_pred_files, lgbm_test_preds):
    np.savetxt(filename, test_pred, fmt='%.18f', delimiter=',')