# 라이브러리 로드

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd

import warnings
from pathlib import Path

from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow.keras import Input, Model, Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalMaxPooling1D, Conv1D, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam

In [3]:
pd.set_option('max_columns', 100)
pd.set_option('display.precision', 4)

warnings.filterwarnings('ignore')

# 학습데이터 로드

In [4]:
data_dir = Path('../data/dacon-novel-author-classification')
feature_dir = Path('../build/feature')
val_dir = Path('../build/val')
tst_dir = Path('../build/tst')
sub_dir = Path('../build/sub')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 2020

In [5]:
algorithm_name = 'mlp'

feature_names= ['stacking-layer1-stopwords-yes-nn',
               'stacking-layer1-stopwords-no-nn',
               'stacking-layer1-stopwords-yes-ml',
               'stacking-layer1-stopwords-no-ml']

feature_target_file = feature_dir / f'feature_target.csv'

model_names = []
for feature_name in feature_names:
    model_names.append(f'{algorithm_name}_{feature_name}')
    
stacking_oof_pred_files=[]
for model_name in model_names:
    stacking_oof_pred_files.append( val_dir / f'{model_name}_oof_pred.csv')
    
stacking_test_pred_files=[]
for model_name in model_names:
    stacking_test_pred_files.append( tst_dir / f'{model_name}_test_pred.csv')
    
stacking_submission_files=[]
for model_name in model_names:
    stacking_submission_files.append( sub_dir / f'{model_name}_submission.csv')

# Stacking feature 생성

In [6]:
def load_feature(model_names, number_of_ver=None, kind=None):
    oof_list = []
    test_list = []
    
    if number_of_ver==None or kind==None:
        print('error')
        return None
    
    # 딥러닝 시리즈 4가지 버전
    if kind == 0:
        for model in model_names:
            print(f'load {model}_cv')
            for i in range(1,number_of_ver+1):
                oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv', delimiter=','))
                test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
    
    # 로지스틱 회귀 6가지 버전
    elif kind == 1:
        for model in model_names:
            print(f'load {model}_cv')
            for i in range(1, number_of_ver+1):
                oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv', delimiter=','))
                test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))

    # 신경망 기반 불용어 처리 21가지 버전 또는 머신러닝 기반 불용어 처리 18가지 버전
    elif kind == 2:
        for model in model_names:
            print(f'load {model}_cv')
            if model.find('feature') != -1:
                for i in range(2,5):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            elif model.find('tfidf') != -1:
                for i in range(1,4):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            elif model.find('hashing') != -1:
                for i in range(1,4):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            elif model.find('bow') != -1:
                for i in range(1,4):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            else:
                print('not found')
    
    # 신경만 기반 불용어 처리 X 13가지 버전 또는 머신러닝 기반 불용어 처리 X 18가지 버전
    elif kind == 3:
        for model in model_names:
            print(f'load {model}_cv')
            if model.find('feature') != -1:
                for i in range(1,2):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            elif model.find('tfidf') != -1:
                for i in range(4,7):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            elif model.find('hashing') != -1:
                for i in range(4,7):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            elif model.find('bow') != -1:
                for i in range(4,7):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            else:
                print('not found')
                
    # 모든 버전 가져오기
    elif kind == 4:
        for model in model_names:
            print(f'load {model}_cv')
            if model.find('feature') != -1:
                for i in range(1,5):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            elif model.find('tfidf') != -1:
                for i in range(1,7):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            elif model.find('hashing') != -1:
                for i in range(1,7):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            elif model.find('bow') != -1:
                for i in range(1,7):
                    oof_list.append(np.loadtxt(val_dir / f'{model}_oof_pred_ver{i}.csv',delimiter=','))
                    test_list.append(np.loadtxt(tst_dir / f'{model}_test_pred_ver{i}.csv', delimiter=','))
            else:
                print('not found')
    
    return oof_list, test_list

In [7]:
nn_model_names= ['cnn_feature', 'lstm_feature' , 'mlp_feature', 'transformer_feature','transformer_v2_feature',
               'cnn_tfidf', 'cnn_hashing', 'cnn_bow']

ml_model_names= ['mlp_tfidf', 'mlp_hashing', 'mlp_bow','lr_tfidf','lr_hashing','lr_bow']


trash = -1 # 의미없는 값
stopwords_yes_kind = 2 # 의미있는 값
stopwords_no_kind = 3 # 의미있는 값


# stopwords-yes-nn 버전
nn_yes_oof, nn_yes_test = load_feature(nn_model_names, trash, stopwords_yes_kind)
nn_yes_oof = np.concatenate(nn_yes_oof, axis=1)
nn_yes_test = np.concatenate(nn_yes_test, axis=1)
print(f'nn_yes shape : {nn_yes_oof.shape}, {nn_yes_test.shape}')

# stopwords-no-nn 버전
nn_no_oof, nn_no_test = load_feature(nn_model_names, trash, stopwords_no_kind)
nn_no_oof = np.concatenate(nn_no_oof, axis=1)
nn_no_test = np.concatenate(nn_no_test, axis=1)
print(f'nn_yes shape : {nn_no_oof.shape}, {nn_no_test.shape}')


# stopwords-yes-ml 버전
ml_yes_oof, ml_yes_test = load_feature(ml_model_names, trash, stopwords_yes_kind)
ml_yes_oof = np.concatenate(ml_yes_oof, axis=1)
ml_yes_test = np.concatenate(ml_yes_test, axis=1)
print(f'nn_yes shape : {ml_yes_oof.shape}, {ml_yes_test.shape}')


# stopwords-no-ml 버전
ml_no_oof, ml_no_test = load_feature(ml_model_names, trash, stopwords_no_kind)
ml_no_oof = np.concatenate(ml_no_oof, axis=1)
ml_no_test = np.concatenate(ml_no_test, axis=1)
print(f'nn_yes shape : {ml_no_oof.shape}, {ml_no_test.shape}')

load cnn_feature_cv
load lstm_feature_cv
load mlp_feature_cv
load transformer_feature_cv
load transformer_v2_feature_cv
load cnn_tfidf_cv
load cnn_hashing_cv
load cnn_bow_cv
nn_yes shape : (54879, 120), (19617, 120)
load cnn_feature_cv
load lstm_feature_cv
load mlp_feature_cv
load transformer_feature_cv
load transformer_v2_feature_cv
load cnn_tfidf_cv
load cnn_hashing_cv
load cnn_bow_cv
nn_yes shape : (54879, 70), (19617, 70)
load mlp_tfidf_cv
load mlp_hashing_cv
load mlp_bow_cv
load lr_tfidf_cv
load lr_hashing_cv
load lr_bow_cv
nn_yes shape : (54879, 90), (19617, 90)
load mlp_tfidf_cv
load mlp_hashing_cv
load mlp_bow_cv
load lr_tfidf_cv
load lr_hashing_cv
load lr_bow_cv
nn_yes shape : (54879, 90), (19617, 90)


In [8]:
y = pd.read_csv(feature_target_file, index_col=0, usecols=['index',target_col]).values.flatten()
y.shape

(54879,)

# 스태킹

- 각 oof 마다 fold별로 logloos 변동이 있으므로 최대한 정보를 뽑아내고자 스태킹을 함.

In [9]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [10]:
def get_model(number):
    inputs = Input(shape=(number,))
    x = Dense(256, activation='relu')(inputs)
    x = Dropout(0.1)(x)
    x = Dense(256, activation='relu')(x)
    outputs = Dense(n_class, activation='softmax')(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam')
    return model

In [12]:
datasets = [(nn_yes_oof, nn_yes_test, y),
            (nn_no_oof, nn_no_test, y),
            (ml_yes_oof, ml_yes_test, y),
            (ml_no_oof, ml_no_test, y)]

mlogloss = []

mlp_oof_preds = []
mlp_test_preds = []

cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

for number, (X, test , y) in enumerate(datasets, 1):
    print(f'start : {number}')
    
    mlp_oof_pred = np.zeros((X.shape[0], n_class))
    mlp_test_pred = np.zeros((test.shape[0], n_class))
    
    for i, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
        print(f'training model for CV #{i}')
        
        es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3,
                       verbose=1, mode='min', baseline=None, restore_best_weights=True)
        
        clf = get_model(X.shape[1])
        clf.fit(X[i_trn], 
            to_categorical(y[i_trn]),
            validation_data=(X[i_val], to_categorical(y[i_val])),
            epochs=100,
            batch_size=512,
            callbacks=[es])
                
        mlp_oof_pred[i_val, :] = clf.predict(X[i_val])
        mlp_test_pred += clf.predict(test) / n_fold
        mlogloss.append(log_loss(y[i_val], mlp_oof_pred[i_val]))
    mlp_oof_preds.append(mlp_oof_pred)
    mlp_test_preds.append(mlp_test_pred)
    
    print(f'end : {number}')

start : 1
training model for CV #1
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 00010: early stopping
training model for CV #2
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 00009: early stopping
training model for CV #3
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 00010: early stopping
training model for CV #4
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 00009: early stopping
training model for CV #5
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 00008: early stopping
end : 1
start : 2
training model for CV #1
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 00009: early s

Epoch 6/100
Epoch 7/100
Epoch 00007: early stopping
training model for CV #4
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 00011: early stopping
training model for CV #5
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 00013: early stopping
end : 2
start : 3
training model for CV #1
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 00012: early stopping
training model for CV #2
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 00013: early stopping
training model for CV #3
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/

Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 00013: early stopping
training model for CV #5
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 00007: early stopping
end : 3
start : 4
training model for CV #1
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 00016: early stopping
training model for CV #2
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 00009: early stopping
training model for CV #3
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 00009: early stopping
training model for CV #4
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 00006: early stopping
training model for CV #5
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 

In [13]:
for i,j in enumerate(mlp_oof_preds,1):
    print(f'logloss = {log_loss(pd.get_dummies(y),j):8.4f}')
    print(f'accuracy = {accuracy_score(y, np.argmax(j,axis=1))*100:8.4f}')
print('mean logloss = ',np.mean(mlogloss))

logloss =   0.5196
accuracy =  81.2278
logloss =   0.5388
accuracy =  80.5481
logloss =   0.5976
accuracy =  78.0408
logloss =   0.5120
accuracy =  81.5631
mean logloss =  0.5420083747910155


# 제출 파일 및 기타 파일 생성

In [14]:
# submission 파일 생성
sub = pd.read_csv(sample_file,index_col=0)

for filename, test_pred in zip(stacking_submission_files, mlp_test_preds):
    sub[sub.columns] = test_pred
    sub.to_csv(filename)

In [15]:
# stacking_oof_pred 파일 생성

for filename, oof_pred in zip(stacking_oof_pred_files, mlp_oof_preds):
    np.savetxt(filename, oof_pred, fmt='%.18f', delimiter=',')

In [16]:
# stacking_test_pred 파일 생성

for filename, test_pred in zip(stacking_test_pred_files, mlp_test_preds):
    np.savetxt(filename, test_pred, fmt='%.18f', delimiter=',')