## 라이브러리 import 및 설정

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import gc
from tensorflow.keras.backend import clear_session

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import HashingVectorizer

from nltk.corpus import stopwords

from pathlib import Path

import tensorflow as tf
from tensorflow.keras import Input, Model, Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalMaxPooling1D, Conv1D, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam

import warnings

In [3]:
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

## 학습데이터 로드

In [4]:
data_dir = Path('../data/dacon-novel-author-classification')
feature_dir = Path('../build/feature')
val_dir = Path('../build/val')
tst_dir = Path('../build/tst')
sub_dir = Path('../build/sub')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 2020 

In [5]:
algo_name = 'cnn'
feature_name = 'hashing'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'

p_val_ver1_file = val_dir / f'{model_name}_oof_pred_ver1.csv'
p_tst_ver1_file = tst_dir / f'{model_name}_test_pred_ver1.csv'

p_val_ver2_file = val_dir / f'{model_name}_oof_pred_ver2.csv'
p_tst_ver2_file = tst_dir / f'{model_name}_test_pred_ver2.csv'

p_val_ver3_file = val_dir / f'{model_name}_oof_pred_ver3.csv'
p_tst_ver3_file = tst_dir / f'{model_name}_test_pred_ver3.csv'

p_val_ver4_file = val_dir / f'{model_name}_oof_pred_ver4.csv'
p_tst_ver4_file = tst_dir / f'{model_name}_test_pred_ver4.csv'

p_val_ver5_file = val_dir / f'{model_name}_oof_pred_ver5.csv'
p_tst_ver5_file = tst_dir / f'{model_name}_test_pred_ver5.csv'

p_val_ver6_file = val_dir / f'{model_name}_oof_pred_ver6.csv'
p_tst_ver6_file = tst_dir / f'{model_name}_test_pred_ver6.csv'

sub_ver1_file = sub_dir / f'{model_name}_ver1.csv'
sub_ver2_file = sub_dir / f'{model_name}_ver2.csv'
sub_ver3_file = sub_dir / f'{model_name}_ver3.csv'
sub_ver4_file = sub_dir / f'{model_name}_ver4.csv'
sub_ver5_file = sub_dir / f'{model_name}_ver5.csv'
sub_ver6_file = sub_dir / f'{model_name}_ver6.csv'

In [6]:
trn = pd.read_csv(trn_file, index_col=0)
print(trn.shape)
trn.head()

(54879, 2)


Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2
2,"She was engaged one day as she walked, in per...",1
3,"The captain was in the porch, keeping himself ...",4
4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [7]:
tst = pd.read_csv(tst_file, index_col=0)
print(tst.shape)
tst.head()

(19617, 1)


Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
0,“Not at all. I think she is one of the most ch...
1,"""No,"" replied he, with sudden consciousness, ""..."
2,As the lady had stated her intention of scream...
3,“And then suddenly in the silence I heard a so...
4,His conviction remained unchanged. So far as I...


## 단어 토큰화 비교, 어간 추출과 표제어 추출 비교

In [8]:
# NLTK에 있는 단어 토큰화
# -> Don't를 Do 와 n't로 분리, Jone's를 Jone 과 '로 분리.
from nltk.tokenize import word_tokenize

# NLTK에 있는 단어 토큰화 
# -> Don't를 Don 과 ' 와 t 로 분리, Jone's를 Jone 과 ' 와 s로 분리.
from nltk.tokenize import WordPunctTokenizer

# 케라스에 있는 단어 토큰화
# -> 모든 알파벳을 소문자로 바꾸고, 온점이나, 컴마, 느낌표 등의 구두점을 제거.
# -> 하지만 don't 나 jone's와 같은 경우 아포스트로피를 보존함.
from tensorflow.keras.preprocessing.text import text_to_word_sequence


# NLTK에 있는 어간 추출(Stemming) 표제어 추출(Lemmatization)

# -> WordNetLemmatizer는 기본형을 추출, 속도가 오래 걸리고 복잡함.
# -> Pos(Part of Speech)에 대한 설정이 없으면, 제대로된 어간을 추출하지 못할 수 있음.
# -> 제대로된 어간을 추출하고 싶다면, 단어의 쓰임새를 알아야 됨.
from nltk.stem import WordNetLemmatizer

# -> Poter Stemmer은 대표적인 문법 기준을 뽑아서, 추출하는 방식, 즉 어간 추출함. 어간은 단어의 의미를 담고 있는 핵심 부분임.
# -> 영어의 접미사(suffix)를 제거해서, 단어의 의미를 담고있는 어간만 추출함
from nltk.stem import PorterStemmer

# -> LancasterStemmer은 Poter Stemmer와 비슷하지만, 알고리즘이 다름.
from nltk.stem import LancasterStemmer

# -> Porter Stemmer의 개선판, Porter Stemmer2라고 보면 됨.
from nltk.stem.snowball import SnowballStemmer

### nltk의 word_tokenize를 통해서 토큰화를 진행 후, 어간 추출 및 표제어 추출

In [9]:
s_1 = trn.text[4]
print(s_1)

“Have mercy, gentlemen!” odin flung up his hands. “Don’t write that, anyway; have some shame. Here I’ve torn my heart asunder before you, and you seize the opportunity and are fingering the wounds in both halves.... Oh, my God!”


In [10]:
tokenized_word_1 = word_tokenize(s_1)
print(tokenized_word_1)

['“', 'Have', 'mercy', ',', 'gentlemen', '!', '”', 'odin', 'flung', 'up', 'his', 'hands', '.', '“', 'Don', '’', 't', 'write', 'that', ',', 'anyway', ';', 'have', 'some', 'shame', '.', 'Here', 'I', '’', 've', 'torn', 'my', 'heart', 'asunder', 'before', 'you', ',', 'and', 'you', 'seize', 'the', 'opportunity', 'and', 'are', 'fingering', 'the', 'wounds', 'in', 'both', 'halves', '....', 'Oh', ',', 'my', 'God', '!', '”']


In [11]:
lemmatizer = WordNetLemmatizer()
tokenized_lemmatizer_word_1 = [lemmatizer.lemmatize(t) for t in tokenized_word_1]
print(tokenized_lemmatizer_word_1)

['“', 'Have', 'mercy', ',', 'gentleman', '!', '”', 'odin', 'flung', 'up', 'his', 'hand', '.', '“', 'Don', '’', 't', 'write', 'that', ',', 'anyway', ';', 'have', 'some', 'shame', '.', 'Here', 'I', '’', 've', 'torn', 'my', 'heart', 'asunder', 'before', 'you', ',', 'and', 'you', 'seize', 'the', 'opportunity', 'and', 'are', 'fingering', 'the', 'wound', 'in', 'both', 'half', '....', 'Oh', ',', 'my', 'God', '!', '”']


In [12]:
porterStemmer = PorterStemmer()
tokenized_porter_word_1 = [porterStemmer.stem(t) for t in tokenized_word_1]
print(tokenized_porter_word_1)

['“', 'have', 'merci', ',', 'gentlemen', '!', '”', 'odin', 'flung', 'up', 'hi', 'hand', '.', '“', 'don', '’', 't', 'write', 'that', ',', 'anyway', ';', 'have', 'some', 'shame', '.', 'here', 'I', '’', 've', 'torn', 'my', 'heart', 'asund', 'befor', 'you', ',', 'and', 'you', 'seiz', 'the', 'opportun', 'and', 'are', 'finger', 'the', 'wound', 'in', 'both', 'halv', '....', 'Oh', ',', 'my', 'god', '!', '”']


In [13]:
lancasterStemmer = LancasterStemmer()
tokenized_lancaster_word_1 = [lancasterStemmer.stem(t) for t in tokenized_word_1]
print(tokenized_lancaster_word_1)

['“', 'hav', 'mercy', ',', 'gentlem', '!', '”', 'odin', 'flung', 'up', 'his', 'hand', '.', '“', 'don', '’', 't', 'writ', 'that', ',', 'anyway', ';', 'hav', 'som', 'sham', '.', 'her', 'i', '’', 've', 'torn', 'my', 'heart', 'asund', 'bef', 'you', ',', 'and', 'you', 'seiz', 'the', 'opportun', 'and', 'ar', 'fing', 'the', 'wound', 'in', 'both', 'halv', '....', 'oh', ',', 'my', 'god', '!', '”']


In [14]:
snowballStemmer = SnowballStemmer("english")
tokenized_snowball_word_1 = [snowballStemmer.stem(t) for t in tokenized_word_1]
print(tokenized_snowball_word_1)

['“', 'have', 'merci', ',', 'gentlemen', '!', '”', 'odin', 'flung', 'up', 'his', 'hand', '.', '“', 'don', '’', 't', 'write', 'that', ',', 'anyway', ';', 'have', 'some', 'shame', '.', 'here', 'i', '’', 've', 'torn', 'my', 'heart', 'asund', 'befor', 'you', ',', 'and', 'you', 'seiz', 'the', 'opportun', 'and', 'are', 'finger', 'the', 'wound', 'in', 'both', 'halv', '....', 'oh', ',', 'my', 'god', '!', '”']


### nltk의 WordPunctTokenizer를 통해서 토큰화를 진행 후, 어간 추출 및 표제어 추출

In [15]:
s_2 = trn.text[4]
print(s_2)

“Have mercy, gentlemen!” odin flung up his hands. “Don’t write that, anyway; have some shame. Here I’ve torn my heart asunder before you, and you seize the opportunity and are fingering the wounds in both halves.... Oh, my God!”


In [16]:
wordPunctTokenizer = WordPunctTokenizer()
tokenized_word_2 = wordPunctTokenizer.tokenize(s_2)
print(tokenized_word_2)

['“', 'Have', 'mercy', ',', 'gentlemen', '!”', 'odin', 'flung', 'up', 'his', 'hands', '.', '“', 'Don', '’', 't', 'write', 'that', ',', 'anyway', ';', 'have', 'some', 'shame', '.', 'Here', 'I', '’', 've', 'torn', 'my', 'heart', 'asunder', 'before', 'you', ',', 'and', 'you', 'seize', 'the', 'opportunity', 'and', 'are', 'fingering', 'the', 'wounds', 'in', 'both', 'halves', '....', 'Oh', ',', 'my', 'God', '!”']


In [17]:
lemmatizer = WordNetLemmatizer()
tokenized_lemmatizer_word_2 = [lemmatizer.lemmatize(t) for t in tokenized_word_2]
print(tokenized_lemmatizer_word_2)

['“', 'Have', 'mercy', ',', 'gentleman', '!”', 'odin', 'flung', 'up', 'his', 'hand', '.', '“', 'Don', '’', 't', 'write', 'that', ',', 'anyway', ';', 'have', 'some', 'shame', '.', 'Here', 'I', '’', 've', 'torn', 'my', 'heart', 'asunder', 'before', 'you', ',', 'and', 'you', 'seize', 'the', 'opportunity', 'and', 'are', 'fingering', 'the', 'wound', 'in', 'both', 'half', '....', 'Oh', ',', 'my', 'God', '!”']


In [18]:
porterStemmer = PorterStemmer()
tokenized_porter_word_2 = [porterStemmer.stem(t) for t in tokenized_word_2]
print(tokenized_porter_word_2)

['“', 'have', 'merci', ',', 'gentlemen', '!”', 'odin', 'flung', 'up', 'hi', 'hand', '.', '“', 'don', '’', 't', 'write', 'that', ',', 'anyway', ';', 'have', 'some', 'shame', '.', 'here', 'I', '’', 've', 'torn', 'my', 'heart', 'asund', 'befor', 'you', ',', 'and', 'you', 'seiz', 'the', 'opportun', 'and', 'are', 'finger', 'the', 'wound', 'in', 'both', 'halv', '....', 'Oh', ',', 'my', 'god', '!”']


In [19]:
lancasterStemmer = LancasterStemmer()
tokenized_lancaster_word_2 = [lancasterStemmer.stem(t) for t in tokenized_word_2]
print(tokenized_lancaster_word_2)

['“', 'hav', 'mercy', ',', 'gentlem', '!”', 'odin', 'flung', 'up', 'his', 'hand', '.', '“', 'don', '’', 't', 'writ', 'that', ',', 'anyway', ';', 'hav', 'som', 'sham', '.', 'her', 'i', '’', 've', 'torn', 'my', 'heart', 'asund', 'bef', 'you', ',', 'and', 'you', 'seiz', 'the', 'opportun', 'and', 'ar', 'fing', 'the', 'wound', 'in', 'both', 'halv', '....', 'oh', ',', 'my', 'god', '!”']


In [20]:
snowballStemmer = SnowballStemmer("english")
tokenized_snowball_word_2 = [snowballStemmer.stem(t) for t in tokenized_word_2]
print(tokenized_snowball_word_2)

['“', 'have', 'merci', ',', 'gentlemen', '!”', 'odin', 'flung', 'up', 'his', 'hand', '.', '“', 'don', '’', 't', 'write', 'that', ',', 'anyway', ';', 'have', 'some', 'shame', '.', 'here', 'i', '’', 've', 'torn', 'my', 'heart', 'asund', 'befor', 'you', ',', 'and', 'you', 'seiz', 'the', 'opportun', 'and', 'are', 'finger', 'the', 'wound', 'in', 'both', 'halv', '....', 'oh', ',', 'my', 'god', '!”']


### kerasd의 text_to_word_sequence를 통해서 토큰화를 진행 후, 어간 추출 및 표제어 추출

In [21]:
s_3 = trn.text[4]
print(s_3)

“Have mercy, gentlemen!” odin flung up his hands. “Don’t write that, anyway; have some shame. Here I’ve torn my heart asunder before you, and you seize the opportunity and are fingering the wounds in both halves.... Oh, my God!”


In [22]:
tokenized_word_3 = text_to_word_sequence(s_3)
print(tokenized_word_3)

['“have', 'mercy', 'gentlemen', '”', 'odin', 'flung', 'up', 'his', 'hands', '“don’t', 'write', 'that', 'anyway', 'have', 'some', 'shame', 'here', 'i’ve', 'torn', 'my', 'heart', 'asunder', 'before', 'you', 'and', 'you', 'seize', 'the', 'opportunity', 'and', 'are', 'fingering', 'the', 'wounds', 'in', 'both', 'halves', 'oh', 'my', 'god', '”']


In [23]:
lemmatizer = WordNetLemmatizer()
tokenized_lemmatizer_word_3 = [lemmatizer.lemmatize(t) for t in tokenized_word_3]
print(tokenized_lemmatizer_word_3)

['“have', 'mercy', 'gentleman', '”', 'odin', 'flung', 'up', 'his', 'hand', '“don’t', 'write', 'that', 'anyway', 'have', 'some', 'shame', 'here', 'i’ve', 'torn', 'my', 'heart', 'asunder', 'before', 'you', 'and', 'you', 'seize', 'the', 'opportunity', 'and', 'are', 'fingering', 'the', 'wound', 'in', 'both', 'half', 'oh', 'my', 'god', '”']


In [24]:
porterStemmer = PorterStemmer()
tokenized_porter_word_3 = [porterStemmer.stem(t) for t in tokenized_word_3]
print(tokenized_porter_word_3)

['“have', 'merci', 'gentlemen', '”', 'odin', 'flung', 'up', 'hi', 'hand', '“don’t', 'write', 'that', 'anyway', 'have', 'some', 'shame', 'here', 'i’v', 'torn', 'my', 'heart', 'asund', 'befor', 'you', 'and', 'you', 'seiz', 'the', 'opportun', 'and', 'are', 'finger', 'the', 'wound', 'in', 'both', 'halv', 'oh', 'my', 'god', '”']


In [25]:
lancasterStemmer = LancasterStemmer()
tokenized_lancaster_word_3 = [lancasterStemmer.stem(t) for t in tokenized_word_3]
print(tokenized_lancaster_word_3)

['“have', 'mercy', 'gentlem', '”', 'odin', 'flung', 'up', 'his', 'hand', '“don’t', 'writ', 'that', 'anyway', 'hav', 'som', 'sham', 'her', 'i’ve', 'torn', 'my', 'heart', 'asund', 'bef', 'you', 'and', 'you', 'seiz', 'the', 'opportun', 'and', 'ar', 'fing', 'the', 'wound', 'in', 'both', 'halv', 'oh', 'my', 'god', '”']


In [26]:
snowballStemmer = SnowballStemmer("english")
tokenized_snowball_word_3 = [snowballStemmer.stem(t) for t in tokenized_word_3]
print(tokenized_snowball_word_3)

['“have', 'merci', 'gentlemen', '”', 'odin', 'flung', 'up', 'his', 'hand', "“don't", 'write', 'that', 'anyway', 'have', 'some', 'shame', 'here', "i'v", 'torn', 'my', 'heart', 'asund', 'befor', 'you', 'and', 'you', 'seiz', 'the', 'opportun', 'and', 'are', 'finger', 'the', 'wound', 'in', 'both', 'halv', 'oh', 'my', 'god', '”']


## Hashing 피쳐 생성

- nltk의 word_tokenize 사용

In [27]:
vec = HashingVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 3), n_features=2**10)
X_1 = vec.fit_transform(trn['text']).toarray()
X_tst_1 = vec.transform(tst['text']).toarray()
print(X_1.shape, X_tst_1.shape)

(54879, 1024) (19617, 1024)


In [28]:
X_1[0, :50]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.09950372,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        , -0.09950372,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        , -0.09950372,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ])

- nltk의 WordPunctTokenizer 사용

In [29]:
vec = HashingVectorizer(tokenizer=wordPunctTokenizer.tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 3), n_features=2**10)
X_2 = vec.fit_transform(trn['text']).toarray()
X_tst_2 = vec.transform(tst['text']).toarray()
print(X_2.shape, X_tst_2.shape)

(54879, 1024) (19617, 1024)


In [30]:
X_2[0, :50]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.09950372,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        , -0.09950372,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        , -0.09950372,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ])

- keras의 text_to_word_sequence 사용

In [31]:
vec = HashingVectorizer(tokenizer=text_to_word_sequence, stop_words=stopwords.words('english'), ngram_range=(1, 3), n_features=2**10)
X_3 = vec.fit_transform(trn['text']).toarray()
X_tst_3 = vec.transform(tst['text']).toarray()
print(X_3.shape, X_tst_3.shape)

(54879, 1024) (19617, 1024)


In [32]:
X_3[0, :50]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        , -0.13245324,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        , -0.13245324,
        0.        ,  0.        ,  0.        ,  0.        , -0.13245324,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ])

- nltk의 word_tokenize 사용, stopword 제거

In [33]:
vec = HashingVectorizer(tokenizer=word_tokenize, ngram_range=(1, 3), n_features=2**10)
X_4 = vec.fit_transform(trn['text']).toarray()
X_tst_4 = vec.transform(tst['text']).toarray()
print(X_4.shape, X_tst_4.shape)

(54879, 1024) (19617, 1024)


In [34]:
X_4[0, :50]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.06917145,  0.        ,
        0.        ,  0.06917145,  0.        , -0.06917145, -0.06917145,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        , -0.06917145,
        0.        , -0.06917145,  0.        ,  0.        ,  0.06917145,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.06917145,  0.        ,  0.06917145,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ])

- nltk의 WordPunctTokenizer 사용, stopword 제거

In [35]:
vec = HashingVectorizer(tokenizer=wordPunctTokenizer.tokenize, ngram_range=(1, 3), n_features=2**10)
X_5 = vec.fit_transform(trn['text']).toarray()
X_tst_5 = vec.transform(tst['text']).toarray()
print(X_5.shape, X_tst_5.shape)

(54879, 1024) (19617, 1024)


In [36]:
X_5[0, :50]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.06917145,  0.        ,
        0.        ,  0.06917145,  0.        , -0.06917145, -0.06917145,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        , -0.06917145,
        0.        , -0.06917145,  0.        ,  0.        ,  0.06917145,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.06917145,  0.        ,  0.06917145,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ])

- keras의 text_to_word_sequence 사용, stopword 제거

In [37]:
vec = HashingVectorizer(tokenizer=text_to_word_sequence, ngram_range=(1, 3), n_features=2**10)
X_6 = vec.fit_transform(trn['text']).toarray()
X_tst_6 = vec.transform(tst['text']).toarray()
print(X_6.shape, X_tst_6.shape)

(54879, 1024) (19617, 1024)


In [38]:
X_6[0, :50]

array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.07881104,  0.        , -0.07881104,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        , -0.07881104,
        0.        , -0.07881104,  0.        ,  0.        ,  0.07881104,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.07881104,  0.        ,  0.07881104,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ])

## cnn 모델 학습

In [39]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [40]:
def get_model(number):
    inputs = Input(batch_shape=(None, number, 1))
    x = Conv1D(128, 7, padding="valid", activation="relu", strides=3)(inputs)
    x = Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(128, activation='relu')(x)
    outputs = Dense(n_class, activation='softmax')(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam')
    return model

In [41]:
y = trn.author.values
y.shape

(54879,)

In [42]:
p_val_ver1 = np.zeros((X_1.shape[0], n_class))
p_tst_ver1 = np.zeros((X_tst_1.shape[0], n_class))
p_val_ver2 = np.zeros((X_2.shape[0], n_class))
p_tst_ver2 = np.zeros((X_tst_2.shape[0], n_class))
p_val_ver3 = np.zeros((X_3.shape[0], n_class))
p_tst_ver3 = np.zeros((X_tst_3.shape[0], n_class))
p_val_ver4 = np.zeros((X_4.shape[0], n_class))
p_tst_ver4 = np.zeros((X_tst_4.shape[0], n_class))
p_val_ver5 = np.zeros((X_5.shape[0], n_class))
p_tst_ver5 = np.zeros((X_tst_5.shape[0], n_class))
p_val_ver6 = np.zeros((X_6.shape[0], n_class))
p_tst_ver6 = np.zeros((X_tst_6.shape[0], n_class))

for number, (X, test) in enumerate([(X_1, X_tst_1), (X_2, X_tst_2), (X_3, X_tst_3),
               (X_4, X_tst_4), (X_5, X_tst_5), (X_6, X_tst_6)],1):
    for i_cv, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
        print(f'Training model for CV #{i_cv}')
        
        es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3,
                       verbose=1, mode='min', baseline=None, restore_best_weights=True)
        
        clf = get_model(X.shape[1])
        clf.fit(X[i_trn], 
            to_categorical(y[i_trn]),
            validation_data=(X[i_val], to_categorical(y[i_val])),
            epochs=100,
            batch_size=512,
            callbacks=[es])
       
        # Predict
        if number==1:
            p_val_ver1[i_val, :] = clf.predict(X[i_val])
            p_tst_ver1 += clf.predict(test) / n_class
        elif number==2:
            p_val_ver2[i_val, :] = clf.predict(X[i_val])
            p_tst_ver2 += clf.predict(test) / n_class
        elif number==3:
            p_val_ver3[i_val, :] = clf.predict(X[i_val])
            p_tst_ver3 += clf.predict(test) / n_class
        elif number==4:
            p_val_ver4[i_val, :] = clf.predict(X[i_val])
            p_tst_ver4 += clf.predict(test) / n_class
        elif number==5:
            p_val_ver5[i_val, :] = clf.predict(X[i_val])
            p_tst_ver5 += clf.predict(test) / n_class
        else:
            p_val_ver6[i_val, :] = clf.predict(X[i_val])
            p_tst_ver6 += clf.predict(test) / n_class
            
        del clf
        clear_session()
        gc.collect()
            
    print("Training has finished")
    print("*"*100)

            
print(f'lr ver1 Accuracy (CV): {accuracy_score(y, np.argmax(p_val_ver1, axis=1)) * 100:8.4f}%')
print(f'lr ver1 Log Loss (CV): {log_loss(pd.get_dummies(y), p_val_ver1):8.4f}')
print(f'lr ver2 Accuracy (CV): {accuracy_score(y, np.argmax(p_val_ver2, axis=1)) * 100:8.4f}%')
print(f'lr ver2 Log Loss (CV): {log_loss(pd.get_dummies(y), p_val_ver2):8.4f}')
print(f'lr ver3 Accuracy (CV): {accuracy_score(y, np.argmax(p_val_ver3, axis=1)) * 100:8.4f}%')
print(f'lr ver3 Log Loss (CV): {log_loss(pd.get_dummies(y), p_val_ver3):8.4f}')
print(f'lr ver4 Accuracy (CV): {accuracy_score(y, np.argmax(p_val_ver4, axis=1)) * 100:8.4f}%')
print(f'lr ver4 Log Loss (CV): {log_loss(pd.get_dummies(y), p_val_ver4):8.4f}')
print(f'lr ver5 Accuracy (CV): {accuracy_score(y, np.argmax(p_val_ver5, axis=1)) * 100:8.4f}%')
print(f'lr ver5 Log Loss (CV): {log_loss(pd.get_dummies(y), p_val_ver5):8.4f}')
print(f'lr ver6 Accuracy (CV): {accuracy_score(y, np.argmax(p_val_ver6, axis=1)) * 100:8.4f}%')
print(f'lr ver6 Log Loss (CV): {log_loss(pd.get_dummies(y), p_val_ver6):8.4f}')

Training model for CV #1
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 00021: early stopping
Training model for CV #2
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 00023: early stopping
Training model for CV #3
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 00022: early stopping
Training m

Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 00020: early stopping
Training model for CV #5
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 00022: early stopping
Training has finished
****************************************************************************************************
Training model for CV #1
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 00015: early stopping
Training model for CV #2
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoc

Epoch 13/100
Epoch 14/100
Epoch 00014: early stopping
Training model for CV #4
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 00013: early stopping
Training model for CV #5
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 00015: early stopping
Training has finished
****************************************************************************************************
Training model for CV #1
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 00012: early stopping
Training model for CV #2
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 00015: early stopping
Training model for CV #5
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 00016: early stopping
Training has finished
****************************************************************************************************
Training model for CV #1
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 00024: early stopping
Training model for CV #2
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 00026: early stopping
Training model for CV #4
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 00022: early stopping
Training model for CV #5
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 00021: early stopping
Training

Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 00022: early stopping
Training model for CV #2
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 00021: early stopping
Training model for CV #3
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 00026: early stopping
Training model for CV #4
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5

Epoch 15/100
Epoch 16/100
Epoch 00016: early stopping
Training model for CV #5
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 00025: early stopping
Training has finished
****************************************************************************************************
Training model for CV #1
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 00015: early stopping
Training model for CV #2
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoc

Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 00015: early stopping
Training model for CV #4
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 00020: early stopping
Training model for CV #5
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 00022: early stopping
Training has finished
****************************************************************************************************
lr ver1 Accuracy (CV):  46.6444%
lr ver1 Log Loss (CV):   1.3128
lr ver2 Accuracy (CV):  42.4042%
lr ver2 Log Loss (CV):   1.3840
lr ver3 Accuracy (CV):  35.7678%


## 제출 파일 생성 및 기타 파일 생성

In [43]:
# submission 파일 생성

sub = pd.read_csv(sample_file, index_col=0)

# Ver1
sub[sub.columns] = p_tst_ver1
sub.to_csv(sub_ver1_file)

# Ver2
sub[sub.columns] = p_tst_ver2
sub.to_csv(sub_ver2_file)

# Ver3
sub[sub.columns] = p_tst_ver3
sub.to_csv(sub_ver3_file)

# Ver4
sub[sub.columns] = p_tst_ver4
sub.to_csv(sub_ver4_file)

# Ver5
sub[sub.columns] = p_tst_ver5
sub.to_csv(sub_ver5_file)

# Ver6
sub[sub.columns] = p_tst_ver6
sub.to_csv(sub_ver6_file)

In [44]:
# p_val 파일 생성 -> oof

# Ver1
np.savetxt(p_val_ver1_file, p_val_ver1, fmt='%.18f', delimiter=',')

# Ver2
np.savetxt(p_val_ver2_file, p_val_ver2, fmt='%.18f', delimiter=',')

# Ver3
np.savetxt(p_val_ver3_file, p_val_ver3, fmt='%.18f', delimiter=',')

# Ver4
np.savetxt(p_val_ver4_file, p_val_ver4, fmt='%.18f', delimiter=',')

# Ver5
np.savetxt(p_val_ver5_file, p_val_ver5, fmt='%.18f', delimiter=',')

# Ver6
np.savetxt(p_val_ver6_file, p_val_ver6, fmt='%.18f', delimiter=',')

In [45]:
# p_tst 파일 생성 -> test 

# Ver1
np.savetxt(p_tst_ver1_file, p_tst_ver1, fmt='%.18f', delimiter=',')

# Ver2
np.savetxt(p_tst_ver2_file, p_tst_ver2, fmt='%.18f', delimiter=',')

# Ver3
np.savetxt(p_tst_ver3_file, p_tst_ver3, fmt='%.18f', delimiter=',')

# Ver4
np.savetxt(p_tst_ver4_file, p_tst_ver4, fmt='%.18f', delimiter=',')

# Ver5
np.savetxt(p_tst_ver5_file, p_tst_ver5, fmt='%.18f', delimiter=',')

# Ver6
np.savetxt(p_tst_ver6_file, p_tst_ver6, fmt='%.18f', delimiter=',')