## 라이브러리 import 및 설정

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import re
import os

from pathlib import Path
import warnings

from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.initializers import Constant
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Dropout
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam

In [3]:
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

In [4]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        print(e)
else:
    print('No GPU detected')

1 Physical GPUs, 1 Logical GPU


In [5]:
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

## 학습데이터 로드 및 GloVe 임베딩 로드

http://nlp.stanford.edu/data/glove.6B.zip 를 다운받아 `data_dir`에 압축을 푼다.

In [6]:
data_dir = Path('../data/dacon-novel-author-classification')
feature_dir = Path('../build/feature')
val_dir = Path('../build/val')
tst_dir = Path('../build/tst')
sub_dir = Path('../build/sub')

dirs = [feature_dir, val_dir, tst_dir, sub_dir]
for d in dirs:
    os.makedirs(d, exist_ok=True)

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'
glove_file = data_dir / 'glove.6B.100d.txt'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 2020

In [7]:
algo_name = 'lstm'
feature_name = 'lemmatization-glove-emb'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'

p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'

sub_file = sub_dir / f'{model_name}.csv'

In [8]:
# os.system(f'wget http://nlp.stanford.edu/data/glove.6B.zip -P {data_dir}')
# os.system(f'unzip {data_dir}/glove.6B.zip -d {data_dir}')

In [10]:
embeddings_index = {}
with open(glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs
        
print(f'Found {len(embeddings_index)} word vectors.')

Found 400000 word vectors.


In [11]:
trn = pd.read_csv(trn_file, index_col=0)
print(trn.shape)
trn.head()

(54879, 2)


Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2
2,"She was engaged one day as she walked, in per...",1
3,"The captain was in the porch, keeping himself ...",4
4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [12]:
tst = pd.read_csv(tst_file, index_col=0)
print(tst.shape)
tst.head()

(19617, 1)


Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
0,“Not at all. I think she is one of the most ch...
1,"""No,"" replied he, with sudden consciousness, ""..."
2,As the lady had stated her intention of scream...
3,“And then suddenly in the silence I heard a so...
4,His conviction remained unchanged. So far as I...


## 데이터 전처리 

In [13]:
#부호를 제거해주는 함수
def alpha_num(text):
    return re.sub(r"[^A-Za-z0-9' ]", '', text)

In [14]:
#전처리 적용
trn['text'] = trn['text'].str.lower()
tst['text'] = tst['text'].str.lower()

trn['text'] = trn['text'].apply(alpha_num)
tst['text'] = tst['text'].apply(alpha_num)

In [15]:
trn

Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,he was almost choking there was so much so muc...,3
1,your sister asked for it i suppose,2
2,she was engaged one day as she walked in peru...,1
3,the captain was in the porch keeping himself c...,4
4,have mercy gentlemen odin flung up his hands d...,3
...,...,...
54874,is that you mr smith odin whispered i hardly d...,2
54875,i told my plan to the captain and between us w...,4
54876,your sincere wellwisher friend and sister luc...,1
54877,then you wanted me to lend you money,3


In [16]:
# train test 분리
X_trn = trn['text'].values
X_tst = tst['text'].values
y = trn['author'].values
print(X_trn.shape, X_tst.shape, y.shape)

(54879,) (19617,) (54879,)


In [17]:
X_trn

array(['he was almost choking there was so much so much he wanted to say but strange exclamations were all that came from his lips the pole gazed fixedly at him at the bundle of notes in his hand looked at odin and was in evident perplexity',
       'your sister asked for it i suppose',
       ' she was engaged one day as she walked in perusing janes last letter and dwelling on some passages which proved that jane had not written in spirits when instead of being again surprised by mr odin she saw on looking up that odin was meeting her putting away the letter immediately and forcing a smile she said',
       ..., ' your sincere wellwisher friend and sister lucy odin',
       'then you wanted me to lend you money',
       'it certainly had not occurred to me before but i said yes i should like that'],
      dtype=object)

In [18]:
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=500)
text_ds = tf.data.Dataset.from_tensor_slices(X_trn).batch(128)
vectorizer.adapt(text_ds)

In [19]:
vectorizer.get_vocabulary()[:5]

['', '[UNK]', 'the', 'and', 'to']

In [20]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [21]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print(f"Converted {hits} words ({misses} misses)")

Converted 18190 words (1810 misses)


In [22]:
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=Constant(embedding_matrix),
    trainable=False,
)

## 케라스 모델 학습

In [23]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [24]:
def get_model():
    int_sequences_input = Input(shape=(1,), dtype=tf.string)
    vectorized_sequences = vectorizer(int_sequences_input)
    embedded_sequences = embedding_layer(vectorized_sequences)
    x = Bidirectional(LSTM(64, return_sequences=True))(embedded_sequences)
    x = Bidirectional(LSTM(64))(x)
    preds = Dense(n_class, activation="softmax")(x)
    model = Model(int_sequences_input, preds)
    
    # compile model
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(learning_rate=.01))
    return model

In [None]:
p_val = np.zeros((X_trn.shape[0], n_class))
p_tst = np.zeros((X_tst.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(X_trn, y), 1):
    print(f'training model for CV #{i}')
    es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3,
                       verbose=1, mode='min', baseline=None, restore_best_weights=True)
    
    clf = get_model() 
    clf.fit(X_trn[i_trn], 
            to_categorical(y[i_trn]),
            validation_data=(X_trn[i_val], to_categorical(y[i_val])),
            epochs=200,
            batch_size=1024,
            callbacks=[es])
    p_val[i_val, :] = clf.predict(X_trn[i_val])
    p_tst += clf.predict(X_tst) / n_fold

print("Training has finished")
print("*"*100)

print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p_val, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p_val):8.4f}')

training model for CV #1
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 00011: early stopping
training model for CV #2
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 00012: early stopping
training model for CV #3
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 00011: early stopping
training model for CV #4
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 00010: early stopping
training model for CV #5
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 00010: early stopping


## 시각화

In [None]:
clf.summary()

## 제출 파일 생성 및 기타 파일 생성

In [None]:
# submission 파일 생성

sub = pd.read_csv(sample_file, index_col=0)
sub[sub.columns] = p_tst
sub.to_csv(sub_file)

In [None]:
# p_val 파일 생성 -> oof

np.savetxt(p_val_file, p_val, fmt='%.18f', delimiter=',')

In [None]:
# p_tst 파일 생성 -> test 

np.savetxt(p_tst_file, p_tst, fmt='%.18f', delimiter=',')