In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import re

from pathlib import Path
import warnings

from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.optimizers import RMSprop

In [3]:
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

## 학습데이터 로드

In [4]:
data_dir = Path('../data/dacon-novel-author-classification')
feature_dir = Path('../build/feature')
val_dir = Path('../build/val')
tst_dir = Path('../build/tst')
sub_dir = Path('../build/sub')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 2020 

In [5]:
algo_name = 'lstm'
feature_name = 'lemmatization-emb'
model_name = f'{algo_name}_{feature_name}'

feature_file = feature_dir / f'{feature_name}.csv'

p_val_file = val_dir / f'{model_name}.val.csv'
p_tst_file = tst_dir / f'{model_name}.tst.csv'

sub_file = sub_dir / f'{model_name}.csv'

In [6]:
trn = pd.read_csv(trn_file, index_col=0)
print(trn.shape)
trn.head()

(54879, 2)


Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2
2,"She was engaged one day as she walked, in per...",1
3,"The captain was in the porch, keeping himself ...",4
4,"“Have mercy, gentlemen!” odin flung up his han...",3


In [7]:
tst = pd.read_csv(tst_file, index_col=0)
print(tst.shape)
tst.head()

(19617, 1)


Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
0,“Not at all. I think she is one of the most ch...
1,"""No,"" replied he, with sudden consciousness, ""..."
2,As the lady had stated her intention of scream...
3,“And then suddenly in the silence I heard a so...
4,His conviction remained unchanged. So far as I...


# 데이터 전처리

In [8]:
#부호를 제거해주는 함수
def alpha_num(text):
    return re.sub(r"[^A-Za-z0-9' ]", '', text)

In [9]:
#전처리 적용
trn['text'] = trn['text'].str.lower()
tst['text'] = tst['text'].str.lower()

trn['text'] = trn['text'].apply(alpha_num)
tst['text'] = tst['text'].apply(alpha_num)

In [10]:
# 토큰화 및 품사 정보를 이용해서 표제어 추출

# 단어의 품사 정보 얻는 함수
def get_wordnet_pos(word):
    if word.startswith('J'):
        return wordnet.ADJ
    elif word.startswith('V'):
        return wordnet.VERB
    elif word.startswith('N'):
        return wordnet.NOUN
    elif word.startswith('R'):
        return wordnet.ADV
    else:
        return None
    
# 품사 정보를 이용해서 표제어 추출하는 함수
def get_lemmatization(docs):
    transformed_docs = list()
    lemmatizer = WordNetLemmatizer()
    for sentence in docs:
        words = word_tokenize(sentence)
        pos_tagged = pos_tag(words)
        wordnet_tagged = list(map(lambda x: (x[0], get_wordnet_pos(x[1])), pos_tagged))
        
        lemmatized_word = []
        for word, tag in wordnet_tagged:
            if tag is None:
                lemmatized_word.append(word)
            else:
                lemmatized_word.append(lemmatizer.lemmatize(word,tag))
        transformed_docs.append(lemmatized_word)
    return transformed_docs

trn_doc = get_lemmatization(trn['text'])
tst_doc = get_lemmatization(tst['text'])

# lemmatizer = WordNetLemmatizer()
# trn_doc = list()
# for sentence in trn['text']:
#     words = word_tokenize(sentence)
#     lemmatized_words = [lemmatizer.lemmatize(t, get_wordnet_pos(t)) for t in words]
#     trn_doc.append(lemmatized_words)
    
# tst_doc = list()
# for sentence in tst['text']:
#     words = word_tokenize(sentence)
#     lemmatized_words = [lemmatizer.lemmatize(t, get_wordnet_pos(t)) for t in words]
#     tst_doc.append(lemmatized_words)

In [11]:
# train test 분리
X_train= np.array([" ".join(i) for i in trn_doc])
X_test = np.array([" ".join(i) for i in tst_doc])
y_train = np.array([x for x in trn['author']])

In [12]:
X_train

array(['he be almost choke there be so much so much he want to say but strange exclamation be all that come from his lip the pole gaze fixedly at him at the bundle of note in his hand look at odin and be in evident perplexity',
       'your sister ask for it i suppose',
       'she be engage one day as she walk in peruse janes last letter and dwelling on some passage which prove that jane have not write in spirit when instead of be again surprise by mr odin she saw on look up that odin be meet her put away the letter immediately and force a smile she say',
       ..., 'your sincere wellwisher friend and sister lucy odin',
       'then you want me to lend you money',
       'it certainly have not occur to me before but i say yes i should like that'],
      dtype='<U2342')

# **모델링**

In [13]:
#파라미터 설정
vocab_size = 30000
embedding_dim = 128
max_length = 500
padding_type='post'
#oov_tok = "<OOV>"

In [14]:
#tokenizer에 fit
tokenizer = Tokenizer(num_words = vocab_size)#, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [15]:
#데이터를 sequence로 변환해주고 padding 해줍니다.
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)

In [16]:
#가벼운 NLP모델 생성
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [17]:
es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', min_delta=0.0001, patience=7, verbose=1, mode='min',
    #model 학습시 5 epoch이상 loss 지표가 낮아지지 않을 경우 early stop을 해준다.
    baseline=None, restore_best_weights=True
)

In [18]:
# compile model
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

# model summary
print(model.summary())


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 128)          3840000   
_________________________________________________________________
bidirectional (Bidirectional (None, 500, 256)          263168    
_________________________________________________________________
dropout (Dropout)            (None, 500, 256)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               394240    
_________________________________________________________________
dense (Dense)                (None, 256)               65792     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               3

In [19]:
# fit model
num_epochs = 200
history = model.fit(train_padded, y_train, 
                    epochs=num_epochs, verbose=1, callbacks=[es],
                    validation_split=0.2)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 00011: early stopping


In [20]:
# predict values
pred = model.predict_proba(test_padded)

Instructions for updating:
Please use `model.predict()` instead.


In [21]:
pred

array([[1.90020762e-02, 6.55193806e-01, 2.71328539e-01, 4.41987254e-02,
        1.02768140e-02],
       [1.48093119e-01, 3.93888623e-01, 1.54965362e-02, 3.56710739e-02,
        4.06850666e-01],
       [9.94663239e-01, 2.85558240e-03, 3.82325816e-04, 1.24360586e-03,
        8.55244580e-04],
       ...,
       [6.33520528e-07, 9.99999285e-01, 1.95397742e-09, 7.56607861e-08,
        1.50890678e-08],
       [2.85093301e-05, 9.99966145e-01, 1.16811066e-07, 2.12656937e-06,
        3.05416393e-06],
       [9.73700702e-01, 8.16983450e-03, 3.62500455e-03, 6.78708777e-03,
        7.71729741e-03]], dtype=float32)

In [22]:
# submission
sub = pd.read_csv(sample_file,index_col=0)
sub[sub.columns] = pred

In [23]:
sub.head()

Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.019,0.6552,0.2713,0.0442,0.0103
1,0.1481,0.3939,0.0155,0.0357,0.4069
2,0.9947,0.0029,0.0004,0.0012,0.0009
3,0.0001,0.0016,0.9976,0.0006,0.0002
4,0.9778,0.0071,0.0018,0.0043,0.009


In [24]:
sub.to_csv('submission_2.csv')