In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from matplotlib import rcParams, pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
import re
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GlobalMaxPooling1D, Conv1D, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.optimizers import Adam
import warnings
import nltk
warnings.filterwarnings(action='ignore')

In [4]:
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/오픈소스_기말')

In [5]:

train = pd.read_csv('data/train.csv', encoding = 'utf-8')
test = pd.read_csv('data/test_x.csv', encoding = 'utf-8')
sample_submission = pd.read_csv('data/sample_submission.csv', encoding = 'utf-8')

In [6]:
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

train['text']=train['text'].apply(alpha_num)

In [7]:
# 불용어 제거해주는 함수
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)

# 불용어
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [8]:
#전처리 적용
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
train['text'] = train['text'].apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].apply(alpha_num).apply(remove_stopwords)

In [32]:
# train test 분리
X_train = train['text'].values
X_test = test['text'].values
y = train['author'].values
print(X_train.shape, X_test.shape, y.shape)

(54879,) (19617,) (54879,)


In [33]:
X_train[:3]

array(['almost choking much much wanted say strange exclamations came lips pole gazed fixedly bundle notes hand looked odin evident perplexity',
       'sister asked suppose',
       'engaged one day walked perusing janes last letter dwelling passages proved jane not written spirits instead surprised mr odin saw looking odin meeting putting away letter immediately forcing smile said'],
      dtype=object)

In [40]:
#파라미터 설정
vocab_size = 20000
embedding_dim = 64
max_length = 500
padding_type='post'
drop_rate = 0.2
# oov_tok = "<OOV>"

In [41]:
#tokenizer에 fit
tokenizer = Tokenizer(num_words = vocab_size)#, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [42]:
#데이터를 sequence로 변환해주고 padding 해줍니다.
train_sequences = tokenizer.texts_to_sequences(X_train)
trn = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(X_test)
tst = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)

print(trn.shape, tst.shape)

(54879, 500) (19617, 500)


In [43]:
n_fold = 5
n_class = 5
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)

In [49]:
def get_model():
  """
  model = Sequential([
                      Embedding(vocab_size, embedding_dim, input_length=max_length),
                      Dropout(drop_rate),
                      Conv1D(128, 7, padding="valid", activation="relu", strides=3),
                      Conv1D(128, 7, padding="valid", activation="relu", strides=3),
                      GlobalMaxPooling1D(),
                      Dense(128, activation='relu'),
                      Dropout(drop_rate),
                      Dense(64, activation='relu'),
                      Dropout(drop_rate),
                      Dense(n_class, activation='softmax')
                      ])
    
    # compile model
  model.compile(loss='categorical_crossentropy',
                optimizer=Adam(learning_rate=.0003))
  return model
  """
  model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_length),
        Bidirectional(LSTM(64, return_sequences=True)),
        Bidirectional(LSTM(64)),
        Dense(n_class, activation='softmax')
    ])
    
  model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=.01))
  return model
  

In [50]:
# fit model
num_epochs = 10

p_val = np.zeros((train_padded.shape[0], n_class))
p_tst = np.zeros((test_padded.shape[0], n_class))

for i, (i_trn, i_val) in enumerate(cv.split(trn, y), 1):
  print(f'training model for CV #{i}')
  model = get_model()
  es = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=3, verbose=1,
                     mode='min', restore_best_weights=True)
  model.fit(trn[i_trn], to_categorical(y[i_trn]), 
            validation_data=(trn[i_val], to_categorical(y[i_trn])),
            epochs=num_epochs, callbacks=[es], verbose=1)
  p_val[i_val, :] = model.predict(trn[i_val])
  p_tst += model.predict(tst) / n_fold


"""
model = get_model()
es = EarlyStopping(monitor='val_loss', 
                   min_delta=0.000001,
                   patience=3,
                   verbose=1,
                   mode='min',
                   restore_best_weights=True)
rlr = ReduceLROnPlateau(monitor='val_loss', 
                        factor=0.3,
                        patience=3, 
                        min_lr=1e-6, 
                        mode='min', 
                        verbose=1)
history = model.fit(train_padded, y_train, 
                    epochs=num_epochs,
                    verbose=2, 
                    validation_split=0.2,
                    callbacks=[es, rlr])
"""

training model for CV #1
Epoch 1/10

ValueError: ignored

In [None]:
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p_val, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p_val):8.4f}')

In [None]:
np.savetxt(p_val_file, p_val, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

# 시각화

In [None]:
# model summary
print(model.summary())

In [None]:
plot_model(clf)

In [None]:
pred = model.predict_proba(test_padded)
# submission
sample_submission[['0','1','2','3','4']] = pred
sample_submission

Instructions for updating:
Please use `model.predict()` instead.


In [None]:
sample_submission.to_csv('submission.csv', index = False, encoding = 'utf-8')