In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import warnings 
warnings.filterwarnings(action='ignore')
import numpy as np
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import re

In [3]:
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/오픈소스_기말')

In [4]:
train = pd.read_csv('data/train.csv', encoding = 'utf-8')
test = pd.read_csv('data/test_x.csv', encoding = 'utf-8')
sample_submission = pd.read_csv('data/sample_submission.csv', encoding = 'utf-8')

In [5]:
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

train['text']=train['text'].apply(alpha_num)

In [6]:
# 불용어 제거해주는 함수
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)

# 불용어
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [7]:
#전처리 적용
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
train['text'] = train['text'].apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].apply(alpha_num).apply(remove_stopwords)

In [8]:
# train test 분리
X_train = np.array([x for x in train['text']])
X_test = np.array([x for x in test['text']])
y_train = np.array([x for x in train['author']])

In [38]:
#파라미터 설정
vocab_size = 20000
embedding_dim = 32
max_length = 1000
padding_type='post'
drop_rate = 0.2
oov_tok = "<OOV>"

In [39]:
#tokenizer에 fit
tokenizer = Tokenizer(num_words = vocab_size)#, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [40]:
#데이터를 sequence로 변환해주고 padding 해줍니다.
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)

In [41]:
def get_model():
  #가벼운 NLP모델 생성
  model = tf.keras.Sequential([
      tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
      tf.keras.layers.GlobalAveragePooling1D(),
      tf.keras.layers.Dense(24, activation='relu'),
      tf.keras.layers.Dropout(drop_rate),
      tf.keras.layers.Dense(5, activation='softmax')
  ])

  # compile model
  model.compile(loss='sparse_categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

  # model summary
  print(model.summary())

  return model

In [42]:
n_fold = 2
n_class = 5
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=42)

In [45]:
# fit model
num_epochs = 15

"""
p_val = np.zeros((train_padded.shape[0], n_class))
p_tst = np.zeros((test_padded.shape[0], n_class))

for i, (i_trn, i_val) in enumerate(cv.split(train_padded, y_train), 1):
  print(f'training model for CV #{i}')
  model = get_model()
  es = EarlyStopping(monitor='val_loss', 
                     min_delta=0.001, 
                     patience=3,
                     verbose=1, 
                     mode='min', 
                     restore_best_weights=True)
  rlr = ReduceLROnPlateau(monitor='val_loss', 
                          factor=0.3,
                          patience=3, 
                          min_lr=1e-6, 
                          mode='min', 
                          verbose=1)
  history = model.fit(train_padded[i_trn], y_train[i_trn], 
                      validation_data=(train_padded[i_val], y_train[i_val]),
                      epochs=num_epochs,
                      verbose=2, 
                      validation_split=0.2)
  p_val[i_val, :] = model.predict(train_padded[i_val])
  p_tst += model.predict(test_padded) / n_fold
  """
model = get_model()
es = EarlyStopping(monitor='val_loss', 
                   min_delta=0.000001,
                   patience=3,
                   verbose=1,
                   mode='min',
                   restore_best_weights=True)
rlr = ReduceLROnPlateau(monitor='val_loss', 
                        factor=0.3,
                        patience=3, 
                        min_lr=1e-6, 
                        mode='min', 
                        verbose=1)
history = model.fit(train_padded, y_train, 
                    epochs=num_epochs,
                    verbose=2, 
                    validation_split=0.2,
                    callbacks=[es, rlr])

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 1000, 32)          640000    
_________________________________________________________________
global_average_pooling1d_11  (None, 32)                0         
_________________________________________________________________
dense_22 (Dense)             (None, 24)                792       
_________________________________________________________________
dropout_1 (Dropout)          (None, 24)                0         
_________________________________________________________________
dense_23 (Dense)             (None, 5)                 125       
Total params: 640,917
Trainable params: 640,917
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/15
1372/1372 - 15s - loss: 1.5723 - accuracy: 0.2695 - val_loss: 1.5662 - val_accuracy: 0.2680
Epoch

In [46]:
pred = model.predict_proba(test_padded)
# submission
sample_submission[['0','1','2','3','4']] = pred
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,0.011979,0.877415,1.467129e-02,3.194911e-02,6.398524e-02
1,1,0.194985,0.598486,4.642662e-02,1.920685e-02,1.408957e-01
2,2,0.998123,0.000562,1.175205e-03,2.378594e-06,1.366934e-04
3,3,0.000002,0.000002,9.959502e-01,3.806567e-08,4.046233e-03
4,4,0.861440,0.071313,1.312528e-02,5.191208e-02,2.209895e-03
...,...,...,...,...,...,...
19612,19612,0.000002,0.999998,3.314313e-17,2.087472e-12,7.094321e-12
19613,19613,0.000062,0.015610,4.466674e-02,5.009388e-08,9.396614e-01
19614,19614,0.008890,0.989792,1.399330e-05,2.797556e-04,1.023737e-03
19615,19615,0.007264,0.990213,4.521116e-05,2.085479e-04,2.269156e-03


In [47]:
sample_submission.to_csv('submission.csv', index = False, encoding = 'utf-8')