In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd

from tqdm import tqdm
import itertools

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

import warnings
from pathlib import Path

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import re

In [None]:
pd.set_option('max_columns', 100)
pd.set_option('display.precision', 4)

rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
sns.set(font_scale=2.5)

warnings.filterwarnings('ignore')

In [None]:
# 코렙 

from google.colab import drive
drive.mount('/content/drive')

data_dir = Path('/content/drive/My Drive/kaggle/Dacon-Novel-author-classification-AI/data/open')
metric_dir = Path('/content/drive/My Drive/kaggle/Dacon-Novel-author-classification-AI/build/metric')
model_dir = Path('/content/drive/My Drive/kaggle/Dacon-Novel-author-classification-AI/build/model')
feature_dir = Path('/content/drive/My Drive/kaggle/Dacon-Novel-author-classification-AI/build/feature')
val_dir = Path('/content/drive/My Drive/kaggle/Dacon-Novel-author-classification-AI/build/val')
tst_dir = Path('/content/drive/My Drive/kaggle/Dacon-Novel-author-classification-AI/build/tst')
sub_dir = Path('/content/drive/My Drive/kaggle/Dacon-Novel-author-classification-AI/build/sub')

trn_file = data_dir / 'train.csv'
tst_file = data_dir / 'test_x.csv'
sample_file = data_dir / 'sample_submission.csv'
submission_file = sub_dir / 'submission.csv'

In [None]:
trn = pd.read_csv(trn_file, encoding='utf-8')
tst = pd.read_csv(tst_file, encoding='utf-8')
sample_submission = pd.read_csv(sample_file, encoding='utf-8')

In [None]:
trn

In [None]:
tst

In [None]:
sample_submission

In [None]:
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]','',text)

trn['text'] = trn['text'].apply(alpha_num)

In [None]:
trn

In [None]:
# 불용어 제거해주는 함수
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)

# 불용어
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [None]:
trn['text'] = trn['text'].str.lower()
tst['text'] = tst['text'].str.lower()
trn['text'] = trn['text'].apply(alpha_num).apply(remove_stopwords)
tst['text'] = tst['text'].apply(alpha_num).apply(remove_stopwords)

In [None]:
X = np.array([x for x in trn['text']])
y = np.array([x for x in trn['author']])
tst = np.array([x for x in tst['text']])

In [None]:
vocab_size = 20000
embedding_dim = 16
max_length = 500
padding_type = 'post'

In [None]:
tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(X)
word_index = tokenizer.word_index

In [None]:
train_sequences = tokenizer.texts_to_sequences(X)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(tst)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [None]:
# compile model
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# model summary
print(model.summary())

In [None]:
# fit model
num_epochs = 20
history = model.fit(train_padded, y, 
                    epochs=num_epochs, verbose=2, 
                    validation_split=0.2)

In [None]:
# predict values
pred = model.predict_proba(test_padded)

In [None]:
pred

In [None]:
# submission
sample_submission[['0','1','2','3','4']] = pred
sample_submission

In [None]:
sample_submission.to_csv(submission_file, index = False, encoding = 'utf-8')