In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, GlobalAveragePooling1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
import os
import easydict

Using TensorFlow backend.


In [2]:
device = tf.test.gpu_device_name()
print(device)
print(tf.__version__)


2.0.0


In [3]:
# 파일 불러오기
train = pd.read_csv('open/train.csv', encoding='utf-8')
test = pd.read_csv('open/test_x.csv', encoding='utf-8')
sample_submission = pd.read_csv('open/sample_submission.csv', encoding='utf-8')

In [4]:
train.drop('index', axis=1, inplace=True)
train.head(2)

Unnamed: 0,text,author
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2


In [5]:
test.drop('index', axis=1, inplace=True)
test.head(2)

Unnamed: 0,text
0,“Not at all. I think she is one of the most ch...
1,"""No,"" replied he, with sudden consciousness, ""..."


In [6]:
sample_submission.head(2)

Unnamed: 0,index,0,1,2,3,4
0,0,0,0,0,0,0
1,1,0,0,0,0,0


# 전처리

In [7]:
# 불용어
basic_stopwords = { "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" }

nltk.download('stopwords')
nltk_stopwords = set(stopwords.words('english'))

final_stopwords = nltk_stopwords.union(basic_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\weroo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# 부호를 제거해주는 함수
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)

# 불용어 제거해주는 함수
def remove_stopwords(text: str):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in final_stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)
    
# 전처리 적용
train['text']=train['text'].apply(alpha_num)
test['text']=test['text'].apply(alpha_num)
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
train['text'] = train['text'].apply(alpha_num).apply(remove_stopwords)
test['text'] = test['text'].apply(alpha_num).apply(remove_stopwords)

In [9]:
# train test 분리
# X_train = np.array([x for x in train['text']])
# X_test = np.array([x for x in test['text']])
# y_train = np.array([x for x in train['author']])

X_train = train['text'].values
X_test = test['text'].values
y_train = train['author'].values

In [10]:
X_train

array(['almost choking much much wanted say strange exclamations came lips pole gazed fixedly bundle notes hand looked odin evident perplexity',
       'sister asked suppose',
       'engaged one day walked perusing janes last letter dwelling passages proved jane written spirits instead surprised mr odin saw looking odin meeting putting away letter immediately forcing smile said',
       ..., 'sincere wellwisher friend sister lucy odin',
       'wanted lend money', 'certainly occurred said yes like'],
      dtype=object)

In [11]:
len(X_train)

54879

# 토크나이징

In [12]:
tokenizer = Tokenizer(num_words=30000, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 47118 unique tokens.


In [13]:
X_token = tokenizer.texts_to_sequences(X_train)
print(X_token[:10])
print(X_train[:10])

[[135, 7244, 17, 17, 310, 18, 223, 3058, 32, 446, 2725, 1195, 5011, 1694, 1200, 41, 53, 2, 1207, 2818], [211, 54, 215], [675, 4, 55, 244, 13184, 4732, 34, 188, 4528, 4733, 1225, 479, 618, 493, 692, 495, 5, 2, 73, 101, 2, 664, 855, 46, 188, 418, 5310, 283, 3], [245, 5012, 867, 1230, 39, 8427, 823, 1497, 115, 248, 20, 1140, 610, 5311, 496, 2, 61, 1728, 129, 393, 2158, 1782, 2115, 1619, 610, 100, 3746, 6404, 1894, 131, 1783], [1374, 340, 2, 1548, 100, 27, 482, 1471, 968, 205, 2031, 98, 6651, 4000, 884, 16684, 6964, 9539, 77, 194], [9, 3410, 3, 3167, 1231, 20, 710], [655, 425, 1549, 388, 238, 258, 6405, 303, 70, 10], [1089, 595, 7, 25535, 3, 11, 937, 7, 6142, 68, 3344, 1761, 7, 12, 35, 651, 25536, 996, 696, 6, 1498, 297, 5149, 11983, 6965, 55, 737, 3002, 28, 80, 968, 16685, 31, 332, 25537, 5907, 1679], [34, 1968, 87, 3], [15, 170, 23, 849, 24, 1030, 5312, 268, 1916, 156, 79, 2013, 79, 2772, 79, 19831, 1333]]
['almost choking much much wanted say strange exclamations came lips pole gazed fi

In [14]:
X_token = pad_sequences(X_token, maxlen=500, padding='post', truncating='post')
X_token.shape

(54879, 500)

In [15]:
X_token_test = tokenizer.texts_to_sequences(X_test)
X_token_test = pad_sequences(X_token_test, maxlen=500, padding='post', truncating='post')
X_token_test.shape

(19617, 500)

In [16]:
y_train = pd.get_dummies(y_train).values
y_train

array([[0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0],
       ...,
       [0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0]], dtype=uint8)

In [17]:
X_train, X_test, Y_train, Y_test = train_test_split(X_token, y_train, test_size=0.1, random_state=42)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(49391, 500) (49391, 5)
(5488, 500) (5488, 5)


In [18]:
model=Sequential()
model.add(Embedding(30000, 20, input_length=500))
# model.add(GlobalAveragePooling1D())
model.add(SpatialDropout1D(0.1))
model.add(LSTM(20, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(16, activation='relu'))
model.add(Dense(5, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 20)           600000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 500, 20)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 20)                3280      
_________________________________________________________________
dense_1 (Dense)              (None, 16)                336       
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 85        
Total params: 603,701
Trainable params: 603,701
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
history = model.fit(X_train, Y_train, 
                    epochs=20,
                    validation_split=0.1, 
                    )
# callbacks=[EarlyStopping(monitor='val_loss', patience=2, min_delta=0.0001)]

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 44451 samples, validate on 4940 samples
Epoch 1/20
Epoch 2/20

In [None]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

In [None]:
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show();

In [None]:
pred = model.predict_proba(X_token_test)

# submission
sample_submission[['0','1','2','3','4']] = pred
sample_submission

In [None]:
sample_submission.to_csv('submission_temp.csv', index = False, encoding = 'utf-8')