In [1]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import pickle
from time import time
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Bidirectional, LSTM, Dropout

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.model_selection import KFold

In [3]:
def tokenizer(t):
    return t.split()
def tokenizer_porter(t):
    return [porter.stem(w) for w in t.split()]

In [8]:
trn = pd.read_csv('./train.csv')
trn

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so m...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in per...",1
3,3,"The captain was in the porch, keeping himself ...",4
4,4,"“Have mercy, gentlemen!” odin flung up his han...",3
...,...,...,...
54874,54874,"“Is that you, Mr. Smith?” odin whispered. “I h...",2
54875,54875,"I told my plan to the captain, and between us ...",4
54876,54876,"""Your sincere well-wisher, friend, and sister...",1
54877,54877,“Then you wanted me to lend you money?”,3


In [14]:
trn2 = pd.read_csv('./dlt_stopwords.csv')
trn2

Unnamed: 0,index,text,author
0,0,"almost choking. much, much wanted say, strange...",3
1,1,"“your sister asked it, suppose?”",2
2,2,"engaged one day walked, perusing jane’s last l...",1
3,3,"captain porch, keeping carefully way treachero...",4
4,4,"“have mercy, gentlemen!” odin flung hands. “do...",3
...,...,...,...
54874,54874,"“is you, mr. smith?” odin whispered. “i hardly...",2
54875,54875,"told plan captain, us settled details accompli...",4
54876,54876,"""your sincere well-wisher, friend, sister, ""lu...",1
54877,54877,“then wanted lend money?”,3


In [3]:
tst = pd.read_csv('./dlt_all_tst.csv')
tst

Unnamed: 0,index,text,stopwords
0,0,“Not at all. I think she is one of the most ch...,“Not all. think one charming young ladies ever...
1,1,"""No,"" replied he, with sudden consciousness, ""...","""No,"" replied he, sudden consciousness, ""not f..."
2,2,As the lady had stated her intention of scream...,"lady stated intention screaming, course would ..."
3,3,“And then suddenly in the silence I heard a so...,“And suddenly silence heard sound sent heart m...
4,4,His conviction remained unchanged. So far as I...,conviction remained unchanged. far know--and b...
...,...,...,...
19612,19612,"At the end of another day or two, odin growing...","end another day two, odin growing visibly stro..."
19613,19613,"All afternoon we sat together, mostly in silen...","afternoon sat together, mostly silence, watchi..."
19614,19614,"odin, having carried his thanks to odin, proc...","odin, carried thanks odin, proceeded happiness..."
19615,19615,"Soon after this, upon odin's leaving the room,...","Soon this, upon odin's leaving room, ""Mama,"" s..."


In [15]:
temp_trn = pd.DataFrame(trn['text'])
temp_trn

Unnamed: 0,text
0,"He was almost choking. There was so much, so m..."
1,"“Your sister asked for it, I suppose?”"
2,"She was engaged one day as she walked, in per..."
3,"The captain was in the porch, keeping himself ..."
4,"“Have mercy, gentlemen!” odin flung up his han..."
...,...
54874,"“Is that you, Mr. Smith?” odin whispered. “I h..."
54875,"I told my plan to the captain, and between us ..."
54876,"""Your sincere well-wisher, friend, and sister..."
54877,“Then you wanted me to lend you money?”


In [16]:
temp_trn2 = pd.DataFrame(trn2['text'])

In [6]:
temp_trn2.rename(columns = {'stopwords' : 'text'}, inplace = True)
temp_trn2


Unnamed: 0,text
0,"almost choking. much, much wanted say, strange..."
1,"“Your sister asked it, suppose?”"
2,"engaged one day walked, perusing Jane’s last l..."
3,"captain porch, keeping carefully way treachero..."
4,"“Have mercy, gentlemen!” odin flung hands. “Do..."
...,...
54874,"“Is you, Mr. Smith?” odin whispered. “I hardly..."
54875,"told plan captain, us settled details accompli..."
54876,"""Your sincere well-wisher, friend, sister, ""LU..."
54877,“Then wanted lend money?”


In [112]:
temp_trn3 = pd.DataFrame(trn2['text'])
temp_trn3

Unnamed: 0,text
0,"almost choking. much, much wanted say, strange..."
1,"“your sister asked it, suppose?”"
2,"engaged one day walked, perusing jane’s last l..."
3,"captain porch, keeping carefully way treachero..."
4,"“have mercy, gentlemen!” flung hands. “don’t w..."
...,...
54874,"“is you, mr. smith?” whispered. “i hardly dare..."
54875,"told plan captain, us settled details accompli..."
54876,"""your sincere well-wisher, friend, sister, ""lu..."
54877,“then wanted lend money?”


In [7]:
temp_author = pd.DataFrame(trn['author'])

In [17]:
final_trn = pd.concat([temp_trn, temp_trn2], ignore_index = True)

In [18]:
final_trn

Unnamed: 0,text
0,"He was almost choking. There was so much, so m..."
1,"“Your sister asked for it, I suppose?”"
2,"She was engaged one day as she walked, in per..."
3,"The captain was in the porch, keeping himself ..."
4,"“Have mercy, gentlemen!” odin flung up his han..."
...,...
109753,"“is you, mr. smith?” odin whispered. “i hardly..."
109754,"told plan captain, us settled details accompli..."
109755,"""your sincere well-wisher, friend, sister, ""lu..."
109756,“then wanted lend money?”


In [19]:
final_trn.to_csv('mydata.csv', index = False, encoding = 'utf-8')

In [113]:
final_trn2 = pd.concat([final_trn, temp_trn3], ignore_index = True)
final_trn2

Unnamed: 0,text
0,"He was almost choking. There was so much, so m..."
1,"“Your sister asked for it, I suppose?”"
2,"She was engaged one day as she walked, in per..."
3,"The captain was in the porch, keeping himself ..."
4,"“Have mercy, gentlemen!” odin flung up his han..."
...,...
164632,"“is you, mr. smith?” whispered. “i hardly dare..."
164633,"told plan captain, us settled details accompli..."
164634,"""your sincere well-wisher, friend, sister, ""lu..."
164635,“then wanted lend money?”


In [115]:
final_author = pd.concat([final_author, temp_author], ignore_index = True)
final_author

Unnamed: 0,author
0,3
1,2
2,1
3,4
4,3
...,...
164632,2
164633,4
164634,1
164635,3


In [10]:
final_author.tail()

Unnamed: 0,author
109753,2
109754,4
109755,1
109756,3
109757,0


In [68]:
temp_tst1 = pd.DataFrame(tst['text'])

In [71]:
temp_tst2 = pd.DataFrame(tst['stopwords'])
temp_tst2.rename(columns = {'stopwords' : 'text'}, inplace = True)

In [72]:
final_tst = pd.concat([temp_tst1, temp_tst2], ignore_index = True)
final_tst

Unnamed: 0,text
0,“Not at all. I think she is one of the most ch...
1,"""No,"" replied he, with sudden consciousness, ""..."
2,As the lady had stated her intention of scream...
3,“And then suddenly in the silence I heard a so...
4,His conviction remained unchanged. So far as I...
...,...
39229,"end another day two, odin growing visibly stro..."
39230,"afternoon sat together, mostly silence, watchi..."
39231,"odin, carried thanks odin, proceeded happiness..."
39232,"Soon this, upon odin's leaving room, ""Mama,"" s..."


In [124]:
tst

Unnamed: 0,index,text,stopwords
0,0,“Not at all. I think she is one of the most ch...,“Not all. think one charming young ladies ever...
1,1,"""No,"" replied he, with sudden consciousness, ""...","""No,"" replied he, sudden consciousness, ""not f..."
2,2,As the lady had stated her intention of scream...,"lady stated intention screaming, course would ..."
3,3,“And then suddenly in the silence I heard a so...,“And suddenly silence heard sound sent heart m...
4,4,His conviction remained unchanged. So far as I...,conviction remained unchanged. far know--and b...
...,...,...,...
19612,19612,"At the end of another day or two, odin growing...","end another day two, odin growing visibly stro..."
19613,19613,"All afternoon we sat together, mostly in silen...","afternoon sat together, mostly silence, watchi..."
19614,19614,"odin, having carried his thanks to odin, proc...","odin, carried thanks odin, proceeded happiness..."
19615,19615,"Soon after this, upon odin's leaving the room,...","Soon this, upon odin's leaving room, ""Mama,"" s..."


In [12]:
temp_y = trn['author']

In [13]:
temp_trn.drop(['author'], axis = 1, inplace = True)
temp_trn

Unnamed: 0,index,text,stopwords
0,0,"He was almost choking. There was so much, so m...","almost choking. much, much wanted say, strange..."
1,1,"“Your sister asked for it, I suppose?”","“Your sister asked it, suppose?”"
2,2,"She was engaged one day as she walked, in per...","engaged one day walked, perusing Jane’s last l..."
3,3,"The captain was in the porch, keeping himself ...","captain porch, keeping carefully way treachero..."
4,4,"“Have mercy, gentlemen!” odin flung up his han...","“Have mercy, gentlemen!” odin flung hands. “Do..."
...,...,...,...
54874,54874,"“Is that you, Mr. Smith?” odin whispered. “I h...","“Is you, Mr. Smith?” odin whispered. “I hardly..."
54875,54875,"I told my plan to the captain, and between us ...","told plan captain, us settled details accompli..."
54876,54876,"""Your sincere well-wisher, friend, and sister...","""Your sincere well-wisher, friend, sister, ""LU..."
54877,54877,“Then you wanted me to lend you money?”,“Then wanted lend money?”


In [122]:
final_trn2

Unnamed: 0,text
0,"He was almost choking. There was so much, so m..."
1,"“Your sister asked for it, I suppose?”"
2,"She was engaged one day as she walked, in per..."
3,"The captain was in the porch, keeping himself ..."
4,"“Have mercy, gentlemen!” odin flung up his han..."
...,...
164632,"“is you, mr. smith?” whispered. “i hardly dare..."
164633,"told plan captain, us settled details accompli..."
164634,"""your sincere well-wisher, friend, sister, ""lu..."
164635,“then wanted lend money?”


In [15]:
temp_trn.drop(['index'], axis = 1, inplace = True)

In [116]:
X_train = np.array([x for x in final_trn2['text']])
X_test = np.array([x for x in tst['text']])
y_train = np.array([x for x in final_author['author']])

In [74]:
X_train

array(['He was almost choking. There was so much, so much he wanted to say, but strange exclamations were all that came from his lips. The Pole gazed fixedly at him, at the bundle of notes in his hand; looked at odin, and was in evident perplexity.',
       '“Your sister asked for it, I suppose?”',
       ' She was engaged one day as she walked, in perusing Jane’s last letter, and dwelling on some passages which proved that Jane had not written in spirits, when, instead of being again surprised by Mr. odin, she saw on looking up that odin was meeting her. Putting away the letter immediately and forcing a smile, she said:',
       ..., '"Your sincere well-wisher, friend, sister, "LUCY odin.',
       '“Then wanted lend money?”',
       'certainly occurred before, said, Yes, like that.'], dtype='<U2500')

In [123]:
#파라미터 설정
vocab_size = 25000
embedding_dim = 8
max_length = 1000
padding_type='post'
#oov_tok = "<OOV>"

#tokenizer에 fit
tokenizer = Tokenizer(num_words = vocab_size)#, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

#데이터를 sequence로 변환해주고 padding 해줍니다.
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)

In [30]:
help(tokenizer.texts_to_sequences)

Help on method texts_to_sequences in module keras_preprocessing.text:

texts_to_sequences(texts) method of keras_preprocessing.text.Tokenizer instance
    Transforms each text in texts to a sequence of integers.
    
    Only top `num_words-1` most frequent words will be taken into account.
    Only words known by the tokenizer will be taken into account.
    
    # Arguments
        texts: A list of texts (strings).
    
    # Returns
        A list of sequences.



In [32]:
tokenizer = Tokenizer(num_words = vocab_size, filters=None)#, oov_token=oov_tok)

In [21]:
help(tokenizer)

Help on Tokenizer in module keras_preprocessing.text object:

class Tokenizer(builtins.object)
 |  Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None, document_count=0, **kwargs)
 |  
 |  Text tokenization utility class.
 |  
 |  This class allows to vectorize a text corpus, by turning each
 |  text into either a sequence of integers (each integer being the index
 |  of a token in a dictionary) or into a vector where the coefficient
 |  for each token could be binary, based on word count, based on tf-idf...
 |  
 |  # Arguments
 |      num_words: the maximum number of words to keep, based
 |          on word frequency. Only the most common `num_words-1` words will
 |          be kept.
 |      filters: a string where each element is a character that will be
 |          filtered from the texts. The default is all punctuation, plus
 |          tabs and line breaks, minus the `'` character.
 |      lower: boolea

In [125]:
#가벼운 NLP모델 생성
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    Bidirectional(LSTM(8, dropout=0.5)),
#     tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

# compile model
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# model summary
print(model.summary())

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 1000, 8)           200000    
_________________________________________________________________
bidirectional_15 (Bidirectio (None, 16)                1088      
_________________________________________________________________
dense_30 (Dense)             (None, 8)                 136       
_________________________________________________________________
dense_31 (Dense)             (None, 5)                 45        
Total params: 201,269
Trainable params: 201,269
Non-trainable params: 0
_________________________________________________________________
None


In [126]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=2, patience=4)
mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', verbose=2, save_best_only=True)

In [127]:
kf = KFold(5, True)



In [128]:
acc_per_fold = []
loss_per_fold = []
models = []
for train_index, test_index in kf.split(train_padded):
    # fit model
    num_epochs = 50
    history = model.fit(train_padded[train_index], y_train[train_index], 
                        epochs=num_epochs,callbacks = [es, mc], verbose=2, 
                        validation_split=0.2)

    scores = model.evaluate(train_padded[test_index], y_train[test_index])

    acc_per_fold.append(scores[1])
    loss_per_fold.append(scores[0])
    models.append(model)

Epoch 1/50

Epoch 00001: val_accuracy improved from -inf to 0.59232, saving model to best_model.h5
3293/3293 - 784s - loss: 1.3100 - accuracy: 0.4414 - val_loss: 1.0066 - val_accuracy: 0.5923
Epoch 2/50

Epoch 00002: val_accuracy improved from 0.59232 to 0.75704, saving model to best_model.h5
3293/3293 - 777s - loss: 0.9014 - accuracy: 0.6479 - val_loss: 0.6741 - val_accuracy: 0.7570
Epoch 3/50

Epoch 00003: val_accuracy improved from 0.75704 to 0.79891, saving model to best_model.h5
3293/3293 - 766s - loss: 0.7154 - accuracy: 0.7302 - val_loss: 0.5667 - val_accuracy: 0.7989
Epoch 4/50

Epoch 00004: val_accuracy improved from 0.79891 to 0.82530, saving model to best_model.h5
3293/3293 - 764s - loss: 0.6255 - accuracy: 0.7681 - val_loss: 0.4900 - val_accuracy: 0.8253
Epoch 5/50

Epoch 00005: val_accuracy improved from 0.82530 to 0.82928, saving model to best_model.h5
3293/3293 - 764s - loss: 0.5772 - accuracy: 0.7862 - val_loss: 0.4721 - val_accuracy: 0.8293
Epoch 6/50

Epoch 00006: val

Epoch 46/50

Epoch 00046: val_accuracy did not improve from 0.90354
3293/3293 - 697s - loss: 0.3498 - accuracy: 0.8695 - val_loss: 0.2738 - val_accuracy: 0.9029
Epoch 47/50

Epoch 00047: val_accuracy did not improve from 0.90354
3293/3293 - 701s - loss: 0.3494 - accuracy: 0.8695 - val_loss: 0.2788 - val_accuracy: 0.9005
Epoch 48/50

Epoch 00048: val_accuracy did not improve from 0.90354
3293/3293 - 709s - loss: 0.3468 - accuracy: 0.8702 - val_loss: 0.2831 - val_accuracy: 0.8981
Epoch 49/50

Epoch 00049: val_accuracy did not improve from 0.90354
3293/3293 - 687s - loss: 0.3439 - accuracy: 0.8719 - val_loss: 0.2795 - val_accuracy: 0.9005
Epoch 00049: early stopping
Epoch 1/50

Epoch 00001: val_accuracy did not improve from 0.90354
3293/3293 - 685s - loss: 0.3741 - accuracy: 0.8619 - val_loss: 0.2726 - val_accuracy: 0.9016
Epoch 2/50

Epoch 00002: val_accuracy did not improve from 0.90354
3293/3293 - 684s - loss: 0.3637 - accuracy: 0.8648 - val_loss: 0.2730 - val_accuracy: 0.9020
Epoch 3/

In [129]:
models

[<tensorflow.python.keras.engine.sequential.Sequential at 0x7fe8189ed790>,
 <tensorflow.python.keras.engine.sequential.Sequential at 0x7fe8189ed790>,
 <tensorflow.python.keras.engine.sequential.Sequential at 0x7fe8189ed790>,
 <tensorflow.python.keras.engine.sequential.Sequential at 0x7fe8189ed790>,
 <tensorflow.python.keras.engine.sequential.Sequential at 0x7fe8189ed790>]

In [81]:
loss_per_fold

[0.4155585765838623,
 0.20002619922161102,
 0.13394342362880707,
 0.10408391803503036,
 0.07673414796590805]

In [48]:
help(model.fit)

Help on method fit in module tensorflow.python.keras.engine.training:

fit(x=None, y=None, batch_size=None, epochs=1, verbose=1, callbacks=None, validation_split=0.0, validation_data=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None, validation_batch_size=None, validation_freq=1, max_queue_size=10, workers=1, use_multiprocessing=False) method of tensorflow.python.keras.engine.sequential.Sequential instance
    Trains the model for a fixed number of epochs (iterations on a dataset).
    
    Arguments:
        x: Input data. It could be:
          - A Numpy array (or array-like), or a list of arrays
            (in case the model has multiple inputs).
          - A TensorFlow tensor, or a list of tensors
            (in case the model has multiple inputs).
          - A dict mapping input names to the corresponding array/tensors,
            if the model has named inputs.
          - A `tf.data` dataset. Should return

In [40]:
# fit model
num_epochs = 20
history = model.fit(train_padded, y_train, 
                    epochs=num_epochs,callbacks = [es, mc], verbose=2, 
                    validation_split=0.2)

Epoch 1/20

Epoch 00001: val_accuracy improved from -inf to 0.72681, saving model to best_model.h5
2744/2744 - 347s - loss: 1.0989 - accuracy: 0.5566 - val_loss: 0.7413 - val_accuracy: 0.7268
Epoch 2/20

Epoch 00002: val_accuracy improved from 0.72681 to 0.78690, saving model to best_model.h5
2744/2744 - 341s - loss: 0.7339 - accuracy: 0.7228 - val_loss: 0.5913 - val_accuracy: 0.7869
Epoch 3/20

Epoch 00003: val_accuracy improved from 0.78690 to 0.81683, saving model to best_model.h5
2744/2744 - 358s - loss: 0.6211 - accuracy: 0.7688 - val_loss: 0.5161 - val_accuracy: 0.8168
Epoch 4/20

Epoch 00004: val_accuracy improved from 0.81683 to 0.83113, saving model to best_model.h5
2744/2744 - 363s - loss: 0.5636 - accuracy: 0.7930 - val_loss: 0.4709 - val_accuracy: 0.8311
Epoch 5/20

Epoch 00005: val_accuracy improved from 0.83113 to 0.84156, saving model to best_model.h5
2744/2744 - 343s - loss: 0.5244 - accuracy: 0.8082 - val_loss: 0.4409 - val_accuracy: 0.8416
Epoch 6/20

Epoch 00006: val

In [104]:
X_test = np.array([x for x in tst['stopwords']])

In [105]:
test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, padding=padding_type, maxlen=max_length)

In [130]:
# predict values
pred = models[0].predict_proba(test_padded)

In [131]:
sub = pd.read_csv('./sample_submission.csv', encoding = 'utf-8')

In [132]:
# submission
sub[['0','1','2','3','4']] = pred
sub.to_csv('muchdata.csv', index = False, encoding = 'utf-8')

In [110]:
test_padded

array([[ 898,   36,   78, ...,    0,    0,    0],
       [  48,  195,   12, ...,    0,    0,    0],
       [ 202, 3752, 1440, ...,    0,    0,    0],
       ...,
       [   3,  581, 2041, ...,    0,    0,    0],
       [ 193,   40,   41, ...,    0,    0,    0],
       [ 729, 6693,   42, ...,    0,    0,    0]], dtype=int32)