In [88]:
from gensim.models import FastText
import xgboost as xgb
import pandas as pd
from sklearn.model_selection import train_test_split
import sentencepiece as spm
from tqdm import tqdm
import regex as re
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import itertools
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder

In [106]:
sp = spm.SentencePieceProcessor()
sp.load('Tokenizer/all_reviews/global_spm.model')

ft_model= FastText.load('0806_fasttext_model_all_reviews_bigram_skipgram_epoch=5_WS=5_alpha=0.05_mincount=150_maxn=4')

In [107]:
def clean_reivew(review):
    review = ''.join(re.compile('[가-힣.!? ]').findall(review))
    
    review = re.sub(r'\!+', '!', review)
    review = re.sub(r'\?+', '?', review)
    review = re.sub(r'\.+', '.', review)
    review = re.sub(r'\([^)]*\)', '', review)    # 괄호 안 내용 삭제
    review = re.sub(r'\[[^)]*\]', '', review)    # 대괄호 안 내용 삭제
    
    return review

In [108]:
reviews = pd.read_excel('200428_밀착력_with_rating.xlsx', index_col=0)

In [109]:
reviews

Unnamed: 0,구분,rating,리뷰
0,1,1,상품 후기가 좋아서 구매했는데 그저 그런 펜슬 붓 나쁘지 않음 얇게 잘 그려져요 근...
1,1,1,진짜별로임 선스틱인데 너무매트해서 바르면 피부에밀착되지않고 밀리는현상이있음 얼굴에는...
2,1,1,우선 발림성이 무겁고 백탁현상이 심한거 같습니다 저만 그런건지 다른사람도 발랐을때 ...
3,1,1,원래 쓰던거랑 발림성이 다름 2년동안 썼는데 여기껀 좀더 꾸덕하고 파운데이션이 낌 ...
4,1,1,홈쇼핑에 속았어요 커버력 발림성 전혀 광고하곤 달라요 밀림 오지구여 모공끼임 쪄네요
...,...,...,...
11197,1,1,접착력이 낮고 효과가 별로 없음 여름에 땀흘리는건 당연한데 땀흘리면 떨어짐 이게 무...
11198,1,2,아니 이때까지 몇번을삿는데 이번에온거는 색깔도틀리고재질도틀리고 왜이렇게잘떨어지나요?...
11199,2,2,양은많은데..좀유분이적고뻣뻣한느낌이들어요
11200,2,3,향이 너무 강해요 지속력은 오래가는데 향기가 강하다 보니 조금 힘들어요


In [110]:
X_raw = reviews['리뷰'].values

In [111]:
X = [sp.EncodeAsIds(clean_reivew(review)) for review in tqdm(X_raw)]

100%|██████████| 11115/11115 [00:00<00:00, 15376.60it/s]


In [112]:
X = pad_sequences(X, maxlen=100, padding='post', truncating='post')

In [113]:
X[0]

array([  390,  5280,   539,   794,  3130,   367, 20548,  6271,  2852,
        7114,  9576,    20, 17131,    55,   488, 14127,    15,  3130,
         367,  7007,   121,   890,  4381,  4515,  9490, 26618,   459,
       25799,  6873,   171, 10477,  1680, 28230,   102,  3761,   246,
        1048,  1528,  6628,    17,    20,  8885,  1528,   270, 25485,
          10,  1153, 15408,  1528,  1586, 24308, 24092, 14267, 16163,
        6405,   316,  1387,   505,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0], dtype=int32)

In [114]:
y = np.array(reviews['구분'].values)
y = [[_y] for _y in y]

In [115]:
ohe = OneHotEncoder()
ohe.fit(y)
y = ohe.transform(y).toarray()

In [116]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7, test_size = .3)

In [117]:
sp = spm.SentencePieceProcessor()
sp.load('Tokenizer/all_reviews/global_spm.model')

with open('Tokenizer/all_reviews/global_spm.vocab', 'r') as f:
    vocab = f.readlines()
    
vocab = [token.split('\t')[0] for token in vocab]
embedding_matrix = np.asarray([ft_model.wv.get_vector(word) for word in vocab])
embedding_matrix.shape

(32000, 384)

In [118]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional, SpatialDropout1D, concatenate, GlobalMaxPooling1D, Conv1D, GlobalAveragePooling1D, Activation, Input,Lambda, Flatten
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
import tensorflow as tf

In [119]:
model = Sequential()

model.add(Embedding(input_dim=len(embedding_matrix), weights=[embedding_matrix], output_dim=ft_model.wv.vector_size, input_length=100, trainable=False))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(256)))
model.add(Dropout(0.3))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(4, activation='softmax'))
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 100, 384)          12288000  
_________________________________________________________________
dropout_15 (Dropout)         (None, 100, 384)          0         
_________________________________________________________________
bidirectional_6 (Bidirection (None, 512)               1312768   
_________________________________________________________________
dropout_16 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 128)               65664     
_________________________________________________________________
dropout_17 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 4)                

In [120]:
model.compile('adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [121]:
model.fit(X_train, y_train, batch_size=64, epochs=10)

Train on 7780 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f9039cb4e10>

In [122]:
model.evaluate(X_test,y_test)



[0.7629001637091344, 0.69025487]

#0806_fasttext_model_all_reviews_bigram_skipgram_epoch=5_WS=5_alpha=0.05_mincount=100_maxn=4

Train on 7780 samples
Epoch 1/10
7780/7780 [==============================] - 55s 7ms/sample - loss: 0.9714 - acc: 0.5761
Epoch 2/10
7780/7780 [==============================] - 53s 7ms/sample - loss: 0.8722 - acc: 0.6198
Epoch 3/10
7780/7780 [==============================] - 53s 7ms/sample - loss: 0.8267 - acc: 0.6488
Epoch 4/10
7780/7780 [==============================] - 111s 14ms/sample - loss: 0.7987 - acc: 0.6483
Epoch 5/10
7780/7780 [==============================] - 172s 22ms/sample - loss: 0.7550 - acc: 0.6725
Epoch 6/10
7780/7780 [==============================] - 179s 23ms/sample - loss: 0.7465 - acc: 0.6823
Epoch 7/10
7780/7780 [==============================] - 120s 15ms/sample - loss: 0.7006 - acc: 0.6979
Epoch 8/10
7780/7780 [==============================] - 54s 7ms/sample - loss: 0.6736 - acc: 0.7121
Epoch 9/10
7780/7780 [==============================] - 57s 7ms/sample - loss: 0.6477 - acc: 0.7222
Epoch 10/10
7780/7780 [==============================] - 53s 7ms/sample - loss: 0.6261 - acc: 0.7341
<tensorflow.python.keras.callbacks.History at 0x7f903a5aac10>
model.evaluate(X_test,y_test)
3335/3335 [==============================] - 7s 2ms/sample - loss: 0.7604 - acc: 0.6858
[0.7604041670215899, 0.6857571]

WS=2

Train on 7780 samples
Epoch 1/10
7780/7780 [==============================] - 57s 7ms/sample - loss: 0.9834 - acc: 0.5753
Epoch 2/10
7780/7780 [==============================] - 57s 7ms/sample - loss: 0.8781 - acc: 0.6194
Epoch 3/10
7780/7780 [==============================] - 56s 7ms/sample - loss: 0.8342 - acc: 0.6365
Epoch 4/10
7780/7780 [==============================] - 57s 7ms/sample - loss: 0.7919 - acc: 0.6608
Epoch 5/10
7780/7780 [==============================] - 55s 7ms/sample - loss: 0.7686 - acc: 0.6680
Epoch 6/10
7780/7780 [==============================] - 56s 7ms/sample - loss: 0.7412 - acc: 0.6846
Epoch 7/10
7780/7780 [==============================] - 56s 7ms/sample - loss: 0.7226 - acc: 0.6914
Epoch 8/10
7780/7780 [==============================] - 55s 7ms/sample - loss: 0.7026 - acc: 0.6973
Epoch 9/10
7780/7780 [==============================] - 56s 7ms/sample - loss: 0.6854 - acc: 0.7055
Epoch 10/10
7780/7780 [==============================] - 56s 7ms/sample - loss: 0.6501 - acc: 0.7220
<tensorflow.python.keras.callbacks.History at 0x7f903bdf8550>
model.evaluate(X_test,y_test)
3335/3335 [==============================] - 10s 3ms/sample - loss: 0.8219 - acc: 0.6528
[0.8219285628606176, 0.6527736]

Epoch 1/10
7780/7780 [==============================] - 57s 7ms/sample - loss: 0.9758 - acc: 0.5744
Epoch 2/10
7780/7780 [==============================] - 56s 7ms/sample - loss: 0.8727 - acc: 0.6171
Epoch 3/10
7780/7780 [==============================] - 56s 7ms/sample - loss: 0.8374 - acc: 0.6289
Epoch 4/10
7780/7780 [==============================] - 56s 7ms/sample - loss: 0.7964 - acc: 0.6577
Epoch 5/10
7780/7780 [==============================] - 56s 7ms/sample - loss: 0.7755 - acc: 0.6731
Epoch 6/10
7780/7780 [==============================] - 56s 7ms/sample - loss: 0.7717 - acc: 0.6679
Epoch 7/10
7780/7780 [==============================] - 55s 7ms/sample - loss: 0.7364 - acc: 0.6850
Epoch 8/10
7780/7780 [==============================] - 57s 7ms/sample - loss: 0.7208 - acc: 0.6923
Epoch 9/10
7780/7780 [==============================] - 57s 7ms/sample - loss: 0.6951 - acc: 0.7054
Epoch 10/10
7780/7780 [==============================] - 56s 7ms/sample - loss: 0.6792 - acc: 0.7150

model.evaluate(X_test,y_test)
3335/3335 [==============================] - 9s 3ms/sample - loss: 0.7890 - acc: 0.6585
[0.7890483795792267, 0.65847075]

In [384]:
y_predicted = model.predict(np.array([X_test[0]])) # 입력한 테스트용 샘플에 대해서 예측 y를 리턴
y_predicted = np.argmax(y_predicted, axis=-1) # 원-핫 인코딩을 다시 정수 인코딩으로 변경함.
true = np.argmax(y_test[0], -1) # 원-핫 인코딩을 다시 정수 인코딩으로 변경함.

In [385]:
y_predicted

array([0])

In [386]:
true

0

In [381]:
[sp.Decode(int(_id)) for _id in X_test[0] if _id]

['붙',
 '혔',
 '을때',
 '검정색',
 '이라',
 '살인',
 '마',
 '같았',
 '는데',
 '보습',
 '하고',
 '향은',
 '좋네요',
 '재구매',
 '예정입니다']