In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Prepare Data

In [2]:
CHARS = [
  '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+',
  ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8',
  '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E',
  'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
  'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_',
  'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
  'n', 'o', 'other', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y',
  'z', '}', '~', 'ก', 'ข', 'ฃ', 'ค', 'ฅ', 'ฆ', 'ง', 'จ', 'ฉ', 'ช',
  'ซ', 'ฌ', 'ญ', 'ฎ', 'ฏ', 'ฐ', 'ฑ', 'ฒ', 'ณ', 'ด', 'ต', 'ถ', 'ท',
  'ธ', 'น', 'บ', 'ป', 'ผ', 'ฝ', 'พ', 'ฟ', 'ภ', 'ม', 'ย', 'ร', 'ฤ',
  'ล', 'ว', 'ศ', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ', 'ฯ', 'ะ', 'ั', 'า',
  'ำ', 'ิ', 'ี', 'ึ', 'ื', 'ุ', 'ู', 'ฺ', 'เ', 'แ', 'โ', 'ใ', 'ไ',
  'ๅ', 'ๆ', '็', '่', '้', '๊', '๋', '์', 'ํ', '๐', '๑', '๒', '๓',
  '๔', '๕', '๖', '๗', '๘', '๙', '‘', '’', '\ufeff'
]
CHARS_MAP = {v: k for k, v in enumerate(CHARS)}

In [3]:
def create_df_with_context(df, n_pad):
    for i in range(n_pad):
        df['char-{}'.format(i+1)] = df['char'].shift(i + 1)
        df['char{}'.format(i+1)] = df['char'].shift(-i - 1)
    return df[n_pad: -n_pad]

In [4]:
def prepare_best_data(option = 'train', n_pad = 10):
    pad = [{'char': ' ', 'target': True}]
    df_pad = pd.DataFrame(pad * n_pad)
    
    df_list = []
    article_types = ['article', 'encyclopedia', 'news', 'novel']
    for article_type in article_types:
        df_list.append(pd.read_csv(os.path.join('../../corpora/BEST', option, 'df_best_{}_{}.csv'.format(article_type, option))))
  
    df_list = [df_pad] + df_list + [df_pad]
    df = pd.concat(df_list)

    df['char'] = df['char'].map(lambda x: CHARS_MAP.get(x, CHARS_MAP['other']))
    
    df_with_context = create_df_with_context(df, n_pad)

    char_row = ['char' + str(i + 1) for i in range(n_pad)] + ['char-' + str(i + 1) for i in range(n_pad)] + ['char']

    x_char = df_with_context[char_row].as_matrix()
    y = df_with_context['target'].astype(int).as_matrix()

    return x_char, y

In [5]:
x_train_char, y_train = prepare_best_data('train', 10)
x_val_char, y_val = prepare_best_data('val', 10)
x_test_char, y_test = prepare_best_data('test', 10)

print('Training data shape          :', x_train_char.shape)
print('Training data labels shape   :', y_train.shape)
print('Validation data shape        :', x_val_char.shape)
print('Validation data labels shape :', y_val.shape)
print('Test data shape              :', x_test_char.shape)
print('Test data labels shape       :', y_test.shape)

Training data shape          : (16461637, 21)
Training data labels shape   : (16461637,)
Validation data shape        : (2035694, 21)
Validation data labels shape : (2035694,)
Test data shape              : (2271932, 21)
Test data labels shape       : (2271932,)


In [6]:
# Print some entry from the data to make sure it is the same as what you think.
print('First 3 features: ', x_train_char[:3])
print('First 30 class labels', y_train[:30])

First 3 features:  [[ 112.  140.  114.  148.  130.  142.   94.  142.  128.  128.    1.    1.
     1.    1.    1.    1.    1.    1.    1.    1.   97.]
 [ 140.  114.  148.  130.  142.   94.  142.  128.  128.  141.   97.    1.
     1.    1.    1.    1.    1.    1.    1.    1.  112.]
 [ 114.  148.  130.  142.   94.  142.  128.  128.  141.  109.  112.   97.
     1.    1.    1.    1.    1.    1.    1.    1.  140.]]
First 30 class labels [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0]


# Convolution Neural Network

In [13]:
from keras.models import Model
from keras.layers import Input, Conv1D, Embedding, TimeDistributed, Flatten, Dense
from keras.optimizers import Adam

def get_cnn():
    input1 = Input(shape=(21,))
    x = Embedding(len(CHARS), 32)(input1)
    x = Conv1D(100, 5, strides=1, activation='relu', padding="same")(x)
    x = TimeDistributed(Dense(5))(x)
    x = Flatten()(x)
    x = Dense(100, activation='relu')(x)
    out = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=input1, outputs=out)
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['acc'])
    return model

In [14]:
from sklearn.metrics import f1_score,precision_score,recall_score

def evaluate(x_test, y_test, model):
    y_pred = model.predict(x_test)
    prob_to_class = lambda p: 1 if p[0]>=0.5 else 0
    y_pred = np.apply_along_axis(prob_to_class,1,y_pred)
    f1score = f1_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    return f1score, precision, recall

In [15]:
from keras import backend as K
K.clear_session()

print('start training')
verbose = 1
model_cnn = get_cnn()

train_params = [(3, 512)]
for (epochs, batch_size) in train_params:
    print("train with {} epochs and {} batch size".format(epochs, batch_size))
    model_cnn.fit(x_train_char, y_train, epochs=epochs, batch_size=batch_size, verbose=verbose,
                           validation_data=(x_val_char, y_val))

start training
train with 3 epochs and 512 batch size
Train on 16461637 samples, validate on 2035694 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [16]:
evaluate(x_test_char, y_test, model_cnn)

(0.96674245864595254, 0.95680406724696521, 0.97688947863106101)

In [17]:
path_model_cnn='../data/model_tokenize_cnn.h5'
model_cnn.save(path_model_cnn)

In [6]:
# Load from saved model

from keras.models import load_model
path_model_cnn='../data/model_tokenize_cnn.h5'
model_cnn = load_model(path_model_cnn)

# Tokenize Wongnai Data

In [14]:
tr = pd.read_csv("../data/w_review_train.csv", sep=";", names = ["review", "label"])
ts = pd.read_csv("../data/test_file.csv", sep=";")

In [15]:
def prepare_feature(text, n_pad = 10):
    pad = [{'char': ' '}]
    df_pad = pd.DataFrame(pad * n_pad)
    
    df_list = []
    df_list.append(pd.DataFrame({'char': list(text)}))
    df_list = [df_pad] + df_list + [df_pad]
                   
    df = pd.concat(df_list)

    df['char'] = df['char'].map(lambda x: CHARS_MAP.get(x, CHARS_MAP['other']))
    
    df_with_context = create_df_with_context(df, n_pad)

    char_row = ['char' + str(i + 1) for i in range(n_pad)] + ['char-' + str(i + 1) for i in range(n_pad)] + ['char']

    x_char = df_with_context[char_row].as_matrix()

    return x_char

In [16]:
def char_to_word(text, y_pred):
    split_text=""
    for char, y in zip(text, y_pred):
        if y == 1:
            split_text+=" "
            split_text+=char
        else:
            split_text+=char
    return split_text.split(" ")

In [21]:
def tokenize(text, model):
    x_char = prepare_feature(text)
    y_pred = model.predict(x_char)
    prob_to_class = lambda p: 1 if p[0]>=0.5 else 0
    y_pred = np.apply_along_axis(prob_to_class,1,y_pred)
    tokens = char_to_word(text, y_pred)
    return tokens

In [22]:
# This take around an hour to finish
tr["tokenizedReview"] = tr["review"].apply(lambda x: tokenize(x,model_cnn))

In [31]:
tr.head()

Unnamed: 0,review,label,tokenizedReview
0,ร้านอาหารใหญ่มากกกกกกก \nเลี้ยวเข้ามาเจอห้องน้...,3,"[, ร้าน, อาหาร, ใหญ่, มา, กกกกกกก, \n, เลี้ยว,..."
1,อาหารที่นี่เป็นอาหารจีนแคะที่หากินยากในบ้านเรา...,4,"[, อาหาร, ที่, นี่, เป็น, อาหาร, จีน, แคะ, ที่..."
2,ปอเปี๊ยะสด ทุกวันนี้รู้สึกว่าหากินยาก (ร้านที่...,3,"[, ปอเปี๊ยะสด, , , ทุก, วัน, นี้, รู้สึก, ว่า,..."
3,รัานคัพเค้กในเมืองไทยมีไม่มาก หลายๆคนอาจจะสงสั...,5,"[, รัาน, คัพเค้ก, ใน, เมือง, ไทย, มี, ไม่, มาก..."
4,อร่อย!!! เดินผ่านDigital gatewayทุกวัน ไม่ยักร...,5,"[, อร่อย, !, !, !, , , เดิน, ผ่าน, Digital, , ..."


In [32]:
ts["tokenizedReview"] = ts["review"].apply(lambda x: tokenize(x,model_cnn))

In [33]:
ts.head()

Unnamed: 0,reviewID,review,tokenizedReview
0,1,ร้านนี้จะอยู่เส้นสันกำแพง-แม่ออน เลยแยกบ่...,"[, , , , , , , , , , , ร้าน, นี้, จะ, อยู่, เส..."
1,2,สั่งไป2 เมนู คือมัชฉะลาเต้ร้อน กับ ไอศครีมชาเข...,"[, สั่ง, ไป, 2, , , เมนู, , , คือ, มัชฉะลา, เต..."
2,3,ครัววงเดือน \n\nหิวดึกๆ ตระเวนหาร้านทาน มาเจอ...,"[, ครัววงเดือน, , , , \n\n, หิว, ดึก, ๆ, , , ต..."
3,4,จะว่าเป็นเจ้าประจำก็คงไม่ผิด แต่ก็ไม่กล้า...,"[, , , , , , , , , , , จะ, ว่า, เป็น, เจ้า, ปร..."
4,5,ถ้าคิดถึงสลัดผมคิดถึงร้านนี้เป็นร้านแรกๆเลยครั...,"[, ถ้า, คิด, ถึง, สลัด, ผม, คิด, ถึง, ร้าน, นี..."


In [29]:
tr.to_csv('../data/tokenized_train.csv', index=False)
ts.to_csv('../data/tokenized_test.csv', index=False)