Feedforward Neural Network 

In [None]:
# for colab version supported
# %tensorflow_version 1.x

In [None]:
import numpy as np
import pandas as pd
from numpy.linalg import norm
from sklearn.metrics import classification_report, confusion_matrix
from IPython.display import Image, display_png
from gensim.models import word2vec, KeyedVectors
from keras.models import Sequential
from keras.layers import Input, Embedding, Dense, Dropout, Flatten, GlobalAveragePooling1D
from keras.utils import to_categorical, plot_model
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import RMSprop
import matplotlib.pyplot as plt
plt.style.use('ggplot')

1. Load up the data

In [ ]:
!gdown --id 14l7wuSNFg0KEberTf-LoniKle4k2bQMa
!upzip wongnai-data.zip

In [ ]:
# load data
train = pd.read_csv('wongnai-tain.csv', encoding='utf-8')
dev = pd.read_csv('wongnai-dev.csv', encoding='utf-8')

# show data
train

In [ ]:
# add column 'length'
train['length'] = train['tokenized'].apply(lambda x: x.count('|'))
dev['length'] = dev['tokenized'].apply(lambda x: x.count('|'))

dev

ส่วนใหญ่มีแค่ 100 คำ ไม่ต้องใช้ทั้งหมด

In [ ]:
train.length.describe()

In [ ]:
dev.length.describe()

2. Load up the pre-trained word embeddings

In [ ]:
!gdown --id 14bv_aTSP-8rs_Bkudvpp8zcU3UpyRen6 #TNC_embeddings-100.bin

In [ ]:
w2v_model = KeyedVectors.load_word2vec_format('TNC_embeddings-100.bin', binary=True, unicode_errors='ignore')

In [ ]:
# vocabulary size of pre-trained model
vocab_size = len(w2v_model.wv.vocab)
print("vocab size:", vocab_size)

# word vector dimension
vector_dim = len(w2v_model['ไป'])
print("vector dimension:", vector_dim)

In [ ]:
# make weight matrix of word embedding, vocab size + 1 (for padding)
embedding_matrix = np.zeros((vocab_size+1, vector_dim), dtype='float32')
embedding_matrix[0] = np.zeros(vector_dim)

word_to_index = {word:i+1 for i, word in enumerate(w2v_model.vocab)}
# word to index dictionary, 0 for padding and UNKNOWN
word_to_index['PADDING'] = 0

for i, word in enumerate(w2v_model.vocab):
    embedding_matrix[i+1] = w2v_model[word]

In [ ]:
# example
word_to_index['ไป']

3. Convert words to indices and pad + truncate sequences

In [ ]:
def convert_words(df, word_to_index, max_length):
    tokens = df['tokenized'].apply(lambda x: x.split('|'))
    list_of_list_of_indices = list(tokens.map(lambda x: [word_to_index.get(word, word_to_index['UNKNOWN']) for word in x]))
    return pad_sequences(list_of_list_of_indices, maxlen=max_length, padding='post', truncating='post', value=0)

# กำหนด max length เอง
max_len = 500
train_x = convert_words(train, word_to_index, max_len)
dev_x = convert_words(dev, word_to_index, max_len)

In [ ]:
train_x[0]

4. Mapping labels

In [ ]:
def get_label(df):
    star_to_label = {1: 0, 2: 0, 3: 1, 4: 1, 5: 2}
    # apply functions & convert to np.array
    label = np.array(df['star'].replace(star_to_label).tolist())
    df['label'] = label
    return to_categorical(label, num_classes=3)

# label : one-hot vector
train_y = get_label(train)
dev_y = get_label(dev)

In [ ]:
train_y[0:10]

In [ ]:
# check the shape (text, words) and (label)
print('input train:', train_x.shape)
print('input dev:', dev_x.shape)
print("label train:", train_y.shape)
print("label dev:", dev_y.shape)

5. Train the model

In [None]:
# instantiation
model = Sequential()

# add embedding layer - mask_zero ignore padding
model.add(Embedding(input_dim=vocab_size+1, output_dim=vector_dim, input_length=max_len, weights=[embedding_matrix], trainable=False, mask_zero=True))

# average
model.add(GlobalAveragePooling1D())

# add hidden layer - usually set equal to or less than embedding dimension
model.add(Dense(150, activation='relu'))

# add output layer
model.add(Dense(3, activation='softmax'))

# compile model
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

model.summary()

In [ ]:
plot_model(model, show_shapes=True, to_file='model.png')
display_png(Image('model.png'))

In [ ]:
# train - epochs is how many times the model will see the entire training set
history = model.fit(train_x, train_y, batch_size=128, epochs=10, validation_data=(dev_x, dev_y))

In [ ]:
# track accuracy in each epochs
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_loss'], label='val_loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend({'train', 'dev'}, loc='best')
plt.show()

6. Evaluate the model

In [ ]:
prediction = [np.argmax(x) for x in model.predict(dev_x)]
print(classification_report(dev['label'], prediction))