In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture

# phoW2V
!wget https://public.vinai.io/word2vec_vi_words_300dims.zip

import zipfile
with zipfile.ZipFile('word2vec_vi_words_300dims.zip', 'r') as zip_ref:
    zip_ref.extractall('word2vec_vi_words_300dims')

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import pickle

# Hyperparameters

In [3]:
MAX_LEN = 20
EMBED_SIZE = 300

BATCH_SIZE = 64
EPOCHS = 20

In [4]:
TRAIN_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/train_processed.csv'
VAL_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/val_processed.csv'
TEST_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/test_processed.csv'

MODELS_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/models/'
RESULTS_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/results/'

In [5]:
GROUP_1 = ['Anger', 'Disgust', 'Sadness', 'Fear'] # 0
GROUP_2 = ['Enjoyment', 'Surprise', 'Other'] # 1

# Load data


In [6]:
def transform_y(y):
    return y.map(lambda i: 0 if i in GROUP_1 else 1)

def load_data(path):
    data = pd.read_csv(path)
    data.cleaned_sentence.fillna('', inplace=True)
    X = data.cleaned_sentence
    y = data.emotion
    y = transform_y(y)

    return X, y

In [7]:
X_train, y_train = load_data(TRAIN_PATH)
X_val, y_val = load_data(VAL_PATH)
X_test, y_test = load_data(TEST_PATH)

# Prepare Data

In [8]:
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical, pad_sequences

In [10]:
tokenizer = Tokenizer(num_words=None, lower=True, filters='!"#$%&()*+,-./:;<=>?@[\]^`{|}~\t\n')
tokenizer.fit_on_texts(X_train)

# save
with open(MODELS_PATH + 'model_1/tokenizer.pkl', "wb") as file:
    pickle.dump(tokenizer, file)

In [None]:
def prepare_data(X, y):
    X = tokenizer.texts_to_sequences(X)
    X = pad_sequences(X, maxlen=MAX_LEN)
    return X, y

In [None]:
X_train, y_train = prepare_data(X_train, y_train)
X_val, y_val = prepare_data(X_val, y_val)
X_test, y_test = prepare_data(X_test, y_test)

# Deep Learning

* Feature Extraction:
    - phoW2V
* Models:
    - LSTM
    - TextCNN

In [None]:
# Deep Learning
from keras.models import Sequential, Model, save_model, load_model
from keras.layers import (
    Input, Reshape, Concatenate, Flatten,
    Embedding,
    Conv2D, MaxPool2D, GlobalMaxPooling1D, GlobalAveragePooling1D,
    LSTM, Bidirectional,
    Dropout, SpatialDropout1D,
    Dense,
)
from keras.optimizers import Adam
from keras import backend as K

## phoW2V

In [None]:
# read phoW2V file
def load_embedding_matrix():
    embeddings_index = {}
    with open('word2vec_vi_words_300dims/word2vec_vi_words_300dims.txt', encoding='utf8') as f:
        for line in f:
            values = line.rstrip().rsplit(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    word_index = tokenizer.word_index
    num_words = len(word_index) + 1
    embedding_matrix = np.zeros((num_words, 300))
    max_features = num_words

    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return  embedding_matrix, word_index, num_words, max_features

In [None]:
EMBEDDING_MATRIX, WORD_INDEX, NUM_WORDS, MAX_FEATURES = load_embedding_matrix()

## Define Models

In [None]:
def build_model_DL(model_name):

    if model_name == 'textcnn':

        FILTER_SIZES = [2, 3]
        NUM_FILTERS = 32

        # Input & embedding layer
        inp = Input(shape=(MAX_LEN,))
        x = Embedding(MAX_FEATURES, EMBED_SIZE, weights=[EMBEDDING_MATRIX])(inp)
        x = SpatialDropout1D(0.4)(x)
        x = Reshape((MAX_LEN, EMBED_SIZE, 1))(x)

        # Conv layer
        conv_0 = Conv2D(NUM_FILTERS, kernel_size=(FILTER_SIZES[0], EMBED_SIZE), kernel_initializer='normal',
                        activation='elu')(x)
        conv_1 = Conv2D(NUM_FILTERS, kernel_size=(FILTER_SIZES[1],EMBED_SIZE), kernel_initializer='normal',
                        activation='elu')(x)

        # Max pooling layer
        maxpool_0 = MaxPool2D(pool_size=(MAX_LEN - FILTER_SIZES[0] + 1, 1))(conv_0)
        maxpool_1 = MaxPool2D(pool_size=(MAX_LEN - FILTER_SIZES[1] + 1, 1))(conv_1)

        z = Concatenate(axis=1)([maxpool_0, maxpool_1])
        z = Flatten()(z)
        z = Dropout(0.1)(z)

        outp = Dense(1, activation="sigmoid")(z)

        model = Model(inputs=inp, outputs=outp)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    elif model_name == 'lstm':

        NUM_UNITS = 64

        # Input & embedding layer
        inp = Input(shape=(MAX_LEN,))
        x = Embedding(MAX_FEATURES, EMBED_SIZE, weights=[EMBEDDING_MATRIX])(inp)
        x = SpatialDropout1D(0.4)(x)

        # LSTM layer
        x = LSTM(NUM_UNITS, return_sequences=True)(x)
        x = LSTM(NUM_UNITS)(x)

        # Dropout layer
        x = Dropout(0.1)(x)

        outp = Dense(1, activation="sigmoid")(x)

        model = Model(inputs=inp, outputs=outp)
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    else:
        print('model_name error~!')
        return None

    return model

## Training

In [None]:
from keras.callbacks import EarlyStopping

In [None]:
text_cnn = build_model_DL('textcnn')
lstm = build_model_DL('lstm')

### TextCNN

In [None]:
es = EarlyStopping(
    monitor='val_accuracy',
    verbose=1,
    patience=5,
    mode='max',
    restore_best_weights=True
)

# Text CNN
text_cnn.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=2,
    callbacks=es
)

# save model
save_model(text_cnn, MODELS_PATH + 'model_1/textcnn_model1.h5')

Epoch 1/20
87/87 - 25s - loss: 0.6493 - accuracy: 0.6370 - val_loss: 0.6053 - val_accuracy: 0.6866 - 25s/epoch - 284ms/step
Epoch 2/20
87/87 - 5s - loss: 0.4913 - accuracy: 0.7803 - val_loss: 0.5106 - val_accuracy: 0.7449 - 5s/epoch - 55ms/step
Epoch 3/20
87/87 - 3s - loss: 0.3765 - accuracy: 0.8398 - val_loss: 0.5201 - val_accuracy: 0.7493 - 3s/epoch - 32ms/step
Epoch 4/20
87/87 - 3s - loss: 0.2952 - accuracy: 0.8832 - val_loss: 0.5639 - val_accuracy: 0.7347 - 3s/epoch - 34ms/step
Epoch 5/20
87/87 - 2s - loss: 0.2302 - accuracy: 0.9147 - val_loss: 0.6304 - val_accuracy: 0.7230 - 2s/epoch - 27ms/step
Epoch 6/20
87/87 - 1s - loss: 0.1816 - accuracy: 0.9335 - val_loss: 0.6953 - val_accuracy: 0.7245 - 535ms/epoch - 6ms/step
Epoch 7/20
87/87 - 1s - loss: 0.1434 - accuracy: 0.9506 - val_loss: 0.7768 - val_accuracy: 0.7259 - 1s/epoch - 12ms/step
Epoch 8/20
Restoring model weights from the end of the best epoch: 3.
87/87 - 1s - loss: 0.1133 - accuracy: 0.9618 - val_loss: 0.8527 - val_accuracy

### LSTM

In [None]:
es = EarlyStopping(
    monitor='val_accuracy',
    verbose=1,
    patience=5,
    mode='max',
    restore_best_weights=True
)

# Text CNN
lstm.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=2,
    callbacks=es
)

# save model
save_model(lstm, MODELS_PATH + 'model_1/lstm_model1.h5')

Epoch 1/20
87/87 - 16s - loss: 0.6363 - accuracy: 0.6327 - val_loss: 0.5720 - val_accuracy: 0.7085 - 16s/epoch - 189ms/step
Epoch 2/20
87/87 - 5s - loss: 0.4725 - accuracy: 0.7841 - val_loss: 0.5296 - val_accuracy: 0.7478 - 5s/epoch - 55ms/step
Epoch 3/20
87/87 - 4s - loss: 0.3568 - accuracy: 0.8464 - val_loss: 0.5739 - val_accuracy: 0.7143 - 4s/epoch - 47ms/step
Epoch 4/20
87/87 - 1s - loss: 0.2867 - accuracy: 0.8807 - val_loss: 0.5727 - val_accuracy: 0.7274 - 1s/epoch - 12ms/step
Epoch 5/20
87/87 - 2s - loss: 0.2267 - accuracy: 0.9086 - val_loss: 0.6497 - val_accuracy: 0.7376 - 2s/epoch - 21ms/step
Epoch 6/20
87/87 - 2s - loss: 0.1842 - accuracy: 0.9272 - val_loss: 0.7178 - val_accuracy: 0.7391 - 2s/epoch - 18ms/step
Epoch 7/20
Restoring model weights from the end of the best epoch: 2.
87/87 - 1s - loss: 0.1366 - accuracy: 0.9472 - val_loss: 0.8859 - val_accuracy: 0.7216 - 1s/epoch - 15ms/step
Epoch 7: early stopping


# Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
text_cnn = load_model(MODELS_PATH + 'model_1/textcnn_model1.h5')
lstm = load_model(MODELS_PATH + 'model_1/lstm_model1.h5')

In [None]:
a = text_cnn.predict(X_val).flatten()



In [None]:
def evaluate(model, X, y):
    preds = model.predict(X).flatten()
    preds = [round(pred) for pred in preds]

    acc = round(accuracy_score(y, preds), 2)
    pre = round(precision_score(y, preds, average='weighted'), 2)
    recall = round(recall_score(y, preds, average='weighted'), 2)
    f1 = round(f1_score(y, preds, average='weighted'), 2)

    return [acc, pre, recall, f1]

In [None]:
def get_result(X, y):
    text_cnn_re = evaluate(text_cnn, X, y)
    lstm_re = evaluate(lstm, X, y)

    re = pd.DataFrame(
        [text_cnn_re] + [lstm_re],
        columns=['accuracy', 'precision', 'recall', 'f1'],
        index=['text_cnn', 'lstm']
    )

    return re

In [None]:
re_train = get_result(X_train, y_train)
re_train



Unnamed: 0,accuracy,precision,recall,f1
text_cnn,0.9,0.9,0.9,0.9
lstm,0.86,0.86,0.86,0.86


In [None]:
re_val = get_result(X_val, y_val)
re_val.to_csv(RESULTS_PATH + 'model1_val_dl.csv')
re_val



Unnamed: 0,accuracy,precision,recall,f1
text_cnn,0.75,0.75,0.75,0.75
lstm,0.75,0.76,0.75,0.75


In [None]:
re_test = get_result(X_test, y_test)
re_test.to_csv(RESULTS_PATH + 'model1_test_dl.csv')
re_test



Unnamed: 0,accuracy,precision,recall,f1
text_cnn,0.75,0.75,0.75,0.75
lstm,0.74,0.75,0.74,0.74
