In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture

# phoW2V
!wget https://public.vinai.io/word2vec_vi_words_300dims.zip

import zipfile
with zipfile.ZipFile('word2vec_vi_words_300dims.zip', 'r') as zip_ref:
    zip_ref.extractall('word2vec_vi_words_300dims')

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import pickle

# Hyperparameters

In [3]:
NUM_CLASSES = 4
MAX_LEN = 20
EMBED_SIZE = 300

BATCH_SIZE = 64
EPOCHS = 20

In [4]:
TRAIN_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/train_processed.csv'
VAL_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/val_processed.csv'
TEST_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/test_processed.csv'

MODELS_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/models/'
RESULTS_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/results/'

In [5]:
GROUP_1 = ['Anger', 'Disgust', 'Sadness', 'Fear'] # 0

# Load data


In [6]:
def load_data(path):
    data = pd.read_csv(path)
    data.cleaned_sentence.fillna('', inplace=True)

    # filter y
    data = data[data.emotion.str.strip().isin(GROUP_1)].reset_index()

    X = data.cleaned_sentence
    y = data.emotion

    return X, y

In [7]:
X_train, y_train = load_data(TRAIN_PATH)
X_val, y_val = load_data(VAL_PATH)
X_test, y_test = load_data(TEST_PATH)

# Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# load
with open(MODELS_PATH + 'model_2/le.pkl', 'rb') as f:
    le = pickle.load(f)

In [None]:
y_train = le.transform(y_train)
y_val = le.transform(y_val)
y_test = le.transform(y_test)

# Prepare Data

In [8]:
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical, pad_sequences

In [9]:
tokenizer = Tokenizer(num_words=None, lower=True, filters='!"#$%&()*+,-./:;<=>?@[\]^`{|}~\t\n')
tokenizer.fit_on_texts(X_train)

# save
with open(MODELS_PATH + 'model_2/tokenizer.pkl', "wb") as file:
    pickle.dump(tokenizer, file)

In [None]:
def prepare_data(X, y, num_classes=NUM_CLASSES):
    X = tokenizer.texts_to_sequences(X)
    X = pad_sequences(X, maxlen=MAX_LEN)
    y = to_categorical(y, num_classes=num_classes)
    return X, y

In [None]:
X_train, y_train = prepare_data(X_train, y_train)
X_val, y_val = prepare_data(X_val, y_val)
X_test, y_test = prepare_data(X_test, y_test)

# Deep Learning

* Feature Extraction:
    - phoW2V
* Models:
    - LSTM
    - TextCNN

In [None]:
# Deep Learning
from keras.models import Sequential, Model, save_model, load_model
from keras.layers import (
    Input, Reshape, Concatenate, Flatten,
    Embedding,
    Conv2D, MaxPool2D, GlobalMaxPooling1D, GlobalAveragePooling1D,
    LSTM, Bidirectional,
    Dropout, SpatialDropout1D,
    Dense,
)
from keras.optimizers import Adam
from keras import backend as K

## phoW2V

In [None]:
# read phoW2V file
def load_embedding_matrix():
    embeddings_index = {}
    with open('word2vec_vi_words_300dims/word2vec_vi_words_300dims.txt', encoding='utf8') as f:
        for line in f:
            values = line.rstrip().rsplit(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    word_index = tokenizer.word_index
    num_words = len(word_index) + 1
    embedding_matrix = np.zeros((num_words, 300))
    max_features = num_words

    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return  embedding_matrix, word_index, num_words, max_features

In [None]:
EMBEDDING_MATRIX, WORD_INDEX, NUM_WORDS, MAX_FEATURES = load_embedding_matrix()

## Define Models

In [None]:
def build_model_DL(model_name):

    if model_name == 'textcnn':

        FILTER_SIZES = [2, 3]
        NUM_FILTERS = 32

        # Input & embedding layer
        inp = Input(shape=(MAX_LEN,))
        x = Embedding(MAX_FEATURES, EMBED_SIZE, weights=[EMBEDDING_MATRIX])(inp)
        x = SpatialDropout1D(0.4)(x)
        x = Reshape((MAX_LEN, EMBED_SIZE, 1))(x)

        # Conv layer
        conv_0 = Conv2D(NUM_FILTERS, kernel_size=(FILTER_SIZES[0], EMBED_SIZE), kernel_initializer='normal',
                        activation='elu')(x)
        conv_1 = Conv2D(NUM_FILTERS, kernel_size=(FILTER_SIZES[1],EMBED_SIZE), kernel_initializer='normal',
                        activation='elu')(x)

        # Max pooling layer
        maxpool_0 = MaxPool2D(pool_size=(MAX_LEN - FILTER_SIZES[0] + 1, 1))(conv_0)
        maxpool_1 = MaxPool2D(pool_size=(MAX_LEN - FILTER_SIZES[1] + 1, 1))(conv_1)

        z = Concatenate(axis=1)([maxpool_0, maxpool_1])
        z = Flatten()(z)
        z = Dropout(0.1)(z)

        outp = Dense(NUM_CLASSES, activation="softmax")(z)

        model = Model(inputs=inp, outputs=outp)
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    elif model_name == 'lstm':

        NUM_UNITS = 64

        # Input & embedding layer
        inp = Input(shape=(MAX_LEN,))
        x = Embedding(MAX_FEATURES, EMBED_SIZE, weights=[EMBEDDING_MATRIX])(inp)
        x = SpatialDropout1D(0.4)(x)

        # LSTM layer
        x = LSTM(NUM_UNITS, return_sequences=True)(x)
        x = LSTM(NUM_UNITS)(x)

        # Dropout layer
        x = Dropout(0.1)(x)

        outp = Dense(NUM_CLASSES, activation="softmax")(x)

        model = Model(inputs=inp, outputs=outp)
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    else:
        print('model_name error~!')
        return None

    return model

## Training

In [None]:
from keras.callbacks import EarlyStopping

In [None]:
text_cnn = build_model_DL('textcnn')
lstm = build_model_DL('lstm')

### TextCNN

In [None]:
es = EarlyStopping(
    monitor='val_accuracy',
    verbose=1,
    patience=5,
    mode='max',
    restore_best_weights=True
)

# Text CNN
text_cnn.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=2,
    callbacks=es
)

# save model
save_model(text_cnn, MODELS_PATH + 'model_2/textcnn_model2.h5')

Epoch 1/20
43/43 - 21s - loss: 1.2392 - accuracy: 0.4474 - val_loss: 1.1670 - val_accuracy: 0.5581 - 21s/epoch - 491ms/step
Epoch 2/20
43/43 - 4s - loss: 1.0479 - accuracy: 0.6036 - val_loss: 1.0109 - val_accuracy: 0.6113 - 4s/epoch - 94ms/step
Epoch 3/20
43/43 - 5s - loss: 0.8074 - accuracy: 0.7041 - val_loss: 0.8863 - val_accuracy: 0.6678 - 5s/epoch - 112ms/step
Epoch 4/20
43/43 - 3s - loss: 0.6262 - accuracy: 0.7697 - val_loss: 0.8717 - val_accuracy: 0.6645 - 3s/epoch - 63ms/step
Epoch 5/20
43/43 - 1s - loss: 0.5002 - accuracy: 0.8357 - val_loss: 0.8678 - val_accuracy: 0.6578 - 1s/epoch - 25ms/step
Epoch 6/20
43/43 - 1s - loss: 0.3808 - accuracy: 0.8757 - val_loss: 0.9146 - val_accuracy: 0.6678 - 1s/epoch - 25ms/step
Epoch 7/20
43/43 - 1s - loss: 0.2922 - accuracy: 0.9197 - val_loss: 0.9710 - val_accuracy: 0.6478 - 1s/epoch - 26ms/step
Epoch 8/20
Restoring model weights from the end of the best epoch: 3.
43/43 - 1s - loss: 0.2242 - accuracy: 0.9413 - val_loss: 1.0160 - val_accuracy:

### LSTM

In [None]:
es = EarlyStopping(
    monitor='val_accuracy',
    verbose=1,
    patience=5,
    mode='max',
    restore_best_weights=True
)

# Text CNN
lstm.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=2,
    callbacks=es
)

# save model
save_model(lstm, MODELS_PATH + 'model_2/lstm_model2.h5')

Epoch 1/20
43/43 - 12s - loss: 1.2497 - accuracy: 0.4184 - val_loss: 1.1896 - val_accuracy: 0.4385 - 12s/epoch - 274ms/step
Epoch 2/20
43/43 - 4s - loss: 0.9860 - accuracy: 0.5922 - val_loss: 0.8984 - val_accuracy: 0.6512 - 4s/epoch - 83ms/step
Epoch 3/20
43/43 - 4s - loss: 0.7063 - accuracy: 0.7356 - val_loss: 0.8972 - val_accuracy: 0.6877 - 4s/epoch - 94ms/step
Epoch 4/20
43/43 - 2s - loss: 0.5067 - accuracy: 0.8119 - val_loss: 0.8931 - val_accuracy: 0.6578 - 2s/epoch - 49ms/step
Epoch 5/20
43/43 - 2s - loss: 0.3778 - accuracy: 0.8673 - val_loss: 1.0392 - val_accuracy: 0.6611 - 2s/epoch - 38ms/step
Epoch 6/20
43/43 - 1s - loss: 0.2902 - accuracy: 0.8999 - val_loss: 1.1780 - val_accuracy: 0.6512 - 689ms/epoch - 16ms/step
Epoch 7/20
43/43 - 2s - loss: 0.2426 - accuracy: 0.9175 - val_loss: 1.2452 - val_accuracy: 0.6478 - 2s/epoch - 39ms/step
Epoch 8/20
Restoring model weights from the end of the best epoch: 3.
43/43 - 1s - loss: 0.1890 - accuracy: 0.9292 - val_loss: 1.2402 - val_accurac

# Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
text_cnn = load_model(MODELS_PATH + 'model_2/textcnn_model2.h5')
lstm = load_model(MODELS_PATH + 'model_2/lstm_model2.h5')

In [None]:
def transform_label(y):
    return [np.argmax(i) for i in y]

def evaluate(model, X, y):
    preds = model.predict(X)

    preds = transform_label(preds)
    y = transform_label(y)

    acc = round(accuracy_score(y, preds), 2)
    pre = round(precision_score(y, preds, average='weighted'), 2)
    recall = round(recall_score(y, preds, average='weighted'), 2)
    f1 = round(f1_score(y, preds, average='weighted'), 2)

    return [acc, pre, recall, f1]

In [None]:
def get_result(X, y):
    text_cnn_re = evaluate(text_cnn, X, y)
    lstm_re = evaluate(lstm, X, y)

    re = pd.DataFrame(
        [text_cnn_re] + [lstm_re],
        columns=['accuracy', 'precision', 'recall', 'f1'],
        index=['text_cnn', 'lstm']
    )

    return re

In [None]:
re_train = get_result(X_train, y_train)
re_train



Unnamed: 0,accuracy,precision,recall,f1
text_cnn,0.78,0.79,0.78,0.76
lstm,0.83,0.83,0.83,0.82


In [None]:
re_val = get_result(X_val, y_val)
re_val.to_csv(RESULTS_PATH + 'model2_val_dl.csv')
re_val



Unnamed: 0,accuracy,precision,recall,f1
text_cnn,0.67,0.67,0.67,0.64
lstm,0.69,0.68,0.69,0.66


In [None]:
re_test = get_result(X_test, y_test)
re_test.to_csv(RESULTS_PATH + 'model2_test_dl.csv')
re_test



Unnamed: 0,accuracy,precision,recall,f1
text_cnn,0.66,0.66,0.66,0.64
lstm,0.69,0.69,0.69,0.67
