In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture

# phoW2V
!wget https://public.vinai.io/word2vec_vi_words_300dims.zip

import zipfile
with zipfile.ZipFile('word2vec_vi_words_300dims.zip', 'r') as zip_ref:
    zip_ref.extractall('word2vec_vi_words_300dims')

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import pickle

# Hyperparameters

In [4]:
NUM_CLASSES = 3
MAX_LEN = 20
EMBED_SIZE = 300

BATCH_SIZE = 64
EPOCHS = 20

In [5]:
TRAIN_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/train_processed.csv'
VAL_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/val_processed.csv'
TEST_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/test_processed.csv'

MODELS_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/models/'
RESULTS_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/results/'

In [6]:
GROUP_2 = ['Enjoyment', 'Surprise', 'Other'] # 1

# Load data


In [7]:
def load_data(path):
    data = pd.read_csv(path)
    data.cleaned_sentence.fillna('', inplace=True)

    # filter y
    data = data[data.emotion.str.strip().isin(GROUP_2)].reset_index()

    X = data.cleaned_sentence
    y = data.emotion

    return X, y

In [8]:
X_train, y_train = load_data(TRAIN_PATH)
X_val, y_val = load_data(VAL_PATH)
X_test, y_test = load_data(TEST_PATH)

# Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# load
with open(MODELS_PATH + 'model_3/le.pkl', 'rb') as f:
    le = pickle.load(f)

In [None]:
y_train = le.transform(y_train)
y_val = le.transform(y_val)
y_test = le.transform(y_test)

# Prepare Data

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical, pad_sequences

In [10]:
tokenizer = Tokenizer(num_words=None, lower=True, filters='!"#$%&()*+,-./:;<=>?@[\]^`{|}~\t\n')
tokenizer.fit_on_texts(X_train)
# save
with open(MODELS_PATH + 'model_3/tokenizer.pkl', "wb") as file:
    pickle.dump(tokenizer, file)

In [None]:
def prepare_data(X, y, num_classes=NUM_CLASSES):
    X = tokenizer.texts_to_sequences(X)
    X = pad_sequences(X, maxlen=MAX_LEN)
    y = to_categorical(y, num_classes=num_classes)
    return X, y

In [None]:
X_train, y_train = prepare_data(X_train, y_train)
X_val, y_val = prepare_data(X_val, y_val)
X_test, y_test = prepare_data(X_test, y_test)

# Deep Learning

* Feature Extraction:
    - phoW2V
* Models:
    - LSTM
    - TextCNN

In [None]:
# Deep Learning
from keras.models import Sequential, Model, save_model, load_model
from keras.layers import (
    Input, Reshape, Concatenate, Flatten,
    Embedding,
    Conv2D, MaxPool2D, GlobalMaxPooling1D, GlobalAveragePooling1D,
    LSTM, Bidirectional,
    Dropout, SpatialDropout1D,
    Dense,
)
from keras.optimizers import Adam
from keras import backend as K

## phoW2V

In [None]:
# read phoW2V file
def load_embedding_matrix():
    embeddings_index = {}
    with open('word2vec_vi_words_300dims/word2vec_vi_words_300dims.txt', encoding='utf8') as f:
        for line in f:
            values = line.rstrip().rsplit(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    word_index = tokenizer.word_index
    num_words = len(word_index) + 1
    embedding_matrix = np.zeros((num_words, 300))
    max_features = num_words

    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return  embedding_matrix, word_index, num_words, max_features

In [None]:
EMBEDDING_MATRIX, WORD_INDEX, NUM_WORDS, MAX_FEATURES = load_embedding_matrix()

## Define Models

In [None]:
def build_model_DL(model_name):

    if model_name == 'textcnn':

        FILTER_SIZES = [2, 3]
        NUM_FILTERS = 32

        # Input & embedding layer
        inp = Input(shape=(MAX_LEN,))
        x = Embedding(MAX_FEATURES, EMBED_SIZE, weights=[EMBEDDING_MATRIX])(inp)
        x = SpatialDropout1D(0.4)(x)
        x = Reshape((MAX_LEN, EMBED_SIZE, 1))(x)

        # Conv layer
        conv_0 = Conv2D(NUM_FILTERS, kernel_size=(FILTER_SIZES[0], EMBED_SIZE), kernel_initializer='normal',
                        activation='elu')(x)
        conv_1 = Conv2D(NUM_FILTERS, kernel_size=(FILTER_SIZES[1],EMBED_SIZE), kernel_initializer='normal',
                        activation='elu')(x)

        # Max pooling layer
        maxpool_0 = MaxPool2D(pool_size=(MAX_LEN - FILTER_SIZES[0] + 1, 1))(conv_0)
        maxpool_1 = MaxPool2D(pool_size=(MAX_LEN - FILTER_SIZES[1] + 1, 1))(conv_1)

        z = Concatenate(axis=1)([maxpool_0, maxpool_1])
        z = Flatten()(z)
        z = Dropout(0.1)(z)

        outp = Dense(NUM_CLASSES, activation="softmax")(z)

        model = Model(inputs=inp, outputs=outp)
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    elif model_name == 'lstm':

        NUM_UNITS = 64

        # Input & embedding layer
        inp = Input(shape=(MAX_LEN,))
        x = Embedding(MAX_FEATURES, EMBED_SIZE, weights=[EMBEDDING_MATRIX])(inp)
        x = SpatialDropout1D(0.4)(x)

        # LSTM layer
        x = LSTM(NUM_UNITS, return_sequences=True)(x)
        x = LSTM(NUM_UNITS)(x)

        # Dropout layer
        x = Dropout(0.1)(x)

        outp = Dense(NUM_CLASSES, activation="softmax")(x)

        model = Model(inputs=inp, outputs=outp)
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    else:
        print('model_name error~!')
        return None

    return model

## Training

In [None]:
from keras.callbacks import EarlyStopping

In [None]:
text_cnn = build_model_DL('textcnn')
lstm = build_model_DL('lstm')

### TextCNN

In [None]:
es = EarlyStopping(
    monitor='val_accuracy',
    verbose=1,
    patience=5,
    mode='max',
    restore_best_weights=True
)

# Text CNN
text_cnn.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=2,
    callbacks=es
)

# save model
save_model(text_cnn, MODELS_PATH + 'model_3/textcnn_model3.h5')

Epoch 1/20
45/45 - 19s - loss: 0.8957 - accuracy: 0.5562 - val_loss: 0.8298 - val_accuracy: 0.6442 - 19s/epoch - 433ms/step
Epoch 2/20
45/45 - 5s - loss: 0.7570 - accuracy: 0.6735 - val_loss: 0.7286 - val_accuracy: 0.6701 - 5s/epoch - 119ms/step
Epoch 3/20
45/45 - 3s - loss: 0.6213 - accuracy: 0.7494 - val_loss: 0.6660 - val_accuracy: 0.6883 - 3s/epoch - 59ms/step
Epoch 4/20
45/45 - 2s - loss: 0.5026 - accuracy: 0.8033 - val_loss: 0.6303 - val_accuracy: 0.7117 - 2s/epoch - 37ms/step
Epoch 5/20
45/45 - 2s - loss: 0.3787 - accuracy: 0.8649 - val_loss: 0.6306 - val_accuracy: 0.7039 - 2s/epoch - 37ms/step
Epoch 6/20
45/45 - 2s - loss: 0.2968 - accuracy: 0.9046 - val_loss: 0.6684 - val_accuracy: 0.6909 - 2s/epoch - 34ms/step
Epoch 7/20
45/45 - 1s - loss: 0.2291 - accuracy: 0.9241 - val_loss: 0.7211 - val_accuracy: 0.6857 - 1s/epoch - 29ms/step
Epoch 8/20
45/45 - 1s - loss: 0.1714 - accuracy: 0.9472 - val_loss: 0.7766 - val_accuracy: 0.6831 - 1s/epoch - 28ms/step
Epoch 9/20
Restoring model w

### LSTM

In [None]:
es = EarlyStopping(
    monitor='val_accuracy',
    verbose=1,
    patience=5,
    mode='max',
    restore_best_weights=True
)

# Text CNN
lstm.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=2,
    callbacks=es
)

# save model
save_model(lstm, MODELS_PATH + 'model_3/lstm_model3.h5')

Epoch 1/20
45/45 - 12s - loss: 0.9182 - accuracy: 0.5445 - val_loss: 0.8312 - val_accuracy: 0.6052 - 12s/epoch - 270ms/step
Epoch 2/20
45/45 - 3s - loss: 0.7510 - accuracy: 0.6639 - val_loss: 0.7644 - val_accuracy: 0.6442 - 3s/epoch - 71ms/step
Epoch 3/20
45/45 - 2s - loss: 0.5598 - accuracy: 0.7657 - val_loss: 0.7380 - val_accuracy: 0.6571 - 2s/epoch - 37ms/step
Epoch 4/20
45/45 - 2s - loss: 0.4266 - accuracy: 0.8334 - val_loss: 0.7493 - val_accuracy: 0.6779 - 2s/epoch - 52ms/step
Epoch 5/20
45/45 - 2s - loss: 0.3260 - accuracy: 0.8773 - val_loss: 0.7687 - val_accuracy: 0.6883 - 2s/epoch - 46ms/step
Epoch 6/20
45/45 - 1s - loss: 0.2464 - accuracy: 0.9100 - val_loss: 0.9671 - val_accuracy: 0.6494 - 1s/epoch - 33ms/step
Epoch 7/20
45/45 - 1s - loss: 0.1860 - accuracy: 0.9348 - val_loss: 1.0401 - val_accuracy: 0.6805 - 1s/epoch - 23ms/step
Epoch 8/20
45/45 - 1s - loss: 0.1592 - accuracy: 0.9383 - val_loss: 1.2661 - val_accuracy: 0.6701 - 1s/epoch - 25ms/step
Epoch 9/20
45/45 - 1s - loss:

# Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
text_cnn = load_model(MODELS_PATH + 'model_3/textcnn_model3.h5')
lstm = load_model(MODELS_PATH + 'model_3/lstm_model3.h5')

In [None]:
def transform_label(y):
    return [np.argmax(i) for i in y]

def evaluate(model, X, y):
    preds = model.predict(X)

    preds = transform_label(preds)
    y = transform_label(y)

    acc = round(accuracy_score(y, preds), 2)
    pre = round(precision_score(y, preds, average='weighted'), 2)
    recall = round(recall_score(y, preds, average='weighted'), 2)
    f1 = round(f1_score(y, preds, average='weighted'), 2)

    return [acc, pre, recall, f1]

In [None]:
def get_result(X, y):
    text_cnn_re = evaluate(text_cnn, X, y)
    lstm_re = evaluate(lstm, X, y)

    re = pd.DataFrame(
        [text_cnn_re] + [lstm_re],
        columns=['accuracy', 'precision', 'recall', 'f1'],
        index=['text_cnn', 'lstm']
    )

    return re

In [None]:
re_train = get_result(X_train, y_train)
re_train



Unnamed: 0,accuracy,precision,recall,f1
text_cnn,0.88,0.88,0.88,0.87
lstm,0.93,0.93,0.93,0.93


In [None]:
re_val = get_result(X_val, y_val)
re_val.to_csv(RESULTS_PATH + 'model3_val_dl.csv')
re_val



Unnamed: 0,accuracy,precision,recall,f1
text_cnn,0.71,0.72,0.71,0.7
lstm,0.69,0.69,0.69,0.69


In [None]:
re_test = get_result(X_test, y_test)
re_test.to_csv(RESULTS_PATH + 'model3_test_dl.csv')
re_test



Unnamed: 0,accuracy,precision,recall,f1
text_cnn,0.68,0.7,0.68,0.67
lstm,0.69,0.69,0.69,0.68
