In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%%capture

# phoW2V
!wget https://public.vinai.io/word2vec_vi_words_300dims.zip

import zipfile
with zipfile.ZipFile('word2vec_vi_words_300dims.zip', 'r') as zip_ref:
    zip_ref.extractall('word2vec_vi_words_300dims')

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import pickle

# Hyperparameters

In [5]:
NUM_CLASSES = 7
MAX_LEN = 20
EMBED_SIZE = 300

BATCH_SIZE = 64
EPOCHS = 20

In [6]:
TRAIN_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/train_processed.csv'
VAL_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/val_processed.csv'
TEST_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/test_processed.csv'

MODELS_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/models/'
RESULTS_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/results/'

# Load data


In [7]:
def load_data(path):
    data = pd.read_csv(path)
    data.cleaned_sentence.fillna('', inplace=True)
    X = data.cleaned_sentence.values.tolist()
    y = data.emotion.values.tolist()
    return X, y

In [8]:
X_train, y_train = load_data(TRAIN_PATH)
X_val, y_val = load_data(VAL_PATH)
X_test, y_test = load_data(TEST_PATH)

# Label Encoder

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
le = LabelEncoder()
le.fit(y_train)

y_train = le.transform(y_train)
y_val = le.transform(y_val)
y_test = le.transform(y_test)

# save
with open(MODELS_PATH + 'baseline/le.pkl', 'wb') as f:
    le = pickle.dump(le, f)

# Prepare Data

In [11]:
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical, pad_sequences

In [12]:
tokenizer = Tokenizer(num_words=None, lower=True, filters='!"#$%&()*+,-./:;<=>?@[\]^`{|}~\t\n')
tokenizer.fit_on_texts(X_train)

In [13]:
def prepare_data(X, y, num_classes=NUM_CLASSES):
    X = tokenizer.texts_to_sequences(X)
    X = pad_sequences(X, maxlen=MAX_LEN)
    y = to_categorical(y, num_classes=num_classes)
    return X, y

In [14]:
X_train, y_train = prepare_data(X_train, y_train)
X_val, y_val = prepare_data(X_val, y_val)
X_test, y_test = prepare_data(X_test, y_test)

# Deep Learning

* Feature Extraction:
    - phoW2V
* Models:
    - LSTM
    - TextCNN

In [15]:
# Deep Learning
from keras.models import Sequential, Model, save_model, load_model
from keras.layers import (
    Input, Reshape, Concatenate, Flatten,
    Embedding,
    Conv2D, MaxPool2D, GlobalMaxPooling1D, GlobalAveragePooling1D,
    LSTM, Bidirectional,
    Dropout, SpatialDropout1D,
    Dense,
)
from keras.optimizers import Adam
from keras import backend as K

## phoW2V

In [16]:
# read phoW2V file
def load_embedding_matrix():
    embeddings_index = {}
    with open('word2vec_vi_words_300dims/word2vec_vi_words_300dims.txt', encoding='utf8') as f:
        for line in f:
            values = line.rstrip().rsplit(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    word_index = tokenizer.word_index
    num_words = len(word_index) + 1
    embedding_matrix = np.zeros((num_words, 300))
    max_features = num_words

    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return  embedding_matrix, word_index, num_words, max_features

In [17]:
EMBEDDING_MATRIX, WORD_INDEX, NUM_WORDS, MAX_FEATURES = load_embedding_matrix()

## Define Models

In [18]:
def build_model_DL(model_name):

    if model_name == 'textcnn':

        FILTER_SIZES = [2, 3]
        NUM_FILTERS = 32

        # Input & embedding layer
        inp = Input(shape=(MAX_LEN,))
        x = Embedding(MAX_FEATURES, EMBED_SIZE, weights=[EMBEDDING_MATRIX])(inp)
        x = SpatialDropout1D(0.4)(x)
        x = Reshape((MAX_LEN, EMBED_SIZE, 1))(x)

        # Conv layer
        conv_0 = Conv2D(NUM_FILTERS, kernel_size=(FILTER_SIZES[0], EMBED_SIZE), kernel_initializer='normal',
                        activation='elu')(x)
        conv_1 = Conv2D(NUM_FILTERS, kernel_size=(FILTER_SIZES[1],EMBED_SIZE), kernel_initializer='normal',
                        activation='elu')(x)

        # Max pooling layer
        maxpool_0 = MaxPool2D(pool_size=(MAX_LEN - FILTER_SIZES[0] + 1, 1))(conv_0)
        maxpool_1 = MaxPool2D(pool_size=(MAX_LEN - FILTER_SIZES[1] + 1, 1))(conv_1)

        z = Concatenate(axis=1)([maxpool_0, maxpool_1])
        z = Flatten()(z)
        z = Dropout(0.1)(z)

        outp = Dense(NUM_CLASSES, activation="softmax")(z)

        model = Model(inputs=inp, outputs=outp)
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    elif model_name == 'lstm':

        NUM_UNITS = 64

        # Input & embedding layer
        inp = Input(shape=(MAX_LEN,))
        x = Embedding(MAX_FEATURES, EMBED_SIZE, weights=[EMBEDDING_MATRIX])(inp)
        x = SpatialDropout1D(0.4)(x)

        # LSTM layer
        x = LSTM(NUM_UNITS, return_sequences=True)(x)
        x = LSTM(NUM_UNITS)(x)

        # Dropout layer
        x = Dropout(0.1)(x)

        outp = Dense(NUM_CLASSES, activation="softmax")(x)

        model = Model(inputs=inp, outputs=outp)
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    else:
        print('model_name error~!')
        return None

    return model

## Training

In [19]:
from keras.callbacks import EarlyStopping

In [20]:
text_cnn = build_model_DL('textcnn')
lstm = build_model_DL('lstm')

### TextCNN

In [21]:
es = EarlyStopping(
    monitor='val_accuracy',
    verbose=1,
    patience=5,
    mode='max',
    restore_best_weights=True
)

# Text CNN
text_cnn.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=2,
    callbacks=es
)

# save model
save_model(text_cnn, MODELS_PATH + 'baseline/textcnn_baseline.h5')

Epoch 1/20
87/87 - 7s - loss: 1.7542 - accuracy: 0.2979 - val_loss: 1.6464 - val_accuracy: 0.3819 - 7s/epoch - 82ms/step
Epoch 2/20
87/87 - 4s - loss: 1.4912 - accuracy: 0.4634 - val_loss: 1.4147 - val_accuracy: 0.4810 - 4s/epoch - 50ms/step
Epoch 3/20
87/87 - 6s - loss: 1.1981 - accuracy: 0.5865 - val_loss: 1.3005 - val_accuracy: 0.5102 - 6s/epoch - 72ms/step
Epoch 4/20
87/87 - 4s - loss: 0.9573 - accuracy: 0.6781 - val_loss: 1.2654 - val_accuracy: 0.5117 - 4s/epoch - 48ms/step
Epoch 5/20
87/87 - 4s - loss: 0.7584 - accuracy: 0.7484 - val_loss: 1.2958 - val_accuracy: 0.5408 - 4s/epoch - 48ms/step
Epoch 6/20
87/87 - 6s - loss: 0.6001 - accuracy: 0.8061 - val_loss: 1.3215 - val_accuracy: 0.5335 - 6s/epoch - 72ms/step
Epoch 7/20
87/87 - 5s - loss: 0.4650 - accuracy: 0.8612 - val_loss: 1.3837 - val_accuracy: 0.5248 - 5s/epoch - 54ms/step
Epoch 8/20
87/87 - 5s - loss: 0.3733 - accuracy: 0.8895 - val_loss: 1.4513 - val_accuracy: 0.5350 - 5s/epoch - 52ms/step
Epoch 9/20
87/87 - 7s - loss: 0.

### LSTM

In [22]:
es = EarlyStopping(
    monitor='val_accuracy',
    verbose=1,
    patience=5,
    mode='max',
    restore_best_weights=True
)

# Text CNN
lstm.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=2,
    callbacks=es
)

# save model
save_model(lstm, MODELS_PATH + 'baseline/lstm_baseline.h5')

Epoch 1/20
87/87 - 14s - loss: 1.7364 - accuracy: 0.3079 - val_loss: 1.5329 - val_accuracy: 0.3761 - 14s/epoch - 161ms/step
Epoch 2/20
87/87 - 8s - loss: 1.3913 - accuracy: 0.4786 - val_loss: 1.3868 - val_accuracy: 0.4913 - 8s/epoch - 87ms/step
Epoch 3/20
87/87 - 7s - loss: 1.1050 - accuracy: 0.6063 - val_loss: 1.3449 - val_accuracy: 0.5029 - 7s/epoch - 84ms/step
Epoch 4/20
87/87 - 8s - loss: 0.8771 - accuracy: 0.6911 - val_loss: 1.4093 - val_accuracy: 0.5058 - 8s/epoch - 96ms/step
Epoch 5/20
87/87 - 7s - loss: 0.7125 - accuracy: 0.7511 - val_loss: 1.4220 - val_accuracy: 0.4927 - 7s/epoch - 78ms/step
Epoch 6/20
87/87 - 9s - loss: 0.5635 - accuracy: 0.8102 - val_loss: 1.5536 - val_accuracy: 0.5029 - 9s/epoch - 98ms/step
Epoch 7/20
87/87 - 7s - loss: 0.4626 - accuracy: 0.8408 - val_loss: 1.6406 - val_accuracy: 0.4956 - 7s/epoch - 76ms/step
Epoch 8/20
87/87 - 9s - loss: 0.3696 - accuracy: 0.8800 - val_loss: 1.7423 - val_accuracy: 0.4927 - 9s/epoch - 98ms/step
Epoch 9/20
Restoring model we

# Evaluation

In [23]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [24]:
text_cnn = load_model(MODELS_PATH + 'baseline/textcnn_baseline.h5')
lstm = load_model(MODELS_PATH + 'baseline/lstm_baseline.h5')

In [25]:
def transform_label(y):
    return [np.argmax(i) for i in y]

def evaluate(model, X, y):
    preds = model.predict(X)

    preds = transform_label(preds)
    y = transform_label(y)

    acc = round(accuracy_score(y, preds), 2)
    pre = round(precision_score(y, preds, average='weighted'), 2)
    recall = round(recall_score(y, preds, average='weighted'), 2)
    f1 = round(f1_score(y, preds, average='weighted'), 2)

    return [acc, pre, recall, f1]

In [26]:
def get_result(X, y):
    text_cnn_re = evaluate(text_cnn, X, y)
    lstm_re = evaluate(lstm, X, y)

    re = pd.DataFrame(
        [text_cnn_re] + [lstm_re],
        columns=['accuracy', 'precision', 'recall', 'f1'],
        index=['text_cnn', 'lstm']
    )

    return re

In [27]:
re_train = get_result(X_train, y_train)
re_train



Unnamed: 0,accuracy,precision,recall,f1
text_cnn,0.85,0.86,0.85,0.85
lstm,0.79,0.79,0.79,0.79


In [28]:
re_val = get_result(X_val, y_val)
re_val.to_csv(RESULTS_PATH + 'baseline_val_dl.csv')
re_val



Unnamed: 0,accuracy,precision,recall,f1
text_cnn,0.54,0.54,0.54,0.53
lstm,0.51,0.51,0.51,0.5


In [29]:
re_test = get_result(X_test, y_test)
re_test.to_csv(RESULTS_PATH + 'baseline_test_dl.csv')
re_test



Unnamed: 0,accuracy,precision,recall,f1
text_cnn,0.54,0.54,0.54,0.54
lstm,0.54,0.56,0.54,0.54
