In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install joblib



In [3]:
import numpy as np
import pandas as pd

from tqdm.auto import tqdm

import joblib
import pickle
from keras import models

from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical, pad_sequences

In [4]:
TRAIN_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/train_processed.csv'
VAL_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/val_processed.csv'
TEST_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/test_processed.csv'

MODELS_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/models/'
RESULTS_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/results/'

# Load data

In [5]:
def load_data(path):
    data = pd.read_csv(path)
    data.cleaned_sentence.fillna('', inplace=True)

    X = data.cleaned_sentence
    y = data.emotion

    return X, y

In [6]:
X_train, y_train = load_data(TRAIN_PATH)
X_val, y_val = load_data(VAL_PATH)
X_test, y_test = load_data(TEST_PATH)

# Choose the model

In [7]:
def get_score(m, on_set, sort_by):

    path = ''

    # Model
    if m == 'model1':
        path += 'model1_'
    elif m == 'model2':
        path += 'model2_'
    else:
        path += 'model3_'

    # Set
    if on_set == 'val':
        path += 'val_'
    else:
        path += 'test_'

    paths = [RESULTS_PATH + path + 'dl.csv', RESULTS_PATH + path + 'ml.csv']
    re = pd.concat([pd.read_csv(paths[0]), pd.read_csv(paths[1])]).reset_index(drop=True)
    re.columns = ['model', 'accuracy', 'precision', 'recall', 'f1']
    re = re.sort_values(by=sort_by, ascending=False).reset_index(drop=True)

    return re

## Model 1

In [8]:
get_score('model1', 'val', 'accuracy')

Unnamed: 0,model,accuracy,precision,recall,f1
0,svc,0.77,0.83,0.75,0.79
1,text_cnn,0.75,0.75,0.75,0.75
2,lstm,0.75,0.76,0.75,0.75
3,lr,0.75,0.81,0.73,0.77
4,nb,0.74,0.82,0.69,0.75
5,knn,0.72,0.8,0.68,0.73
6,rf,0.71,0.79,0.66,0.72
7,dt,0.65,0.71,0.63,0.67


1. SVM
2. LR, LSTM

In [9]:
get_score('model1', 'test', 'accuracy')

Unnamed: 0,model,accuracy,precision,recall,f1
0,lr,0.76,0.76,0.78,0.77
1,text_cnn,0.75,0.75,0.75,0.75
2,svc,0.75,0.75,0.77,0.76
3,lstm,0.74,0.75,0.74,0.74
4,rf,0.73,0.76,0.71,0.73
5,nb,0.73,0.75,0.72,0.73
6,knn,0.69,0.72,0.67,0.69
7,dt,0.61,0.64,0.57,0.61


1. LR
2. SVM
3. TextCNN

### Chọn SVM và LR

## Model 2

In [10]:
get_score('model2', 'val', 'accuracy')

Unnamed: 0,model,accuracy,precision,recall,f1
0,svc,0.72,0.74,0.72,0.7
1,lr,0.7,0.7,0.7,0.68
2,lstm,0.69,0.68,0.69,0.66
3,text_cnn,0.67,0.67,0.67,0.64
4,knn,0.63,0.65,0.63,0.62
5,rf,0.62,0.62,0.62,0.58
6,nb,0.61,0.61,0.61,0.53
7,dt,0.51,0.52,0.51,0.5


1. SVM
2. LR

In [11]:
get_score('model2', 'test', 'accuracy')

Unnamed: 0,model,accuracy,precision,recall,f1
0,lstm,0.69,0.69,0.69,0.67
1,svc,0.67,0.7,0.67,0.65
2,text_cnn,0.66,0.66,0.66,0.64
3,lr,0.66,0.67,0.66,0.64
4,rf,0.65,0.64,0.65,0.62
5,knn,0.63,0.62,0.63,0.62
6,nb,0.61,0.59,0.61,0.54
7,dt,0.56,0.57,0.56,0.57


1. LSTM
2. SVM

### Chọn SVM và LSTM

## Model 3

In [12]:
get_score('model3', 'val', 'accuracy')

Unnamed: 0,model,accuracy,precision,recall,f1
0,text_cnn,0.71,0.72,0.71,0.7
1,lstm,0.69,0.69,0.69,0.69
2,lr,0.69,0.69,0.69,0.67
3,svc,0.68,0.66,0.68,0.65
4,rf,0.67,0.67,0.67,0.65
5,knn,0.65,0.65,0.65,0.62
6,nb,0.65,0.6,0.65,0.6
7,dt,0.62,0.61,0.62,0.62


1. TextCNN
2. LSTM

In [13]:
get_score('model3', 'test', 'accuracy')

Unnamed: 0,model,accuracy,precision,recall,f1
0,lstm,0.69,0.69,0.69,0.68
1,text_cnn,0.68,0.7,0.68,0.67
2,knn,0.68,0.69,0.68,0.66
3,svc,0.67,0.68,0.67,0.65
4,rf,0.67,0.68,0.67,0.66
5,lr,0.65,0.65,0.65,0.63
6,nb,0.64,0.58,0.64,0.59
7,dt,0.57,0.57,0.57,0.57


1. TextCNN
2. LSTM

### Chọn TextCNN và LSTM

# Predicting

In [14]:
def load_model_(file_name):
    parts = file_name.split('_')
    method = parts[0]
    m = parts[1].split('.')[0]

    path = MODELS_PATH + 'model_' + m[-1] + '/' + file_name

    if method in ['textcnn', 'lstm']:
        return models.load_model(path)
    else:
        return joblib.load(path)

In [15]:
# stage 1 (binary classification)
svc_1 = load_model_('svc_model1.h5')
lr_1 = load_model_('lr_model1.h5')

# stage 2 (4-class classification)
svc_2 = load_model_('svc_model2.h5')
lstm_2 = load_model_('lstm_model2.h5')

# stage 3 (3-class classification)
textcnn_3 = load_model_('textcnn_model3.h5')
lstm_3 = load_model_('lstm_model3.h5')

In [66]:
models_1 = [svc_1, lr_1]        # TF-IDF_1
models_2 = [svc_2, lstm_2]      # TF-IDF_2, tokenizer
models_3 = [textcnn_3, lstm_3]  # tokenizer

model_name_1 = ['svc_1', 'lr_1']
model_name_2 = ['svc_2', 'lstm_2']
model_name_3 = ['textcnn_3', 'lstm_3']

In [46]:
class Predictor():

    def __init__(self, models_1, models_2, models_3):
        self.models_1 = models_1
        self.models_2 = models_2
        self.models_3 = models_3

        self.model_name_1 = ['svc_1', 'lr_1']
        self.model_name_2 = ['svc_2', 'lstm_2']
        self.model_name_3 = ['textcnn_3', 'lstm_3']

        self.tfidf1 = pickle.load(open(MODELS_PATH + 'baseline/tfidf.pkl', 'rb'))
        self.tfidf2 = pickle.load(open(MODELS_PATH + 'model_2/tfidf.pkl', 'rb'))

        self.tokenizer2 = pickle.load(open(MODELS_PATH + 'model_2/tokenizer.pkl', 'rb'))
        self.tokenizer3 = pickle.load(open(MODELS_PATH + 'model_3/tokenizer.pkl', 'rb'))

        self.le2 = pickle.load(open(MODELS_PATH + 'model_2/le.pkl', 'rb')) # for 0
        self.le3 = pickle.load(open(MODELS_PATH + 'model_3/le.pkl', 'rb')) # for 1

    def get_token(self, text, tokenizer):
        tokens = tokenizer.texts_to_sequences(text)
        tokens = pad_sequences(tokens, maxlen=20)
        return tokens

    def stage_1_predict(self, text, idx):
        assert idx == 0 or idx == 1
        features = self.tfidf1.transform(text)
        pred = self.models_1[idx].predict(features)
        return pred

    def stage_2_predict(self, text, idx):
        assert idx == 0 or idx == 1
        if idx == 0:
            features = self.tfidf2.transform(text)
            return self.models_2[idx].predict(features)
        elif idx == 1:
            features = self.get_token(text, self.tokenizer2)
            pred = self.models_2[idx].predict(features)
            return [np.argmax(pred)]

        return pred

    def stage_3_predict(self, text, idx):
        assert idx == 0 or idx == 1
        features = self.get_token(text, self.tokenizer3)
        pred = self.models_3[idx].predict(features)
        pred = [np.argmax(pred)]
        return pred

    def predict(self, text, model_indices):
        s1 = self.stage_1_predict(text, model_indices[0])
        s2 = self.stage_2_predict(text, model_indices[1])
        s3 = self.stage_3_predict(text, model_indices[2])

        if s1 == 0:
            label = self.le2.inverse_transform(s2)
        elif s1 == 1:
            label = self.le3.inverse_transform(s3)
        else:
            label = np.nan

        return (s1, s2, s3), label

    def get_all_predictions(self, X, model_indices):
        predictions = []
        for x in tqdm(X):
            _, pred = predictor.predict([x], model_indices)
            predictions.append(pred.tolist())

        return np.array(predictions).flatten().tolist()

In [47]:
predictor = Predictor(models_1, models_2, models_3)

In [52]:
def get_true_label_and_prediction(X, y, model_indices):
    preds = predictor.get_all_predictions(X, model_indices)
    y = y.values.tolist()
    return y, preds

# Evaluation

In [60]:
from sklearn.preprocessing import LabelEncoder
le4all = LabelEncoder().fit(y_train)

In [61]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [62]:
def evaluate(X, y, model_indices):
    y_true, y_preds = get_true_label_and_prediction(X, y, model_indices)

    acc = round(accuracy_score(y_true, y_preds), 2)
    pre = round(precision_score(y_true, y_preds, average='weighted'), 2)
    recall = round(recall_score(y_true, y_preds, average='weighted'), 2)
    f1 = round(f1_score(y_true, y_preds, average='weighted'), 2)

    return [acc, pre, recall, f1]

In [77]:
def get_results(X, y):
    re = {}
    for i in range(2):
        for j in range(2):
            for k in range(2):
                print([i, j, k])
                key = model_name_1[i] + ' + ' + model_name_2[j] + ' + ' + model_name_3[k]
                re[key] = evaluate(X_val, y_val, [i, j, k])

    df = pd.DataFrame.from_dict(re, orient='index')
    df.columns = ['accuracy', 'precision', 'recall', 'f1']

    return df

### Validation set

In [78]:
val_result = get_results(X_val, y_val)
val_result.to_csv(RESULTS_PATH + 'pipeline_val.csv')
val_result

[0, 0, 0]


  0%|          | 0/686 [00:00<?, ?it/s]

[0, 0, 1]


  0%|          | 0/686 [00:00<?, ?it/s]

[0, 1, 0]


  0%|          | 0/686 [00:00<?, ?it/s]

[0, 1, 1]


  0%|          | 0/686 [00:00<?, ?it/s]

[1, 0, 0]


  0%|          | 0/686 [00:00<?, ?it/s]

[1, 0, 1]


  0%|          | 0/686 [00:00<?, ?it/s]

[1, 1, 0]


  0%|          | 0/686 [00:00<?, ?it/s]

[1, 1, 1]


  0%|          | 0/686 [00:00<?, ?it/s]



Unnamed: 0,accuracy,precision,recall,f1
svc_1 + svc_2 + textcnn_3,0.55,0.56,0.55,0.54
svc_1 + svc_2 + lstm_3,0.53,0.54,0.53,0.53
svc_1 + lstm_2 + textcnn_3,0.54,0.54,0.54,0.53
svc_1 + lstm_2 + lstm_3,0.53,0.53,0.53,0.52
lr_1 + svc_2 + textcnn_3,0.53,0.55,0.53,0.53
lr_1 + svc_2 + lstm_3,0.52,0.53,0.52,0.51
lr_1 + lstm_2 + textcnn_3,0.53,0.53,0.53,0.52
lr_1 + lstm_2 + lstm_3,0.51,0.51,0.51,0.51


### Test set

In [79]:
test_result = get_results(X_test, y_test)
test_result.to_csv(RESULTS_PATH + 'pipeline_test.csv')
test_result

[0, 0, 0]


  0%|          | 0/686 [00:00<?, ?it/s]

[0, 0, 1]


  0%|          | 0/686 [00:00<?, ?it/s]

[0, 1, 0]


  0%|          | 0/686 [00:00<?, ?it/s]

[0, 1, 1]


  0%|          | 0/686 [00:00<?, ?it/s]

[1, 0, 0]


  0%|          | 0/686 [00:00<?, ?it/s]

[1, 0, 1]


  0%|          | 0/686 [00:00<?, ?it/s]

[1, 1, 0]


  0%|          | 0/686 [00:00<?, ?it/s]

[1, 1, 1]


  0%|          | 0/686 [00:00<?, ?it/s]



Unnamed: 0,accuracy,precision,recall,f1
svc_1 + svc_2 + textcnn_3,0.55,0.56,0.55,0.54
svc_1 + svc_2 + lstm_3,0.53,0.54,0.53,0.53
svc_1 + lstm_2 + textcnn_3,0.54,0.54,0.54,0.53
svc_1 + lstm_2 + lstm_3,0.53,0.53,0.53,0.52
lr_1 + svc_2 + textcnn_3,0.53,0.55,0.53,0.53
lr_1 + svc_2 + lstm_3,0.52,0.53,0.52,0.51
lr_1 + lstm_2 + textcnn_3,0.53,0.53,0.53,0.52
lr_1 + lstm_2 + lstm_3,0.51,0.51,0.51,0.51
