In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%capture

# Joblib
!pip install joblib

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import joblib
import pickle

# Load data

In [4]:
TRAIN_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/train_processed.csv'
VAL_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/val_processed.csv'
TEST_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/test_processed.csv'

MODELS_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/models/'
RESULTS_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/results/'

In [5]:
GROUP_1 = ['Anger', 'Disgust', 'Sadness', 'Fear'] # 0

In [6]:
def load_data(path):
    data = pd.read_csv(path)
    data.cleaned_sentence.fillna('', inplace=True)

    # filter y
    data = data[data.emotion.str.strip().isin(GROUP_1)].reset_index()

    X = data.cleaned_sentence
    y = data.emotion

    return X, y

In [7]:
X_train, y_train = load_data(TRAIN_PATH)
X_val, y_val = load_data(VAL_PATH)
X_test, y_test = load_data(TEST_PATH)

In [8]:
X_full = np.array(X_train.tolist() + X_val.tolist() + X_test.tolist())

# Label Encoding

In [9]:
from sklearn.preprocessing import LabelEncoder

In [10]:
le = LabelEncoder()
le.fit(y_train)

y_train = le.transform(y_train)
y_val = le.transform(y_val)
y_test = le.transform(y_test)

# save
with open(MODELS_PATH + 'model_2/le.pkl', 'wb') as f:
    le = pickle.dump(le, f)

# Machine Learning

* Feature Extraction: TF-IDF
* Models:
    - Logistic Regression
    - SVM
    - Decision Tree
    - Random Forest
    - kNN
    - Naive Bayes

In [11]:
# Machine Learning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

## TF-IDF

In [12]:
tfidf = TfidfVectorizer(analyzer='word', max_features=5000)
tfidf.fit(X_full)

# save
with open(MODELS_PATH + 'model_2/tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

In [13]:
X_train_tfidf = tfidf.transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)

## Training ML Models


In [14]:
def build_model_ML(model_name):
    if model_name == 'lr':
        clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000, verbose=True)
    elif model_name == 'svc':
        clf = SVC(verbose=True,)
    elif model_name == 'dt':
        clf = DecisionTreeClassifier()
    elif model_name == 'rf':
        clf = RandomForestClassifier(n_estimators=150, random_state=0)
    elif model_name == 'knn':
        clf = KNeighborsClassifier(n_neighbors=15)
    elif model_name == 'nb':
        clf = MultinomialNB()
    else:
        print('model_name error~!')
        return None

    return clf

In [15]:
# ML
lr = build_model_ML('lr')
svc = build_model_ML('svc')
dt = build_model_ML('dt')
rf = build_model_ML('rf')
knn = build_model_ML('knn')
nb = build_model_ML('nb')

In [16]:
# LR
lr.fit(X_train_tfidf, y_train)
joblib.dump(lr, MODELS_PATH + 'model_2/lr_model2.h5')

# SVC
svc.fit(X_train_tfidf, y_train)
joblib.dump(svc, MODELS_PATH + 'model_2/svc_model2.h5')

# DT
dt.fit(X_train_tfidf, y_train)
joblib.dump(dt, MODELS_PATH + 'model_2/dt_model2.h5')

# RF
rf.fit(X_train_tfidf, y_train)
joblib.dump(rf, MODELS_PATH + 'model_2/rf_model2.h5')

# kNN
knn.fit(X_train_tfidf, y_train)
joblib.dump(knn, MODELS_PATH + 'model_2/knn_model2.h5')

# NB
nb.fit(X_train_tfidf, y_train)
joblib.dump(nb, MODELS_PATH + 'model_2/nb_model2.h5')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s finished


[LibSVM]

['/content/drive/MyDrive/Public/CS221 - Natural Language Processing/models/model_2/nb_model2.h5']

# Evaluation

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [18]:
lr = joblib.load(MODELS_PATH + 'model_2/lr_model2.h5')
svc = joblib.load(MODELS_PATH + 'model_2/svc_model2.h5')
dt = joblib.load(MODELS_PATH + 'model_2/dt_model2.h5')
rf = joblib.load(MODELS_PATH + 'model_2/rf_model2.h5')
knn = joblib.load(MODELS_PATH + 'model_2/knn_model2.h5')
nb = joblib.load(MODELS_PATH + 'model_2/nb_model2.h5')

In [19]:
def evaluate(model, X, y):
    preds = model.predict(X)

    acc = round(accuracy_score(y, preds), 2)
    pre = round(precision_score(y, preds, average='weighted'), 2)
    recall = round(recall_score(y, preds, average='weighted'), 2)
    f1 = round(f1_score(y, preds, average='weighted'), 2)

    return [acc, pre, recall, f1]

In [20]:
def get_result(X, y):
    lr_re = evaluate(lr, X, y)
    svc_re = evaluate(svc, X, y)
    dt_re = evaluate(dt, X, y)
    rf_re = evaluate(rf, X, y)
    knn_re = evaluate(knn, X, y)
    nb_re = evaluate(nb, X, y)

    re = pd.DataFrame(
        [lr_re] + [svc_re] + [dt_re] + [rf_re] + [knn_re] + [nb_re],
        columns=['accuracy', 'precision', 'recall', 'f1'],
        index=['lr', 'svc', 'dt', 'rf', 'knn', 'nb']
    )

    return re

In [21]:
re_train = get_result(X_train_tfidf, y_train)
re_train

Unnamed: 0,accuracy,precision,recall,f1
lr,0.84,0.85,0.84,0.83
svc,0.96,0.97,0.96,0.96
dt,1.0,1.0,1.0,1.0
rf,1.0,1.0,1.0,1.0
knn,0.71,0.72,0.71,0.7
nb,0.73,0.8,0.73,0.67


In [22]:
re_val = get_result(X_val_tfidf, y_val)
re_val.to_csv(RESULTS_PATH + 'model2_val_ml.csv')
re_val

Unnamed: 0,accuracy,precision,recall,f1
lr,0.7,0.7,0.7,0.68
svc,0.72,0.74,0.72,0.7
dt,0.51,0.52,0.51,0.5
rf,0.62,0.62,0.62,0.58
knn,0.63,0.65,0.63,0.62
nb,0.61,0.61,0.61,0.53


In [23]:
re_test = get_result(X_test_tfidf, y_test)
re_test.to_csv(RESULTS_PATH + 'model2_test_ml.csv')
re_test

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,accuracy,precision,recall,f1
lr,0.66,0.67,0.66,0.64
svc,0.67,0.7,0.67,0.65
dt,0.56,0.57,0.56,0.57
rf,0.65,0.64,0.65,0.62
knn,0.63,0.62,0.63,0.62
nb,0.61,0.59,0.61,0.54
