In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%capture

# Joblib
!pip install joblib

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import joblib
import pickle

# Load data

In [4]:
TRAIN_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/train_processed.csv'
VAL_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/val_processed.csv'
TEST_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/uit_vsmec_processed/test_processed.csv'

MODEL_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/models/'
RESULTS_PATH = '/content/drive/MyDrive/Public/CS221 - Natural Language Processing/results/'

In [5]:
train_df = pd.read_csv(TRAIN_PATH)
val_df = pd.read_csv(VAL_PATH)
test_df = pd.read_csv(TEST_PATH)

In [6]:
train_df.cleaned_sentence.fillna('', inplace=True)
val_df.cleaned_sentence.fillna('', inplace=True)
test_df.cleaned_sentence.fillna('', inplace=True)

In [7]:
def prepare_data(data):
    X = data['cleaned_sentence']
    y = data['emotion']
    return list(X), list(y)

# Label Encoding

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
le = LabelEncoder()
le.fit(train_df.emotion)

train_df['emotion'] = le.transform(train_df['emotion'])
val_df['emotion'] = le.transform(val_df['emotion'])
test_df['emotion'] = le.transform(test_df['emotion'])

# save
with open(MODEL_PATH + 'baseline/le.pkl', 'wb') as f:
    le = pickle.dump(le, f)

In [10]:
X_train, y_train = prepare_data(train_df)
X_val, y_val = prepare_data(val_df)
X_test, y_test = prepare_data(test_df)

In [11]:
X_full = np.array(train_df['cleaned_sentence'].tolist() + val_df['cleaned_sentence'].tolist() + test_df['cleaned_sentence'].tolist())

# Machine Learning

* Feature Extraction: TF-IDF
* Models:
    - Logistic Regression
    - SVM
    - Decision Tree
    - Random Forest
    - kNN
    - Naive Bayes

In [12]:
# Machine Learning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

## TF-IDF

In [13]:
tfidf = TfidfVectorizer(analyzer='word', max_features=5000)
tfidf.fit(X_full)

# save
with open(MODEL_PATH + 'baseline/tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

In [14]:
X_train_tfidf = tfidf.transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)

## Training ML Models


In [15]:
def build_model_ML(model_name):
    if model_name == 'lr':
        clf = LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=1000, verbose=True)
    elif model_name == 'svc':
        clf = SVC(verbose=True,)
    elif model_name == 'dt':
        clf = DecisionTreeClassifier()
    elif model_name == 'rf':
        clf = RandomForestClassifier(n_estimators=150, random_state=0)
    elif model_name == 'knn':
        clf = KNeighborsClassifier(n_neighbors=15)
    elif model_name == 'nb':
        clf = MultinomialNB()
    else:
        print('model_name error~!')
        return None

    return clf

In [16]:
# ML
lr = build_model_ML('lr')
svc = build_model_ML('svc')
dt = build_model_ML('dt')
rf = build_model_ML('rf')
knn = build_model_ML('knn')
nb = build_model_ML('nb')

In [17]:
# LR
lr.fit(X_train_tfidf, y_train)
joblib.dump(lr, MODEL_PATH + 'baseline/lr_baseline.h5')

# SVC
svc.fit(X_train_tfidf, y_train)
joblib.dump(svc, MODEL_PATH + 'baseline/svc_baseline.h5')

# DT
dt.fit(X_train_tfidf, y_train)
joblib.dump(dt, MODEL_PATH + 'baseline/dt_baseline.h5')

# RF
rf.fit(X_train_tfidf, y_train)
joblib.dump(rf, MODEL_PATH + 'baseline/rf_baseline.h5')

# kNN
knn.fit(X_train_tfidf, y_train)
joblib.dump(knn, MODEL_PATH + 'baseline/knn_baseline.h5')

# NB
nb.fit(X_train_tfidf, y_train)
joblib.dump(nb, MODEL_PATH + 'baseline/nb_baseline.h5')

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.4s finished


[LibSVM]

['/content/drive/MyDrive/Public/CS221 - Natural Language Processing/models/baseline/nb_baseline.h5']

# Evaluation

In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [19]:
lr = joblib.load(MODEL_PATH + 'baseline/lr_baseline.h5')
svc = joblib.load(MODEL_PATH + 'baseline/svc_baseline.h5')
dt = joblib.load(MODEL_PATH + 'baseline/dt_baseline.h5')
rf = joblib.load(MODEL_PATH + 'baseline/rf_baseline.h5')
knn = joblib.load(MODEL_PATH + 'baseline/knn_baseline.h5')
nb = joblib.load(MODEL_PATH + 'baseline/nb_baseline.h5')

In [20]:
def evaluate(model, X, y):
    preds = model.predict(X)

    acc = round(accuracy_score(y, preds), 2)
    pre = round(precision_score(y, preds, average='weighted'), 2)
    recall = round(recall_score(y, preds, average='weighted'), 2)
    f1 = round(f1_score(y, preds, average='weighted'), 2)

    return [acc, pre, recall, f1]

In [21]:
def get_result(X, y):
    lr_re = evaluate(lr, X, y)
    svc_re = evaluate(svc, X, y)
    dt_re = evaluate(dt, X, y)
    rf_re = evaluate(rf, X, y)
    knn_re = evaluate(knn, X, y)
    nb_re = evaluate(nb, X, y)

    re = pd.DataFrame(
        [lr_re] + [svc_re] + [dt_re] + [rf_re] + [knn_re] + [nb_re],
        columns=['accuracy', 'precision', 'recall', 'f1'],
        index=['lr', 'svc', 'dt', 'rf', 'knn', 'nb']
    )

    return re

In [22]:
re_train = get_result(X_train_tfidf, y_train)
re_train

Unnamed: 0,accuracy,precision,recall,f1
lr,0.74,0.75,0.74,0.73
svc,0.94,0.94,0.94,0.94
dt,1.0,1.0,1.0,1.0
rf,1.0,1.0,1.0,1.0
knn,0.58,0.59,0.58,0.57
nb,0.61,0.71,0.61,0.56


In [23]:
re_val = get_result(X_val_tfidf, y_val)
re_val.to_csv(RESULTS_PATH + 'baseline_val_ml.csv')
re_val

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,accuracy,precision,recall,f1
lr,0.54,0.54,0.54,0.52
svc,0.53,0.53,0.53,0.51
dt,0.37,0.37,0.37,0.37
rf,0.49,0.48,0.49,0.46
knn,0.49,0.5,0.49,0.47
nb,0.48,0.44,0.48,0.41


In [24]:
re_test = get_result(X_test_tfidf, y_test)
re_test.to_csv(RESULTS_PATH + 'baseline_test_ml.csv')
re_test

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,accuracy,precision,recall,f1
lr,0.53,0.56,0.53,0.52
svc,0.54,0.58,0.54,0.53
dt,0.38,0.38,0.38,0.38
rf,0.49,0.52,0.49,0.48
knn,0.48,0.49,0.48,0.47
nb,0.46,0.41,0.46,0.39
