## Import Library

In [1]:
!pip install pyvi

Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite, pyvi
Successfully installed python-crfsuite-0.9.10 pyvi-0.1.1 sklearn-crfsuite-0.3.6
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding, Dense, Dropout, Bidirectional, LSTM, GRU, Input, GlobalMaxPooling1D, LayerNormalization, Conv1D, MaxPooling1D
from tensorflow.keras.optimizers import Adam,SGD
from tensorflow.keras import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from pyvi import ViTokenizer
from pyvi import ViUtils

## Load data

In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp
  Downloading aiohttp-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=8.0.0
  Downloading pyarrow-14.0.2-cp310-cp310-manylinux_2_28_x86_64.whl (38.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.0/38.0 MB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyarrow-hotfix
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting fsspec[http]<=2023.10.0,>=2023.1.0
  Downloading fsspec-2023.10.0-py3-none-any.whl (166 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.4/166.4 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting

In [4]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
dataset = load_dataset("uitnlp/vietnamese_students_feedback")
dataset

Downloading data: 100%|██████████| 475k/475k [00:00<00:00, 1.10MB/s]
Downloading data: 100%|██████████| 63.3k/63.3k [00:00<00:00, 240kB/s]
Downloading data: 100%|██████████| 134k/134k [00:00<00:00, 472kB/s]
Generating train split: 100%|██████████| 11426/11426 [00:00<00:00, 919513.37 examples/s]
Generating validation split: 100%|██████████| 1583/1583 [00:00<00:00, 660852.32 examples/s]
Generating test split: 100%|██████████| 3166/3166 [00:00<00:00, 769271.61 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence', 'sentiment', 'topic'],
        num_rows: 11426
    })
    validation: Dataset({
        features: ['sentence', 'sentiment', 'topic'],
        num_rows: 1583
    })
    test: Dataset({
        features: ['sentence', 'sentiment', 'topic'],
        num_rows: 3166
    })
})

In [6]:
X_train, y_train = dataset['train']['sentence'], dataset['train']['sentiment']
X_valid, y_valid = dataset['validation']['sentence'], dataset['validation']['sentiment']
X_test, y_test = dataset['test']['sentence'], dataset['test']['sentiment']

In [7]:
X_train.extend(X_valid)
X_train.extend(X_test)
y_train.extend(y_valid)
y_train.extend(y_test)

## Preprocessing

In [8]:
input_pre = []
label_with_accent = []
for i in range(len(X_train)):
    input_text_pre = list(tf.keras.preprocessing.text.text_to_word_sequence(X_train[i]))
    input_text_pre = " ".join(input_text_pre)
    input_text_pre_no_accent = str(ViUtils.remove_accents(input_text_pre).decode("utf-8"))
    input_text_pre_accent = ViTokenizer.tokenize(input_text_pre)
    input_text_pre_no_accent = ViTokenizer.tokenize(input_text_pre_no_accent)
    input_pre.append(input_text_pre_accent)
    input_pre.append(input_text_pre_no_accent)
    label_with_accent.append(y_train[i])
    label_with_accent.append(y_train[i])
    
    

In [9]:
label_tf = tf.keras.utils.to_categorical(label_with_accent,num_classes=3)

tokenizer_data = Tokenizer(oov_token='<OOV>',filters = '',split = ' ')
tokenizer_data.fit_on_texts(input_pre)

tokenized_data_text = tokenizer_data.texts_to_sequences(input_pre)
vec_data = pad_sequences(tokenized_data_text,padding = 'post',maxlen = 512)

pickle.dump(tokenizer_data,open("tokenizer_data.pkl","wb"))

print("input data shape ",vec_data.shape)
data_vocab_size = len(tokenizer_data.word_index)+1
print("data vocab size ",data_vocab_size)

X_train, X_val, y_train, y_val = train_test_split(vec_data, label_tf,test_size=0.2, random_state = 42)
X_train, X_test, y_train, y_test = train_test_split(X_train,y_train, test_size=0.1,random_state = 42)

print("training sample: ",len(X_train))
print("validation sample: ",len(X_val))
print("test sample: ",len(X_test))

input data shape  (32350, 512)
data vocab size  5852
training sample:  23292
validation sample:  6470
test sample:  2588


In [10]:
def generate_model():
    dropout_threshold = 0.4
    input_dim = data_vocab_size
    output_dim = 32
    input_length = 512
    initializer = tf.keras.initializers.GlorotNormal()
    
    input_layer = Input(shape=(input_length,))
    feature = Embedding(input_dim = input_dim,output_dim = output_dim, input_length=input_length, embeddings_initializer = "GlorotNormal")(input_layer)
    
    cnn_feature = Conv1D(filters=32, kernel_size=3, padding='same', activation='relu')(feature)
    cnn_feature = MaxPooling1D()(cnn_feature)
    cnn_feature = Dropout(dropout_threshold)(cnn_feature)
    cnn_feature = Conv1D(filters=32, kernel_size=3,padding='same',activation='relu')(cnn_feature)
    cnn_feature = MaxPooling1D()(cnn_feature)
    cnn_feature = LayerNormalization()(cnn_feature)
    cnn_feature = Dropout(dropout_threshold)(cnn_feature)
    
    bi_lstm_feature = Bidirectional(LSTM(units=32, dropout=dropout_threshold, return_sequences=True, kernel_initializer=initializer))(feature)
    bi_lstm_feature = MaxPooling1D()(bi_lstm_feature)
    
    bi_lstm_feature = Bidirectional(GRU(units=32,dropout=dropout_threshold,return_sequences = True, kernel_initializer=initializer))(bi_lstm_feature)
    bi_lstm_feature = MaxPooling1D()(bi_lstm_feature)
    bi_lstm_feature = LayerNormalization()(bi_lstm_feature)
    
    combine_feature = tf.keras.layers.Concatenate()([cnn_feature,bi_lstm_feature])
    combine_feature = GlobalMaxPooling1D()(combine_feature)
    combine_feature = LayerNormalization()(combine_feature)
    
    classifier = Dense(90,activation='relu')(combine_feature)
    classifier = Dropout(0.2)(classifier)
    classifier = Dense(70,activation = 'relu')(classifier)
    classifier = Dropout(0.2)(classifier)
    classifier = Dense(50,activation = 'relu')(classifier)
    classifier = Dropout(0.2)(classifier)
    classifier = Dense(30,activation = 'relu')(classifier)
    classifier = Dropout(0.2)(classifier)
    classifier = Dense(3,activation = 'softmax')(classifier)
    
    model = tf.keras.Model(inputs = input_layer,outputs = classifier)
    return model

model = generate_model()
adam = Adam(learning_rate=0.001)
model.compile(optimizer=adam,loss='categorical_crossentropy',metrics=['accuracy'])
model.summary()
    

2024-01-09 19:16:46.706616: E ./tensorflow/compiler/xla/stream_executor/stream_executor_internal.h:124] SetPriority unimplemented for this stream.
2024-01-09 19:16:46.706748: E ./tensorflow/compiler/xla/stream_executor/stream_executor_internal.h:124] SetPriority unimplemented for this stream.
2024-01-09 19:16:46.706867: E ./tensorflow/compiler/xla/stream_executor/stream_executor_internal.h:124] SetPriority unimplemented for this stream.
2024-01-09 19:16:46.706949: E ./tensorflow/compiler/xla/stream_executor/stream_executor_internal.h:124] SetPriority unimplemented for this stream.
2024-01-09 19:16:46.707031: E ./tensorflow/compiler/xla/stream_executor/stream_executor_internal.h:124] SetPriority unimplemented for this stream.
2024-01-09 19:16:46.707231: E ./tensorflow/compiler/xla/stream_executor/stream_executor_internal.h:124] SetPriority unimplemented for this stream.
2024-01-09 19:16:46.707321: E ./tensorflow/compiler/xla/stream_executor/stream_executor_internal.h:124] SetPriority un

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 512)]                0         []                            
                                                                                                  
 embedding (Embedding)       (None, 512, 32)              187264    ['input_1[0][0]']             
                                                                                                  
 conv1d (Conv1D)             (None, 512, 32)              3104      ['embedding[0][0]']           
                                                                                                  
 max_pooling1d (MaxPooling1  (None, 256, 32)              0         ['conv1d[0][0]']              
 D)                                                                                           

## Visualize Model

In [11]:
!pip install pydot
!pip install graphviz 

Collecting pydot
  Downloading pydot-2.0.0-py3-none-any.whl (22 kB)
Installing collected packages: pydot
Successfully installed pydot-2.0.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.0/47.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: graphviz
Successfully installed graphviz-0.20.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [12]:
dot_img_file = "model_visualize.png"
tf.keras.utils.plot_model(model,to_file=dot_img_file,show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


## Training 

In [13]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_val = np.array(X_val)
y_val = np.array(y_val)

In [14]:
callback_model = tf.keras.callbacks.ModelCheckpoint('model_cnn_bilstm.h5',monitor='val_loss')
history = model.fit(x=X_train,y = y_train, validation_data = (X_val,y_val),epochs = 17,batch_size = 120, callbacks=[callback_model])

Epoch 1/17
Epoch 2/17


  saving_api.save_model(


Epoch 3/17
Epoch 4/17
Epoch 5/17
Epoch 6/17
Epoch 7/17
Epoch 8/17
Epoch 9/17
Epoch 10/17
Epoch 11/17
Epoch 12/17
Epoch 13/17
Epoch 14/17
Epoch 15/17
Epoch 16/17
Epoch 17/17


## Evaluation

In [15]:
!pip install seaborn

Collecting seaborn
  Downloading seaborn-0.13.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: seaborn
Successfully installed seaborn-0.13.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [16]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sn

In [17]:
X_test = np.array(X_test)
y_test = np.array(y_test)

In [18]:
model.load_weights("model_cnn_bilstm.h5")
model.evaluate(X_test,y_test)



[0.37574851512908936, 0.9107418656349182]