<a href="https://colab.research.google.com/github/Akechi1412/Vietnamese-Review-Classification/blob/master/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
!rm -rf Vietnamese-Review-Classification
!git clone https://github.com/Akechi1412/Vietnamese-Review-Classification
%cd Vietnamese-Review-Classification

Cloning into 'Vietnamese-Review-Classification'...
remote: Enumerating objects: 92, done.[K
remote: Counting objects: 100% (92/92), done.[K
remote: Compressing objects: 100% (73/73), done.[K
remote: Total 92 (delta 46), reused 55 (delta 15), pack-reused 0[K
Receiving objects: 100% (92/92), 18.31 MiB | 14.09 MiB/s, done.
Resolving deltas: 100% (46/46), done.
/content/Vietnamese-Review-Classification/Vietnamese-Review-Classification


In [12]:
!sudo apt-get install python3.10
!pip install pyvi https://gitlab.com/trungtv/vi_spacy/-/raw/master/packages/vi_core_news_lg-3.6.0/dist/vi_core_news_lg-3.6.0.tar.gz --upgrade

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
python3.10 is already the newest version (3.10.12-1~22.04.3).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
Collecting https://gitlab.com/trungtv/vi_spacy/-/raw/master/packages/vi_core_news_lg-3.6.0/dist/vi_core_news_lg-3.6.0.tar.gz
  Using cached https://gitlab.com/trungtv/vi_spacy/-/raw/master/packages/vi_core_news_lg-3.6.0/dist/vi_core_news_lg-3.6.0.tar.gz (233.3 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [13]:
# !git pull origin master

In [14]:
from utils.data_preparing import prepare_data
from utils.plotter import plot_reviews_data

x_data, y_data = prepare_data()

print(f'Data size: {len(x_data)}')
print(f'Max number of words: {len(max(x_data, key=len))}')
print(x_data[0])
print(y_data[0])

plot_reviews_data(x_data)

Preparing data...


KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import train_test_split

# Split data into train, validation, and test sets
x_train, x_temp, y_train, y_temp = train_test_split(x_data, y_data, test_size=1/3, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

print(len(x_train), len(y_train))
print(len(x_val), len(y_val))
print(len(x_test), len(y_test))

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import pickle

# Tokenize text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

x_train_seq = tokenizer.texts_to_sequences(x_train)
x_val_seq = tokenizer.texts_to_sequences(x_val)
x_test_seq = tokenizer.texts_to_sequences(x_test)

# Save tokenize
with open("review_tokenizer.pkl", "wb") as file:
    pickle.dump(tokenizer, file)

# Pad sequences to ensure uniform length
max_len = 200
x_train_pad = pad_sequences(x_train_seq, maxlen=max_len)
x_val_pad = pad_sequences(x_val_seq, maxlen=max_len)
x_test_pad = pad_sequences(x_test_seq, maxlen=max_len)

print(f'({len(x_train_pad)}, {len(x_train_pad[0])})')
print(f'({len(x_val_pad)}, {len(x_val_pad[0])})')
print(f'({len(x_test_pad)}, {len(x_test_pad[0])})')

# Create one hot
y_train_one_hot = to_categorical(y_train, num_classes=3)
y_val_one_hot = to_categorical(y_val, num_classes=3)
y_test_one_hot = to_categorical(y_test, num_classes=3)

In [None]:
import numpy as np

x_train_pad = np.array(x_train_pad)
x_val_pad = np.array(x_val_pad)
x_test_pad = np.array(x_test_pad)

y_train_one_hot = np.array(y_train_one_hot)
y_val_one_hot = np.array(y_val_one_hot)
y_test_one_hot = np.array(y_test_one_hot)

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dropout, Dense
from keras import regularizers
from keras.callbacks import ModelCheckpoint

# Build RNN model
embedding_layer = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=max_len)
old_embedding_name = embedding_layer.weights[0].name  # Lấy tên của biến trong lớp nhúng
new_embedding_name = 'embedding/embeddings:0'

model = Sequential()
model.add(embedding_layer)
model.add(LSTM(units=64))
model.add(Dense(units=32, activation='relu', kernel_regularizer=regularizers.L2(0.01)))
model.add(Dropout(rate=0.5))
model.add(Dense(units=16, activation='relu', kernel_regularizer=regularizers.L2(0.01)))
model.add(Dropout(rate=0.5))
model.add(Dense(units=8, activation='relu', kernel_regularizer=regularizers.L2(0.01)))
model.add(Dropout(rate=0.5))
model.add(Dense(units=3, activation='softmax', kernel_regularizer=regularizers.L2(0.01)))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train and save the model
filepath = 'review_model.keras'
checkpoint = ModelCheckpoint(
  filepath,
  monitor='val_loss',
  verbose=1,
  save_best_only=True,
  mode='min'
)
callbacks_list = [checkpoint]
history = model.fit(
  x_train_pad, y_train_one_hot,
  batch_size=50,
  epochs=10,
  validation_data=(x_val_pad, y_val_one_hot),
  callbacks=callbacks_list
)

In [None]:
from utils.plotter import plot_history_model

plot_history_model(history)

In [None]:
from sklearn.metrics import confusion_matrix

score, acc = model.evaluate(x_test_pad, y_test_one_hot, batch_size=15, verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)

y_pred = model.predict(x_test_pad, batch_size=15)
y_test_single = np.argmax(y_test_one_hot, axis=1)
y_pred_single = np.argmax(y_pred, axis=1)
conf_matrix = confusion_matrix(y_test_single, y_pred_single)
print("Confusion Matrix:")
print(conf_matrix)