In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, SpatialDropout1D, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.utils import to_categorical
from gensim.models import KeyedVectors
from sklearn.metrics import precision_score, recall_score, f1_score
import requests

In [3]:
# Tải các tài nguyên cần thiết từ NLTK
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Tải nội dung của tệp stop_words.py từ GitHub
url = 'https://raw.githubusercontent.com/trungtv/vivi_spacy/master/vi/vi/stop_words.py'
response = requests.get(url)
stop_words_content = response.text

In [5]:
# Phân tích nội dung của tệp stop_words.py để lấy danh sách stopwords
start_index = stop_words_content.find('"""') + 3
end_index = stop_words_content.rfind('"""')
stop_words_list = stop_words_content[start_index:end_index].split("\n")
# Tạo danh sách stopwords từ danh sách bạn vừa phân tích
stop_words = set(word.strip() for word in stop_words_list if word.strip())

In [6]:
# Tiền xử lý văn bản
def preprocess_text(text):
    words = word_tokenize(text.lower())
    filtered_words = [word for word in words if word not in stop_words and word.isalnum()]
    return ' '.join(filtered_words)

In [7]:
# Load dữ liệu
df = pd.read_excel('data_xuly.xlsx')
df.dropna(axis=1, how='all', inplace=True)
contents = df['content'].tolist()
topics = df['label'].tolist()
contents = [str(content).strip() for content in contents]
processed_contents = [preprocess_text(content) for content in contents]

In [8]:
# Tiền xử lý nhãn
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(topics)

num_classes = len(set(y))
y = to_categorical(y, num_classes=num_classes)

In [9]:
# Tokenization và padding
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(processed_contents)
X = tokenizer.texts_to_sequences(processed_contents)
X = pad_sequences(X, maxlen=500)

In [11]:
# Cross-validation
kf = KFold(n_splits=4, shuffle=True, random_state=42)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []
best_accuracy = 0
best_model = None

In [12]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Xây dựng mô hình Bidirectional LSTM
    model = Sequential()
    model.add(Embedding(input_dim=20000, output_dim=128, input_length=500))
    
    model.add(SpatialDropout1D(0.3))
    
    
    model.add(Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.3)))
    
    
    model.add(Dense(256, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Huấn luyện mô hình
    model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_test, y_test), verbose=2)

    # Đánh giá mô hình
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    accuracy_scores.append(accuracy)

    # Dự đoán
    y_pred = model.predict(X_test, verbose=0)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_test_classes = np.argmax(y_test, axis=1)

    # Tính toán các độ đo
    precision = precision_score(y_test_classes, y_pred_classes, average='weighted')
    recall = recall_score(y_test_classes, y_pred_classes, average='weighted')
    f1 = f1_score(y_test_classes, y_pred_classes, average='weighted')

    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

    # In ra các độ đo
    print(f"Fold accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    # Lưu lại mô hình có độ chính xác cao nhất
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model



Epoch 1/10
39/39 - 90s - 2s/step - accuracy: 0.3452 - loss: 1.8771 - val_accuracy: 0.4240 - val_loss: 1.5373
Epoch 2/10
39/39 - 94s - 2s/step - accuracy: 0.4695 - loss: 1.4650 - val_accuracy: 0.5114 - val_loss: 1.2824
Epoch 3/10
39/39 - 95s - 2s/step - accuracy: 0.6033 - loss: 1.1933 - val_accuracy: 0.7065 - val_loss: 0.9134
Epoch 4/10
39/39 - 95s - 2s/step - accuracy: 0.7288 - loss: 0.8211 - val_accuracy: 0.7477 - val_loss: 0.7759
Epoch 5/10
39/39 - 97s - 2s/step - accuracy: 0.7897 - loss: 0.6416 - val_accuracy: 0.7914 - val_loss: 0.6825
Epoch 6/10
39/39 - 105s - 3s/step - accuracy: 0.8295 - loss: 0.5098 - val_accuracy: 0.8074 - val_loss: 0.6463
Epoch 7/10
39/39 - 109s - 3s/step - accuracy: 0.8542 - loss: 0.4367 - val_accuracy: 0.8111 - val_loss: 0.5894
Epoch 8/10
39/39 - 102s - 3s/step - accuracy: 0.8794 - loss: 0.3461 - val_accuracy: 0.8271 - val_loss: 0.5982
Epoch 9/10
39/39 - 102s - 3s/step - accuracy: 0.9046 - loss: 0.2935 - val_accuracy: 0.8277 - val_loss: 0.6003
Epoch 10/10
39/



39/39 - 98s - 3s/step - accuracy: 0.3280 - loss: 1.9425 - val_accuracy: 0.4332 - val_loss: 1.5021
Epoch 2/10
39/39 - 98s - 3s/step - accuracy: 0.5147 - loss: 1.3901 - val_accuracy: 0.6055 - val_loss: 1.1811
Epoch 3/10
39/39 - 99s - 3s/step - accuracy: 0.6195 - loss: 1.0837 - val_accuracy: 0.6702 - val_loss: 1.0077
Epoch 4/10
39/39 - 96s - 2s/step - accuracy: 0.7323 - loss: 0.8047 - val_accuracy: 0.7428 - val_loss: 0.8303
Epoch 5/10
39/39 - 97s - 2s/step - accuracy: 0.8152 - loss: 0.5576 - val_accuracy: 0.7766 - val_loss: 0.7251
Epoch 6/10
39/39 - 98s - 3s/step - accuracy: 0.8544 - loss: 0.4498 - val_accuracy: 0.7791 - val_loss: 0.6950
Epoch 7/10
39/39 - 97s - 2s/step - accuracy: 0.8802 - loss: 0.3683 - val_accuracy: 0.7957 - val_loss: 0.7020
Epoch 8/10
39/39 - 98s - 3s/step - accuracy: 0.8857 - loss: 0.3368 - val_accuracy: 0.7871 - val_loss: 0.7457
Epoch 9/10
39/39 - 99s - 3s/step - accuracy: 0.9048 - loss: 0.2728 - val_accuracy: 0.7822 - val_loss: 0.7728
Epoch 10/10
39/39 - 100s - 3s/



39/39 - 101s - 3s/step - accuracy: 0.3405 - loss: 1.8854 - val_accuracy: 0.4585 - val_loss: 1.6045
Epoch 2/10
39/39 - 99s - 3s/step - accuracy: 0.4876 - loss: 1.4408 - val_accuracy: 0.5858 - val_loss: 1.2890
Epoch 3/10
39/39 - 99s - 3s/step - accuracy: 0.5953 - loss: 1.1566 - val_accuracy: 0.6677 - val_loss: 1.0157
Epoch 4/10
39/39 - 97s - 2s/step - accuracy: 0.7138 - loss: 0.8614 - val_accuracy: 0.7612 - val_loss: 0.7196
Epoch 5/10
39/39 - 100s - 3s/step - accuracy: 0.8086 - loss: 0.5823 - val_accuracy: 0.8197 - val_loss: 0.5731
Epoch 6/10
39/39 - 98s - 3s/step - accuracy: 0.8593 - loss: 0.4363 - val_accuracy: 0.8246 - val_loss: 0.5526
Epoch 7/10
39/39 - 105s - 3s/step - accuracy: 0.8837 - loss: 0.3337 - val_accuracy: 0.8585 - val_loss: 0.4528
Epoch 8/10
39/39 - 99s - 3s/step - accuracy: 0.8987 - loss: 0.3050 - val_accuracy: 0.8474 - val_loss: 0.5073
Epoch 9/10
39/39 - 105s - 3s/step - accuracy: 0.9194 - loss: 0.2395 - val_accuracy: 0.8511 - val_loss: 0.5236
Epoch 10/10
39/39 - 107s -



39/39 - 117s - 3s/step - accuracy: 0.3594 - loss: 1.8742 - val_accuracy: 0.4874 - val_loss: 1.5754
Epoch 2/10
39/39 - 110s - 3s/step - accuracy: 0.5198 - loss: 1.4090 - val_accuracy: 0.6129 - val_loss: 1.1156
Epoch 3/10
39/39 - 108s - 3s/step - accuracy: 0.6299 - loss: 1.0513 - val_accuracy: 0.6437 - val_loss: 1.0418
Epoch 4/10
39/39 - 107s - 3s/step - accuracy: 0.6964 - loss: 0.9027 - val_accuracy: 0.7348 - val_loss: 0.8013
Epoch 5/10
39/39 - 113s - 3s/step - accuracy: 0.7664 - loss: 0.6842 - val_accuracy: 0.7766 - val_loss: 0.6656
Epoch 6/10
39/39 - 107s - 3s/step - accuracy: 0.8027 - loss: 0.5547 - val_accuracy: 0.7920 - val_loss: 0.6345
Epoch 7/10
39/39 - 104s - 3s/step - accuracy: 0.8369 - loss: 0.4540 - val_accuracy: 0.8123 - val_loss: 0.5927
Epoch 8/10
39/39 - 107s - 3s/step - accuracy: 0.8722 - loss: 0.3693 - val_accuracy: 0.8363 - val_loss: 0.5541
Epoch 9/10
39/39 - 103s - 3s/step - accuracy: 0.8878 - loss: 0.3129 - val_accuracy: 0.8443 - val_loss: 0.5552
Epoch 10/10
39/39 - 1

In [13]:
# In ra độ chính xác trung bình của tất cả các lần lặp
print(f"Mean accuracy: {np.mean(accuracy_scores)}")
print(f"Mean Precision: {np.mean(precision_scores)}")
print(f"Mean Recall: {np.mean(recall_scores)}")
print(f"Mean F1 Score: {np.mean(f1_scores)}")

Mean accuracy: 0.8293846100568771
Mean Precision: 0.8357625643497231
Mean Recall: 0.8293846153846154
Mean F1 Score: 0.8283938268731028


In [14]:
print(f"Best accuracy: {best_accuracy}")

Best accuracy: 0.8479999899864197


In [15]:
best_model.save('best_model.h5')

