In [18]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, GRU, Dense, SpatialDropout1D
from keras.utils import to_categorical
import requests

In [19]:
# Tải các tài nguyên cần thiết từ NLTK
nltk.download('punkt')

# Tải nội dung của tệp stop_words.py từ GitHub
url = 'https://raw.githubusercontent.com/trungtv/vivi_spacy/master/vi/vi/stop_words.py'
response = requests.get(url)
stop_words_content = response.text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
# Phân tích nội dung của tệp stop_words.py để lấy danh sách stopwords
start_index = stop_words_content.find('"""') + 3
end_index = stop_words_content.rfind('"""')
stop_words_list = stop_words_content[start_index:end_index].split("\n")

In [21]:
# Tạo danh sách stopwords từ danh sách bạn vừa phân tích
stop_words = set(word.strip() for word in stop_words_list if word.strip())

In [22]:
# Tiền xử lý văn bản
def preprocess_text(text):
    words = word_tokenize(text.lower())
    filtered_words = [word for word in words if word not in stop_words and word.isalnum()]
    return ' '.join(filtered_words)

In [23]:
# Load dữ liệu
df = pd.read_excel('data.xlsx')
df.dropna(axis=1, how='all', inplace=True)
contents = df['Title'].tolist()
topics = df['Topics'].tolist()
contents = [str(content).strip() for content in contents]
processed_contents = [preprocess_text(content) for content in contents]

In [24]:
# Tiền xử lý nhãn
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(topics)
num_classes = len(set(y))
y = to_categorical(y, num_classes=num_classes)

In [25]:
# Tokenization và padding
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(processed_contents)
X = tokenizer.texts_to_sequences(processed_contents)
X = pad_sequences(X, maxlen=200)

In [26]:
# Cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
best_accuracy = 0
best_model = None

In [27]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Xây dựng mô hình GRU
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=128, input_length=200))
    model.add(SpatialDropout1D(0.2))
    model.add(GRU(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Huấn luyện mô hình
    model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=2)

    # Đánh giá mô hình
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    accuracy_scores.append(accuracy)
    print(f"Fold accuracy: {accuracy}")
    # Lưu lại mô hình có độ chính xác cao nhất
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

Epoch 1/10




14/14 - 7s - 512ms/step - accuracy: 0.2351 - loss: 1.7755 - val_accuracy: 0.2877 - val_loss: 1.7460
Epoch 2/10
14/14 - 3s - 199ms/step - accuracy: 0.2810 - loss: 1.6653 - val_accuracy: 0.3699 - val_loss: 1.6110
Epoch 3/10
14/14 - 3s - 194ms/step - accuracy: 0.4989 - loss: 1.3546 - val_accuracy: 0.5708 - val_loss: 1.2747
Epoch 4/10
14/14 - 3s - 193ms/step - accuracy: 0.7041 - loss: 0.9288 - val_accuracy: 0.6895 - val_loss: 1.0068
Epoch 5/10
14/14 - 3s - 192ms/step - accuracy: 0.8188 - loss: 647742016.0000 - val_accuracy: 0.6895 - val_loss: 0.9803
Epoch 6/10
14/14 - 3s - 208ms/step - accuracy: 0.8567 - loss: 35165732.0000 - val_accuracy: 0.6986 - val_loss: 0.8957
Epoch 7/10
14/14 - 3s - 203ms/step - accuracy: 0.9151 - loss: 0.4367 - val_accuracy: 0.7260 - val_loss: 0.8367
Epoch 8/10
14/14 - 3s - 207ms/step - accuracy: 0.9392 - loss: 0.3292 - val_accuracy: 0.7534 - val_loss: 0.7964
Epoch 9/10
14/14 - 3s - 201ms/step - accuracy: 0.9576 - loss: 0.2476 - val_accuracy: 0.7580 - val_loss: 0.76

In [28]:
# Lưu mô hình có độ chính xác cao nhất
best_model.save('best_model.h5')



In [29]:
print("Phiên bản TensorFlow:", tf.__version__)

Phiên bản TensorFlow: 2.16.1


In [30]:
import tensorflow as tf
# Tải lại mô hình để sử dụng cho dự đoán
loaded_model = tf.keras.models.load_model('best_model.h5')



In [47]:
# Hàm dự đoán tiêu đề
def predict_title(title):
    processed_title = preprocess_text(title)
    sequence = tokenizer.texts_to_sequences([processed_title])
    padded_sequence = pad_sequences(sequence, maxlen=200)
    prediction = loaded_model.predict(padded_sequence)
    predicted_class = np.argmax(prediction, axis=1)
    predicted_label = label_encoder.inverse_transform(predicted_class)
    return predicted_label[0]

In [48]:
# Sử dụng hàm dự đoán
title = "Đâu là giới hạn tiếp theo khi phương Tây dần dỡ bỏ các lằn ranh đỏ với Ukraine?"
predicted_topic = predict_title(title)
print(f"Predicted topic: {predicted_topic}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Predicted topic: Thế giới


In [49]:
from flask import Flask, request, jsonify

In [50]:
app = Flask(__name__)

In [51]:
@app.route('/predict', methods=['POST'])
def predict():
    content = request.json['content']
    predicted_topic = predict_title(content)
    return jsonify({'predicted_topic': predicted_topic})

if __name__ == '__main__':
    app.run(debug=True, use_reloader=False)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step


INFO:werkzeug:127.0.0.1 - - [08/Jun/2024 22:59:10] "POST /predict HTTP/1.1" 200 -


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step


INFO:werkzeug:127.0.0.1 - - [08/Jun/2024 23:01:06] "POST /predict HTTP/1.1" 200 -
