In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, SpatialDropout1D, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.utils import to_categorical
from gensim.models import KeyedVectors
import requests

In [2]:
# Tải các tài nguyên cần thiết từ NLTK
nltk.download('punkt')

# Tải nội dung của tệp stop_words.py từ GitHub
url = 'https://raw.githubusercontent.com/trungtv/vivi_spacy/master/vi/vi/stop_words.py'
response = requests.get(url)
stop_words_content = response.text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Phân tích nội dung của tệp stop_words.py để lấy danh sách stopwords
start_index = stop_words_content.find('"""') + 3
end_index = stop_words_content.rfind('"""')
stop_words_list = stop_words_content[start_index:end_index].split("\n")
# Tạo danh sách stopwords từ danh sách bạn vừa phân tích
stop_words = set(word.strip() for word in stop_words_list if word.strip())

In [6]:
# Tạo danh sách stopwords từ danh sách bạn vừa phân tích
stop_words = set(word.strip() for word in stop_words_list if word.strip())

In [4]:
# Tiền xử lý văn bản
def preprocess_text(text):
    words = word_tokenize(text.lower())
    filtered_words = [word for word in words if word not in stop_words and word.isalnum()]
    return ' '.join(filtered_words)

In [5]:
# Load dữ liệu
df = pd.read_excel('data_xuly.xlsx')
df.dropna(axis=1, how='all', inplace=True)
contents = df['content'].tolist()
topics = df['label'].tolist()
contents = [str(content).strip() for content in contents]
processed_contents = [preprocess_text(content) for content in contents]

In [6]:
# Tiền xử lý nhãn
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(topics)
num_classes = len(set(y))
y = to_categorical(y, num_classes=num_classes)

In [7]:
# Tokenization và padding
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(processed_contents)
X = tokenizer.texts_to_sequences(processed_contents)
X = pad_sequences(X, maxlen=500)

In [8]:
import tensorflow as tf

In [33]:
loaded_model = tf.keras.models.load_model('best_model2.h5')



In [39]:
def predict_title(content):
    processed_title = preprocess_text(content)
    sequence = tokenizer.texts_to_sequences([processed_title])
    padded_sequence = pad_sequences(sequence, maxlen=200)
    prediction = loaded_model.predict(padded_sequence)
    predicted_class = np.argmax(prediction, axis=1)
    predicted_label = label_encoder.inverse_transform(predicted_class)
    return predicted_label[0]

In [40]:
from flask import Flask, request, jsonify

In [41]:
app = Flask(__name__)

In [42]:
@app.route('/predict', methods=['POST'])
def predict():
    content = request.json['content']
    predicted_topic = predict_title(content)
    return jsonify({'predicted_topic': predicted_topic})

if __name__ == '__main__':
    app.run(debug=True, use_reloader=False)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step


INFO:werkzeug:127.0.0.1 - - [11/Jun/2024 20:41:04] "POST /predict HTTP/1.1" 200 -


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step


INFO:werkzeug:127.0.0.1 - - [11/Jun/2024 20:41:22] "POST /predict HTTP/1.1" 200 -
