In [29]:
import numpy as np
import pandas as pd

In [30]:
toxicWordsTrain=pd.read_csv("D:/Data Science/Machine Learning/Toxic Comments with LSTM-CNN/Toxic-Comment-with-LSTM-CNN/dataset/jigsaw-toxic-comment-classification-challenge/train.csv").fillna(' ')
toxicWordsTest=pd.read_csv("D:/Data Science/Machine Learning/Toxic Comments with LSTM-CNN/Toxic-Comment-with-LSTM-CNN/dataset/jigsaw-toxic-comment-classification-challenge/test.csv").fillna(' ')

#Class labels
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

y_train = toxicWordsTrain[list_classes].values
x_train = toxicWordsTrain["comment_text"]
x_test  = toxicWordsTest["comment_text"]

In [31]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Create tokenizer
tokenizer = Tokenizer(num_words=None,
                      filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                      lower=True,
                      split=" ",
                      char_level=False)

# Fit and run tokenizer
tokenizer.fit_on_texts(list(x_train))
tokenized_train = tokenizer.texts_to_sequences(x_train)
tokenized_test = tokenizer.texts_to_sequences(x_test)
word_index = tokenizer.word_index


# Extract variables
vocab_size = len(word_index)
longest = max(len(seq) for seq in tokenized_train)
average = np.mean([len(seq) for seq in tokenized_train])
stdev = np.std([len(seq) for seq in tokenized_train])
max_len = int(average + stdev * 3)


In [32]:
# Pad sequences
processed_X_train = pad_sequences(tokenized_train, maxlen=max_len, padding='post', truncating='post')
processed_X_test = pad_sequences(tokenized_test, maxlen=max_len, padding='post', truncating='post')

In [33]:
print(processed_X_train.shape)
print(y_train.shape)

(159571, 371)
(159571, 6)


In [34]:
import tensorflow as tf
model = tf.keras.models.load_model("D:/Data Science/Machine Learning/Toxic Comments with LSTM-CNN/Toxic-Comment-with-LSTM-CNN/model.best.hdf5")



In [35]:
import gradio as gr

In [38]:
from flask import Flask, request, jsonify
import nest_asyncio

nest_asyncio.apply()

app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    # Lấy dữ liệu từ yêu cầu POST
    data = request.get_json()
    if 'text' not in data:
        return jsonify({'error': 'Missing "text" field'}), 400
    
    text = data['text']
    
    # Xử lý và dự đoán
    result = toxicity_level(text)
    
    # Trả kết quả dưới dạng JSON
    return jsonify(result)

def toxicity_level(string):
    """
    Return toxicity probability based on input string.
    """
    new_string = [string]
    new_string = tokenizer.texts_to_sequences(new_string)
    new_string = pad_sequences(new_string, maxlen = max_len, padding = 'post', truncating = 'post')
    # Predict
    prediction = model.predict(new_string)[0]   
     
    # Trả kết quả
    labels = ['Toxic', 'Severe Toxic', 'Obscene', 'Threat', 'Insult', 'Identity Hate']
    result = {label: f"{prob:.0%}" for label, prob in zip(labels, prediction)}
    return result


In [None]:
interface = gr.Interface(
    fn=toxicity_level,
    inputs=gr.Textbox(lines=2, placeholder="Nhập một chuỗi để kiểm tra mức độ độc hại..."),
    outputs=gr.Label(num_top_classes=6),
    title="Toxicity Comment Prediction",
    description="Nhập một bình luận và kiểm tra các mức độ độc hại theo các danh mục khác nhau."
)