In [1]:
import pandas as pd
from tensorflow.keras.layers import TextVectorization

df = pd.read_csv('comments.csv')
X = df['comment_text']

MAX_FEATURES = 200000 # number of words in the vocab

vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

vectorizer.adapt(X.values)

In [7]:
from flask import Flask, request, jsonify
import re
from flask_cors import CORS
import tensorflow as tf
import numpy as np

app = Flask(__name__)
CORS(app)

model = tf.keras.models.load_model('toxicity.h5')

def preprocess_text(text):
    # Assuming `text` is a list of comments
    processed_texts = [re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', sentence) for sentence in text]
    # Flatten the list of lists into a single list
    return [item for sublist in processed_texts for item in sublist]

@app.route('/detect', methods=['POST'])
def detect():
    data = request.get_json()
    comments = data['comments']
    
    hate_comments = []
    
    for comment in comments:
        sentences = preprocess_text(comment['text'])
        preprocessed_sentences = [vectorizer([sentence]) for sentence in sentences]
        
        # Predict toxicity for all sentences in the comment
        predictions = model.predict(np.vstack(preprocessed_sentences))
        is_hate_speech = any(prediction[0] > 0.5 for prediction in predictions)  # Adjust based on your model's output shape

        if is_hate_speech:
            hate_comments.append({
                "username": comment['ownerUsername'],
                "text": comment['text']
            })

    return jsonify(hate_comments)

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5050)




 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5050
 * Running on http://192.168.29.96:5050
INFO:werkzeug:[33mPress CTRL+C to quit[0m
