In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from deep_translator import GoogleTranslator
import spacy
import numpy as np
from sklearn.preprocessing import LabelEncoder

nlp = spacy.load("en_core_web_sm")

data = pd.read_csv("data/train_1M.csv")
data = data.head(100000)
data = data.drop_duplicates(subset="comment_text")

def preprocess_text(text):
    doc = nlp(text,disable=["parser", "tagger", "ner", "textcat"])
    simplified_text = " ".join([token.lemma_ for token in doc if not token.is_punct and not token.is_stop and not token.is_digit])

    return simplified_text
    # return text

data["comment_text"] = data["comment_text"].fillna("").apply(preprocess_text)

def classify_toxicity(toxicity_score, threshold=0.5):
    if toxicity_score > threshold:
        return 'negative'
    else:
        return 'positive'


data['sentiment'] = data['toxicity'].apply(classify_toxicity)
# le = LabelEncoder()
# data['sentiment'] = le.fit_transform(data['sentiment'])

def analyze_comment(comment):
    comment = GoogleTranslator(source='auto', target='en').translate(comment)
    comment = preprocess_text(comment)

    vectorizer = TfidfVectorizer()
    classifier = MultinomialNB()

    X_train_vectors = vectorizer.fit_transform(data["comment_text"])
    classifier.fit(X_train_vectors, data["sentiment"])

    comment_vector = vectorizer.transform([comment])
    prediction = classifier.predict(comment_vector)

    percentages = {}
    for column in data.columns[14:20]:
        percentages[column] = data[column].mean() * 100

    total_percentage = np.mean(list(percentages.values()))

    sentiment = prediction[0]

    result = f"Percentages:\n"
    for column, percentage in percentages.items():
        result += f"{column}: {percentage:.2f}%\n"
    result += f"Total Percentage: {total_percentage:.2f}%\n"
    result += f"Sentiment: {sentiment}"

    return result

comment = "You are ugly"
result = analyze_comment(comment)
print(result)



Percentages:
severe_toxicity: 0.62%
obscene: 2.58%
sexual_explicit: 0.83%
identity_attack: 1.87%
insult: 11.58%
threat: 0.84%
Total Percentage: 3.05%
Sentiment: positive


In [2]:
# import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.naive_bayes import MultinomialNB
# from deep_translator import GoogleTranslator
# import spacy
# import numpy as np

# nlp = spacy.load("en_core_web_sm")

# # Charger les données d'apprentissage
# data = pd.read_csv("data/train_1M.csv")
# data = data.head(5000)
# data = data.drop_duplicates(subset="comment_text")

# def preprocess_text(text):
#     doc = nlp(text)
#     simplified_text = " ".join([token.lemma_ for token in doc if not token.is_punct and not token.is_stop and not token.is_digit])

#     return simplified_text

# data["comment_text"] = data["comment_text"].fillna("").apply(preprocess_text)

# def classify_toxicity(toxicity_score, threshold=0.5):
#     if toxicity_score > threshold:
#         return 'negative'
#     else:
#         return 'positive'

# data['sentiment'] = data['toxicity'].apply(classify_toxicity)
# data = data.dropna(subset=["sentiment"])

# # Entraîner le modèle et créer un vecteur TF-IDF
# vectorizer = TfidfVectorizer()
# classifier = MultinomialNB()

# X_train_vectors = vectorizer.fit_transform(data["comment_text"])
# classifier.fit(X_train_vectors, data["sentiment"])

# # Sauvegarder le modèle, le vecteur TF-IDF et les pourcentages
# model_path = "sentiment_model.pkl"
# vectorizer_path = "tfidf_vectorizer.pkl"

# import joblib

# joblib.dump(classifier, model_path)
# joblib.dump(vectorizer, vectorizer_path)

# percentages = {}
# for column in data.columns[7:16]:
#     percentages[column] = data[column].mean() * 100

# joblib.dump(percentages, "percentages.pkl")

# # Charger le modèle, le vecteur TF-IDF et les pourcentages lors de l'analyse d'un nouveau commentaire
# def analyze_comment(comment):
#     comment = GoogleTranslator(source='auto', target='en').translate(comment)
#     comment = preprocess_text(comment)

#     loaded_model = joblib.load(model_path)
#     loaded_vectorizer = joblib.load(vectorizer_path)

#     comment_vector = loaded_vectorizer.transform([comment])
#     prediction = loaded_model.predict(comment_vector)

#     loaded_percentages = joblib.load("percentages.pkl")
#     total_percentage = np.mean(list(loaded_percentages.values()))

#     sentiment = prediction[0]

#     result = f"Percentages:\n"
#     for column, percentage in loaded_percentages.items():
#         result += f"{column}: {percentage:.2f}%\n"
#     result += f"Total Percentage: {total_percentage:.2f}%\n"
#     result += f"Sentiment: {sentiment}"

#     return result
