In [46]:
import pandas as pd

df = pd.read_csv("../data/youtoxic_english_1000.csv")
df = df.drop(columns=["IsHomophobic", "IsRadicalism", "IsSexist"])

df.head(5)

Unnamed: 0,CommentId,VideoId,Text,IsToxic,IsAbusive,IsThreat,IsProvocative,IsObscene,IsHatespeech,IsRacist,IsNationalist,IsReligiousHate
0,Ugg2KwwX0V8-aXgCoAEC,04kJtp6pVXI,If only people would just take a step back and...,False,False,False,False,False,False,False,False,False
1,Ugg2s5AzSPioEXgCoAEC,04kJtp6pVXI,Law enforcement is not trained to shoot to app...,True,True,False,False,False,False,False,False,False
2,Ugg3dWTOxryFfHgCoAEC,04kJtp6pVXI,\nDont you reckon them 'black lives matter' ba...,True,True,False,False,True,False,False,False,False
3,Ugg7Gd006w1MPngCoAEC,04kJtp6pVXI,There are a very large number of people who do...,False,False,False,False,False,False,False,False,False
4,Ugg8FfTbbNF8IngCoAEC,04kJtp6pVXI,"The Arab dude is absolutely right, he should h...",False,False,False,False,False,False,False,False,False


In [47]:
# importamos las dependencias
import pandas as pd

import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

[nltk_data] Downloading package punkt to /Users/trabajo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/trabajo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/trabajo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [48]:
# cargamos los datos 
df = df[['Text', 'IsToxic']]  # solo nos interesa esta columna como target

# preprocesamiento del texto
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def limpiar_texto(texto):
    tokens = nltk.word_tokenize(str(texto).lower())
    tokens = [t for t in tokens if t.isalpha()]  # elimina puntuación y números
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

df['texto_limpio'] = df['Text'].apply(limpiar_texto)

# vectorización
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['texto_limpio'])

# variable objetivo
y = df['IsToxic'].astype(int)

# división en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# entrenamiento del modelo
modelo = MultinomialNB()
modelo.fit(X_train, y_train)

# evaluación
y_pred = modelo.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# función para pasar de 0 y 1 a True y False
def predict_true_false(prob):
    if prob >= 0.5:
        return True
    else:
        return False

# Función para predecir nuevos textos
def predecir_toxicidad(texto):
    texto_limpio = limpiar_texto(texto)
    texto_vect = vectorizer.transform([texto_limpio])
    prob = modelo.predict_proba(texto_vect) # prob nos devuelve las probabilidades de cada clase, por ejemplo: [[0.2, 0.8]] donde 0.2 es la probabilidad de que no sea tóxico y 0.8 es la probabilidad de que sí lo sea.
    print(f"Probabilidades de cada clase, [not-toxic, toxic]: {prob}")
    prob_toxico = prob[0][1]  # accedemos a la probabilidad de la clase tóxica mediante índices, donde 0 es la primera fila (la única en este caso) y 1 es la segunda columna (la probabilidad de ser tóxico). 
    print(f"Probabilidad de toxicidad: {prob_toxico}")
    prediccion_final = predict_true_false(prob_toxico)
    return {"prediccion": prediccion_final, "probabilidad_toxico": round(prob_toxico, 3)}


[[80 13]
 [50 57]]
              precision    recall  f1-score   support

           0       0.62      0.86      0.72        93
           1       0.81      0.53      0.64       107

    accuracy                           0.69       200
   macro avg       0.71      0.70      0.68       200
weighted avg       0.72      0.69      0.68       200



In [49]:
print(predecir_toxicidad("If only people would just take a step back and not make this case about them, because it wasn't about anyone except the two people in that situation.  To lump yourself into this mess and take matters into your own hands makes these kinds of protests selfish and without rational thought and investigation.  The guy in this video is heavily emotional and hyped up and wants to be heard, and when he gets heard he just presses more and more.  He was never out to have a reasonable discussion.  Kudos to the Smerconish for keeping level the whole time and letting Masri make himself out to be a fool.  How dare he and those that tore that city down in protest make this about themselves and to dishonor the entire incident with their own hate.  By the way, since when did police brutality become an epidemic?  I wish everyone would just stop pretending like they were there and they knew EXACTLY what was going on, because there's no measurable amount of people that honestly witnessed this incident, so none of us have a clue on which way this whole issue should have swung.  The grand jury were the most informed, we have to trust the majority rule was the right course of action and let it be.  Also, thank you to the 99.999% of police officers in America that actually serve & protect, even if you're a bit of a jerk when you pull me over, I respect your job and know that someone has to do it and that many people are going to pout about being held accountable to their actions.  People hate police until they need an officer or two around in an emergency."))

Probabilidades de cada clase, [not-toxic, toxic]: [[0.79547868 0.20452132]]
Probabilidad de toxicidad: 0.20452132424613173
{'prediccion': False, 'probabilidad_toxico': np.float64(0.205)}
