In [1]:
!pip install -q transformers

In [2]:
#Manipulacion de datos
import pandas as pd
import numpy as np

#Visualizacion
import matplotlib.pyplot as plt
import seaborn as sn

# Preprocesamiento
import spacy
from spacy.tokens import Doc
import re
from tqdm import tqdm
from typing import List
from transformers import AutoTokenizer
import torch

# Modelos
from transformers import AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import torch.nn as nn
from torch.optim import Adam
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, roc_auc_score
from torch.utils.data import TensorDataset, DataLoader


In [4]:
#Cargar datos
data= pd.read_csv('youtoxic_english_1000.csv')

In [5]:
data.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,987,988,989,990,991,992,993,994,995,996
CommentId,Ugg2KwwX0V8-aXgCoAEC,Ugg2s5AzSPioEXgCoAEC,Ugg3dWTOxryFfHgCoAEC,Ugg7Gd006w1MPngCoAEC,Ugg8FfTbbNF8IngCoAEC,Ugg9a6FtoXdxmXgCoAEC,Ugga9KzkNDGvlXgCoAEC,UggBlIXoph7p-3gCoAEC,UggD1aYSn7KOR3gCoAEC,UggGm8a1fu8brngCoAEC,...,Ugh07S5UNzXDWngCoAEC,Ughbo_MZ42mpJ3gCoAEC,Ughd_2zFh-O-93gCoAEC,UghGyVemEvDEqXgCoAEC,UghHMimNd9cYxngCoAEC,Ugi5ADt10EdDz3gCoAEC,Ugifh2DMhBbDkHgCoAEC,Ugj_plbGBjjzYXgCoAEC,Ugj0bah1De8xy3gCoAEC,UgjBJKQSoQMQ6ngCoAEC
VideoId,04kJtp6pVXI,04kJtp6pVXI,04kJtp6pVXI,04kJtp6pVXI,04kJtp6pVXI,04kJtp6pVXI,04kJtp6pVXI,04kJtp6pVXI,04kJtp6pVXI,04kJtp6pVXI,...,XRuCW80L9mA,XRuCW80L9mA,XRuCW80L9mA,XRuCW80L9mA,XRuCW80L9mA,XRuCW80L9mA,XRuCW80L9mA,XRuCW80L9mA,XRuCW80L9mA,XRuCW80L9mA
Text,If only people would just take a step back and...,Law enforcement is not trained to shoot to app...,\nDont you reckon them 'black lives matter' ba...,There are a very large number of people who do...,"The Arab dude is absolutely right, he should h...",here people his facebook is https://www.facebo...,"Check out this you tube post. ""Black man goes ...",I would LOVE to see this pussy go to Staten Is...,I agree with the protestor.,mike browns father was made to say that boooshit,...,"Remember..Michael Brown was just a young, teen...",What point are you trying to make? Is it that...,There are a lot of disgusting people in the co...,"Whites should move-out of Ferguson, let's see ...",I just found this channel and its one of my fa...,I remember that they sent in the national defe...,Stats don`t represent the problem. Race baitin...,The quote from the mother... Wow that hit hard...,this video is so racist,"God, the narrator has such an annoying lisp."
IsToxic,False,True,True,False,False,True,True,True,False,True,...,False,False,False,True,False,False,True,False,False,False
IsAbusive,False,True,True,False,False,False,False,True,False,True,...,False,False,False,False,False,False,False,False,False,False
IsThreat,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
IsProvocative,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
IsObscene,False,False,True,False,False,False,False,True,False,True,...,False,False,False,False,False,False,False,False,False,False
IsHatespeech,False,False,False,False,False,True,True,True,False,False,...,False,False,False,True,False,False,True,False,False,False
IsRacist,False,False,False,False,False,False,True,True,False,False,...,False,False,False,True,False,False,True,False,False,False


# **ETIQUETA**

In [5]:
# Consolidar las etiquetas en una sola columna binaria 'IsHate'
# Creación de una nueva columna 'IsHate'
hate_columns = ['IsToxic', 'IsAbusive', 'IsThreat', 'IsProvocative', 'IsObscene', 'IsHatespeech',
                'IsRacist', 'IsNationalist', 'IsSexist', 'IsHomophobic', 'IsReligiousHate', 'IsRadicalism']

# Un comentario es considerado de odio si cumple cualquiera de las condiciones anteriores
data['IsHate'] = data[hate_columns].any(axis=1)

# Ahora seleccionamos solo las columnas necesarias, es decir, 'Text' y 'IsHate'
data_simplified = data[['Text', 'IsHate']]

# Verificamos la distribución de las clases
distribution = data_simplified['IsHate'].value_counts()

data_simplified.head(), distribution

(                                                Text  IsHate
 0  If only people would just take a step back and...   False
 1  Law enforcement is not trained to shoot to app...    True
 2  \nDont you reckon them 'black lives matter' ba...    True
 3  There are a very large number of people who do...   False
 4  The Arab dude is absolutely right, he should h...   False,
 False    538
 True     462
 Name: IsHate, dtype: int64)

In [6]:
# Dividir los datos en conjuntos de entrenamiento y prueba
train_data, test_data = train_test_split(data_simplified, test_size=0.2, random_state=42)

# **PREPROCESAMIENTO**

In [7]:
# Tokenizador
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


In [8]:
# Codificación de los datos de entrenamiento
train_encoding = tokenizer(train_data['Text'].tolist(), padding=True, truncation=True, max_length=128, return_tensors="pt")
train_labels = torch.tensor(train_data['IsHate'].values, dtype=torch.long)

In [9]:
# Codificación de los datos de prueba
test_encoding = tokenizer(test_data['Text'].tolist(), padding=True, truncation=True, max_length=128, return_tensors="pt")
test_labels = torch.tensor(test_data['IsHate'].values, dtype=torch.long)

# **MODELO**

In [10]:
# Modelo y optimizador con regularización L2 (weight_decay)LO VAMOS AJUSTANDO
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)  # Ajusta el valor de weight_decay según sea necesario
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_data))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Datasets y Dataloaders
train_dataset = TensorDataset(train_encoding.input_ids, train_encoding.attention_mask, train_labels)
test_dataset = TensorDataset(test_encoding.input_ids, test_encoding.attention_mask, test_labels)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [12]:
# Definir la función de pérdida y el dispositivo (GPU o CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
loss_fn = nn.CrossEntropyLoss()


In [13]:
# Número de épocas y entrenamiento
num_epochs = 2

for epoch in range(num_epochs):
    # Entrenamiento
    model.train()
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = loss_fn(logits, labels)
        total_loss += loss.item()

        # Calcular la precisión en el conjunto de entrenamiento
        preds = torch.argmax(logits, dim=1)
        correct_predictions += torch.sum(preds == labels).item()
        total_samples += len(labels)

        loss.backward()
        optimizer.step()
        scheduler.step()

    # Calcular la pérdida promedio y la precisión en el conjunto de entrenamiento
    average_loss = total_loss / len(train_dataloader)
    train_accuracy = correct_predictions / total_samples

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {average_loss}, Train Accuracy: {train_accuracy}")

Epoch 1/2, Loss: 0.5910302184522152, Train Accuracy: 0.6775
Epoch 2/2, Loss: 0.3781577431410551, Train Accuracy: 0.85375


# **EVALUACIÓN DEL MODELO**

In [14]:
# Evaluación en el conjunto de prueba
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())


In [16]:
# Calcular métricas de evaluación
precision = precision_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)
test_accuracy = accuracy_score(all_labels, all_preds)
conf_matrix = confusion_matrix(all_labels, all_preds)
roc_auc = roc_auc_score(all_labels, all_preds)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Accuracy: {test_accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"ROC AUC Score: {roc_auc}")


Precision: 0.8556701030927835
Recall: 0.7757009345794392
F1 Score: 0.8137254901960783
Accuracy: 0.81
Confusion Matrix:
[[79 14]
 [24 83]]
ROC AUC Score: 0.8125816500854185


In [17]:
overfitting = train_accuracy - test_accuracy
print(f'Overfitting: {overfitting}')

Overfitting: 0.043749999999999956


# **GUARDAR EL MODELO**

In [None]:
# Save the model
model.save("model")

# **CARGAR EL MODELO**

In [None]:
#Para cargar el modelo y el tokenizador posteriormente
loaded_model = DistilBertForSequenceClassification.from_pretrained("model_directory")
loaded_tokenizer = DistilBertTokenizer.from_pretrained("model_directory")