In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from tqdm import tqdm
import os
from dotenv import load_dotenv
from googleapiclient.discovery import build

# Cargar variables de entorno
load_dotenv()

# Configuración de la API de YouTube
API_KEY = os.getenv('API_KEY')
youtube = build('youtube', 'v3', developerKey=API_KEY)

# Cargar el dataset
df = pd.read_csv('youtoxic_english_1000.csv')

# Preparar los datos
X = df['Text']
y = df['IsToxic'].astype(int)

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Cargar el tokenizador y el modelo DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Tokenizar los datos
def tokenize_data(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

train_encodings = tokenize_data(X_train.tolist())
test_encodings = tokenize_data(X_test.tolist())

# Crear datasets de PyTorch
train_dataset = TensorDataset(
    train_encodings['input_ids'],
    train_encodings['attention_mask'],
    torch.tensor(y_train.values, dtype=torch.long)
)
test_dataset = TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask'],
    torch.tensor(y_test.values, dtype=torch.long)
)

# Crear dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Configurar el optimizador
optimizer = AdamW(model.parameters(), lr=5e-5)

# Entrenar el modelo
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Evaluar el modelo
model.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1]}
        outputs = model(**inputs)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(batch[2].cpu().numpy())

print("Accuracy:", accuracy_score(true_labels, predictions))
print("\nClassification Report:")
print(classification_report(true_labels, predictions))

# Guardar el modelo
model.save_pretrained('models/distilbert_toxic_classifier')
tokenizer.save_pretrained('models/distilbert_toxic_classifier')

# Función para predecir si un comentario es odioso
def predict_hate_speech(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    return "Comentario odioso" if prediction == 1 else "Comentario no odioso"

# Función para obtener comentarios de un video de YouTube
def get_video_comments(video_id):
    comments = []
    results = youtube.commentThreads().list(
        part="snippet",
        videoId=video_id,
        textFormat="plainText",
        maxResults=100
    ).execute()

    while results:
        for item in results['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)

        if 'nextPageToken' in results:
            results = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                textFormat="plainText",
                maxResults=100,
                pageToken=results['nextPageToken']
            ).execute()
        else:
            break

    return comments

# Función principal para analizar los comentarios de un video
def analyze_video_comments(video_url):
    video_id = video_url.split("v=")[1]
    comments = get_video_comments(video_id)
    
    results = []
    for comment in comments:
        prediction = predict_hate_speech(comment)
        results.append({
            'comment': comment,
            'prediction': prediction
        })
    
    return pd.DataFrame(results)

# Ejemplo de uso
video_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"  # Reemplaza con la URL del video que quieres analizar
results_df = analyze_video_comments(video_url)
print(results_df)

# Mostrar estadísticas
total_comments = len(results_df)
hate_comments = len(results_df[results_df['prediction'] == "Comentario odioso"])
print(f"Total de comentarios analizados: {total_comments}")
print(f"Comentarios odiosos detectados: {hate_comments}")
print(f"Porcentaje de comentarios odiosos: {(hate_comments/total_comments)*100:.2f}%")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
