In [13]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
from googleapiclient.discovery import build
import os
from dotenv import load_dotenv

# Cargar variables de entorno
load_dotenv()

# Configuración de la API de YouTube
API_KEY = os.getenv('API_KEY')
youtube = build('youtube', 'v3', developerKey=API_KEY)

# Cargar el dataset
df = pd.read_csv('youtoxic_english_1000.csv')

# Preparar los datos
X = df['Text']
y = df['IsToxic']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Cargar el modelo de SpaCy en inglés
nlp = spacy.load('en_core_web_sm')

# Función para procesar el texto con SpaCy
def process_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct])

# Aplicar el procesamiento de texto
X_train_processed = X_train.apply(process_text)
X_test_processed = X_test.apply(process_text)

# Vectorizar el texto
vectorizer = CountVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train_processed)
X_test_vectorized = vectorizer.transform(X_test_processed)

# Crear y entrenar el modelo Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_vectorized, y_train)

# Evaluar el modelo
y_pred = rf_model.predict(X_test_vectorized)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Guardar el modelo y el vectorizador
joblib.dump(rf_model, 'models/random_forest_model.joblib')
joblib.dump(vectorizer, 'models/vectorizer.joblib')

# Función para predecir si un comentario es odioso
def predict_hate_speech(text):
    processed_text = process_text(text)
    vectorized_text = vectorizer.transform([processed_text])
    prediction = rf_model.predict(vectorized_text)
    return "Comentario odioso" if prediction[0] else "Comentario no odioso"

# Función para obtener comentarios de un video de YouTube
def get_video_comments(video_id):
    comments = []
    results = youtube.commentThreads().list(
        part="snippet",
        videoId=video_id,
        textFormat="plainText",
        maxResults=100
    ).execute()

    while results:
        for item in results['items']:
            comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
            comments.append(comment)

        if 'nextPageToken' in results:
            results = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                textFormat="plainText",
                maxResults=100,
                pageToken=results['nextPageToken']
            ).execute()
        else:
            break

    return comments

# Función principal para analizar los comentarios de un video
def analyze_video_comments(video_url):
    video_id = video_url.split("v=")[1]
    comments = get_video_comments(video_id)
    
    results = []
    for comment in comments:
        prediction = predict_hate_speech(comment)
        results.append({
            'comment': comment,
            'prediction': prediction
        })
    
    return pd.DataFrame(results)

# Ejemplo de uso
video_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"  # Reemplaza con la URL del video que quieres analizar
results_df = analyze_video_comments(video_url)
print(results_df)

# Mostrar estadísticas
total_comments = len(results_df)
hate_comments = len(results_df[results_df['prediction'] == "Comentario odioso"])
print(f"Total de comentarios analizados: {total_comments}")
print(f"Comentarios odiosos detectados: {hate_comments}")
print(f"Porcentaje de comentarios odiosos: {(hate_comments/total_comments)*100:.2f}%")

ImportError: cannot import name 'load_dotenv' from 'dotenv' (/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/dotenv/__init__.py)