In [1]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Cargar el dataset
df = pd.read_csv('youtoxic_english_1000.csv')

# Preparar los datos
X = df['Text']
y = df['IsToxic']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Cargar el modelo de SpaCy en inglés
nlp = spacy.load('en_core_web_sm')

# Función para procesar el texto con SpaCy
def process_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct])

# Aplicar el procesamiento de texto
X_train_processed = X_train.apply(process_text)
X_test_processed = X_test.apply(process_text)

# Vectorizar el texto
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train_processed)
X_test_vectorized = vectorizer.transform(X_test_processed)

# Crear los clasificadores individuales
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
lr_model = LogisticRegression(random_state=42)
svm_model = SVC(probability=True, random_state=42)

# Crear el modelo ensemble
ensemble_model = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('lr', lr_model),
        ('svm', svm_model)
    ],
    voting='soft'
)

# Entrenar el modelo ensemble
ensemble_model.fit(X_train_vectorized, y_train)

# Evaluar el modelo
y_pred = ensemble_model.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(f"Precisión del modelo ensemble: {accuracy:.4f}")
print("\nInforme de clasificación:")
print(classification_report(y_test, y_pred))

# Guardar el modelo ensemble y el vectorizador
joblib.dump(ensemble_model, 'ensemble_model.joblib')
joblib.dump(vectorizer, 'ensemble_vectorizer.joblib')

# Función para predecir
def predict_hate_speech(text):
    processed_text = process_text(text)
    vectorized_text = vectorizer.transform([processed_text])
    prediction = ensemble_model.predict(vectorized_text)
    return "Frase odiosa" if prediction[0] else "Frase no odiosa"

# Ejemplo de uso
texto_ejemplo = "You are an idiot and I hate you"
resultado = predict_hate_speech(texto_ejemplo)
print(f"Predicción: {resultado}")

FileNotFoundError: [Errno 2] No such file or directory: 'datos_procesados.csv'