In [53]:
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Descargar recursos de NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Cargar los datos
df = pd.read_csv('datos_procesados.csv')



[nltk_data] Downloading package punkt to /Users/sunbay85/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sunbay85/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/sunbay85/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [54]:
# Función de preprocesamiento
def preprocess_text(text):
    # Convertir a minúsculas
    text = text.lower()

    # Eliminar caracteres especiales
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenización
    tokens = word_tokenize(text)

    # Eliminar stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    return ' '.join(tokens)

# Aplicar preprocesamiento
df['processed_text'] = df['Text'].apply(preprocess_text)

# Preparar características (X) y etiquetas (y)
X = df['processed_text']
y = df['IsToxic']

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [55]:

# Definir la función objetivo para Optuna
def objective(trial):
    # Definir el espacio de búsqueda para los hiperparámetros
    c = trial.suggest_loguniform('c', 1e-5, 1e2)
    tfidf_max_features = trial.suggest_int('tfidf_max_features', 1000, 10000)

    # Crear el pipeline con los hiperparámetros sugeridos
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=tfidf_max_features)),
        ('clf', LogisticRegression(C=c, random_state=42))
    ])

    # Realizar validación cruzada
    score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='f1')

    # Retornar la media del f1-score
    return score.mean()

# Crear un estudio de Optuna
study = optuna.create_study(direction='maximize')

# Optimizar
study.optimize(objective, n_trials=100)

# Imprimir los mejores hiperparámetros y el mejor score
print('Mejores hiperparámetros:', study.best_params)
print('Mejor f1-score:', study.best_value)


[I 2024-11-13 11:44:44,388] A new study created in memory with name: no-name-53b631b2-3ad4-4387-9dc1-6ee1655caa91
  c = trial.suggest_loguniform('c', 1e-5, 1e2)
[I 2024-11-13 11:44:44,458] Trial 0 finished with value: 0.0 and parameters: {'c': 0.0032286529639440316, 'tfidf_max_features': 3395}. Best is trial 0 with value: 0.0.
  c = trial.suggest_loguniform('c', 1e-5, 1e2)
[I 2024-11-13 11:44:44,501] Trial 1 finished with value: 0.0 and parameters: {'c': 0.01284057552264191, 'tfidf_max_features': 9628}. Best is trial 0 with value: 0.0.
  c = trial.suggest_loguniform('c', 1e-5, 1e2)
[I 2024-11-13 11:44:44,545] Trial 2 finished with value: 0.0 and parameters: {'c': 1.0418217674862809e-05, 'tfidf_max_features': 7794}. Best is trial 0 with value: 0.0.
  c = trial.suggest_loguniform('c', 1e-5, 1e2)
[I 2024-11-13 11:44:44,597] Trial 3 finished with value: 0.616291426407799 and parameters: {'c': 93.47924057575557, 'tfidf_max_features': 7203}. Best is trial 3 with value: 0.616291426407799.
  c

Mejores hiperparámetros: {'c': 29.430283099091294, 'tfidf_max_features': 1011}
Mejor f1-score: 0.641847903976638


In [57]:

# Entrenar el modelo final con los mejores hiperparámetros
best_c = study.best_params['c']
best_max_features = study.best_params['tfidf_max_features']

final_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=best_max_features)),
    ('clf', LogisticRegression(C=best_c, random_state=42))
])

final_pipeline.fit(X_train, y_train)

# Evaluar el modelo final
y_pred = final_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

# Función para clasificar nuevos comentarios
def classify_comment(comment):
    processed = preprocess_text(comment)
    prediction = final_pipeline.predict([processed])
    return "Tóxico" if prediction[0] else "No tóxico"


              precision    recall  f1-score   support

       False       0.66      0.80      0.72        93
        True       0.78      0.64      0.71       107

    accuracy                           0.71       200
   macro avg       0.72      0.72      0.71       200
weighted avg       0.73      0.71      0.71       200

