In [7]:
import gzip
import requests
from io import BytesIO, StringIO

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import mlflow
import mlflow.sklearn
import joblib

In [8]:
GITHUB_CLEAN_URL = "https://raw.githubusercontent.com/Bootcamp-IA-P4/Bootcamp-IA-P4-project-x-nlp-team-3/feature/eda/Data/comments_data_clean.csv"

def load_comments_data_from_github(url):
    """
    Downloading and processing comments data from GitHub repository.
    """
    print("🔗 Downloading data from GitHub...")

    try:
        response = requests.get(url)
        response.raise_for_status()

        print("📊 Reading CSV file...")

        df = pd.read_csv(StringIO(response.text))

        print("✅ Data downloaded successfully!")
        return df

    except Exception as e:
        print(f"❌ Error while downloading data: {e}")
        return None

# Creating dataframe from GitHub URL
df = load_comments_data_from_github(GITHUB_CLEAN_URL)

🔗 Downloading data from GitHub...
📊 Reading CSV file...
✅ Data downloaded successfully!


In [None]:
TEXT_COLUMN = 'Text'
TARGET_COLUMN = 'IsToxic'

X = df[TEXT_COLUMN]
y = df[TARGET_COLUMN]

# División de datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline SVM
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('svc', SVC())
])

# Búsqueda de hiperparámetros
param_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['linear', 'rbf']
}

grid = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1_weighted')
grid.fit(X_train, y_train)

print(f"Mejores parámetros: {grid.best_params_}")
print(f"Mejor F1: {grid.best_score_:.4f}")

# Evaluación y logging con MLflow
with mlflow.start_run(run_name="SVM_Text_Classification"):
    best_model = grid.best_estimator_
    # Predicciones en test
    y_pred = best_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    print(f"Accuracy: {acc:.4f} | F1: {f1:.4f} | Recall: {recall:.4f} | Precision: {precision:.4f}")

    # Predicciones en train para calcular overfitting
    y_pred_train = best_model.predict(X_train)
    f1_train = f1_score(y_train, y_pred_train, average='weighted')
    overfit_f1 = f1_train - f1
    print(f"F1 Entrenamiento: {f1_train:.4f} | F1 Prueba: {f1:.4f} | Overfitting F1: {overfit_f1:.4f}")

    mlflow.log_params(grid.best_params_)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("train_f1_score", f1_train)
    mlflow.log_metric("overfit_f1", overfit_f1)

Mejores parámetros: {'svc__C': 150, 'svc__kernel': 'rbf'}
Mejor F1: 0.6549
Accuracy: 0.6700 | F1: 0.6663 | Recall: 0.6700 | Precision: 0.6898
