In [2]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix
)

import re

# -------------------------------------------------------------------
# 1. Carga de datos
# -------------------------------------------------------------------
DATA_PATH = "Dataset (1).csv"



df = pd.read_csv(DATA_PATH)

print("Primeras filas del dataset:")
print(df.head())
print("\nInfo del dataset:")
print(df.info())

# Validación de columnas esperadas
assert "phrase" in df.columns, "No se encontró la columna 'phrase'"
assert "prompt" in df.columns, "No se encontró la columna 'prompt'"

# -------------------------------------------------------------------
# 2. Limpieza básica de etiquetas y texto original
# -------------------------------------------------------------------

df["prompt"] = df["prompt"].astype(str).str.strip()
df["prompt"] = df["prompt"].str.rstrip(",.;:")

df["text_raw"] = df["phrase"].astype(str)

# -------------------------------------------------------------------
# 3. Preprocesamiento de texto
# -------------------------------------------------------------------

def preprocess_text(text: str) -> str:
    # Minúsculas
    text = text.lower()
    # Conserva letras, números básicos y espacios
    text = re.sub(r"[^a-z0-9áéíóúüñ\s]", " ", text)
    # Normaliza espacios
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["text_clean"] = df["text_raw"].apply(preprocess_text)

print("\nEjemplos de texto limpio:")
print(df[["text_raw", "text_clean"]].head())

# -------------------------------------------------------------------
# 4. Definición de variables y partición train/test
# -------------------------------------------------------------------

X = df["text_clean"].values
y = df["prompt"].values

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("\nTamaños de los conjuntos:")
print("Train:", len(X_train), " - Test:", len(X_test))

# -------------------------------------------------------------------
# 5. Configuración de validación cruzada
# -------------------------------------------------------------------

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# -------------------------------------------------------------------
# 6. Definición de modelos y grids de búsqueda
# -------------------------------------------------------------------

# Modelo 1: TF-IDF por palabras + Logistic Regression
pipeline_logreg = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=20000
    )),
    ("clf", LogisticRegression(max_iter=1000))
])

param_grid_logreg = {
    "clf__C": [0.5, 1.0, 2.0],
    "clf__class_weight": [None, "balanced"]
}

# Modelo 2: TF-IDF por palabras + LinearSVC
pipeline_svc = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=20000
    )),
    ("clf", LinearSVC())
])

param_grid_svc = {
    "clf__C": [0.5, 1.0, 2.0]
}

# Modelo 3: TF-IDF por caracteres + LinearSVC
pipeline_char_svc = Pipeline([
    ("tfidf", TfidfVectorizer(
        analyzer="char",
        ngram_range=(3, 5),
        max_features=30000
    )),
    ("clf", LinearSVC())
])

param_grid_char_svc = {
    "clf__C": [0.5, 1.0, 2.0]
}

modelos = {
    "LogReg_TFIDF_palabras": (pipeline_logreg, param_grid_logreg),
    "LinearSVC_TFIDF_palabras": (pipeline_svc, param_grid_svc),
    "LinearSVC_TFIDF_caracteres": (pipeline_char_svc, param_grid_char_svc),
}

# -------------------------------------------------------------------
# 7. Entrenamiento con GridSearchCV y evaluación en test
# -------------------------------------------------------------------

resultados = []

os.makedirs("results", exist_ok=True)

for nombre_modelo, (pipeline, param_grid) in modelos.items():
    print("\n" + "=" * 80)
    print(f"Modelo: {nombre_modelo}")
    print("=" * 80)

    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=cv,
        scoring="f1_macro",
        n_jobs=-1,
        verbose=1
    )

    grid.fit(X_train, y_train)

    print(f"\nMejores hiperparámetros para {nombre_modelo}:")
    print(grid.best_params_)
    print(f"F1 macro promedio en CV: {grid.best_score_:.4f}")

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)

    acc_test = accuracy_score(y_test, y_pred)
    f1_macro_test = f1_score(y_test, y_pred, average="macro")
    f1_weighted_test = f1_score(y_test, y_pred, average="weighted")

    print("\nResultados en conjunto de prueba:")
    print(f"Accuracy:      {acc_test:.4f}")
    print(f"F1 macro:      {f1_macro_test:.4f}")
    print(f"F1 weighted:   {f1_weighted_test:.4f}")

    print("\nClassification report:")
    print(classification_report(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred)
    print("Matriz de confusión (shape):", cm.shape)

    resultados.append({
        "modelo": nombre_modelo,
        "best_params": str(grid.best_params_),
        "f1_macro_cv": grid.best_score_,
        "accuracy_test": acc_test,
        "f1_macro_test": f1_macro_test,
        "f1_weighted_test": f1_weighted_test
    })

# -------------------------------------------------------------------
# 8. Guardado de resultados
# -------------------------------------------------------------------

df_resultados = pd.DataFrame(resultados)
resultados_path = os.path.join("results", "resultados_modelos.csv")
df_resultados.to_csv(resultados_path, index=False)

print("\nResumen de resultados guardado en:", resultados_path)
print("\nTabla resumen:")
print(df_resultados)


Primeras filas del dataset:
                                              phrase            prompt
0                    When I remember her I feel down    Emotional pain
1  When I carry heavy things I feel like breaking...  Hair falling out
2          there is too much pain when i move my arm       Heart hurts
3  My son had his lip pierced and it is swollen a...    Infected wound
4             My muscles in my lower back are aching    Infected wound

Info del dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6661 entries, 0 to 6660
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   phrase  6661 non-null   object
 1   prompt  6661 non-null   object
dtypes: object(2)
memory usage: 104.2+ KB
None

Ejemplos de texto limpio:
                                            text_raw  \
0                    When I remember her I feel down   
1  When I carry heavy things I feel like breaking...   
2          there is too much pain w