# Modelo 2: Clustering de jugadoras por criterios

## Librerías necesarias para la ejecución

In [4]:
# Librerias de sistema
import os
from dotenv import load_dotenv
# Librerias de tratado de datos
import pandas as pd
import numpy as np
# Librerias para modelos
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
import mlflow
import mlflow.sklearn


### Carga y preprocesado de datos

In [5]:
def load_dataset(path: str, sep = ";") -> pd.DataFrame:
    return pd.read_csv(path, sep=sep)

In [6]:
def preprocess(df: pd.DataFrame, id_cols=('Nombre','Temporada','Equipo')):
    X = df.drop(columns=list(id_cols), errors="ignore")
    X = X.select_dtypes(include=[np.number])

    pipe = SimpleImputer(strategy="median")
    X = pipe.fit_transform(X)

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    return X, df[list(id_cols)]

### Algoritmos de clustering

In [7]:
def kmeans_run(X, k, pca_n=None):
    X_fit = PCA(n_components=pca_n, random_state=42).fit_transform(X) if pca_n else X
    model = KMeans(n_clusters=k, n_init="auto", random_state=42)
    labels = model.fit_predict(X_fit)
    sil = silhouette_score(X_fit, labels)
    return model, labels, sil

In [8]:
def dbscan_run(X, eps, min_samples, pca_n=None):
    X_fit = PCA(n_components=pca_n, random_state=42).fit_transform(X) if pca_n else X
    model = DBSCAN(eps=eps, min_samples=min_samples)
    labels = model.fit_predict(X_fit)
    # silueta ignorando ruido (-1)
    valid = labels != -1
    sil = silhouette_score(X_fit[valid], labels[valid]) if valid.sum() >= 2 else -1
    return model, labels, sil

## MLFlow

In [None]:
def main(csv_path, tracking_uri, experiment):

    # MLflow
    mlflow.set_tracking_uri(tracking_uri)
    mlflow.set_experiment(experiment)

    # Datos
    df = load_dataset(csv_path, sep=";")
    X, meta = preprocess(df)

    results: dict[str, float] = {}

    # ----- K-Means -----
    for k in range(2, 11):
        with mlflow.start_run(run_name=f"KMeans_k={k}"):
            m, y, sil = kmeans_run(X, k)
            mlflow.log_params({"algorithm": "kmeans", "k": k})
            mlflow.log_metric("silhouette", sil)
            mlflow.sklearn.log_model(m, "model")
            results[f"kmeans_{k}"] = sil
            print(f"KMeans k={k}  →  silhouette={sil:.3f}")

    # ----- PCA + K-Means -----
    for k in range(2, 11):
        for n_comp in (2, 3, 5):
            with mlflow.start_run(run_name=f"PCA{k}_n{n_comp}"):
                m, y, sil = kmeans_run(X, k, pca_n=n_comp)
                mlflow.log_params({"algorithm": "pca+kmeans",
                                   "k": k,
                                   "pca_components": n_comp})
                mlflow.log_metric("silhouette", sil)
                mlflow.sklearn.log_model(m, "model")
                results[f"pca{k}_{n_comp}"] = sil
                print(f"PCA({n_comp}) + KMeans k={k}  →  silhouette={sil:.3f}")

    # ----- DBSCAN -----
    for eps in (0.5, 1.0, 1.5):
        for ms in (3, 5, 10):
            with mlflow.start_run(run_name=f"DBSCAN_eps{eps}_ms{ms}"):
                m, y, sil = dbscan_run(X, eps, ms, pca_n=3)
                mlflow.log_params({"algorithm": "dbscan",
                                   "eps": eps,
                                   "min_samples": ms})
                mlflow.log_metric("silhouette", sil)
                mlflow.sklearn.log_model(m, "model")
                results[f"dbscan_{eps}_{ms}"] = sil
                print(f"DBSCAN eps={eps} min_samples={ms}  →  silhouette={sil:.3f}")
                
    # Selección del mejor modelo 
    best_key = max(results, key=results.get)
    print(f"\nMejor experimento: {best_key}  |  silhouette = {results[best_key]:.3f}")

    # Recreamos el modelo ganador para etiquetar todas las jugadoras
    if best_key.startswith("kmeans_"):
        k = int(best_key.split("_")[1])
        best_model, labels, _ = kmeans_run(X, k)
    elif best_key.startswith("pca"):
        k = int(best_key[3:].split("_")[0])
        n_comp = int(best_key.split("_")[1])
        best_model, labels, _ = kmeans_run(X, k, pca_n=n_comp)
    else:  # DBSCAN
        _, eps, ms = best_key.split("_")
        best_model, labels, _ = dbscan_run(X, float(eps), int(ms), pca_n=3)

    # ---------- 5. Exportar resultados ----------
    resultado = meta.copy()
    resultado["cluster"] = labels
    salida = "jugadoras_clusterizadas.csv"
    resultado.to_csv(salida, index=False, sep=";")
    print(f"Etiquetas guardadas en {salida}")

In [10]:
if __name__ == "__main__":
    load_dotenv()
    main(csv_path=os.getenv("TRANSFORM_DATA_PATH"), tracking_uri="mlruns",experiment="Clustering_Jugadoras")

2025/06/15 18:26:30 INFO mlflow.tracking.fluent: Experiment with name 'Clustering_Jugadoras' does not exist. Creating a new experiment.


KMeans k=2  →  silhouette=0.315




KMeans k=3  →  silhouette=0.254




KMeans k=4  →  silhouette=0.220




KMeans k=5  →  silhouette=0.195




KMeans k=6  →  silhouette=0.197




KMeans k=7  →  silhouette=0.189




KMeans k=8  →  silhouette=0.176




KMeans k=9  →  silhouette=0.173




KMeans k=10  →  silhouette=0.176




PCA(2) + KMeans k=2  →  silhouette=0.471




PCA(3) + KMeans k=2  →  silhouette=0.435




PCA(5) + KMeans k=2  →  silhouette=0.405




PCA(2) + KMeans k=3  →  silhouette=0.440




PCA(3) + KMeans k=3  →  silhouette=0.390




PCA(5) + KMeans k=3  →  silhouette=0.353




PCA(2) + KMeans k=4  →  silhouette=0.414




PCA(3) + KMeans k=4  →  silhouette=0.373




PCA(5) + KMeans k=4  →  silhouette=0.333




PCA(2) + KMeans k=5  →  silhouette=0.414




PCA(3) + KMeans k=5  →  silhouette=0.375




PCA(5) + KMeans k=5  →  silhouette=0.333




PCA(2) + KMeans k=6  →  silhouette=0.402




PCA(3) + KMeans k=6  →  silhouette=0.368




PCA(5) + KMeans k=6  →  silhouette=0.324




PCA(2) + KMeans k=7  →  silhouette=0.388




PCA(3) + KMeans k=7  →  silhouette=0.347




PCA(5) + KMeans k=7  →  silhouette=0.326




PCA(2) + KMeans k=8  →  silhouette=0.387




PCA(3) + KMeans k=8  →  silhouette=0.335




PCA(5) + KMeans k=8  →  silhouette=0.283




PCA(2) + KMeans k=9  →  silhouette=0.379




PCA(3) + KMeans k=9  →  silhouette=0.338




PCA(5) + KMeans k=9  →  silhouette=0.277




PCA(2) + KMeans k=10  →  silhouette=0.382




PCA(3) + KMeans k=10  →  silhouette=0.332




PCA(5) + KMeans k=10  →  silhouette=0.283




DBSCAN eps=0.5 min_samples=3  →  silhouette=-0.464




DBSCAN eps=0.5 min_samples=5  →  silhouette=0.140




DBSCAN eps=0.5 min_samples=10  →  silhouette=0.199




DBSCAN eps=1.0 min_samples=3  →  silhouette=0.382


ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)