In [1]:
# Clustering Notebook
# =====================================
# Importar librerías


import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.neighbors import NearestNeighbors
import scipy.cluster.hierarchy as sch






# =====================================



In [2]:
%load_ext kedro.ipython

df = catalog.load("data_final")

In [3]:
# =====================================
# 2. Preprocesamiento
def preprocess_data(df: pd.DataFrame):
    df_limits = df.head(7000)
    X = df_limits[['mapname', 'operator', 'primaryweapon']]

    encoder = OneHotEncoder(sparse_output=True)
    X_encoded = encoder.fit_transform(X)

    scaler = StandardScaler(with_mean=False)
    X_scaled = scaler.fit_transform(X_encoded)

    X_scaled_df = pd.DataFrame(X_scaled.toarray() if hasattr(X_scaled, "toarray") else X_scaled)
    return X_scaled_df, encoder, scaler

X_scaled, encoder, scaler = preprocess_data(df)
X_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36,37,38,39,40,41,42,43,44,45
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.399012,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.991758,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.991758,0.0,...,0.0,0.0,0.0,0.0,4.268343,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# =====================================
# 3. PCA (85% de varianza)
def apply_pca(X_scaled: pd.DataFrame, explained_var: float = 0.85):
    pca = PCA(n_components=explained_var)
    X_pca = pca.fit_transform(X_scaled)
    return X_pca, pca

X_pca, pca_model = apply_pca(X_scaled)


print("Varianza explicada total:", sum(pca_model.explained_variance_ratio_))


Varianza explicada total: 0.865573158729392


In [31]:
# =====================================
# 4. Clustering

# --- 4a. DBSCAN
from sklearn.cluster import DBSCAN

def apply_dbscan(X_pca: pd.DataFrame, eps=0.1, min_samples=10):
    model = DBSCAN(eps=eps, min_samples=min_samples)
    labels = model.fit_predict(X_pca)
    # Número de clusters (excluyendo ruido)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)
    return labels, n_clusters, n_noise

labels_dbscan, n_clusters_dbscan, n_noise_dbscan = apply_dbscan(X_pca, eps=0.1, min_samples=10)
print(f"Número de clusters DBSCAN: {n_clusters_dbscan}")
print(f"Número de puntos considerados ruido: {n_noise_dbscan}")

# --- 4b. KMeans
from sklearn.cluster import KMeans

def apply_kmeans(X_pca: pd.DataFrame, n_clusters=5):
    model = KMeans(n_clusters=n_clusters, random_state=42)
    labels = model.fit_predict(X_pca)
    return labels

labels_kmeans = apply_kmeans(X_pca, n_clusters=5)
print("Número de clusters KMeans:", len(set(labels_kmeans)))


# --- 4c. Jerárquico (Agglomerative)
from sklearn.cluster import AgglomerativeClustering

def apply_hierarchical(X_pca: pd.DataFrame, n_clusters=5, linkage="ward"):
    model = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
    labels = model.fit_predict(X_pca)
    return labels

labels_hierarchical = apply_hierarchical(X_pca, n_clusters=5)
print("Número de clusters Jerárquico:", len(set(labels_hierarchical)))


Número de clusters DBSCAN: 266
Número de puntos considerados ruido: 847
Número de clusters KMeans: 5
Número de clusters Jerárquico: 5



* DBSCAN identificó 266 clusters, con 847 puntos clasificados como ruido. Por su parte, KMeans y el clustering jerárquico generaron 5 clusters cada uno, sin que quedaran puntos sin asignar a ningún cluster.

In [26]:
# 5. Métricas de clustering
def compute_clustering_metrics(X_pca, labels):
    if len(set(labels)) <= 1:
        return {
            "silhouette": None,
            "davies_bouldin": None,
            "calinski_harabasz": None
        }
    return {
        "silhouette": silhouette_score(X_pca, labels),
        "davies_bouldin": davies_bouldin_score(X_pca, labels),
        "calinski_harabasz": calinski_harabasz_score(X_pca, labels)
    }

metrics = {
    "dbscan": compute_clustering_metrics(X_pca, labels_dbscan),
    "kmeans": compute_clustering_metrics(X_pca, labels_kmeans),
    "hierarchical": compute_clustering_metrics(X_pca, labels_hierarchical)
}

# Mostrar métricas
metrics_df = pd.DataFrame(metrics).T
print(metrics_df)


              silhouette  davies_bouldin  calinski_harabasz
dbscan          0.818087        1.158192         114.944931
kmeans          0.058275        2.762187         323.893653
hierarchical    0.121725        2.498883         459.366565


* La métrica de Silhouette muestra que DBSCAN (0.818) tiene clusters muy bien separados, mientras que KMeans (0.058) y Jerárquico (0.122) presentan una separación mucho menor. El índice Davies-Bouldin indica que los clusters de DBSCAN (1.158) son más compactos que los de KMeans (2.762) y Jerárquico (2.499). Por su parte, Calinski-Harabasz refleja que los clusters grandes de Jerárquico (459.37) están mejor diferenciados que los de KMeans (323.89) y DBSCAN (114.94).

In [27]:

def cluster_variance(X_pca, labels):
    clusters = np.unique(labels)
    var_per_cluster = {}
    for c in clusters:
        if c == -1:  # DBSCAN ruido
            continue
        points = X_pca[labels == c]
        var_per_cluster[c] = np.var(points, axis=0).sum()  # suma de varianzas por componente
    return var_per_cluster

var_dbscan = cluster_variance(X_pca, labels_dbscan)
var_kmeans = cluster_variance(X_pca, labels_kmeans)
var_hierarchical = cluster_variance(X_pca, labels_hierarchical)

print("Varianza intra-cluster DBSCAN:", sum(var_dbscan.values()))
print("Varianza intra-cluster KMeans:", sum(var_kmeans.values()))
print("Varianza intra-cluster Jerárquico:", sum(var_hierarchical.values()))

Varianza intra-cluster DBSCAN: 1.3257877603141252e-27
Varianza intra-cluster KMeans: 133.47178186838335
Varianza intra-cluster Jerárquico: 174.73099231588895




* a varianza intra-cluster de DBSCAN es prácticamente cero (1.33e-27), lo que indica que los clusters son extremadamente densos. En KMeans, la varianza es 133.47, mostrando clusters moderadamente dispersos, mientras que en el clustering jerárquico la varianza es aún mayor (174.73), lo que refleja que los clusters son más amplios y dispersos.

In [28]:
# 6. Visualización PCA 2D para los 3 modelos
from plotly.subplots import make_subplots

# Construir DataFrame con primeras 2 componentes y etiquetas de cada modelo
pca_df = pd.DataFrame(X_pca[:, :2], columns=["PC1", "PC2"])
pca_df["dbscan"] = labels_dbscan
pca_df["kmeans"] = labels_kmeans
pca_df["hierarchical"] = labels_hierarchical

models = [("dbscan", "DBSCAN"), ("kmeans", "KMeans"), ("hierarchical", "Jerárquico")]
fig = make_subplots(rows=1, cols=3, subplot_titles=[m[1] for m in models])

for i, (col_name, title) in enumerate(models, start=1):
    df_plot = pca_df.copy()
    if col_name == "dbscan":
        # Etiquetar ruido como 'ruido' para distinguirlo de los clusters
        df_plot["cluster"] = df_plot[col_name].map(lambda x: "ruido" if x == -1 else f"c{x}")
    else:
        df_plot["cluster"] = df_plot[col_name].astype(str)

    scatter = px.scatter(
        df_plot,
        x="PC1",
        y="PC2",
        color="cluster",
        opacity=0.55,
        title=None,
        labels={"cluster": "Cluster"},
    )

    # Agregar todas las trazas del scatter al subplot correspondiente
    for tr in scatter.data:
        # Mostrar la leyenda solo en el primer subplot para evitar duplicados
        tr.showlegend = (i == 1)
        tr.marker.update(size=5)
        fig.add_trace(tr, row=1, col=i)

fig.update_layout(
    height=500,
    width=1200,
    title_text="PCA 2D - Comparación de Modelos de Clustering",
    legend_title_text="Cluster",
    margin=dict(l=40, r=40, t=60, b=40)
)

fig.update_xaxes(title_text="PC1", row=1, col=1)
fig.update_yaxes(title_text="PC2", row=1, col=1)
fig.update_xaxes(title_text="PC1", row=1, col=2)
fig.update_xaxes(title_text="PC1", row=1, col=3)
fig.update_yaxes(title_text="PC2", row=1, col=2)
fig.update_yaxes(title_text="PC2", row=1, col=3)

fig.show()

## comparacion de modelos no supervisado

| Modelo         | Número de clusters | Puntos ruido | Silhouette | Davies-Bouldin | Calinski-Harabasz | Varianza intra-cluster | Observación                                                                              |
| -------------- | ------------------ | ------------ | ---------- | -------------- | ----------------- | ---------------------- | ---------------------------------------------------------------------------------------- |
| **DBSCAN**     | 266                | 847          | 0.818      | 1.158          | 114.94            | 1.33e-27               | Clusters muy densos y bien separados, pero muchos puntos quedan clasificados como ruido. |
| **KMeans**     | 5                  | 0            | 0.058      | 2.762          | 323.89            | 133.47                 | Clusters moderadamente dispersos, todos los puntos están asignados, separación baja.     |
| **Jerárquico** | 5                  | 0            | 0.122      | 2.499          | 459.37            | 174.73                 | Clusters más grandes y dispersos, buena diferenciación de grupos grandes.                |


## Conclusión

DBSCAN genera clusters muy compactos y bien separados, pero deja fuera una cantidad significativa de puntos como ruido. Esto refleja que los datos están dispersos y no todas las regiones cumplen la densidad mínima para formar clusters.

KMeans y clustering jerárquico agrupan todos los puntos, aunque presentan clusters más dispersos y menor separación entre ellos.

Cada modelo representa un enfoque diferente de agrupamiento:

- DBSCAN → densidad y pureza de clusters.

- KMeans → cobertura total del dataset y clusters moderadamente compactos.

- Jerárquico → clusters grandes y estructurados, útil para visualizar relaciones jerárquicas.

En conjunto, los resultados muestran que la elección del modelo depende del objetivo: si priorizas clusters densos y puros, DBSCAN es adecuado; si quieres cubrir todos los datos, KMeans o Jerárquico son más apropiados.