In [None]:
# Clustering Notebook
# =====================================
# Importar librerías


import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.neighbors import NearestNeighbors
import scipy.cluster.hierarchy as sch


In [None]:
%load_ext kedro.ipython

df = catalog.load("data_final")

In [None]:
# =====================================
# 2. Preprocesamiento
def preprocess_data(df: pd.DataFrame):
    df_limits = df.head(7000)
    X = df_limits[['mapname', 'operator', 'primaryweapon']]

    encoder = OneHotEncoder(sparse_output=True)
    X_encoded = encoder.fit_transform(X)

    scaler = StandardScaler(with_mean=False)
    X_scaled = scaler.fit_transform(X_encoded)

    X_scaled_df = pd.DataFrame(X_scaled.toarray() if hasattr(X_scaled, "toarray") else X_scaled)
    return X_scaled_df, encoder, scaler

X_scaled, encoder, scaler = preprocess_data(df)
X_scaled.head()

In [None]:
# --- 4c. Jerárquico (Agglomerative)
from sklearn.cluster import AgglomerativeClustering

def apply_hierarchical(X_pca: pd.DataFrame, n_clusters=5, linkage="ward"):
    model = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
    labels = model.fit_predict(X_pca)
    return labels

labels_hierarchical = apply_hierarchical(X_pca, n_clusters=5)
print("Número de clusters Jerárquico:", len(set(labels_hierarchical)))

In [None]:
# Función para calcular métricas de clustering
def compute_clustering_metrics_hierarchical(X_pca, labels_hierarchical):
    if len(set(labels_hierarchical)) <= 1:
        return {
            "silhouette": None,
            "davies_bouldin": None,
            "calinski_harabasz": None
        }
    return {
        "silhouette": silhouette_score(X_pca, labels_hierarchical),
        "davies_bouldin": davies_bouldin_score(X_pca, labels_hierarchical),
        "calinski_harabasz": calinski_harabasz_score(X_pca, labels_hierarchical)
    }

# Calcular métricas del modelo jerárquico
metrics_hierarchical = compute_clustering_metrics_hierarchical(X_pca, labels_hierarchical)

# Convertir a DataFrame para visualizar
metrics_df = pd.DataFrame([metrics_hierarchical], index=["Jerárquico"])
print(metrics_df)


In [None]:
# Función para calcular varianza intra-cluster
def cluster_variance_hierarchical(X_pca, labels_hierarchical):
    clusters = np.unique(labels_hierarchical)
    var_per_cluster = {}

    for c in clusters:
        points = X_pca[labels_hierarchical == c]
        var_per_cluster[c] = np.var(points, axis=0).sum()  # suma de varianzas por componente

    return var_per_cluster

# Calcular varianza del modelo jerárquico
var_hierarchical = cluster_variance_hierarchical(X_pca, labels_hierarchical)
print("Varianza intra-cluster Jerárquico:", sum(var_hierarchical.values()))


In [None]:


# Crear DataFrame con PC1 y PC2
pca_df = pd.DataFrame(X_pca[:, :2], columns=["PC1", "PC2"])
pca_df["hierarchical"] = labels_hierarchical

# Convertir etiquetas a string para plotly
pca_df["cluster"] = pca_df["hierarchical"].astype(str)

# Gráfico
fig = px.scatter(
    pca_df,
    x="PC1",
    y="PC2",
    color="cluster",
    opacity=0.6,
    labels={"cluster": "Cluster"},
    title="PCA 2D - Clustering Jerárquico"
)

fig.update_traces(marker=dict(size=6))
fig.update_layout(height=600, width=800)

fig.show()
