In [None]:
# Clustering Notebook
# =====================================
# Importar librerías


import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.neighbors import NearestNeighbors
import scipy.cluster.hierarchy as sch






# =====================================



In [None]:
%load_ext kedro.ipython

df = catalog.load("data_final")

In [None]:
# =====================================
# 2. Preprocesamiento
def preprocess_data(df: pd.DataFrame):
    df_limits = df.head(7000)
    X = df_limits[['mapname', 'operator', 'primaryweapon']]

    encoder = OneHotEncoder(sparse_output=True)
    X_encoded = encoder.fit_transform(X)

    scaler = StandardScaler(with_mean=False)
    X_scaled = scaler.fit_transform(X_encoded)

    X_scaled_df = pd.DataFrame(X_scaled.toarray() if hasattr(X_scaled, "toarray") else X_scaled)
    return X_scaled_df, encoder, scaler

X_scaled, encoder, scaler = preprocess_data(df)
X_scaled.head()

In [None]:
# =====================================
# 3. PCA (85% de varianza)
def apply_pca(X_scaled: pd.DataFrame, explained_var: float = 0.85):
    pca = PCA(n_components=explained_var)
    X_pca = pca.fit_transform(X_scaled)
    return X_pca, pca

X_pca, pca_model = apply_pca(X_scaled)


print("Varianza explicada total:", sum(pca_model.explained_variance_ratio_))


In [None]:
# ==== CLUSTERING SOLO DBSCAN ====

from sklearn.cluster import DBSCAN

def apply_dbscan(X_pca: pd.DataFrame, eps=0.1, min_samples=10):
    model = DBSCAN(eps=eps, min_samples=min_samples)
    labels = model.fit_predict(X_pca)

    # Número de clusters (excluyendo ruido)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)

    return labels, n_clusters, n_noise

# Aplicar DBSCAN
labels_dbscan, n_clusters_dbscan, n_noise_dbscan = apply_dbscan(X_pca, eps=0.1, min_samples=10)

print(f"Número de clusters DBSCAN: {n_clusters_dbscan}")
print(f"Número de puntos considerados ruido: {n_noise_dbscan}")


In [None]:
# ==== MÉTRICAS DE CLUSTERING SOLO DBSCAN ====

from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

def compute_clustering_metrics_dbscan(X_pca, labels_dbscan):
    # Si DBSCAN encuentra 0 o 1 clusters (solo ruido), no se pueden calcular métricas
    if len(set(labels_dbscan)) <= 1:
        return {
            "silhouette": None,
            "davies_bouldin": None,
            "calinski_harabasz": None
        }
    return {
        "silhouette": silhouette_score(X_pca, labels_dbscan),
        "davies_bouldin": davies_bouldin_score(X_pca, labels_dbscan),
        "calinski_harabasz": calinski_harabasz_score(X_pca, labels_dbscan)
    }

# Calcular métricas para DBSCAN
metrics_dbscan = compute_clustering_metrics_dbscan(X_pca, labels_dbscan)

# Mostrar métricas en una tabla
metrics_df = pd.DataFrame([metrics_dbscan], index=["DBSCAN"])
print(metrics_df)


In [None]:
# ==== VARIANZA INTRA-CLUSTER SOLO DBSCAN ====

def cluster_variance_dbscan(X_pca, labels_dbscan):
    clusters = np.unique(labels_dbscan)
    var_per_cluster = {}

    for c in clusters:
        if c == -1:  # ignoramos ruido
            continue
        points = X_pca[labels_dbscan == c]
        var_per_cluster[c] = np.var(points, axis=0).sum()

    return var_per_cluster

# Calcular varianza solo DBSCAN
var_dbscan = cluster_variance_dbscan(X_pca, labels_dbscan)

print("Varianza intra-cluster DBSCAN:", sum(var_dbscan.values()))


In [None]:
# ==== VISUALIZACIÓN PCA 2D SOLO DBSCAN ====

# Crear DataFrame con PC1, PC2 y etiquetas DBSCAN
pca_df = pd.DataFrame(X_pca[:, :2], columns=["PC1", "PC2"])
pca_df["cluster"] = pca_df["cluster"] = pd.Series(labels_dbscan).map(
    lambda x: "ruido" if x == -1 else f"c{x}"
)

# Gráfico PCA 2D
fig = px.scatter(
    pca_df,
    x="PC1",
    y="PC2",
    color="cluster",
    opacity=0.6,
    labels={"cluster": "Cluster"},
    title="PCA 2D - Clustering DBSCAN"
)

fig.update_traces(marker=dict(size=6))
fig.update_layout(height=600, width=800)

fig.show()
