In [1]:
# Clustering Notebook
# =====================================
# Importar librerías


import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.neighbors import NearestNeighbors
import scipy.cluster.hierarchy as sch







# =====================================



In [2]:
%load_ext kedro.ipython

df = catalog.load("data_final")

In [3]:
# =====================================
# 2. Preprocesamiento
def preprocess_data(df: pd.DataFrame):
    df_limits = df.head(7000)
    X = df_limits[['mapname', 'operator', 'primaryweapon']]

    encoder = OneHotEncoder(sparse_output=True)
    X_encoded = encoder.fit_transform(X)

    scaler = StandardScaler(with_mean=False)
    X_scaled = scaler.fit_transform(X_encoded)

    X_scaled_df = pd.DataFrame(X_scaled.toarray() if hasattr(X_scaled, "toarray") else X_scaled)
    return X_scaled_df, encoder, scaler

X_scaled, encoder, scaler = preprocess_data(df)
X_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36,37,38,39,40,41,42,43,44,45
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.399012,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.991758,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.991758,0.0,...,0.0,0.0,0.0,0.0,4.268343,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# =====================================
# 3. PCA (85% de varianza)
def apply_pca(X_scaled: pd.DataFrame, explained_var: float = 0.85):
    pca = PCA(n_components=explained_var)
    X_pca = pca.fit_transform(X_scaled)
    return X_pca, pca

X_pca, pca_model = apply_pca(X_scaled)


print("Varianza explicada total:", sum(pca_model.explained_variance_ratio_))


Varianza explicada total: 0.865573158729392


In [5]:
# ==== CLUSTERING SOLO DBSCAN ====



def apply_dbscan(X_pca: pd.DataFrame, eps=0.1, min_samples=10):
    model = DBSCAN(eps=eps, min_samples=min_samples)
    labels = model.fit_predict(X_pca)

    # Número de clusters (excluyendo ruido)
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)

    return labels, n_clusters, n_noise

# Aplicar DBSCAN
labels_dbscan, n_clusters_dbscan, n_noise_dbscan = apply_dbscan(X_pca, eps=0.1, min_samples=10)

print(f"Número de clusters DBSCAN: {n_clusters_dbscan}")
print(f"Número de puntos considerados ruido: {n_noise_dbscan}")


Número de clusters DBSCAN: 266
Número de puntos considerados ruido: 847


In [6]:
# ==== MÉTRICAS DE CLUSTERING SOLO DBSCAN ====


def compute_clustering_metrics_dbscan(X_pca, labels_dbscan):
    # Si DBSCAN encuentra 0 o 1 clusters (solo ruido), no se pueden calcular métricas
    if len(set(labels_dbscan)) <= 1:
        return {
            "silhouette": None,
            "davies_bouldin": None,
            "calinski_harabasz": None
        }
    return {
        "silhouette": silhouette_score(X_pca, labels_dbscan),
        "davies_bouldin": davies_bouldin_score(X_pca, labels_dbscan),
        "calinski_harabasz": calinski_harabasz_score(X_pca, labels_dbscan)
    }

# Calcular métricas para DBSCAN
metrics_dbscan = compute_clustering_metrics_dbscan(X_pca, labels_dbscan)

# Mostrar métricas en una tabla
metrics_df = pd.DataFrame([metrics_dbscan], index=["DBSCAN"])
print(metrics_df)


        silhouette  davies_bouldin  calinski_harabasz
DBSCAN    0.818087        1.158192         114.944931


In [7]:
# ==== VARIANZA INTRA-CLUSTER SOLO DBSCAN ====

def cluster_variance_dbscan(X_pca, labels_dbscan):
    clusters = np.unique(labels_dbscan)
    var_per_cluster = {}

    for c in clusters:
        if c == -1:  # ignoramos ruido
            continue
        points = X_pca[labels_dbscan == c]
        var_per_cluster[c] = np.var(points, axis=0).sum()

    return var_per_cluster

# Calcular varianza solo DBSCAN
var_dbscan = cluster_variance_dbscan(X_pca, labels_dbscan)

print("Varianza intra-cluster DBSCAN:", sum(var_dbscan.values()))


Varianza intra-cluster DBSCAN: 1.3257877603141252e-27


In [8]:


pca_df = pd.DataFrame(X_pca[:, :2], columns=["PC1", "PC2"])
pca_df["dbscan"] = labels_dbscan
pca_df["cluster"] = pca_df["dbscan"].astype(str)

fig = px.scatter(
    pca_df,
    x="PC1",
    y="PC2",
    color="cluster",
    opacity=0.6,
    labels={"cluster": "Cluster"},
    title="PCA 2D - Clusters DBSCAN"
)
fig.update_traces(marker=dict(size=6))
fig.update_layout(height=600, width=800)
fig.show()


In [9]:
# ANOMALÍAS PARA DBSCAN (ruido = -1)
anom_dbscan = (labels_dbscan == -1)


In [13]:
# === BARRA DE ANOMALÍAS PARA DBSCAN ===
fig.add_trace(
    go.Scatter(
        x=np.arange(len(pca_df)),
        y=[-0.25] * len(pca_df),
        mode="markers",
        marker=dict(
            size=6,
            color=["red" if a else "lightgray" for a in anom_dbscan],
            symbol="square"
        ),
        showlegend=False,
        hovertemplate="Punto %{x}<br>Anomalía (DBSCAN): %{marker.color}"
    ),
    row=1, col=1
)
