In [None]:
# Clustering Notebook
# =====================================
# Importar librerías


import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from sklearn.neighbors import NearestNeighbors
import scipy.cluster.hierarchy as sch






# =====================================



In [None]:
%load_ext kedro.ipython

df = catalog.load("data_final")

In [None]:
# =====================================
# 2. Preprocesamiento
def preprocess_data(df: pd.DataFrame):
    df_limits = df.head(7000)
    X = df_limits[['mapname', 'operator', 'primaryweapon']]

    encoder = OneHotEncoder(sparse_output=True)
    X_encoded = encoder.fit_transform(X)

    scaler = StandardScaler(with_mean=False)
    X_scaled = scaler.fit_transform(X_encoded)

    X_scaled_df = pd.DataFrame(X_scaled.toarray() if hasattr(X_scaled, "toarray") else X_scaled)
    return X_scaled_df, encoder, scaler

X_scaled, encoder, scaler = preprocess_data(df)
X_scaled.head()

In [None]:
# =====================================
# 3. PCA (85% de varianza)
def apply_pca(X_scaled: pd.DataFrame, explained_var: float = 0.85):
    pca = PCA(n_components=explained_var)
    X_pca = pca.fit_transform(X_scaled)
    return X_pca, pca

X_pca, pca_model = apply_pca(X_scaled)


print("Varianza explicada total:", sum(pca_model.explained_variance_ratio_))


In [None]:
# ==== CLUSTERING KMEANS ====

from sklearn.cluster import KMeans

def apply_kmeans(X_pca: pd.DataFrame, n_clusters=5):
    model = KMeans(n_clusters=n_clusters, random_state=42)
    labels = model.fit_predict(X_pca)
    return labels

# Aplicar KMeans
labels_kmeans = apply_kmeans(X_pca, n_clusters=5)

print("Número de clusters KMeans:", len(set(labels_kmeans)))


In [None]:
# ==== MÉTRICAS DE CLUSTERING SOLO KMEANS ====

from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

def compute_clustering_metrics_kmeans(X_pca, labels_kmeans):
    if len(set(labels_kmeans)) <= 1:
        return {
            "silhouette": None,
            "davies_bouldin": None,
            "calinski_harabasz": None
        }
    return {
        "silhouette": silhouette_score(X_pca, labels_kmeans),
        "davies_bouldin": davies_bouldin_score(X_pca, labels_kmeans),
        "calinski_harabasz": calinski_harabasz_score(X_pca, labels_kmeans)
    }

# Calcular métricas KMeans
metrics_kmeans = compute_clustering_metrics_kmeans(X_pca, labels_kmeans)

# Mostrar como DataFrame
metrics_df = pd.DataFrame([metrics_kmeans], index=["KMeans"])
print(metrics_df)


In [None]:
# ==== VARIANZA INTRA-CLUSTER SOLO KMEANS ====

def cluster_variance_kmeans(X_pca, labels_kmeans):
    clusters = np.unique(labels_kmeans)
    var_per_cluster = {}

    for c in clusters:
        points = X_pca[labels_kmeans == c]
        var_per_cluster[c] = np.var(points, axis=0).sum()  # suma de varianzas por componente

    return var_per_cluster

# Calcular varianza KMeans
var_kmeans = cluster_variance_kmeans(X_pca, labels_kmeans)

print("Varianza intra-cluster KMeans:", sum(var_kmeans.values()))


In [None]:
# ==== VISUALIZACIÓN PCA 2D SOLO KMEANS ====

# Crear DataFrame con PC1, PC2 y etiquetas KMeans
pca_df = pd.DataFrame(X_pca[:, :2], columns=["PC1", "PC2"])
pca_df["cluster"] = labels_kmeans.astype(str)

# Gráfico PCA 2D
fig = px.scatter(
    pca_df,
    x="PC1",
    y="PC2",
    color="cluster",
    opacity=0.6,
    labels={"cluster": "Cluster"},
    title="PCA 2D - Clustering KMeans"
)

fig.update_traces(marker=dict(size=6))
fig.update_layout(height=600, width=800)

fig.show()
