# 📊 Clustering sobre métricas agregadas de repositorios (Gold Layer)

Este notebook evalúa distintos algoritmos de clustering sobre métricas ya procesadas de repositorios, sin aplicar reducción de dimensionalidad.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

import warnings
warnings.filterwarnings("ignore")


In [None]:

# Ruta del archivo generado por gold.py
gold_path = 'data/gold/repo_metrics/repo_activity_metrics.parquet'

try:
    df = pd.read_parquet(gold_path)
    print(f"✔ Datos cargados con éxito: {df.shape}")
except Exception as e:
    print(f"❌ Error al cargar el archivo: {e}")


In [None]:

# Solo columnas numéricas, excluyendo identificadores
df_numeric = df.select_dtypes(include=[np.number])
df_numeric = df_numeric.drop(columns=['repo_id'], errors='ignore')


In [None]:

def evaluate_clustering(X, labels):
    results = {}
    if len(set(labels)) > 1 and -1 not in set(labels):
        results['silhouette'] = silhouette_score(X, labels)
        results['calinski'] = calinski_harabasz_score(X, labels)
        results['davies'] = davies_bouldin_score(X, labels)
    else:
        results['silhouette'] = -1
        results['calinski'] = -1
        results['davies'] = np.inf
    return results


In [None]:

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_numeric)


In [None]:

results = []

# KMeans
for k in range(2, 11):
    model = KMeans(n_clusters=k, random_state=42)
    labels = model.fit_predict(X_scaled)
    scores = evaluate_clustering(X_scaled, labels)
    results.append({'model': 'KMeans', 'params': {'k': k}, **scores})

# DBSCAN
for eps in [0.5, 1.0, 1.5]:
    for min_samples in [3, 5, 10]:
        model = DBSCAN(eps=eps, min_samples=min_samples)
        labels = model.fit_predict(X_scaled)
        scores = evaluate_clustering(X_scaled, labels)
        results.append({'model': 'DBSCAN', 'params': {'eps': eps, 'min_samples': min_samples}, **scores})

# Agglomerative Clustering
for k in range(2, 11):
    model = AgglomerativeClustering(n_clusters=k)
    labels = model.fit_predict(X_scaled)
    scores = evaluate_clustering(X_scaled, labels)
    results.append({'model': 'Agglomerative', 'params': {'k': k}, **scores})


In [None]:

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='silhouette', ascending=False)
results_df.head(10)
