# Maestría en Maestría en Ciencia de Datos e Inteligencia Artificial
#### 8. Machine Learning and Deep Learning
#### Docente: Msc. Renzo Claure Aracena.

### Kmeans

#### *Ejemplo artificial*

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples

In [None]:
# Generar datos de ejemplo
X, _ = make_blobs(n_samples=500, centers=4, cluster_std=1.0, random_state=42)

In [None]:
plt.figure()
plt.scatter(x=X[:,0], y=X[:,1], s=0.4)
plt.show()

In [None]:
X[:,0].min(), X[:,0].max()

In [None]:
X[:,1].min(), X[:,1].max()

In [None]:
kmean = KMeans(n_clusters=4)
kmean.fit(X)

In [None]:
#Centroides
kmean.cluster_centers_

In [None]:
#Graficar los cluster y sus centroides
labels = kmean.labels_
plt.figure(figsize=(10, 5))
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=10, alpha=0.8)
plt.scatter(kmean.cluster_centers_[:, 0], kmean.cluster_centers_[:, 1], c='red', marker='X', s=200, label='Centroides')
plt.title('Clusters formados por K-Means')
plt.xlabel('Característica 1')
plt.ylabel('Característica 2')
plt.legend()
plt.grid()
plt.show()

In [None]:
#Efecto de la cantidad de clusters
# Gráfico de sedimentación (Elbow Method)
wcss = []  # Within-Cluster-Sum-of-Squares
k_values = range(1, 11)  # Probamos de 1 a 10 clusters

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)  # Inertia es la suma de las distancias al cuadrado

plt.figure(figsize=(10, 5))
plt.plot(k_values, wcss, marker='o', linestyle='--')
plt.title('Gráfico de Sedimentación (Elbow Method)')
plt.xlabel('Número de Clusters (k)')
plt.ylabel('WCSS (Within-Cluster-Sum-of-Squares)')
plt.xticks(k_values)
plt.grid()
plt.show()

In [None]:
#Calidad de los Clusters
# Análisis de siluetas
from sklearn.metrics import silhouette_score, silhouette_samples

silhouette_avg = silhouette_score(X, labels)
print(f"Coeficiente de silueta promedio: {silhouette_avg:.2f}")

# Graficar el análisis de siluetas
sample_silhouette_values = silhouette_samples(X, labels)


In [None]:
def plot_silhouette_analysis(X, labels, n_clusters):
    """
    Genera un gráfico de siluetas para evaluar la calidad del clustering.

    Parámetros:
    -----------
    X : array-like (n_samples, n_features)
        Datos utilizados para el clustering

    labels : array-like (n_samples,)
        Etiquetas de cluster asignadas a cada muestra

    n_clusters : int
        Número total de clusters
    """
    silhouette_avg = silhouette_score(X, labels)
    print(f"Coeficiente de silueta promedio: {silhouette_avg:.2f}")

    sample_silhouette_values = silhouette_samples(X, labels)

    plt.figure(figsize=(10, 5))
    y_lower = 10

    for i in range(n_clusters):
        cluster_silhouette_values = sample_silhouette_values[labels == i]
        cluster_silhouette_values.sort()
        size_cluster_i = cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        plt.fill_betweenx(np.arange(y_lower, y_upper),
                          0, cluster_silhouette_values,
                          alpha=0.7)

        plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
        y_lower = y_upper + 10

    plt.title('Análisis de Siluetas')
    plt.xlabel('Coeficiente de Silueta')
    plt.ylabel('Cluster')
    plt.axvline(x=silhouette_avg, color="red", linestyle="--")
    plt.yticks([])
    plt.grid(True)
    plt.show()

In [None]:
plot_silhouette_analysis(X, kmean.labels_, n_clusters=4)

#### *Ejemplo Real*

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler

In [None]:
data = load_iris()

In [None]:
X = data.data
y = data.target

In [None]:
target_name = data.target_names

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Aplicar K-Means
kmeans_2 = KMeans(n_clusters=3, random_state=42)
kmeans_2.fit(X_scaled)
klabels = kmeans_2.labels_  # Etiquetas de los clusters

#Clusters y sus centroides
fig = plt.figure(figsize=(15, 5))

# Clusters formados por K-Means
ax0 = fig.add_subplot(1, 2, 1)
scatter0 = ax0.scatter(X_scaled[:, 0], X_scaled[:, 1], c=klabels, cmap='viridis', s=50, alpha=0.8)
ax0.scatter(kmeans_2.cluster_centers_[:, 0], kmeans_2.cluster_centers_[:, 1], c='red', marker='X', s=200, label='Centroides')
ax0.set_title('Clusters formados por K-Means')
ax0.set_xlabel(data.feature_names[0])
ax0.set_ylabel(data.feature_names[1])
ax0.legend(*scatter0.legend_elements(), title="Clusters")
ax0.grid()

# Clases originales
ax1 = fig.add_subplot(1, 2, 2)
scatter1 = ax1.scatter(X_scaled[:, 0], X_scaled[:, 1], c=y, cmap='viridis', s=50, alpha=0.8)
ax1.set_title('Clases originales')
ax1.set_xlabel(data.feature_names[0])
ax1.set_ylabel(data.feature_names[1])

# Crear manualmente los elementos de la leyenda para las clases originales
handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=plt.cm.viridis(i / 2), markersize=10) for i in range(3)]
ax1.legend(handles, target_name, title="Clases")
ax1.grid()

plt.show()

In [None]:
plot_silhouette_analysis(X, klabels, n_clusters=3)

### DBSCAN

In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import make_circles
from sklearn.cluster import DBSCAN
import numpy as np
import pandas as pd

#### *Datos artificiales*

In [None]:
X, _ = make_circles(n_samples=500, factor=.5, noise=.03, random_state=4)

# Plotting
plt.scatter(X[:, 0], X[:, 1], marker='o')
plt.title("Circulos Concentricos")
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
plt.show()

In [None]:
#Aplicando DBSCAN
dbscan = DBSCAN(eps=0.1, min_samples=5)
clusters = dbscan.fit_predict(X)

In [None]:
# Plotting
plt.scatter(X[:, 0], X[:, 1], c=clusters, cmap='viridis', marker='o')
plt.title("DBSCAN Clustering ")
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
plt.show()

#### *Datos reales*

In [None]:
from sklearn.datasets import fetch_california_housing
california_housing = fetch_california_housing()

In [None]:
df = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)

In [None]:
df.head()

In [None]:
df['target'] = california_housing.target

In [None]:
#Gráfico de dispersión
plt.figure()
plt.scatter(x=df['Latitude'], y=df['Longitude'], s=0.5)
plt.show()

In [None]:
#Crear un grafico de georeferencias
X = df[['Latitude', 'Longitude']].to_numpy()
dbscan = DBSCAN() #DBSCAN(eps=0.1, min_samples=5)
labels = dbscan.fit_predict(X)
df['labels'] = labels

In [None]:
def graf_dbscan(x, y, labels):
    """
    Grafica los puntos georeferenciados con colores según los clusters obtenidos por DBSCAN.
    """
    plt.figure(figsize=(8, 6))

    # Hacer el scatter plot con etiquetas por grupo
    scatter = plt.scatter(x, y, 
                        s=0.5, c=labels, cmap='viridis')

    # Crear leyenda manual
    legend_labels = np.unique(labels)

    # Agregar leyenda con identificadores de cluster
    legend_handles = [plt.Line2D([0], [0], marker='o', color='w', 
                                label=f'Cluster {label}', 
                                markerfacecolor=scatter.cmap(scatter.norm(label)), 
                                markersize=6) for label in legend_labels]

    plt.legend(handles=legend_handles, title="Clusters", loc='best')
    plt.xlabel("Latitude")
    plt.ylabel("Longitude")
    plt.title("Clusters con leyenda de colores")
    plt.grid(True)
    plt.show()

In [None]:
graf_dbscan(df['Latitude'], df['Longitude'], df['labels'])

In [None]:
#La calidad es muy pobre
df['labels'].value_counts()

In [None]:
from sklearn.metrics import silhouette_score, silhouette_samples
mask = df['labels'] != -1  # Excluir ruido (-1)
x_valid = df.loc[mask, ['Latitude', 'Longitude']]
labels_valid = df.loc[mask, 'labels']
silhouette_avg = silhouette_score(x_valid, labels_valid)
print(f"Coeficiente de silueta promedio: {silhouette_avg:.2f}")

In [None]:
#Alterando los hyperparametros
dbscan = DBSCAN(eps=0.32, min_samples=18) 
labels = dbscan.fit_predict(X)
df['labels2'] = labels

In [None]:
df['labels2'].value_counts()

In [None]:
mask = df['labels2'] != -1  # Excluir ruido (-1)
x_valid = df.loc[mask, ['Latitude', 'Longitude']]
labels_valid = df.loc[mask, 'labels2']
silhouette_avg = silhouette_score(x_valid, labels_valid)
print(f"Coeficiente de silueta promedio: {silhouette_avg:.2f}")

pd.cross

In [None]:

# Graficar los clusters con los nuevos parámetros
graf_dbscan(df['Latitude'], df['Longitude'], df['labels2'])
