In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Función para graficar dispersión
def scatter_plots(df, centers=None, palette=None):
    fig, ax = plt.subplots(nrows=1, ncols=2, sharex=True, figsize=(20, 8))
    fig.subplots_adjust(hspace=0.5, wspace=0.2)

    ax1 = plt.subplot(1, 2, 1)
    sns.scatterplot(ax=ax1, data=df, x='x', y='y', hue='pred_label', palette=palette)
    if centers is not None:
        sns.scatterplot(ax=ax1, data=centers, x='x', y='y', color='r', marker="X")
    plt.title('Etiquetas de predicción')

    ax2 = plt.subplot(1, 2, 2)
    sns.scatterplot(ax=ax2, data=df, x='x', y='y', hue='true_label', palette=palette)
    if centers is not None:
        sns.scatterplot(ax=ax2, data=centers, x='x', y='y', color='r', marker="X")
    plt.title('Etiquetas reales')

# Generación de datos
X, y = make_blobs(n_samples=400, centers=3, cluster_std=2.7, random_state=42)

# Preprocesamiento
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# División de datos en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.7, random_state=0)

# Entrenamiento del modelo
kmeans = KMeans(init="random", n_clusters=3, n_init=10, max_iter=300, random_state=42)
kmeans.fit(X_train)
print(f'Converge después de {kmeans.n_iter_} iteraciones')

# Visualización de predicciones y evaluación del modelo
cluster_centers = scaler.inverse_transform(kmeans.cluster_centers_)
df_train = pd.DataFrame(scaler.inverse_transform(X_train), columns=['x', 'y'])
df_train['pred_label'] = kmeans.labels_
df_train['true_label'] = y_train

palette = {1: "tab:cyan", 0: "tab:orange", 2: "tab:purple"}
scatter_plots(df_train, centers=pd.DataFrame(cluster_centers, columns=['x', 'y']), palette=palette)

# Evaluación del modelo en el conjunto de prueba
df_test = pd.DataFrame(scaler.inverse_transform(X_test), columns=['x', 'y'])
df_test['true_label'] = y_test
df_test['pred_label'] = kmeans.predict(X_test)

df_test["true_label"] = df_test["true_label"].map({0: "A", 1: "B", 2: "C"})
df_test["pred_label"] = df_test["pred_label"].map({0: "A", 1: "B", 2: "C"})

print(df_test.head(15))


Converge después de 7 iteraciones
           x          y true_label pred_label
0   5.537127  -0.047744          B          A
1  -6.501207   7.070707          A          C
2  -4.078269  -8.257154          C          B
3  -7.306249  -8.032688          C          B
4   1.382240   5.137458          A          A
5   7.497625  -2.775427          B          A
6  -1.614786  11.648258          A          C
7   2.419884   2.631125          B          A
8   2.883144   0.657931          B          A
9   7.206023   0.415530          B          A
10  4.330621   5.315274          B          A
11  2.750620   2.550915          B          A
12 -1.310885  11.105798          A          C
13 -0.276097  -6.531212          C          B
14 -1.798161  -3.247475          C          B
