### K-medias

In [1]:
import numpy as np
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = (8,4)
plt.style.use('ggplot')

In [2]:
data = np.array([[8,10],[3,10.5],[7,13.5],[5,18],[5,13],[6,9],[9,11],[3,18],[8.5,12],[8,16]])
C = np.array([[8,10],[3,10.5]])

In [3]:
np.sqrt((C[0,0] - C[1,0])**2 + (C[0,1] - C[1,1])**2)

In [4]:
np.linalg.norm(C[0] - C[1])

In [5]:
# Gráfica
plt.scatter(C[:,0], C[:,1], marker = '*', s = 200, c = 'k')
plt.scatter(data[:,0], data[:,1])

In [6]:
distances = []
clusters = []

def dist(a, b, ax=1):
    return np.linalg.norm(a - b, axis=ax)

# Grupos
for i in range(len(data)):
    distance = dist(data[i], C) # Distancia del punto actual a las k-medias
    distances.append(distance)
    cluster = np.argmin(distance) # Grupo al que pertenece
    clusters.append(cluster)

print(clusters, '\n', np.array(distances))

In [7]:
# Recalcular centroides
for i in range(len(C)):
    points = np.array([data[j] for j in range(len(data)) if clusters[j] == i])
    # print(points )
    C[i] = np.mean(points, axis=0)

C

In [8]:
# Gráfica
plt.scatter(C[:,0], C[:,1], marker = '*', s = 200, c = 'k')
plt.scatter(data[:,0], data[:,1])

### Sklearn

### Exploratory data analysis

In [9]:
import pandas as pd

In [10]:
plt.rcParams['figure.figsize'] = (14,8)
plt.style.use('ggplot')

In [11]:
url = 'https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/refs/heads/master/csv/cluster/xclara.csv'
data = pd.read_csv(url)
print(data.shape)
data.head()

In [12]:
v1, v2 = data['V1'].values, data.V2.values
plt.scatter(v1, v2, c='k', s=7)
plt.show()

In [13]:
from sklearn.cluster import KMeans
X = np.array(list(zip(v1, v2)))
km = KMeans(n_clusters=3, n_init='auto')
km.fit(X)
y_hat = km.predict(X)
C = km.cluster_centers_

In [14]:
plt.scatter(X[:,0], X[:,1], c=y_hat, s=7)
plt.scatter(C[:,0], C[:,1], marker='*', s=200, c='k')
plt.show()

#### Determinar número de clusters con gráfica del codo (alternativa a la gráfica de dispersión)


In [15]:
import seaborn as sns
sns.set_theme()

In [16]:
from sklearn.datasets import make_blobs
X, _ = make_blobs(n_samples=1000, n_features = 12, centers=8, random_state=42)
# Buscar número 'optimo' de clusters
wcss = [] # within cluster sum of squares
for i in range(4, 12):
    km = KMeans(n_clusters=i, init='k-means++', n_init='auto', random_state=0)
    km.fit(X)
    wcss.append(km.inertia_)
# Gráfica
plt.plot(range(4,12), wcss)
plt.title('Gráfica de codo')
plt.xlabel('Número de clusters')
plt.ylabel('WCSS')
plt.show()

In [17]:
### Bootcamp
from yellowbrick.cluster import KElbowVisualizer
X, _ = make_blobs(n_samples=1000, n_features=12, centers=8, random_state=42)
# Gráfica
vis = KElbowVisualizer(KMeans(n_init='auto', random_state=0), k=(4,12), timings=False)
vis.fit(X)
plt.xlabel('Número de clusters')
plt.ylabel('Distorsión')
plt.show()

In [18]:
# 3D
X, y = make_blobs(n_samples=800, n_features=3, centers=4)

In [19]:
km = KMeans(n_clusters=4, n_init='auto')
km = km.fit(X)
y_hat = km.predict(X)
C = km.cluster_centers_
C

In [20]:

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X[:,0], X[:,1], X[:,2], alpha=0.1)
ax.scatter(C[:,0], C[:,1], C[:,2], marker='*', s=100, c='k')
plt.show()

### K-modas

In [21]:
# Datos
data = np.array([['x', 'y', 'z'],
                 ['y', 'z', 'x'],
                 ['z', 'x', 'x'],
                 ['y', 'z', 'z'],
                 ['x', 'z', 'y'],
                 ['z', 'y', 'x'],
                 ['x', 'x', 'y'],
                 ['z', 'y', 'x']])

# Modas iniciales
modes = np.array([['x', 'y', 'z'],
                  ['z', 'y', 'x']])
data.shape

In [22]:
# Asignar clusters
clusters = np.zeros(data.shape[0])
distances = []
for i, object in enumerate(data):
    dist = [sum(object != mode) for mode in modes]
    distances.append(dist)
    clusters[i] = np.argmin(dist)
print(clusters)
distances

In [23]:
# Actualizar modas
for i in range(len(modes)):
    points = np.array([data[j] for j in range(len(data)) if clusters[j] == i])
    print(points)
    for h in range(modes.shape[1]):
        vals, counts = np.unique(points[:,h], return_counts=True)
        modes[i,h] = vals[np.argmax(counts)]
modes

In [24]:
from kmodes.kmodes import KModes

In [25]:
# Datos
data = np.array([['x', 'y', 'z'],
                 ['y', 'z', 'x'],
                 ['z', 'x', 'x'],
                 ['y', 'z', 'z'],
                 ['x', 'z', 'y'],
                 ['z', 'y', 'x'],
                 ['x', 'x', 'y'],
                 ['z', 'y', 'x']])

In [26]:
# Modelo con 2 grupos
km = KModes(n_clusters=2, init='random', n_init=5, verbose=True)
grupos = km.fit_predict(data)
grupos, km.cluster_centroids_

In [27]:
# Otro ejemplo
col_cabello = np.array(['rubio', 'castaño', 'pelirrojo', 'negro', 'castaño', 'negro', 'pelirrojo', 'rubio'])
col_ojos = np.array(['azul', 'gris', 'verde', 'café', 'azul', 'gris', 'azul', 'café'])
tipo_cabello = np.array(['lacio', 'chino', 'ondulado', 'ondulado', 'chino', 'chino', 'ondulado', 'lacio'])
personas = ['P1','P2','P3','P4','P5','P6','P7','P8']
data = pd.DataFrame({'person':personas, 'col_cabello':col_cabello, 'col_ojos':col_ojos, 'tipo_cabello':tipo_cabello})
data = data.set_index('person')
data

In [28]:
# Gráfica del codo
cost = []
k = range(1,6)
for nc in k:
    km = KModes(n_clusters=nc, init='random', n_init=5, verbose=False)
    km.fit_predict(data)
    cost.append(km.cost_)

plt.plot(k, cost, 'b*-')
plt.title('Gráfica de codo')
plt.xlabel('# de grupos')
plt.ylabel('Costo')
plt.show()

In [29]:
# 3 grupos
km = KModes(n_clusters=3, init='random', n_init=5, verbose=False)
grupos = km.fit_predict(data)
grupos, km.cluster_centroids_

In [30]:
data.insert(0, 'grupo', grupos)
data

### Agrupamiento aglomerativo

In [31]:
#pandas
#numpy
#matplotlib.pyplot
url = 'https://bit.ly/2COHM14'
data = pd.read_csv(url)
data.head()

In [32]:
# Preprocesamiento (normalización) rango [0,1]
from sklearn.preprocessing import normalize
data_scaled = normalize(data)
data_scaled = pd.DataFrame(data_scaled, columns=data.columns)
data_scaled.head()

In [33]:
# Dendograma
import scipy.cluster.hierarchy as shc
plt.figure(figsize=(16,9))
plt.title('Dendrograma')
dend = shc.dendrogram(shc.linkage(data_scaled, method='ward')) # ward minimiza la varianza

In [34]:
# 2 grupos
from sklearn.cluster import AgglomerativeClustering
ac = AgglomerativeClustering(n_clusters=2, metric='euclidean', linkage='ward') # si usamos ward para el dendograma, debemos usarlo aquí
y_hat = ac.fit_predict(data_scaled)
y_hat

In [39]:
plt.figure(figsize=(10,7))
plt.scatter(data_scaled.Milk, data_scaled.Grocery, c=y_hat, s=50, cmap='viridis')
plt.show()

In [40]:
plt.figure(figsize=(10,7))
plt.scatter(data_scaled.Fresh, data_scaled.Frozen, c=y_hat, s=50, cmap='viridis')
plt.show()

#### Otro ejemplo

In [41]:
#matplotlib.pyplot
#numpy
#pandas

# Datos
dataset = pd.read_csv('https://bit.ly/3TUeh37')
dataset.head()

In [44]:
# Columnas de interés
X = dataset.iloc[:, [3,4]].values

# Dendrograma
# import scipy.cluster.hierarchy as sch
plt.figure(figsize=(16,9))
dend = shc.dendrogram(shc.linkage(X, method='ward'))
plt.title('Dendrograma')
plt.show()

In [46]:
# 3 grupos
# from sklearn.cluster import AgglomerativeClustering
ac = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='ward')
y_hat = ac.fit_predict(X)
y_hat

In [48]:
# Visualización
plt.figure(figsize=(10,7))
plt.scatter(X[:,0], X[:,1], c=y_hat, s=50, cmap='viridis')

In [51]:
# 5 grupos
ac = AgglomerativeClustering(n_clusters=5, metric='euclidean', linkage='ward')
y_hc = ac.fit_predict(X)
y_hc

In [52]:
# Visualización
plt.figure(figsize=(10,7))
plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.title('Grupos de clientes')
plt.xlabel('Ingreso anual (k$)')
plt.ylabel('Nivel de gastos (1-100)')
plt.legend()
plt.show()

### Agrupamiento por modelos de mezclas Gaussianas

In [None]:
#pandas
#mapltolib.pyplot
#numpy
#seaborn
sns.set_theme()

In [54]:
# Datos
from sklearn.datasets import make_blobs
X,y = make_blobs(n_samples=300, centers=4, random_state=0, cluster_std=0.6)
plt.scatter(X[:,0], X[:,1], s=50)
plt.show()

In [57]:
# Determinar numero de grupos
from sklearn.mixture import GaussianMixture
n_clusters = range(1,16)
models = [GaussianMixture(n, covariance_type='full', random_state=0).fit(X) for n in n_clusters]
plt.plot(n_clusters, [m.bic(X) for m in models], label='BIC') # Criterio Bayesiano
plt.plot(n_clusters, [m.aic(X) for m in models], label='AIC') # Criterio de Información de Akaike
plt.legend(loc='best')
plt.show()

In [58]:
gmm = GaussianMixture(n_components=4)
gmm.fit(X)
y_hat = gmm.predict(X)
y_hat

In [60]:
plt.scatter(X[:,0], X[:,1], c=y_hat, s=50, cmap='rainbow')
plt.show()

In [1]:
# Importar las bibliotecas necesarias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as shc

# Cargar y preprocesar los datos
url = 'https://bit.ly/2COHM14'
data = pd.read_csv(url)
data_scaled = normalize(data)
data_scaled = pd.DataFrame(data_scaled, columns=data.columns)

# Función para crear dendrogramas
def plot_dendrogram(data, linkage_method):
    plt.figure(figsize=(16,9))
    plt.title(f'Dendrograma ({linkage_method})')
    dend = shc.dendrogram(shc.linkage(data, method=linkage_method))
    plt.show()

# Función para realizar clustering y visualizar resultados
def perform_clustering(data, n_clusters, metric, linkage):
    ac = AgglomerativeClustering(n_clusters=n_clusters, metric=metric, linkage=linkage)
    y_hat = ac.fit_predict(data)
    
    plt.figure(figsize=(10,7))
    plt.scatter(data.Milk, data.Grocery, c=y_hat, s=50, cmap='viridis')
    plt.title(f'Clustering (metric: {metric}, linkage: {linkage})')
    plt.show()

# Distancia Euclidiana (original)
plot_dendrogram(data_scaled, 'ward')
perform_clustering(data_scaled, 2, 'euclidean', 'ward')

# Distancia Manhattan
plot_dendrogram(data_scaled, 'single')
perform_clustering(data_scaled, 2, 'manhattan', 'single')

# Distancia Coseno
plot_dendrogram(data_scaled, 'complete')
perform_clustering(data_scaled, 2, 'cosine', 'complete')

# Enlazamiento promedio (average)
plot_dendrogram(data_scaled, 'average')
perform_clustering(data_scaled, 2, 'euclidean', 'average')