<a href="https://colab.research.google.com/github/DeaAnalytics/univ-workshop/blob/main/notebooks/Python/Sesion6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip uninstall -y -q plotly; pip install -q plotly
# !pip uninstall -y -q pandas; pip install -q pandas

In [2]:
import pandas as pd 
import plotly
import plotly.express as px
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [3]:
pd.set_option('plotting.backend','plotly')

In [4]:
#Leer un archivo .csv, en este caso agregarmos la liga de github que contiene esta tabla y renombrarla como kdf
#Con la función "head" puedo dar un vistazo a los primeros datos de mi tabla y sus encabezados
df = pd.read_csv("https://github.com/DeaAnalytics/univ-workshop/raw/main/data/K_Means_U.csv")
df.head()

Unnamed: 0,Temperatura,Velocidad,Elongacion,Diametro
0,500,1600,3.4,0.4
1,520,1500,3.5,0.2
2,520,1400,3.4,0.2
3,470,1600,3.2,0.2
4,480,1600,3.1,0.2


In [15]:
# nube de puntos de los datos, para detectar patrones e identificar grupos visualmente
fig = px.scatter_matrix(df, dimensions=df.columns)
fig.show()

In [5]:
df.describe()

Unnamed: 0,Temperatura,Velocidad,Elongacion,Diametro
count,150.0,150.0,150.0,150.0
mean,584.333333,3758.666667,3.054,1.198667
std,82.806613,1764.42042,0.433594,0.763161
min,430.0,1000.0,2.0,0.1
25%,510.0,1600.0,2.8,0.3
50%,580.0,4350.0,3.0,1.3
75%,640.0,5100.0,3.3,1.8
max,790.0,6900.0,4.4,2.5


In [6]:
#Como podemos ver la magnitud de los valores entre cada variable difieren significativamente, 
#Por lo que debemos escalarlas antes de aplicar clustering
scaler = StandardScaler()
df_scale = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
df_scale.head()

Unnamed: 0,Temperatura,Velocidad,Elongacion,Diametro
0,-1.021849,-1.227541,0.800654,-1.050031
1,-0.779513,-1.284407,1.032057,-1.312977
2,-0.779513,-1.341272,0.800654,-1.312977
3,-1.385353,-1.227541,0.337848,-1.312977
4,-1.264185,-1.227541,0.106445,-1.312977


In [7]:
# Funcion para calcular la inercia entre los grupos
# la inercia es la suma de las distancias al cuadrado de cada objeto del Cluster a su centroide
def Elbow(df, min_clusters=1, max_clusters=50):
  wcss = []
  n_clusters = []
  for k in range(min_clusters, max_clusters + 1):
    kmeans = KMeans(n_clusters = k, init = "k-means++", max_iter = 300, n_init = 10, random_state = 0)
    kmeans.fit(df)
    n_clusters.append(k)
    wcss.append(kmeans.inertia_)
  clusters = pd.DataFrame(data=[n_clusters, wcss]).T
  clusters.columns = ["n_clusters", "wcss"]
  clusters.set_index("n_clusters", inplace=True)
  return clusters

In [8]:
clusters = Elbow(df_scale, max_clusters=15)
clusters.plot()

In [9]:
#Ahora iniciemos con el Modelo Kmeans para k=3 (3 cluster):
# El parametro de random_state es para establecer una semilla y el resultado no sea aleatorio en cada ejecucion
k=3
kMeans = KMeans(n_clusters=k, random_state=123)
kMeans.fit(X=df_scale)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=123, tol=0.0001, verbose=0)

In [10]:
kMeans.inertia_

140.965816630747

In [12]:
kMeans.labels_

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 2, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0,
       0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 0, 2,
       2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [13]:
kMeans.cluster_centers_

array([[-0.05021989,  0.34753171, -0.88029181,  0.28206327],
       [-1.01457897, -1.30487835,  0.84230679, -1.25512862],
       [ 1.13597027,  0.996271  ,  0.09659843,  1.01717187]])

In [16]:
# Grafica con las variables escaladas
# color continuo NO RECOMENDADO para visualizacion de clusters
fig = px.scatter_matrix(df_scale, dimensions=df_scale.columns, color=kMeans.labels_)
fig.show()

In [17]:
# Grafica con las variables escaladas
# color discreto RECOMENDADO para visualizacion de clusters
fig = px.scatter_matrix(df_scale, dimensions=df_scale.columns, color=kMeans.labels_.astype(str))
fig.show()

In [18]:
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples

In [19]:
silhouette_score(X=df_scale, labels=kMeans.labels_)

0.45897178668557237

In [20]:
silhouette_samples(X=df_scale, labels=kMeans.labels_)

array([ 0.70261401,  0.71745103,  0.71262009,  0.67084552,  0.6310475 ,
        0.64619953,  0.58358858,  0.53526543,  0.63932991,  0.67919109,
        0.66245827,  0.63932991,  0.55676622,  0.72268674,  0.72887695,
        0.08121394,  0.63927004,  0.66019986,  0.64227539,  0.56480207,
        0.67210029,  0.66871307,  0.68320211,  0.71137344,  0.34258316,
        0.16825018,  0.35921488,  0.54341995,  0.13314978,  0.53825996,
        0.23385527,  0.40753533,  0.03720138,  0.54373834,  0.42226153,
        0.33939124,  0.46926967,  0.33099504,  0.49436241,  0.19035341,
        0.39110564,  0.54899662,  0.4037049 ,  0.58241483,  0.03883921,
        0.45814823,  0.36708449,  0.42571909,  0.21844191,  0.05222353,
        0.03996145,  0.34239177,  0.37364238,  0.55581837,  0.56646216,
        0.55681822,  0.56830957,  0.39052244,  0.40374251,  0.11226518,
        0.28053667,  0.41853205,  0.43906479,  0.5861081 ,  0.58398743,
        0.23958608,  0.57988808,  0.43673834,  0.58366904,  0.42

In [23]:
# Funcion para calcular la metrica de silueta entre los grupos
# la silueta consiste en conocer el nivel de agrupamiento interno y externo
def Silhouette(df, min_clusters=2, max_clusters=50):
  silhouette_scores = []
  n_clusters = []
  for k in range(min_clusters, max_clusters + 1):
    kmeans = KMeans(n_clusters = k, init = "k-means++", max_iter = 300, n_init = 10, random_state = 0)
    kmeans.fit(df)
    n_clusters.append(k)
    silhouette_scores.append(silhouette_score(X=df, labels=kmeans.labels_))
  clusters = pd.DataFrame(data=[n_clusters, silhouette_scores]).T
  clusters.columns = ["n_clusters", "silhouette_scores"]
  clusters.set_index("n_clusters", inplace=True)
  return clusters

In [24]:
clusters = Silhouette(df_scale, max_clusters=15)
clusters.plot()

In [25]:
df["Grupo"] = kMeans.labels_
df

Unnamed: 0,Temperatura,Velocidad,Elongacion,Diametro,Grupo
0,500,1600,3.4,0.4,1
1,520,1500,3.5,0.2,1
2,520,1400,3.4,0.2,1
3,470,1600,3.2,0.2,1
4,480,1600,3.1,0.2,1
...,...,...,...,...,...
145,510,1500,3.7,0.4,1
146,460,1000,3.6,0.2,1
147,510,1700,3.3,0.5,1
148,480,1900,3.4,0.2,1


In [34]:
# Grafica con las variables en la escala original
# color discreto RECOMENDADO para visualizacion de clusters
fig = px.scatter_matrix(df, dimensions=df.columns[:-1], color=df[df.columns[-1]].astype(str))
fig.show()