Importamos las librerías necesarias

In [1]:
from sklearn.cluster import KMeans
import pandas as pd
from sklearn.metrics import silhouette_score

Cargamos los datos a un dataframe con pandas

In [2]:
df_crime = pd.read_csv('crime_data.csv')

In [3]:
df_crime.head()

Unnamed: 0,State,Murder,Assault,UrbanPop,Rape
0,Alabama,13.2,236,58,21.2
1,Alaska,10.0,263,48,44.5
2,Arizona,8.1,294,80,31.0
3,Arkansas,8.8,190,50,19.5
4,California,9.0,276,91,40.6


Estadísticas básicas del dataframe

In [4]:
df_crime.describe()

Unnamed: 0,Murder,Assault,UrbanPop,Rape
count,50.0,50.0,50.0,50.0
mean,7.788,170.76,65.54,21.232
std,4.35551,83.337661,14.474763,9.366385
min,0.8,45.0,32.0,7.3
25%,4.075,109.0,54.5,15.075
50%,7.25,159.0,66.0,20.1
75%,11.25,249.0,77.75,26.175
max,17.4,337.0,91.0,46.0


Se separa el dataframe en las features y target. 

In [5]:
X = df_crime[['Murder', 'Assault', 'UrbanPop', 'Rape']]
y = df_crime.State

Se crea el modelo de KMeans con 3 clusters y se ajusta con los datos

In [6]:
kmean = KMeans(n_clusters=3, random_state=0).fit(X)

Se obtienen las etiquetas de cada cluster

In [7]:
targets = set(kmean.labels_)

In [8]:
targets

{0, 1, 2}

Se evalúa el modelo con el coefieciente se silhouette

In [9]:
silhouette_score(X, kmean.labels_)

0.5319024108339817

Obtenemos los centroides de cada cluster

In [10]:
kmean.cluster_centers_

array([[  4.27      ,  87.55      ,  59.75      ,  14.39      ],
       [ 11.8125    , 272.5625    ,  68.3125    ,  28.375     ],
       [  8.21428571, 173.28571429,  70.64285714,  22.84285714]])

guardamos los datos que pertenecen a cada cluster

In [11]:
clusters = {x:[] for x in targets}
for i,v in enumerate(kmean.labels_):
    clusters[v].append(y[i])

Mostramos los clusters  junto a su centroide

In [12]:
for i in clusters:
    print(f'Cluster {i}')
    print(f'Centroide: {kmean.cluster_centers_[i]}')
    print(clusters[i])

Cluster 0
Centroide: [ 4.27 87.55 59.75 14.39]
['Connecticut', 'Hawaii', 'Idaho', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Maine', 'Minnesota', 'Montana', 'Nebraska', 'New Hampshire', 'North Dakota', 'Ohio', 'Pennsylvania', 'South Dakota', 'Utah', 'Vermont', 'West Virginia', 'Wisconsin']
Cluster 1
Centroide: [ 11.8125 272.5625  68.3125  28.375 ]
['Alabama', 'Alaska', 'Arizona', 'California', 'Delaware', 'Florida', 'Illinois', 'Louisiana', 'Maryland', 'Michigan', 'Mississippi', 'Nevada', 'New Mexico', 'New York', 'North Carolina', 'South Carolina']
Cluster 2
Centroide: [  8.21428571 173.28571429  70.64285714  22.84285714]
['Arkansas', 'Colorado', 'Georgia', 'Massachusetts', 'Missouri', 'New Jersey', 'Oklahoma', 'Oregon', 'Rhode Island', 'Tennessee', 'Texas', 'Virginia', 'Washington', 'Wyoming']
