In [2]:
import pandas as pd
import numpy as np
import plotly.express as px #criacao de graficos dinamicos
import plotly.graph_objects as go #para criacao e concatenacao de graficos
from sklearn.preprocessing import StandardScaler # para utilizar a padronização de dados
from sklearn.cluster import KMeans

In [4]:
# File location
file_location = "D://caiof//Documents//GIT_Repos//Arquiteto_BigData-IGTI//Bases_Dados//datasets//base_iris.csv"

In [5]:
base_iris = pd.read_csv(file_location, sep= ';')

base_iris.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
base_iris.shape

(150, 5)

In [8]:
#verifica as classes das flores

base_iris['Class'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

Realizando agrupamento por petalas

In [9]:
#criando variavel x com as colunas petal length e petal width
x_petalas = base_iris.iloc[: , [2,3]].values
x_petalas[:10]

array([[1.4, 0.2],
       [1.4, 0.2],
       [1.3, 0.2],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.7, 0.4],
       [1.4, 0.3],
       [1.5, 0.2],
       [1.4, 0.2],
       [1.5, 0.1]])

Normalizando os dados

Como os dados estao em uma escala muito diferente,
precisamos normalizar os dados colocando no mesmo padrao e na mesma escala.
como o Kmeans realiza os calculos baseados na distancia, devemos padronizar os dados afim de que um atributo
nao seja considerado mais importante que o outro.

In [10]:
normalizar_dados = StandardScaler()
x_petalas = normalizar_dados.fit_transform(x_petalas)
x_petalas[ :10]

array([[-1.3412724 , -1.31297673],
       [-1.3412724 , -1.31297673],
       [-1.39813811, -1.31297673],
       [-1.2844067 , -1.31297673],
       [-1.3412724 , -1.31297673],
       [-1.17067529, -1.05003079],
       [-1.3412724 , -1.18150376],
       [-1.2844067 , -1.31297673],
       [-1.3412724 , -1.31297673],
       [-1.2844067 , -1.4444497 ]])

Calculando o numero do cluster

Para calcular o numero do cluster vamos utilizar o metodo cotovelo
WCSS é a soma da distancia quadrada entre cada ponto e o centroide é um cluster

In [12]:
wcss_petala = [] #cria uma lista vazia
for i in range(1,11):
    """
    #executa o kmeans para todos os clusters e random_state = 0 
    para fixar e obter os mesmos resultados
    """
    kmeans_petala = KMeans(n_clusters=i, random_state=0) 
    kmeans_petala.fit(x_petalas) #realiza o treinamento
    wcss_petala.append(kmeans_petala.inertia_) # adiciona na lista os valores de wcss



In [13]:
#visualizando os valores wcss
for i in range(len(wcss_petala)):
    print(f'cluster: {i} valor wcss: {wcss_petala[i]}')

cluster: 0 valor wcss: 300.0
cluster: 1 valor wcss: 54.14584701344989
cluster: 2 valor wcss: 18.04698389190627
cluster: 3 valor wcss: 12.307440251261845
cluster: 4 valor wcss: 9.181131495513897
cluster: 5 valor wcss: 7.215096212730808
cluster: 6 valor wcss: 6.026593155951447
cluster: 7 valor wcss: 5.173315218915379
cluster: 8 valor wcss: 4.419290104197288
cluster: 9 valor wcss: 3.9139417830543195


Criando grafico para melhor visualizacao

In [14]:
grafico_cotovelo_petala = px.line(x= range(1,11), y=wcss_petala)
grafico_cotovelo_petala.show()

In [15]:
kmeans_petala = KMeans(n_clusters=3, random_state=0)
label_cluster_petala = kmeans_petala.fit_predict(x_petalas)




KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [16]:
#verifica a classificacao dos clusters

label_cluster_petala

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [17]:
centroides_petala = kmeans_petala.cluster_centers_
centroides_petala

array([[-1.30487835, -1.25512862],
       [ 1.02813193,  1.12749028],
       [ 0.30564587,  0.16609419]])

Grafico de agrupamento das caracteristicas do tamanho e comprimento das petalas

In [23]:
grafico_petala = px.scatter(x= x_petalas[: ,0], y = x_petalas[:,1], color= label_cluster_petala)
grafico_centroide_petala = px.scatter(x= centroides_petala[:,0], y= centroides_petala[:,1], size= [7,7,7])
grafico_final_petala = go.Figure(data= grafico_petala.data + grafico_centroide_petala.data)
grafico_final_petala.show()

Realizando o agrupamento com dados da sepala

In [26]:
# realizando agrupamento sepala
x_sepala = base_iris.iloc[:,[0,1]].values
x_sepala[:10]

array([[5.1, 3.5],
       [4.9, 3. ],
       [4.7, 3.2],
       [4.6, 3.1],
       [5. , 3.6],
       [5.4, 3.9],
       [4.6, 3.4],
       [5. , 3.4],
       [4.4, 2.9],
       [4.9, 3.1]])

In [27]:
#calcular o wcss
wcss_sepala = [] #lista vazia
for i in range(1,11):
    """
    executa o kmeans para todos os clusters 
    e random_state = 0 para fizar e obter os mesmos resultados
    """
    kmeans_sepala = KMeans(n_clusters=i , random_state=0)
    kmeans_sepala.fit(x_sepala) # realiza o treinamento
    wcss_sepala.append(kmeans_sepala.inertia_) #adiciona ba lista de valores wcss




KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.




KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.




KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.




KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.




KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.




KMeans is known to have a memory leak on Windows with MKL, when the

In [28]:
#visualizando os valores wcss
for i in range(len(wcss_sepala)):
    print(f'cluster: {i} valor wcss: {wcss_petala[i]}')

cluster: 0 valor wcss: 300.0
cluster: 1 valor wcss: 54.14584701344989
cluster: 2 valor wcss: 18.04698389190627
cluster: 3 valor wcss: 12.307440251261845
cluster: 4 valor wcss: 9.181131495513897
cluster: 5 valor wcss: 7.215096212730808
cluster: 6 valor wcss: 6.026593155951447
cluster: 7 valor wcss: 5.173315218915379
cluster: 8 valor wcss: 4.419290104197288
cluster: 9 valor wcss: 3.9139417830543195


In [29]:
grafico_cotovelo_petala = px.line(x= range(1,11), y=wcss_sepala)
grafico_cotovelo_petala.show()

In [30]:
kmeans_sepala = KMeans(n_clusters=3, random_state=0)
label_cluster_sepala = kmeans_sepala.fit_predict(x_sepala)




KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



In [31]:
#verifica a classificacao dos clusters

label_cluster_sepala

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 2, 2, 0, 2, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 2,
       0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2,
       2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0])

In [32]:
centroides_sepala = kmeans_sepala.cluster_centers_
centroides_sepala

array([[5.77358491, 2.69245283],
       [5.006     , 3.418     ],
       [6.81276596, 3.07446809]])

In [34]:
grafico_sepala = px.scatter(x= x_sepala[: ,0], y = x_sepala[:,1], color= label_cluster_sepala)
grafico_centroide_sepala = px.scatter(x= centroides_sepala[:,0], y= centroides_sepala[:,1], size= [7,7,7])
grafico_final_sepala = go.Figure(data= grafico_sepala.data + grafico_centroide_sepala.data)
grafico_final_sepala.show()