In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, pairwise_distances
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
import optuna

### Carregando Dados

In [2]:
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data         # Atributos das flores
y = iris.target       # Classes reais (usadas apenas para avaliação)

In [3]:
iris.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [4]:
sepal_length = []
sepal_width = []
petal_length = []
petal_width = []
for i in range(0,len(iris.data)):
    sepal_length.append(iris.data[i][0])

for j in range(0,len(iris.data)):
    sepal_width.append(iris.data[j][1])
    
for k in range(0,len(iris.data)):
    petal_length.append(iris.data[k][2])
    
for l in range(0,len(iris.data  )):
    petal_width.append(iris.data[l][3])

In [5]:
target = iris.target

In [6]:
df_iris = pd.DataFrame({'sepal_length': sepal_length, 'sepal_width': sepal_width, 'petal_length': petal_length, 'petal_width': petal_width})

In [7]:
df_iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
dtypes: float64(4)
memory usage: 4.8 KB


In [8]:
df_iris.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1


In [9]:
df_iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [10]:
px.histogram(df_iris.sepal_length, title="Distribuição da variável Sepal Length")

In [11]:
px.histogram(df_iris.sepal_width, title="Distribuição da variável Sepal Width")

In [12]:
px.histogram(df_iris.petal_length,title="Distribuição da variável Petal Length")

In [13]:
px.histogram(df_iris.petal_width, title="Distribuição da variável Petal Width")

In [14]:
corr = df_iris.corr()

fig = px.imshow(
    corr, 
    title='Mapa de correlação do DF',
    text_auto=".2f",   
    color_continuous_scale="RdBu_r",
    zmin=-1,
    zmax=1,
)

fig.show()

### Treinar Modelo

In [15]:
X = df_iris.copy()

numeric_features = df_iris.select_dtypes(include=['number']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features)
    ]
)

X_transformed = preprocessor.fit_transform(X)

In [16]:
# Criar modelo
modelo_kmeans = KMeans(n_clusters= 3, random_state=51)

# Treinar modelo
modelo_kmeans.fit(X_transformed)

# Calculando Silhouette Score
silhouette_avg = silhouette_score(X_transformed, modelo_kmeans.labels_, metric='euclidean')

In [17]:
silhouette_avg

0.4630420362927048

Com base nesse valor temos que a clusterizacao é razoalveamente bom com cluster definidos, porem com alguma separacao.

In [18]:
def kmeans_objective(trial):
    n_clusters = trial.suggest_int('n_clusters', 2,10)
    distance_metric = trial.suggest_categorical('distance_metric' , ['euclidean', 'minkowski'])
    # Criar modelo
    modelo_kmeans = KMeans(n_clusters= n_clusters, random_state=51)
    # Treinar modelo
    modelo_kmeans.fit(X_transformed)
    
    silhouette_avg = silhouette_score(X_transformed, modelo_kmeans.labels_, metric=distance_metric)
    
    return silhouette_avg

In [19]:
search_space = {'n_clusters': [2,3,4,5,6,7,8,9,10], 'distance_metric' : ['euclidean', 'minkowski']}
sampler = optuna.samplers.GridSampler(search_space=search_space)
study = optuna.create_study(direction='maximize', sampler = sampler)
study.optimize(kmeans_objective, n_trials=50)

[I 2025-11-07 10:13:14,541] A new study created in memory with name: no-name-b253c76b-38a1-4d6b-b820-674b6fefd264
[I 2025-11-07 10:13:14,557] Trial 0 finished with value: 0.4630420362927048 and parameters: {'n_clusters': 3, 'distance_metric': 'euclidean'}. Best is trial 0 with value: 0.4630420362927048.
[I 2025-11-07 10:13:14,570] Trial 1 finished with value: 0.33295384714167575 and parameters: {'n_clusters': 8, 'distance_metric': 'euclidean'}. Best is trial 0 with value: 0.4630420362927048.
[I 2025-11-07 10:13:14,587] Trial 2 finished with value: 0.3567677588394416 and parameters: {'n_clusters': 10, 'distance_metric': 'euclidean'}. Best is trial 0 with value: 0.4630420362927048.
[I 2025-11-07 10:13:14,598] Trial 3 finished with value: 0.4630420362927048 and parameters: {'n_clusters': 3, 'distance_metric': 'minkowski'}. Best is trial 0 with value: 0.4630420362927048.
[I 2025-11-07 10:13:14,613] Trial 4 finished with value: 0.3378323046563087 and parameters: {'n_clusters': 7, 'distance_

In [20]:
# Melhor confuguração encontrada pelo optuna
best_params = study.best_params

# Instanciando o modelo K-Means com melhores parametros
best_kmeans = KMeans(n_clusters=best_params['n_clusters'], random_state=51)
best_kmeans.fit(X_transformed)

# Calculando Silhouette Score
distances = pairwise_distances(X_transformed, metric=best_params['distance_metric'])
best_silhouette = silhouette_score(distances, best_kmeans.labels_)

print(f"K (Numero DE clusters): {best_params['n_clusters']}")
print(f"Métrica de distância selecionada: {best_params['distance_metric']}")
print(f'Silhouette Score: {best_silhouette}')

K (Numero DE clusters): 2
Métrica de distância selecionada: minkowski
Silhouette Score: 0.6469714400222696


Silhouette Score: 0.6469714400222696  -> Indica clusterizacao boa com clusters bem definidos 

Com isso, observamos que o algoritmo K-Means obteve melhor desempenho ao agrupar os dados em dois clusters, e não em três, como sugere a variável target. Isso ocorre porque, de acordo com as características das espécies Setosa, Versicolor e Virginica, a Setosa apresenta-se bem distinta das demais, enquanto Versicolor e Virginica possuem grande semelhança entre si. Assim, o algoritmo identificou de forma natural duas regiões bem definidas nos dados: uma correspondente à Setosa e outra que engloba Versicolor e Virginica.

In [21]:
df_iris_cluster = df_iris.copy()
df_iris_cluster['cluster'] = best_kmeans.labels_

In [22]:
df_iris.to_csv('.\datasets\iris.csv', index=False)
df_iris_cluster.to_csv('.\datasets\iris_cluster.csv', index=False)

In [23]:
df_iris_cluster.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,cluster
0,5.1,3.5,1.4,0.2,1
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1
3,4.6,3.1,1.5,0.2,1
4,5.0,3.6,1.4,0.2,1
5,5.4,3.9,1.7,0.4,1
6,4.6,3.4,1.4,0.3,1
7,5.0,3.4,1.5,0.2,1
8,4.4,2.9,1.4,0.2,1
9,4.9,3.1,1.5,0.1,1


In [25]:
df_iris_cluster.tail(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,cluster
140,6.7,3.1,5.6,2.4,0
141,6.9,3.1,5.1,2.3,0
142,5.8,2.7,5.1,1.9,0
143,6.8,3.2,5.9,2.3,0
144,6.7,3.3,5.7,2.5,0
145,6.7,3.0,5.2,2.3,0
146,6.3,2.5,5.0,1.9,0
147,6.5,3.0,5.2,2.0,0
148,6.2,3.4,5.4,2.3,0
149,5.9,3.0,5.1,1.8,0


In [35]:
zero = 0
um = 0
dois = 0
for t in target:
    if (t == 0):
        zero += 1
    elif(t==1):
        um +=1
    else:
        dois +=1
        
print("cluster")
print(f"0     {zero}")
print(f"1     {um}")
print(f"2     {dois}")

cluster
0     50
1     50
2     50


In [None]:
df_iris_cluster['cluster'].value_counts()

cluster
0    100
1     50
Name: count, dtype: int64

In [37]:
px.scatter(df_iris_cluster, x='sepal_length', y='petal_length', color='cluster')

In [38]:
px.scatter(df_iris_cluster, x='sepal_length', y='sepal_width', color='cluster')

In [39]:
px.scatter(df_iris_cluster, x='sepal_length', y='petal_width', color='cluster')

In [40]:
px.scatter(df_iris_cluster, x='sepal_width', y='petal_length', color='cluster')

In [41]:
px.scatter(df_iris_cluster, x='sepal_width', y='petal_width', color='cluster')

In [42]:
px.scatter(df_iris_cluster, x='petal_length', y='petal_width', color='cluster')

A partir dos gráficos, é possível observar que o algoritmo K-Means realizou a divisão dos dados com maior ênfase nas variáveis petal_length e petal_width. Isso indica que essas variáveis tiveram maior influência na formação dos clusters, sendo, portanto, as mais relevantes para a separação das espécies no conjunto de dados.