## 📚 1. Importação das Bibliotecas

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.cluster import KMeans


## 🔄 2. Carregamento dos Dados

In [None]:
data_df = pd.read_csv('diabetes_normalizado.csv')

In [None]:
data_df['Diabetes_binary'].value_counts()

# 🗂️ 3. Segmentação

In [None]:
# predictores e variável dependente
predictors = ['HighBP', 'HighChol', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'HvyAlcoholConsump', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Age', 'Income']
target = 'Diabetes_binary'

In [None]:
data_df.head()

In [None]:
# número ideal de clusters
inertia = []
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(data_df)
    inertia.append(kmeans.inertia_)

# plotar os valores
plt.figure(figsize=(10, 10))
plt.plot(range(1, 10), inertia, marker='o')
plt.title('Inertia Values by Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
# criar e treinar o modelo k-means
kmeans = KMeans(n_clusters=3, random_state=1234)
data_df['Cluster'] = kmeans.fit_predict(data_df[predictors])

In [None]:
data_df.head()

In [None]:
# adicionar o número de indivíduos em cada cluster
cluster_sizes = data_df['Cluster'].value_counts()
cluster_analysis = data_df.groupby('Cluster')[predictors + [target]].mean()
cluster_analysis['Nº de Indivíduos'] = cluster_sizes

# características/estatísticas médias por cluster
print("Características médias por cluster:")
print(cluster_analysis)

In [None]:
# cluster 0 - alto risco de diabetes
# cluster 1 - risco baixo de diabetes
# cluster 2 - risco moderado de diabetes

In [None]:
import pandas as pd
import joblib

# variáveis contínuas e indesejadas
continuous_features = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income']
unwanted_features = ['Education']

# load do scaler
scaler = joblib.load('scaler.pkl')

# criar novo paciente exemplo
novo_paciente = pd.DataFrame([{
    "HighBP": 1.0,
    "HighChol": 1.0,
    "BMI": 40.0,
    "Smoker": 1.0,
    "Stroke": 1.0,
    "HeartDiseaseorAttack": 1.0,
    "PhysActivity": 1.0,
    "HvyAlcoholConsump": 1.0,
    "GenHlth": 5.0,
    "MentHlth": 0.0,
    "PhysHlth": 4.0,
    "DiffWalk": 1.0,
    "Age": 11.0,
    "Education": 1.0,
    "Income": 2.0,
    
}])

# normalizar variáveis contínuas
novo_paciente[continuous_features] = scaler.transform(novo_paciente[continuous_features])

# eliminar variáveis indesejadas
novo_paciente = novo_paciente.drop(unwanted_features, axis=1)

# ver resultado
print(novo_paciente)


In [None]:
# ver os valores normalizados
print("Novo paciente (normalizado):")
print(novo_paciente)

# prever o cluster do novo paciente
predicted_cluster = kmeans.predict(novo_paciente)

print(f"O novo paciente pertence ao cluster: {predicted_cluster[0]}")

In [None]:
# iterar por todas as colunas do dataframe, exceto 'Diabetes_binary'
for column in data_df.columns:
    plt.figure(figsize=(8, 6))
    sns.scatterplot(data=data_df, x=column, y='Diabetes_binary', hue='Cluster', palette='viridis', alpha=0.7)
    plt.title(f"Scatterplot de {column} vs Diabetes_binary")
    plt.xlabel(column)
    plt.ylabel("Diabetes_binary")
    plt.legend(title="Cluster", loc="best")
    plt.show()

In [None]:
# scaterplot de cluster de BMI vs GenHlth
plt.figure(figsize=(8, 6))
sns.scatterplot(data=data_df, x='BMI', y='GenHlth', hue='Cluster', palette='viridis', alpha=0.7)
plt.title("Scatterplot de BMI vs GenHlth")
plt.xlabel("BMI")
plt.ylabel("GenHlth")
plt.legend(title="Cluster", loc="best")
plt.show()



In [None]:
# scaterplot de cluster de BMI vs MentHlth
plt.figure(figsize=(8, 6))
sns.scatterplot(data=data_df, x='BMI', y='MentHlth', hue='Cluster', palette='viridis', alpha=0.7)
plt.title("Scatterplot de BMI vs MentHlth")
plt.xlabel("BMI")
plt.ylabel("MentHlth")
plt.legend(title="Cluster", loc="best")
plt.show()



In [None]:
# scaterplot de cluster de BMI vs PhysHlth
plt.figure(figsize=(8, 6))
sns.scatterplot(data=data_df, x='BMI', y='PhysHlth', hue='Cluster', palette='viridis', alpha=0.7)
plt.title("Scatterplot de BMI vs PhysHlth")
plt.xlabel("BMI")
plt.ylabel("PhysHlth")
plt.legend(title="Cluster", loc="best")
plt.show()

In [None]:
# scaterplot de cluster de Income vs PhysHlth
plt.figure(figsize=(8, 6))
sns.scatterplot(data=data_df, x='Income', y='PhysHlth', hue='Cluster', palette='viridis', alpha=0.7)
plt.title("Scatterplot de Income vs PhysHlth")
plt.xlabel("Income")
plt.ylabel("PhysHlth")
plt.legend(title="Cluster", loc="best")
plt.show()

In [None]:
# scaterplot de cluster de Income vs HeartDiseaseorAttack
plt.figure(figsize=(8, 6))
sns.scatterplot(data=data_df, x='Income', y='HeartDiseaseorAttack', hue='Cluster', palette='viridis', alpha=0.7)
plt.title("Scatterplot de Income vs HeartDiseaseorAttack")
plt.xlabel("Income")
plt.ylabel("HeartDiseaseorAttack")
plt.legend(title="Cluster", loc="best")
plt.show()