In [1]:
import pandas as pd
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import pickle

In [2]:
data = pd.read_csv('updated_medical_lifestyle_dataset.csv')

In [3]:

numerical_features = ['serum_creatinine', 'gfr', 'bun', 'serum_calcium', 'oxalate_levels', 'urine_ph', 'blood_pressure']
data = data[numerical_features]


In [4]:
scaler = StandardScaler()
power_transformer = PowerTransformer(method='yeo-johnson', standardize=True)

numerical_transformer = Pipeline(steps=[
        ('power_transform', power_transformer), 
        ('scaler', scaler)
    ])

    # Combine preprocessors in a column transformer
preprocessor = ColumnTransformer(
    transformers=[
            ('num', numerical_transformer, numerical_features),
            
    ]
)

In [5]:
processed_data = preprocessor.fit_transform(data)
    # Save the preprocessor pipeline

In [6]:
# dbscan = DBSCAN(eps=1.5, min_samples=5)
# labels = dbscan.fit_predict(processed_data)
    


kmeans = KMeans(n_clusters=7, random_state=42)
labels = kmeans.fit_predict(processed_data)

data['cluster'] = labels

# with open('cluster_model.pkl', 'wb') as file:  # Use 'wb' mode for binary writing
#     pickle.dump(kmeans, file)

num_clusters = len(set(labels)) - (1 if -1 in labels else 0)
num_noise_points = list(labels).count(-1)

metrics = {}

if num_clusters > 1:
    non_noise_data = data[labels != -1]
    non_noise_labels = labels[labels != -1]

metrics['silhouette_score'] = silhouette_score(non_noise_data, non_noise_labels)
metrics['calinski_harabasz_score'] = calinski_harabasz_score(non_noise_data, non_noise_labels)
metrics['davies_bouldin_score'] = davies_bouldin_score(non_noise_data, non_noise_labels)

In [7]:

print(f"Silhouette Score: {metrics['silhouette_score']}")
print(f"Calinski-Harabasz Score: {metrics['calinski_harabasz_score']}")
print(f"Davies-Bouldin Score: {metrics['davies_bouldin_score']}")

Silhouette Score: 0.1081572152831103
Calinski-Harabasz Score: 1900.5003479170741
Davies-Bouldin Score: 5.869431781240045


In [8]:

cluster_data = pd.DataFrame(data, columns=numerical_features)
cluster_data['Cluster'] = labels


In [9]:
cluster_summary = cluster_data.groupby('Cluster').mean()
print(cluster_summary)

         serum_creatinine        gfr        bun  serum_calcium  \
Cluster                                                          
0                0.853225  76.026179  13.997609       9.338160   
1                0.843384  80.308409  13.478645       9.264033   
2                3.188500  51.767160  99.695123       6.660409   
3                0.834908  16.073113  13.512632       9.335233   
4                0.836568  70.306562  12.876114       9.357155   
5                0.871029  74.041041  13.636616       9.419180   
6                0.867094  17.874469  13.344756       9.357261   

         oxalate_levels  urine_ph  blood_pressure  
Cluster                                            
0              2.351360  6.459579      107.853724  
1              1.324310  6.898451      105.623077  
2              4.046387  4.948810      154.655016  
3              2.454943  7.093243      104.959020  
4              2.236652  7.167527       94.274696  
5              2.237813  7.597534      11

In [10]:
print(data.head())

   serum_creatinine         gfr         bun  serum_calcium  oxalate_levels  \
0          0.683683   32.946784    7.553739      10.039896        2.878164   
1          3.809044   32.685035  141.347494       8.330543        4.767639   
2          1.143827    2.079805   15.979104       9.419229        1.818613   
3          4.804657  109.871407   53.307333       7.556631        4.051686   
4          4.920235   42.214590  134.182157       7.289379        3.240920   

   urine_ph  blood_pressure  cluster  
0  7.864308      115.224217        5  
1  4.920015      130.143900        2  
2  6.188115       98.026072        6  
3  5.278607      142.166650        2  
4  4.862923      151.962572        2  


In [11]:
# Save the dataset with the added 'cluster' column to a CSV file
data.to_csv('updated_dataset_with_clusters.csv', index=False)

print("Dataset saved as 'updated_dataset_with_clusters.csv'")


Dataset saved as 'updated_dataset_with_clusters.csv'
