In [None]:

# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, accuracy_score

import warnings
warnings.filterwarnings("ignore")
    

In [None]:

# Load the dataset
df = pd.read_csv("/content/sample_data/heart-disease.csv")

# Data preprocessing
print('Checking for duplicated rows:', df.duplicated().sum())
print('Checking for missing values:', df.isna().sum().sum(), 'values missing')

# Fill missing values if any
df.fillna(df.mean(), inplace=True)
    

In [None]:

# Define features (X)
X = df.drop('target', axis=1)

# Standardize the dataset
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
    

In [None]:

# Apply K-Means clustering
n_clusters = 2  # Assuming we want 2 clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)

# Add cluster labels to the dataset
df['Cluster'] = cluster_labels
    

In [None]:

# Evaluate clustering performance (silhouette score)
silhouette_avg = silhouette_score(X_scaled, cluster_labels)
print(f"Silhouette Score: {silhouette_avg:.2f}")

# If you want to compare clusters with the actual target
if 'target' in df.columns:
    target_accuracy = accuracy_score(df['target'], cluster_labels)
    print(f"Accuracy with respect to the target: {target_accuracy:.2f}")
    

In [None]:

# Visualize clusters (if features are reduced to 2D, e.g., using PCA)
sns.scatterplot(x=X_scaled[:, 0], y=X_scaled[:, 1], hue=cluster_labels, palette='viridis')
plt.title("K-Means Clustering Results")
plt.show()
    