<a href="https://colab.research.google.com/github/Aabhas2/DataMining_Practicals/blob/main/KMeans.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml

titanic = fetch_openml('titanic', version=1,as_frame=True)
df = titanic.frame.copy()

In [2]:
df = df.drop(columns=['name','ticket','cabin','boat','body','home.dest','survived'])
X = df

In [3]:
# Build preprocessing pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numeric_features = X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object','category']).columns.tolist()

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    'imputer', SimpleImputer(strategy='most_frequent'),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num',numeric_transformer, numeric_features),
    ('cat',categorical_transformer, categorical_features)
])

In [5]:
# Fit K-Means
from sklearn.cluster import KMeans
# Importing necessary modules for preprocessing, as we're redefining preprocessor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming X, numeric_features, categorical_features are already defined from previous cells.

# Re-defining numeric_transformer (copied from Zqop3Z72aKTd for self-containment in this cell)
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Corrected categorical_transformer (copied from Zqop3Z72aKTd with fix)
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')), # Fix: added tuple for the imputer step
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Re-defining preprocessor with the corrected transformers
preprocessor = ColumnTransformer([
    ('num',numeric_transformer, numeric_features),
    ('cat',categorical_transformer, categorical_features)
])

kmeans = Pipeline([
    ('preprocess',preprocessor),
    ('cluster',KMeans(n_clusters=2,random_state=42,n_init="auto"))
])

kmeans.fit(X)
labels = kmeans['cluster'].labels_

In [6]:
labels

array([0, 0, 0, ..., 1, 1, 1], dtype=int32)

In [7]:
# Evaluate Clusters vs Actual Survival (Interpretation)
df_results = titanic.frame.copy()
df_results['cluster'] = labels

pd.crosstab(df_results['cluster'],df_results['survived'])

survived,0,1
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,149,213
1,660,287


## Clustering Validation (Intertia + Silhouette)

In [8]:
from sklearn.metrics import silhouette_score

for k in range(2,7):
  model = Pipeline([
      ('preprocess', preprocessor),
      ('cluster', KMeans(n_clusters=k, random_state=42, n_init='auto'))
  ])
  model.fit(X)
  labels = model['cluster'].labels_

  inertia = model['cluster'].inertia_
  sil = silhouette_score(preprocessor.fit_transform(X),labels)

  print(f"K={k}, Inertia={inertia:.1f}, Silhouette={sil:.3f}")

K=2, Inertia=5995.6, Silhouette=0.312
K=3, Inertia=5341.8, Silhouette=0.270
K=4, Inertia=4217.4, Silhouette=0.307
K=5, Inertia=3918.3, Silhouette=0.302
K=6, Inertia=3738.4, Silhouette=0.222
