# Clustering the Dataset

In [12]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

## 1. Load the Dataset

In [14]:
ilpd_indian_liver_patient = fetch_ucirepo(id=225)

X = ilpd_indian_liver_patient.data.features
y = ilpd_indian_liver_patient.data.targets

In [15]:
X

Unnamed: 0,Age,Gender,TB,DB,Alkphos,Sgpt,Sgot,TP,ALB,A/G Ratio
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.90
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.00
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.40
...,...,...,...,...,...,...,...,...,...,...
578,60,Male,0.5,0.1,500,20,34,5.9,1.6,0.37
579,40,Male,0.6,0.1,98,35,31,6.0,3.2,1.10
580,52,Male,0.8,0.2,245,48,49,6.4,3.2,1.00
581,31,Male,1.3,0.5,184,29,32,6.8,3.4,1.00


## 2. Preprocessing

In [None]:
# Age: StandardScaler
# Gender: OneHotEncoding
# TB: StandardScaler
# DB: StandardScaler
# Alkphos: StandardScaler
# Sgpt: StandardScaler
# Sgot: StandardScaler
# TP: StandardScaler
# ALB: StandardScaler
# A/G Ratio: StandardScaler

# Consider the possibility to use RobustScaler

In [16]:
X.isna().sum() # Check for missing values (there's none)

Age          0
Gender       0
TB           0
DB           0
Alkphos      0
Sgpt         0
Sgot         0
TP           0
ALB          0
A/G Ratio    4
dtype: int64

In [20]:
# Build preprocessing pipeline

numeric_features = ['Age', 'TB', 'DB', 'Alkphos', 'Sgpt', 'Sgot', 'TP', 'ALB', 'A/G Ratio']
categorical_features = ['Gender']
all_columns = numeric_features + categorical_features

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)
    ]
)

pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

In [23]:
X_processed = pipeline.fit_transform(X)
X_processed = pd.DataFrame(X_processed, columns=all_columns, index=X.index)

X_processed

Unnamed: 0,Age,TB,DB,Alkphos,Sgpt,Sgot,TP,ALB,A/G Ratio,Gender
0,1.252098,-0.418878,-0.493964,-0.426715,-0.354665,-0.318393,0.292120,0.198969,-0.147390,0.0
1,1.066637,1.225171,1.430423,1.682629,-0.091599,-0.034333,0.937566,0.073157,-0.648461,1.0
2,1.066637,0.644919,0.931508,0.821588,-0.113522,-0.145186,0.476533,0.198969,-0.178707,1.0
3,0.819356,-0.370523,-0.387054,-0.447314,-0.365626,-0.311465,0.292120,0.324781,0.165780,1.0
4,1.684839,0.096902,0.183135,-0.393756,-0.294379,-0.176363,0.753153,-0.933340,-1.713237,1.0
...,...,...,...,...,...,...,...,...,...,...
578,0.942997,-0.451114,-0.493964,0.862786,-0.332743,-0.262967,-0.537740,-1.939837,-1.807188,1.0
579,-0.293407,-0.434996,-0.493964,-0.793378,-0.250535,-0.273359,-0.445534,0.073157,0.478949,1.0
580,0.448435,-0.402760,-0.458327,-0.187766,-0.179288,-0.211005,-0.076707,0.073157,0.165780,1.0
581,-0.849789,-0.322169,-0.351417,-0.439074,-0.283418,-0.269895,0.292120,0.324781,0.165780,1.0
