## Libraries

In [3]:
# Core Libraries
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore

# Dimensionality Reduction
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# UMAP (optional â€“ requires pip install umap-learn)
import umap

# Clustering
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage, dendrogram

# Anomaly Detection
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor


# Utility / Settings
import warnings

## 1. Data Wrangling

**Objective:** We want to convert  the raw biochemical patient data into a structured feature matrix on which we can perform geometric and statistical modeling.

* Removed patient ID to avoid artificial structure.
* Audited and corrected variable types to prevent parsing errors and preserve numerical precision.
* Assessed missingness to reduce bias in structure estimation:

  * Excluded records with >30% missing values.
  * Applied **median imputation (numerical)** and **most-frequent imputation (categorical)** to preserve distributional robustness.
* Enforced biological plausibility constraints (valid age range and non-negative lab values) to prevent clustering artifacts from erroneous entries.
* Encoded categorical variables for compatibility with numerical algorithms
* Separated the categories from the rest of the data





In [33]:
# -----------------------------
# Load data
# -----------------------------
df = pd.read_csv("HepatitisCdata.csv")
#df.head()

In [34]:
# -----------------------------
# Remove non-informative columns
# -----------------------------
df = df.drop(columns=["Unnamed: 0"])
#df.head()

In [37]:
# -----------------------------
# Verify and enforce data types
# -----------------------------
categorical_cols = ["Sex", "Category"]
numeric_cols = [c for c in df.columns if c not in categorical_cols]

# Convert numeric columns safely
for c in numeric_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")


In [40]:
# -----------------------------
# Handle Missing Values
# -----------------------------

# Step 1: Remove rows with excessive missingness (>30%)
row_missing_fraction = df.isna().mean(axis=1)
df = df.loc[row_missing_fraction <= 0.30].copy()

# Step 2: Impute remaining missing values
num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")

df[numeric_cols] = num_imputer.fit_transform(df[numeric_cols])
df[["Sex"]] = cat_imputer.fit_transform(df[["Sex"]])


In [41]:
# -----------------------------
# Remove biologically impossible values
# -----------------------------

# Age constraints
if "Age" in df.columns:
    df = df[(df["Age"] > 0) & (df["Age"] < 120)].copy()

# Lab marker sanity check (non-negative)
lab_markers = ["ALB","ALP","ALT","AST","BIL","CHE","CHOL","CREA","GGT","PROT"]
lab_markers = [c for c in lab_markers if c in df.columns]

for c in lab_markers:
    df = df[df[c] >= 0].copy()


In [47]:
# -----------------------------
# Encode categorical variables
# -----------------------------
if "Sex" in df.columns:
    df["Sex"] = df["Sex"].map({"m": 0, "f": 1})
#df.head()

# -----------------------------
# Separate features and labels
# -----------------------------
X = df.drop(columns=["Category"])
y = df["Category"]   # Not used in unsupervised training

#X.head()
#y.head()


In [44]:
# -----------------------------
# Standardize features
# -----------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame for readability
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

# -----------------------------
# Final check
# -----------------------------
print("Final dataset shape:", X_scaled.shape)
print("Remaining missing values:", X_scaled.isna().sum().sum())
df.head()

Final dataset shape: (615, 12)
Remaining missing values: 0


Unnamed: 0,Category,Age,Sex,ALB,ALP,ALT,AST,BIL,CHE,CHOL,CREA,GGT,PROT
0,0=Blood Donor,32.0,0,38.5,52.5,7.7,22.1,7.5,6.93,3.23,106.0,12.1,69.0
1,0=Blood Donor,32.0,0,38.5,70.3,18.0,24.7,3.9,11.17,4.8,74.0,15.6,76.5
2,0=Blood Donor,32.0,0,46.9,74.7,36.2,52.6,6.1,8.84,5.2,86.0,33.2,79.3
3,0=Blood Donor,32.0,0,43.2,52.0,30.6,22.6,18.9,7.33,4.74,80.0,33.8,75.7
4,0=Blood Donor,32.0,0,39.2,74.1,32.6,24.8,9.6,9.15,4.32,76.0,29.9,68.7


## 2. Exploratory Data Analysis (EDA)

