In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("indian_liver_patient.csv")
df[df["Dataset"]==1].head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [3]:
df[df["Dataset"] == 2].head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
8,17,Male,0.9,0.3,202,22,19,7.4,4.1,1.2,2
12,64,Male,0.9,0.3,310,61,58,7.0,3.4,0.9,2
15,25,Male,0.6,0.1,183,91,53,5.5,2.3,0.7,2
17,33,Male,1.6,0.5,165,15,23,7.3,3.5,0.92,2
24,63,Male,0.9,0.2,194,52,45,6.0,3.9,1.85,2


In [4]:
df["Dataset"] = df["Dataset"].map({1:1, 2:0})

In [5]:
df.drop_duplicates(inplace=True)

In [6]:
df["Dataset"].value_counts()

Dataset
1    406
0    164
Name: count, dtype: int64

# Data Understanding #

The normal range of Total Bilirubin in adults is typically:
 * Total Bilirubin: 0.3 to 1.2 mg/dL (milligrams per deciliter)

In [7]:
total_bilirubin = df[(df["Total_Bilirubin"]<0.3) | (df["Total_Bilirubin"]>1.2)]
total_bilirubin

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.40,1
5,46,Male,1.8,0.7,208,19,14,7.6,4.4,1.30,1
11,72,Male,2.7,1.3,260,31,56,7.4,3.0,0.60,1
...,...,...,...,...,...,...,...,...,...,...,...
574,32,Male,12.1,6.0,515,48,92,6.6,2.4,0.50,1
575,32,Male,25.0,13.7,560,41,88,7.9,2.5,2.50,1
576,32,Male,15.0,8.2,289,58,80,5.3,2.2,0.70,1
577,32,Male,12.7,8.4,190,28,47,5.4,2.6,0.90,1


<p>
    Above 1.2 mg/dL is considered elevated. [Mostly an Alarm]
</p>

In [8]:
total_bilirubin["Dataset"].value_counts()

Dataset
1    212
0     33
Name: count, dtype: int64

In [9]:
tb_negative = total_bilirubin[total_bilirubin["Dataset"]==0]

One cannot say these 35 rows are mislabeled just because Total_Bilirubin is abnormal.

Elevated bilirubin ≠ guaranteed liver disease.

Labels may be correct due to clinical context or other causes of elevation.

In [10]:
direct_bilirubin = df[(df["Direct_Bilirubin"] < 0.1) | (df["Direct_Bilirubin"] > 0.3)]
direct_bilirubin["Dataset"].value_counts()

Dataset
1    228
0     40
Name: count, dtype: int64

In [11]:
db_negative = direct_bilirubin[direct_bilirubin["Dataset"]==0]

In [12]:
intersection_tb_db = pd.merge(tb_negative, db_negative, how="inner")
intersection_tb_db

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,33,Male,1.6,0.5,165,15,23,7.3,3.5,0.92,0
1,38,Female,2.6,1.2,410,59,57,5.6,3.0,0.8,0
2,42,Male,6.8,3.2,630,25,47,6.1,2.3,0.6,0
3,35,Male,1.8,0.6,275,48,178,6.5,3.2,0.9,0
4,70,Male,1.4,0.6,146,12,24,6.2,3.8,1.58,0
5,36,Male,5.3,2.3,145,32,92,5.1,2.6,1.0,0
6,50,Male,5.8,3.0,661,181,285,5.7,2.3,0.67,0
7,50,Male,7.3,3.6,1580,88,64,5.6,2.3,0.6,0
8,58,Male,1.7,0.8,188,60,84,5.9,3.5,1.4,0
9,60,Male,1.8,0.5,201,45,25,3.9,1.7,0.7,0


In [13]:
intersection_tb_db.shape

(33, 11)

These 33 records are consistently abnormal in both features but still labeled 0 → likely borderline or subclinical cases, not mislabels.

Since both indicators flag the same cases, they're likely highly correlated (as expected physiologically).

In [14]:
alp_abnormal = df[(df["Alkaline_Phosphotase"] < 40) | (df["Alkaline_Phosphotase"] > 130)]
alp_abnormal["Dataset"].value_counts()

Dataset
1    387
0    157
Name: count, dtype: int64

In [15]:
alp_negative = alp_abnormal[alp_abnormal["Dataset"]==0]

In [16]:
intersection_tb_db_apl = pd.merge(alp_negative, intersection_tb_db, how="inner")

In [17]:
intersection_tb_db_apl

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,33,Male,1.6,0.5,165,15,23,7.3,3.5,0.92,0
1,38,Female,2.6,1.2,410,59,57,5.6,3.0,0.8,0
2,42,Male,6.8,3.2,630,25,47,6.1,2.3,0.6,0
3,35,Male,1.8,0.6,275,48,178,6.5,3.2,0.9,0
4,70,Male,1.4,0.6,146,12,24,6.2,3.8,1.58,0
5,36,Male,5.3,2.3,145,32,92,5.1,2.6,1.0,0
6,50,Male,5.8,3.0,661,181,285,5.7,2.3,0.67,0
7,50,Male,7.3,3.6,1580,88,64,5.6,2.3,0.6,0
8,58,Male,1.7,0.8,188,60,84,5.9,3.5,1.4,0
9,60,Male,1.8,0.5,201,45,25,3.9,1.7,0.7,0


There are 31 non-diseased records with abnormal Total_Bilirubin, Direct_Bilirubin, and Alkaline_Phosphotase.

These three features together suggest significant liver-related abnormality, yet they're still labeled as 0 (non-diseased).

This strengthens the case that:

These 30 rows are either edge cases, label noise, or reflect undiagnosed/subclinical liver issues.

Statistically, these are outliers among the “healthy” class and will likely confuse a classifier unless handled carefully.

In [18]:
alt_abnormal = df[(df["Alamine_Aminotransferase"] < 7) | (df["Alamine_Aminotransferase"] > 56)]
alt_abnormal["Dataset"].value_counts()

Dataset
1    139
0     18
Name: count, dtype: int64

In [19]:
alt_abnormal_negative = alt_abnormal[alt_abnormal["Dataset"]==0]

In [25]:
intersection_tb_db_apl_alt = pd.merge(alp_negative, intersection_tb_db_apl, how="inner")

In [26]:
intersection_tb_db_apl_alt.shape

(31, 11)

In [27]:
intersection_tb_db_apl_alt

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,33,Male,1.6,0.5,165,15,23,7.3,3.5,0.92,0
1,38,Female,2.6,1.2,410,59,57,5.6,3.0,0.8,0
2,42,Male,6.8,3.2,630,25,47,6.1,2.3,0.6,0
3,35,Male,1.8,0.6,275,48,178,6.5,3.2,0.9,0
4,70,Male,1.4,0.6,146,12,24,6.2,3.8,1.58,0
5,36,Male,5.3,2.3,145,32,92,5.1,2.6,1.0,0
6,50,Male,5.8,3.0,661,181,285,5.7,2.3,0.67,0
7,50,Male,7.3,3.6,1580,88,64,5.6,2.3,0.6,0
8,58,Male,1.7,0.8,188,60,84,5.9,3.5,1.4,0
9,60,Male,1.8,0.5,201,45,25,3.9,1.7,0.7,0


In [28]:
pd.merge(alt_abnormal_negative, intersection_tb_db_apl, how="left")

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,64,Male,0.9,0.3,310,61,58,7.0,3.4,0.9,0
1,25,Male,0.6,0.1,183,91,53,5.5,2.3,0.7,0
2,38,Female,2.6,1.2,410,59,57,5.6,3.0,0.8,0
3,27,Male,1.2,0.4,179,63,39,6.1,3.3,1.1,0
4,50,Male,5.8,3.0,661,181,285,5.7,2.3,0.67,0
5,50,Male,7.3,3.6,1580,88,64,5.6,2.3,0.6,0
6,58,Male,1.7,0.8,188,60,84,5.9,3.5,1.4,0
7,38,Male,1.5,0.4,298,60,103,6.0,3.0,1.0,0
8,22,Male,2.7,1.0,160,82,127,5.5,3.1,1.2,0
9,61,Male,1.5,0.6,196,61,85,6.7,3.8,1.3,0


In [66]:
pd.merge(alt_abnormal_negative, intersection_tb_db_apl, how="right")

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,33,Male,1.6,0.5,165,15,23,7.3,3.5,0.92,0
1,38,Female,2.6,1.2,410,59,57,5.6,3.0,0.8,0
2,42,Male,6.8,3.2,630,25,47,6.1,2.3,0.6,0
3,35,Male,1.8,0.6,275,48,178,6.5,3.2,0.9,0
4,70,Male,1.4,0.6,146,12,24,6.2,3.8,1.58,0
5,36,Male,5.3,2.3,145,32,92,5.1,2.6,1.0,0
6,50,Male,5.8,3.0,661,181,285,5.7,2.3,0.67,0
7,50,Male,7.3,3.6,1580,88,64,5.6,2.3,0.6,0
8,58,Male,1.7,0.8,188,60,84,5.9,3.5,1.4,0
9,60,Male,1.8,0.5,201,45,25,3.9,1.7,0.7,0


In [29]:
pd.merge(alt_abnormal_negative, intersection_tb_db_apl, how="inner")

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,38,Female,2.6,1.2,410,59,57,5.6,3.0,0.8,0
1,50,Male,5.8,3.0,661,181,285,5.7,2.3,0.67,0
2,50,Male,7.3,3.6,1580,88,64,5.6,2.3,0.6,0
3,58,Male,1.7,0.8,188,60,84,5.9,3.5,1.4,0
4,38,Male,1.5,0.4,298,60,103,6.0,3.0,1.0,0
5,22,Male,2.7,1.0,160,82,127,5.5,3.1,1.2,0
6,61,Male,1.5,0.6,196,61,85,6.7,3.8,1.3,0
7,38,Male,2.2,1.0,310,119,42,7.9,4.1,1.0,0
8,39,Male,1.6,0.8,230,88,74,8.0,4.0,1.0,0


In [68]:
pd.merge(alt_abnormal_negative, intersection_tb_db_apl, how="outer")

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,4,Male,0.8,0.2,460,152,231,6.5,3.2,0.9,0
1,18,Male,1.3,0.7,316,10,21,6.0,2.1,0.5,0
2,19,Male,1.4,0.8,178,13,26,8.0,4.6,1.3,0
3,22,Male,0.8,0.2,300,57,40,7.9,3.8,0.9,0
4,22,Male,2.7,1.0,160,82,127,5.5,3.1,1.2,0
5,23,Female,2.3,0.8,509,28,44,6.9,2.9,0.7,0
6,24,Male,3.3,1.6,174,11,33,7.6,3.9,1.0,0
7,25,Male,0.6,0.1,183,91,53,5.5,2.3,0.7,0
8,26,Male,1.9,0.8,180,22,19,8.2,4.1,1.0,0
9,27,Male,1.2,0.4,179,63,39,6.1,3.3,1.1,0


Interpretation:
You now have a solid cohort of non-diseased records that violate multiple independent liver function indicators.

This isn't explainable by natural variation alone — they're either:

Subclinical cases (not labeled yet)

Mislabels

A limitation of the diagnostic label itself

Even if medically correct, these rows are statistical anomalies in the Dataset = 0 class — and they will likely reduce your classifier’s confidence.

In [31]:
# Use all columns to match, assuming columns match exactly
df_clean = df.merge(intersection_tb_db_apl, how='outer', indicator=True)
df_clean = df_clean[df_clean['_merge'] == 'left_only'].drop(columns=['_merge'])

In [33]:
df_clean.shape

(539, 11)

In [34]:
df_clean["Dataset"].value_counts()

Dataset
1    406
0    133
Name: count, dtype: int64

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 1. Split

df_clean["Gender"] = df_clean["Gender"].map({"Male":1, "Female":0})

X = df_clean.drop(columns=["Dataset"])
y = df_clean["Dataset"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# 2. Train with class weights
clf = RandomForestClassifier(class_weight='balanced', random_state=42)
clf.fit(X_train, y_train)

# 3. Predict & Evaluate
y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))


[[ 8 19]
 [ 9 72]]
              precision    recall  f1-score   support

           0      0.471     0.296     0.364        27
           1      0.791     0.889     0.837        81

    accuracy                          0.741       108
   macro avg      0.631     0.593     0.600       108
weighted avg      0.711     0.741     0.719       108



In [38]:
df_clean.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Dataset                       0
dtype: int64

In [49]:
df_clean[df_clean["Albumin_and_Globulin_Ratio"].isnull()]

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
88,27,1,1.3,0.6,106,25,54,8.5,4.8,,0
162,35,0,0.6,0.2,180,12,15,5.2,2.7,,0
270,45,0,0.9,0.3,189,23,33,6.6,3.9,,1
369,51,1,0.8,0.2,230,24,46,6.5,3.1,,1


In [50]:
# Fill NaN in Albumin_and_Globulin_Ratio with class-wise median
df_clean["Albumin_and_Globulin_Ratio"] = df_clean.groupby("Dataset")["Albumin_and_Globulin_Ratio"]\
    .transform(lambda x: x.fillna(x.median()))

In [51]:
df_clean.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    0
Dataset                       0
dtype: int64

In [52]:
from sklearn.model_selection import train_test_split

X = df_clean.drop(columns=["Dataset"])
y = df_clean["Dataset"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

In [53]:
from sklearn.model_selection import train_test_split

X = df_clean.drop(columns=["Dataset"])
y = df_clean["Dataset"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", X_train.shape, y_train.value_counts().to_dict())
print("After SMOTE:", X_train_resampled.shape, y_train_resampled.value_counts().to_dict())

Before SMOTE: (431, 10) {1: 325, 0: 106}
After SMOTE: (650, 10) {1: 325, 0: 325}


In [54]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_resampled, y_train_resampled)

y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

[[18  9]
 [16 65]]
              precision    recall  f1-score   support

           0      0.529     0.667     0.590        27
           1      0.878     0.802     0.839        81

    accuracy                          0.769       108
   macro avg      0.704     0.735     0.714       108
weighted avg      0.791     0.769     0.777       108



In [55]:
from imblearn.combine import SMOTETomek

smote_tomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)

In [56]:
clf = RandomForestClassifier(random_state=42)
clf.fit(X_resampled, y_resampled)

y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

[[15 12]
 [19 62]]
              precision    recall  f1-score   support

           0      0.441     0.556     0.492        27
           1      0.838     0.765     0.800        81

    accuracy                          0.713       108
   macro avg      0.640     0.660     0.646       108
weighted avg      0.739     0.713     0.723       108



In [57]:
from xgboost import XGBClassifier

clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
clf.fit(X_train_resampled, y_train_resampled)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [58]:
y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

[[13 14]
 [16 65]]
              precision    recall  f1-score   support

           0      0.448     0.481     0.464        27
           1      0.823     0.802     0.812        81

    accuracy                          0.722       108
   macro avg      0.636     0.642     0.638       108
weighted avg      0.729     0.722     0.725       108



In [59]:
from xgboost import XGBClassifier

clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
clf.fit(X_resampled, y_resampled)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [60]:
y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

[[14 13]
 [17 64]]
              precision    recall  f1-score   support

           0      0.452     0.519     0.483        27
           1      0.831     0.790     0.810        81

    accuracy                          0.722       108
   macro avg      0.641     0.654     0.646       108
weighted avg      0.736     0.722     0.728       108



In [61]:
from imblearn.combine import SMOTEENN

In [62]:
# Apply SMOTEENN to training data
from collections import Counter
print("Before SMOTEENN:", Counter(y_train))

smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

print("After SMOTEENN:", Counter(y_resampled))

Before SMOTEENN: Counter({1: 325, 0: 106})
After SMOTEENN: Counter({0: 235, 1: 189})


In [63]:
# Train model
clf = RandomForestClassifier(random_state=42)  # or XGBClassifier
clf.fit(X_resampled, y_resampled)

# Predict on test set
y_pred = clf.predict(X_test)

# Evaluate
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

[[22  5]
 [28 53]]
              precision    recall  f1-score   support

           0      0.440     0.815     0.571        27
           1      0.914     0.654     0.763        81

    accuracy                          0.694       108
   macro avg      0.677     0.735     0.667       108
weighted avg      0.795     0.694     0.715       108



# Modeling #

In [64]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Before SMOTE:", X_train.shape, y_train.value_counts().to_dict())
print("After SMOTE:", X_train_resampled.shape, y_train_resampled.value_counts().to_dict())

Before SMOTE: (431, 10) {1: 325, 0: 106}
After SMOTE: (650, 10) {1: 325, 0: 325}


In [65]:
from sklearn.model_selection import GridSearchCV

In [66]:
estimator = RandomForestClassifier()

In [72]:
param_grid = {
    "n_estimators": [10, 50, 100],
    "max_depth": [None] + list(range(1, 10)),  
    "max_leaf_nodes": [None, 5, 10, 20, 30, 40],  
    "min_samples_split": [2, 3, 4, 5, 6, 7],     
    "min_samples_leaf": [1, 2, 3, 4, 5],         
    "criterion": ["gini", "entropy"],
    "class_weight": [None, "balanced"]
    
}

In [94]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    "n_estimators": [25,50,100,200],
    "max_depth": [None,50],
    "min_samples_split": [1,2,3,4],
    "min_samples_leaf": [1,2,3,4],
    "criterion": ["gini", "entropy"],
    "class_weight": ["balanced"]  
}

clf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    cv=5,
    scoring="recall",  
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_resampled, y_train_resampled)

Fitting 5 folds for each of 256 candidates, totalling 1280 fits


320 fits failed out of a total of 1280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
320 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\DELL\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\DELL\anaconda3\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\DELL\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\DELL\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameter

In [95]:
grid_search.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': None,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 200}

In [99]:
rf_model = RandomForestClassifier(
#     class_weight="balanced",
#     criterion="gini",
#     max_depth=None,
#     min_samples_leaf=1,
#     min_samples_split=4,
#     n_estimators=200
# 
)

In [100]:
rf_model.fit(X_train_resampled, y_train_resampled)

In [101]:
# Predict on test set
y_pred = rf_model.predict(X_test)

# Evaluate
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

[[18  9]
 [19 62]]
              precision    recall  f1-score   support

           0      0.486     0.667     0.562        27
           1      0.873     0.765     0.816        81

    accuracy                          0.741       108
   macro avg      0.680     0.716     0.689       108
weighted avg      0.777     0.741     0.752       108



In [102]:
from sklearn.ensemble import AdaBoostClassifier

In [103]:
ada_model = AdaBoostClassifier()

In [104]:
ada_model.fit(X_train_resampled, y_train_resampled)



In [105]:
# Predict on test set
y_pred = ada_model.predict(X_test)

# Evaluate
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

[[20  7]
 [22 59]]
              precision    recall  f1-score   support

           0      0.476     0.741     0.580        27
           1      0.894     0.728     0.803        81

    accuracy                          0.731       108
   macro avg      0.685     0.735     0.691       108
weighted avg      0.790     0.731     0.747       108



In [118]:
import warnings

warnings.filterwarnings("ignore")

estimator = AdaBoostClassifier()

In [138]:
param_grid = {
    "n_estimators": [25,50,100,150,200,250,300,350,400],
    "learning_rate": [1.0,0.9, 0.5, 0.3, 0.1, 0.01, 0.001],
    "algorithm": ['SAMME', 'SAMME.R']
}

In [139]:
grid_ada = GridSearchCV(
    estimator,
    param_grid,
    cv=5,
    scoring="recall"
)
grid_ada.fit(X_train_resampled, y_train_resampled)

In [140]:
grid_ada.fit(X_train_resampled, y_train_resampled)

In [141]:
grid_ada.best_params_

{'algorithm': 'SAMME.R', 'learning_rate': 1.0, 'n_estimators': 350}

In [129]:
ada_model = AdaBoostClassifier(
    learning_rate=0.5,
    n_estimators=100
)

ada_model.fit(X_train_resampled, y_train_resampled)
# Predict on test set
y_pred = ada_model.predict(X_test)

# Evaluate
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

[[19  8]
 [20 61]]
              precision    recall  f1-score   support

           0      0.487     0.704     0.576        27
           1      0.884     0.753     0.813        81

    accuracy                          0.741       108
   macro avg      0.686     0.728     0.695       108
weighted avg      0.785     0.741     0.754       108



In [137]:
ada_model = AdaBoostClassifier(
    learning_rate=1.0,
    n_estimators=200
)

ada_model.fit(X_train_resampled, y_train_resampled)
# Predict on test set
y_pred = ada_model.predict(X_test)

# Evaluate
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

[[17 10]
 [16 65]]
              precision    recall  f1-score   support

           0      0.515     0.630     0.567        27
           1      0.867     0.802     0.833        81

    accuracy                          0.759       108
   macro avg      0.691     0.716     0.700       108
weighted avg      0.779     0.759     0.767       108



In [185]:
ada_model = AdaBoostClassifier(
    learning_rate=1.0,
    n_estimators=250
)

ada_model.fit(X_train_resampled, y_train_resampled)
# Predict on test set
y_pred = ada_model.predict(X_test)

# Evaluate
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

[[17 10]
 [15 66]]
              precision    recall  f1-score   support

           0      0.531     0.630     0.576        27
           1      0.868     0.815     0.841        81

    accuracy                          0.769       108
   macro avg      0.700     0.722     0.709       108
weighted avg      0.784     0.769     0.775       108



In [198]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix

gb_model = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.0001,
    max_depth=10,
    subsample=1.0,        # Try 0.8–1.0
    random_state=42
)

gb_model.fit(X_train_resampled, y_train_resampled)
y_pred = gb_model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

[[16 11]
 [23 58]]
              precision    recall  f1-score   support

           0      0.410     0.593     0.485        27
           1      0.841     0.716     0.773        81

    accuracy                          0.685       108
   macro avg      0.625     0.654     0.629       108
weighted avg      0.733     0.685     0.701       108



In [199]:
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix, classification_report

lgbm_model = LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=-1,
    class_weight='balanced',  # handles your class imbalance
    random_state=42
)

lgbm_model.fit(X_train_resampled, y_train_resampled)
y_pred = lgbm_model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

[LightGBM] [Info] Number of positive: 325, number of negative: 325
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001390 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1239
[LightGBM] [Info] Number of data points in the train set: 650, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[[14 13]
 [17 64]]
              precision    recall  f1-score   support

           0      0.452     0.519     0.483        27
           1      0.831     0.790     0.810        81

    accuracy                          0.722       108
   macro avg      0.641     0.654     0.646       108
weighted avg      0.736     0.722     0.728       108



In [200]:
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix

lgbm = LGBMClassifier(class_weight='balanced', random_state=42)

param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'max_depth': [-1, 5, 10, 15],
    'num_leaves': [15, 31, 50, 70],
    'min_child_samples': [10, 20, 30],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 1],
    'reg_lambda': [0, 0.1, 1]
}

search = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=param_dist,
    n_iter=50,              # try 50 combinations
    cv=5,
    scoring='f1',
    verbose=2,
    random_state=42,
    n_jobs=-1
)

search.fit(X_train_resampled, y_train_resampled)

# Evaluate
y_pred = search.best_estimator_.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[LightGBM] [Info] Number of positive: 325, number of negative: 325
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000453 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1239
[LightGBM] [Info] Number of data points in the train set: 650, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[[16 11]
 [16 65]]
              precision    recall  f1-score   support

           0      0.500     0.593     0.542        27
           1      0.855     0.802     0.828        81

    accuracy                          0.750       108
   macro avg      0.678     0.698     0.685       108
weighted avg      0.766     0.750     0.757       108



In [205]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report

# Base estimator
base_tree = DecisionTreeClassifier(random_state=42)

# AdaBoost setup
ada = AdaBoostClassifier(
    estimator=base_tree,
    random_state=42
)

# Param grid
param_dist = {
    "n_estimators": [100, 200, 300, 350, 400],
    "learning_rate": [0.01, 0.05, 0.1, 0.5, 1.0],
    "estimator__max_depth": [1, 2, 3, 4],
    "estimator__min_samples_split": [2, 3, 4],
    "estimator__min_samples_leaf": [1, 2, 3]
}

# Randomized search
random_search = RandomizedSearchCV(
    estimator=ada,
    param_distributions=param_dist,
    n_iter=40,
    scoring='f1',
    cv=5,
    random_state=42,
    n_jobs=-1,
    verbose=2
)

# Fit
random_search.fit(X_train_resampled, y_train_resampled)

# Predict
y_pred = random_search.best_estimator_.predict(X_test)

# Evaluate
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[[13 14]
 [19 62]]
              precision    recall  f1-score   support

           0      0.406     0.481     0.441        27
           1      0.816     0.765     0.790        81

    accuracy                          0.694       108
   macro avg      0.611     0.623     0.615       108
weighted avg      0.713     0.694     0.703       108



In [206]:
! pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-win_amd64.whl.metadata (1.5 kB)
Downloading catboost-1.2.8-cp312-cp312-win_amd64.whl (102.4 MB)
   ---------------------------------------- 0.0/102.4 MB ? eta -:--:--
    --------------------------------------- 1.6/102.4 MB 7.0 MB/s eta 0:00:15
   - -------------------------------------- 3.1/102.4 MB 7.1 MB/s eta 0:00:14
   - -------------------------------------- 4.7/102.4 MB 7.3 MB/s eta 0:00:14
   -- ------------------------------------- 6.3/102.4 MB 7.4 MB/s eta 0:00:13
   --- ------------------------------------ 8.1/102.4 MB 7.6 MB/s eta 0:00:13
   --- ------------------------------------ 9.7/102.4 MB 7.6 MB/s eta 0:00:13
   ---- ----------------------------------- 11.3/102.4 MB 7.7 MB/s eta 0:00:12
   ----- ---------------------------------- 13.1/102.4 MB 7.7 MB/s eta 0:00:12
   ----- ---------------------------------- 14.9/102.4 MB 7.8 MB/s eta 0:00:12
   ------ --------------------------------- 16.8/102.4 MB 7.8 MB/s 

In [207]:
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# Define base model
cat_model = CatBoostClassifier(verbose=0, random_state=42, auto_class_weights='Balanced')

# Hyperparameter space
param_dist = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'iterations': [100, 200, 300, 400, 500],
    'border_count': [32, 64, 128]
}

# RandomizedSearchCV
cat_random_search = RandomizedSearchCV(
    estimator=cat_model,
    param_distributions=param_dist,
    n_iter=40,
    cv=5,
    scoring='f1_macro',
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# Fit on resampled data
cat_random_search.fit(X_train_resampled, y_train_resampled)

# Predict
y_pred = cat_random_search.predict(X_test)

# Evaluate
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))


Fitting 5 folds for each of 40 candidates, totalling 200 fits
[[15 12]
 [14 67]]
              precision    recall  f1-score   support

           0      0.517     0.556     0.536        27
           1      0.848     0.827     0.838        81

    accuracy                          0.759       108
   macro avg      0.683     0.691     0.687       108
weighted avg      0.765     0.759     0.762       108



In [212]:
df_clean.to_csv("cleaned_indian_liver_patient.csv")

In [213]:
df_clean.corr()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
Age,1.0,0.061681,0.012122,0.007463,0.086371,-0.087092,-0.017076,-0.183756,-0.270956,-0.231292,0.146078
Gender,0.061681,1.0,0.093465,0.10523,-0.022854,0.087854,0.084128,-0.086178,-0.095699,-0.011447,0.113361
Total_Bilirubin,0.012122,0.093465,1.0,0.873536,0.202092,0.215751,0.237226,-0.007908,-0.226818,-0.207601,0.228117
Direct_Bilirubin,0.007463,0.10523,0.873536,1.0,0.230041,0.235719,0.257064,-0.000575,-0.233935,-0.200977,0.256257
Alkaline_Phosphotase,0.086371,-0.022854,0.202092,0.230041,1.0,0.126678,0.170731,-0.024231,-0.159671,-0.225237,0.210833
Alamine_Aminotransferase,-0.087092,0.087854,0.215751,0.235719,0.126678,1.0,0.791672,-0.037945,-0.028795,-0.005609,0.157107
Aspartate_Aminotransferase,-0.017076,0.084128,0.237226,0.257064,0.170731,0.791672,1.0,-0.023033,-0.086764,-0.073642,0.147399
Total_Protiens,-0.183756,-0.086178,-0.007908,-0.000575,-0.024231,-0.037945,-0.023033,1.0,0.780198,0.230509,-0.063078
Albumin,-0.270956,-0.095699,-0.226818,-0.233935,-0.159671,-0.028795,-0.086764,0.780198,1.0,0.683139,-0.192817
Albumin_and_Globulin_Ratio,-0.231292,-0.011447,-0.207601,-0.200977,-0.225237,-0.005609,-0.073642,0.230509,0.683139,1.0,-0.187862


In [215]:
df_clean.skew()

Age                           -0.046989
Gender                        -1.143669
Total_Bilirubin                4.745263
Direct_Bilirubin               3.101217
Alkaline_Phosphotase           3.729986
Alamine_Aminotransferase       6.521044
Aspartate_Aminotransferase    10.298085
Total_Protiens                -0.327613
Albumin                       -0.057613
Albumin_and_Globulin_Ratio     1.019298
Dataset                       -1.178108
dtype: float64

In [227]:
df_clean.pivot_table(columns=df["Gender"], index=df["Dataset"])

Unnamed: 0_level_0,Age,Age,Alamine_Aminotransferase,Alamine_Aminotransferase,Albumin,Albumin,Albumin_and_Globulin_Ratio,Albumin_and_Globulin_Ratio,Alkaline_Phosphotase,Alkaline_Phosphotase,...,Dataset,Dataset,Direct_Bilirubin,Direct_Bilirubin,Gender,Gender,Total_Bilirubin,Total_Bilirubin,Total_Protiens,Total_Protiens
Gender,Female,Male,Female,Male,Female,Male,Female,Male,Female,Male,...,Female,Male,Female,Male,Female,Male,Female,Male,Female,Male
Dataset,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0.0,49.304348,45.149533,84.956522,67.130841,3.15,3.183178,0.948478,0.93972,321.326087,286.570093,...,0.826087,0.738318,2.08913,0.947664,0.847826,0.738318,4.221739,2.88785,6.513043,6.605607
1.0,45.647059,44.577855,62.094118,90.692042,3.118824,3.13045,0.910588,0.954256,315.023529,284.723183,...,0.8,0.733564,1.223529,1.708304,0.823529,0.712803,2.587059,3.620069,6.46,6.469896


In [228]:
df_clean.pivot_table(
    aggfunc="count",
    values="Gender",
    index=df["Dataset"]
)

Unnamed: 0_level_0,Gender
Dataset,Unnamed: 1_level_1
0.0,153
1.0,374


In [236]:
df_clean.pivot_table(
    aggfunc="skew",
    values=["Total_Bilirubin", "Direct_Bilirubin", "Albumin", "Albumin_and_Globulin_Ratio", "Direct_Bilirubin", "Total_Protiens"],
    index=["Dataset"]
)

Unnamed: 0_level_0,Albumin,Albumin_and_Globulin_Ratio,Direct_Bilirubin,Direct_Bilirubin,Total_Bilirubin,Total_Protiens
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,-0.390814,0.559261,5.359686,5.359686,4.376309,-0.29352
1,0.042592,1.250416,2.608527,2.608527,4.131408,-0.326922


In [237]:
df_clean.pivot_table(
    aggfunc="skew",
    values=df_clean.columns,
    index=["Dataset"]
)

Unnamed: 0_level_0,Age,Alamine_Aminotransferase,Albumin,Albumin_and_Globulin_Ratio,Alkaline_Phosphotase,Aspartate_Aminotransferase,Direct_Bilirubin,Gender,Total_Bilirubin,Total_Protiens
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.094366,3.75657,-0.390814,0.559261,2.451766,3.814981,5.359686,-0.691134,4.376309,-0.29352
1,-0.0397,5.676256,0.042592,1.250416,3.302568,9.056589,2.608527,-1.327948,4.131408,-0.326922


In [238]:
df_clean["AST_ALT_ratio"] = df_clean["Aspartate_Aminotransferase"] / df_clean["Alamine_Aminotransferase"]
df_clean["Bilirubin_ratio"] = df_clean["Direct_Bilirubin"] / df_clean["Total_Bilirubin"]
df_clean["Protein_gap"] = df_clean["Total_Protiens"] - df_clean["Albumin"]

In [239]:
df_clean.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset,AST_ALT_ratio,Bilirubin_ratio,Protein_gap
0,4,1,0.8,0.2,460,152,231,6.5,3.2,0.9,0,1.519737,0.25,3.3
1,4,1,0.9,0.2,348,30,34,8.0,4.0,1.0,0,1.133333,0.222222,4.0
2,6,1,0.6,0.1,289,38,30,4.8,2.0,0.7,0,0.789474,0.166667,2.8
3,7,0,27.2,11.8,1420,790,1050,6.1,2.0,0.4,1,1.329114,0.433824,4.1
4,7,1,0.5,0.1,352,28,51,7.9,4.2,1.1,0,1.821429,0.2,3.7


In [240]:
df_clean[["AST_ALT_ratio", "Bilirubin_ratio", "Protein_gap", "Dataset"]].corr()

Unnamed: 0,AST_ALT_ratio,Bilirubin_ratio,Protein_gap,Dataset
AST_ALT_ratio,1.0,0.128556,0.164271,0.09842
Bilirubin_ratio,0.128556,1.0,0.113201,0.188462
Protein_gap,0.164271,0.113201,1.0,0.125855
Dataset,0.09842,0.188462,0.125855,1.0


In [241]:
X = df_clean.drop("Dataset", axis=1)
y = df_clean["Dataset"]

In [242]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [243]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


In [244]:
ada_model = AdaBoostClassifier(
    learning_rate=1.0,
    n_estimators=250
)

ada_model.fit(X_train_res, y_train_res)
# Predict on test set
y_pred = ada_model.predict(X_test)

# Evaluate
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

[[15 12]
 [19 62]]
              precision    recall  f1-score   support

           0      0.441     0.556     0.492        27
           1      0.838     0.765     0.800        81

    accuracy                          0.713       108
   macro avg      0.640     0.660     0.646       108
weighted avg      0.739     0.713     0.723       108



In [245]:
model = LGBMClassifier(random_state=42)
model.fit(X_train_res, y_train_res)
y_pred = model.predict(X_test)


[LightGBM] [Info] Number of positive: 325, number of negative: 325
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000770 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1761
[LightGBM] [Info] Number of data points in the train set: 650, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [246]:
# Evaluate
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

[[14 13]
 [16 65]]
              precision    recall  f1-score   support

           0      0.467     0.519     0.491        27
           1      0.833     0.802     0.818        81

    accuracy                          0.731       108
   macro avg      0.650     0.660     0.654       108
weighted avg      0.742     0.731     0.736       108



In [247]:
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(random_state=42)
lgbm.fit(X_train_res, y_train_res)

sfm = SelectFromModel(lgbm, threshold='mean')  # or use threshold='median'
X_train_sel = sfm.transform(X_train_res)
X_test_sel = sfm.transform(X_test)

# Retrain
lgbm.fit(X_train_sel, y_train_res)
y_pred = lgbm.predict(X_test_sel)

[LightGBM] [Info] Number of positive: 325, number of negative: 325
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000456 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1761
[LightGBM] [Info] Number of data points in the train set: 650, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


ValueError: X has 6 features, but LGBMClassifier is expecting 13 features as input.

In [248]:
# Evaluate
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))

[[14 13]
 [16 65]]
              precision    recall  f1-score   support

           0      0.467     0.519     0.491        27
           1      0.833     0.802     0.818        81

    accuracy                          0.731       108
   macro avg      0.650     0.660     0.654       108
weighted avg      0.742     0.731     0.736       108



In [249]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Step 1: Drop engineered features
df_model = df_clean.drop(columns=['AST_ALT_ratio', 'Bilirubin_ratio', 'Protein_gap'], errors='ignore')

# Step 2: Split features and target
X = df_model.drop('Dataset', axis=1)
y = df_model['Dataset']

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Step 4: Apply SMOTE to balance training data
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# Step 5: Train AdaBoost
model = AdaBoostClassifier(learning_rate=1, n_estimators=250, random_state=42)
model.fit(X_train_sm, y_train_sm)

# Step 6: Evaluate
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))


[[17 10]
 [15 66]]
              precision    recall  f1-score   support

           0      0.531     0.630     0.576        27
           1      0.868     0.815     0.841        81

    accuracy                          0.769       108
   macro avg      0.700     0.722     0.709       108
weighted avg      0.784     0.769     0.775       108



In [251]:
# 2. Extract importances into a DataFrame
importances = pd.Series(model.feature_importances_, index=X_train_sm.columns)
importances.sort_values(ascending=False)

Total_Bilirubin               0.164
Albumin_and_Globulin_Ratio    0.148
Alkaline_Phosphotase          0.124
Age                           0.120
Albumin                       0.108
Alamine_Aminotransferase      0.096
Direct_Bilirubin              0.080
Aspartate_Aminotransferase    0.080
Total_Protiens                0.080
Gender                        0.000
dtype: float64

In [252]:
import numpy as np

importances = {
    "Total_Bilirubin":               0.164,
    "Albumin_and_Globulin_Ratio":    0.148,
    "Alkaline_Phosphotase":          0.124,
    "Age":                           0.120,
    "Albumin":                       0.108,
    "Alamine_Aminotransferase":      0.096,
    "Direct_Bilirubin":              0.080,
    "Aspartate_Aminotransferase":    0.080,
    "Total_Protiens":                0.080,
    "Gender":                        0.000
}

# 1. Compute mean importance
threshold = np.mean(list(importances.values()))
print(f"Mean importance threshold = {threshold:.3f}")


Mean importance threshold = 0.100


In [253]:
selected = [
    "Total_Bilirubin",
    "Albumin_and_Globulin_Ratio",
    "Alkaline_Phosphotase",
    "Age",
    "Albumin"
]

X_train_sel = X_train_sm[selected]
X_test_sel  = X_test[selected]

In [254]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Initialize and train
ada_sel = AdaBoostClassifier(learning_rate=1, n_estimators=250, random_state=42)
ada_sel.fit(X_train_sel, y_train_sm)

# Predict and evaluate
y_pred_sel = ada_sel.predict(X_test_sel)
print(confusion_matrix(y_test, y_pred_sel))
print(classification_report(y_test, y_pred_sel, digits=3))


[[17 10]
 [16 65]]
              precision    recall  f1-score   support

           0      0.515     0.630     0.567        27
           1      0.867     0.802     0.833        81

    accuracy                          0.759       108
   macro avg      0.691     0.716     0.700       108
weighted avg      0.779     0.759     0.767       108

