<a href="https://www.kaggle.com/code/emmanuelniyioriolowo/2-decision-tree-rldc?scriptVersionId=285432144" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Imports

In [1]:
!pip install scikit-learn==1.4.2 imbalanced-learn==0.12.2
!pip install --upgrade ctgan

Collecting scikit-learn==1.4.2
  Downloading scikit_learn-1.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting imbalanced-learn==0.12.2
  Downloading imbalanced_learn-0.12.2-py3-none-any.whl.metadata (8.2 kB)
Downloading scikit_learn-1.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m93.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[?25hDownloading imbalanced_learn-0.12.2-py3-none-any.whl (257 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.0/258.0 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn, imbalanced-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanc

In [2]:
# Imports and Environment Setup

import os
import torch
import random
import numpy as np
import pandas as pd

from collections import Counter
from ctgan import CTGAN
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import matthews_corrcoef, make_scorer

# Display available files in the Kaggle input directory
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/indian-liver-patient-ilp-dataset/Indian Liver Patient Dataset (ILPD).csv


# Data Loading and Cleaning

In [3]:
# Load ILPD dataset
path = "/kaggle/input/indian-liver-patient-ilp-dataset/Indian Liver Patient Dataset (ILPD).csv"
data = pd.read_csv(path)

# Rename columns for consistency and readability
data.columns = [
    'Age',
    'Gender',
    'TB_total_bilirubin',
    'DB_Direct_Bilirubin',
    'Alkphos_Alkaline_Phosphotase',
    'Sgpt_Alamine_Aminotransferase',
    'Sgot_Aspartate_Aminotransferase',
    'TP_Total_Protiens',
    'ALB_Albumin',
    'A/G_Ratio',
    'Selector'
]

data

Unnamed: 0,Age,Gender,TB_total_bilirubin,DB_Direct_Bilirubin,Alkphos_Alkaline_Phosphotase,Sgpt_Alamine_Aminotransferase,Sgot_Aspartate_Aminotransferase,TP_Total_Protiens,ALB_Albumin,A/G_Ratio,Selector
0,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
1,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
2,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.00,1
3,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.40,1
4,46,Male,1.8,0.7,208,19,14,7.6,4.4,1.30,1
...,...,...,...,...,...,...,...,...,...,...,...
577,60,Male,0.5,0.1,500,20,34,5.9,1.6,0.37,2
578,40,Male,0.6,0.1,98,35,31,6.0,3.2,1.10,1
579,52,Male,0.8,0.2,245,48,49,6.4,3.2,1.00,1
580,31,Male,1.3,0.5,184,29,32,6.8,3.4,1.00,1


In [4]:
# one hot encode categorical data
data = pd.get_dummies(data)
data.head()

Unnamed: 0,Age,TB_total_bilirubin,DB_Direct_Bilirubin,Alkphos_Alkaline_Phosphotase,Sgpt_Alamine_Aminotransferase,Sgot_Aspartate_Aminotransferase,TP_Total_Protiens,ALB_Albumin,A/G_Ratio,Selector,Gender_Female,Gender_Male
0,62,10.9,5.5,699,64,100,7.5,3.2,0.74,1,False,True
1,62,7.3,4.1,490,60,68,7.0,3.3,0.89,1,False,True
2,58,1.0,0.4,182,14,20,6.8,3.4,1.0,1,False,True
3,72,3.9,2.0,195,27,59,7.3,2.4,0.4,1,False,True
4,46,1.8,0.7,208,19,14,7.6,4.4,1.3,1,False,True


In [5]:
# replace NaN values with mean
data['A/G_Ratio'] = data['A/G_Ratio'].fillna(data['A/G_Ratio'].mean())
data['A/G_Ratio'].isna().sum()

0

In [6]:
# set y
y = data.Selector

# Update the columns list post one hot encoding 
cols = data.columns.tolist()
cols.remove("Selector")

# set X
features = cols
X = data[features]

# Model Definition 

In [7]:
# Model and evaluation setup
skf = StratifiedKFold(n_splits=10)

ilp_model = DecisionTreeRegressor()
mcc_scorer = make_scorer(matthews_corrcoef)

scoring = {
    'accuracy': 'accuracy',
    'recall': 'recall',
    'precision': 'precision',
    'f1': 'f1',
    'mcc': mcc_scorer
}

# Cross-validation
cv_results = cross_validate(
    ilp_model,
    X, y,
    cv=skf,
    scoring=scoring
)

# Display averaged metrics
for metric in scoring:
    print(f"Average {metric}: {cv_results[f'test_{metric}'].mean():.3f}")

Average accuracy: 0.636
Average recall: 0.752
Average precision: 0.739
Average f1: 0.744
Average mcc: 0.108


In [8]:
# Reporting table for SMOTE experiments
smote_results = pd.DataFrame(columns=["case", "accuracy", "recall", "precision", "f1", "mcc"])

In [9]:
# Store baseline (unbalanced) metrics
metric_list = ["UNBALANCED"]

for metric in scoring:
    metric_list.append(round(cv_results[f'test_{metric}'].mean(), 4))

smote_results.loc[len(smote_results)] = metric_list
smote_results.head()

Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,UNBALANCED,0.6357,0.7517,0.7394,0.7442,0.1075


# Augmentation with SMOTE (Synthetic Minority Oversampling Technique - SMOTE)

In [10]:
print("Before SMOTE:", Counter(y))

smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

print("After SMOTE:", Counter(y_smote))

Before SMOTE: Counter({1: 415, 2: 167})
After SMOTE: Counter({1: 415, 2: 415})


In [11]:
# Evaluate model on SMOTE-balanced data
cv_results = cross_validate(
    ilp_model,
    X_smote, y_smote,
    cv=skf,
    scoring=scoring
)

# Store SMOTE-balanced metrics
metric_list = ["BAL-AUG"]

for metric in scoring:
    metric_list.append(round(cv_results[f'test_{metric}'].mean(), 4))

smote_results.loc[len(smote_results)] = metric_list
smote_results

Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,UNBALANCED,0.6357,0.7517,0.7394,0.7442,0.1075
1,BAL-AUG,0.7373,0.687,0.7849,0.722,0.4876


In [12]:
# get current number of rows
current_count = Counter(y_smote)[1]

double = current_count * 2
triple = current_count * 3
quadrupule = current_count * 4

In [13]:
# Double the minority class using SMOTE
smote_double = SMOTE(sampling_strategy={1: double}, random_state=42)
X_double, y_double = smote_double.fit_resample(X_smote, y_smote)
print("After targeted doubling:", Counter(y_double))

# Normalize class balance with standard SMOTE
smote_double = SMOTE(random_state=42)
X_double, y_double = smote_double.fit_resample(X_double, y_double)
print("After normalization:", Counter(y_double))

After targeted doubling: Counter({1: 830, 2: 415})
After normalization: Counter({1: 830, 2: 830})


In [14]:
# calculate with augmented data
cv_results = cross_validate(ilp_model, X_double, y_double, cv=skf, scoring=scoring)

# retrieve the metrics for the dataset augmented 2 fold and add them to the df
metric_list = ["DD-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results['test_' + metric].mean(), 4))

smote_results.loc[len(smote_results)] = metric_list
smote_results

Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,UNBALANCED,0.6357,0.7517,0.7394,0.7442,0.1075
1,BAL-AUG,0.7373,0.687,0.7849,0.722,0.4876
2,DD-AUG,0.8554,0.8265,0.8833,0.8521,0.7151


In [15]:
# TRIPLE THE DATA WITH SMOTE

# Triple the data 
smote_triple = SMOTE(sampling_strategy={1: triple}, random_state=42)  
X_triple, y_triple = smote_triple.fit_resample(X_smote, y_smote)

# normalize index 1 and 2, effectively doubling the data
smote_triple = SMOTE(random_state=42)
X_triple, y_triple = smote_double.fit_resample(X_triple, y_triple)
print("Double SMOTE:", Counter(y_triple))

Double SMOTE: Counter({1: 1245, 2: 1245})


In [16]:
# calculate with augmented data
cv_results = cross_validate(ilp_model, X_triple, y_triple, cv=skf, scoring=scoring, return_train_score=True)
print("Training mcc:", cv_results['train_mcc'].mean())
print("Validation mcc:", cv_results['test_mcc'].mean())

# retrieve the metrics for the dataset augmented 3 fold and add them to the df
metric_list = ["TD-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results['test_' + metric].mean(), 4))

smote_results.loc[len(smote_results)] = metric_list

Training mcc: 1.0
Validation mcc: 0.7837429085170416


In [17]:
# QUADRUPULE THE DATA WITH SMOTE

# Quadrupul data 
smote_quad = SMOTE(sampling_strategy={1: quadrupule}, random_state=42)  
X_quad, y_quad = smote_quad.fit_resample(X_smote, y_smote)

# normalize index 1 and 2, effectively Quadrupling the data
smote_quad = SMOTE(random_state=42)
X_quad, y_quad = smote_quad.fit_resample(X_quad, y_quad)

# calculate with augmented data
cv_results = cross_validate(ilp_model, X_quad, y_quad, cv=skf, scoring=scoring, return_train_score=True)

# retrieve the metrics for the dataset augmented 4x and add them to the df
metric_list = ["QD-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results['test_' + metric].mean(), 4))

smote_results.loc[len(smote_results)] = metric_list


# Compare traning and validation mcc to check for overfitting 
print("Training mcc:", cv_results['train_mcc'].mean())
print("Validation mcc:", cv_results['test_mcc'].mean())

smote_results.head()

Training mcc: 1.0
Validation mcc: 0.831253972557143


Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,UNBALANCED,0.6357,0.7517,0.7394,0.7442,0.1075
1,BAL-AUG,0.7373,0.687,0.7849,0.722,0.4876
2,DD-AUG,0.8554,0.8265,0.8833,0.8521,0.7151
3,TD-AUG,0.8908,0.8804,0.9031,0.8904,0.7837
4,QD-AUG,0.9142,0.9157,0.9182,0.9154,0.8313


# Augmentation with CTGAN (Conditional Tabular Generative Adversarial Network)

In [18]:
# Seed configuration for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

# Train CTGAN
ctgan = CTGAN(epochs=400)
ctgan.fit(data)

# Generate minority samples to correct the initial imbalance
synthetic_minority = ctgan.sample(248, {'Selector': 2})

# Combine original and synthetic minority rows
balanced_data = pd.concat([data, synthetic_minority])

# Create a 4× dataset in a single generation pass
len_balanced = len(balanced_data)
number_to_generate = (len_balanced * 4) - len_balanced
generated_difference = ctgan.sample(number_to_generate)

# Assemble expanded datasets
quad_data = pd.concat([balanced_data, generated_difference])
double_data = quad_data.head(len_balanced * 2)
triple_data = quad_data.head(len_balanced * 3)

  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


In [19]:
y = quad_data["Selector"]
print("After GAN:", Counter(y))

After GAN: Counter({1: 2810, 2: 510})


In [20]:
# Balanced CTGAN data evaluation
y = balanced_data["Selector"]

cols = balanced_data.columns.tolist()
cols.remove("Selector")
features = cols

X = balanced_data[features]

gan_results = pd.DataFrame(columns=["case", "accuracy", "recall", "precision", "f1", "mcc"])

cv_results = cross_validate(
    ilp_model,
    X, y,
    cv=skf,
    scoring=scoring,
    return_train_score=True
)

metric_list = ["BAL-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results[f'test_{metric}'].mean(), 4))

gan_results.loc[len(gan_results)] = metric_list

gan_results

Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,BAL-AUG,0.6759,0.7746,0.7962,0.7816,0.1354


In [21]:
# 2× CTGAN data evaluation

# Target labels from the doubled dataset
y = double_data["Selector"]

# Feature matrix using the same feature list defined earlier
X = double_data[features]

# Cross-validation on the 2× GAN-augmented data
cv_results = cross_validate(
    ilp_model,
    X, y,
    cv=skf,
    scoring=scoring,
    return_train_score=True
)

# Store metrics for comparison across augmentation levels
metric_list = ["DD-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results[f'test_{metric}'].mean(), 4))

gan_results.loc[len(gan_results)] = metric_list

# Quick overfitting check
print("Training mcc:", cv_results["train_mcc"].mean())
print("Validation mcc:", cv_results["test_mcc"].mean())

gan_results

Training mcc: 1.0
Validation mcc: 0.07470560428437893


Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,BAL-AUG,0.6759,0.7746,0.7962,0.7816,0.1354
1,DD-AUG,0.694,0.7903,0.8257,0.8044,0.0747


In [22]:
# 3x DATA 

# set y
y = triple_data.Selector

# set z 
features = cols
X = triple_data[features]

# calculate with augmented data
cv_results = cross_validate(ilp_model, X, y, cv=skf, scoring=scoring, return_train_score=True)

# retrieve the metrics for the dataset and add them to the results df
metric_list = ["TD-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results['test_' + metric].mean(), 4))

gan_results.loc[len(gan_results)] = metric_list


# Compare traning and validation mcc to check for overfitting 
print("Training mcc:", cv_results['train_mcc'].mean())
print("Validation mcc:", cv_results['test_mcc'].mean())

gan_results

Training mcc: 1.0
Validation mcc: 0.062097958793429706


Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,BAL-AUG,0.6759,0.7746,0.7962,0.7816,0.1354
1,DD-AUG,0.694,0.7903,0.8257,0.8044,0.0747
2,TD-AUG,0.7269,0.8223,0.8478,0.8322,0.0621


In [23]:
# 4x DATA 

# set y
y = quad_data.Selector

# set z 
features = cols
X = quad_data[features]

# calculate with augmented data
cv_results = cross_validate(ilp_model, X, y, cv=skf, scoring=scoring, return_train_score=True)

# retrieve the metrics for the dataset and add them to the results df
metric_list = ["QD-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results['test_' + metric].mean(), 4))

gan_results.loc[len(gan_results)] = metric_list


# Compare traning and validation mcc to check for overfitting 
print("Training mcc:", cv_results['train_mcc'].mean())
print("Validation mcc:", cv_results['test_mcc'].mean())

Training mcc: 1.0
Validation mcc: 0.04731071333662612


In [24]:
gan_results.head()

Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,BAL-AUG,0.6759,0.7746,0.7962,0.7816,0.1354
1,DD-AUG,0.694,0.7903,0.8257,0.8044,0.0747
2,TD-AUG,0.7269,0.8223,0.8478,0.8322,0.0621
3,QD-AUG,0.7364,0.8288,0.857,0.8404,0.0473


In [25]:
smote_results.head()

Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,UNBALANCED,0.6357,0.7517,0.7394,0.7442,0.1075
1,BAL-AUG,0.7373,0.687,0.7849,0.722,0.4876
2,DD-AUG,0.8554,0.8265,0.8833,0.8521,0.7151
3,TD-AUG,0.8908,0.8804,0.9031,0.8904,0.7837
4,QD-AUG,0.9142,0.9157,0.9182,0.9154,0.8313


In [26]:
# Save Both Files 
smote_results.to_csv("dt_gan_results.csv", index=False) 
smote_results.to_csv("dt_smote_results.csv", index=False) 

# Dev Notes
### 19-11-25, 11:80
When applying my model, I tried running the notebook but replaced KNN with a Decision Tree. However, I ran into an error:

"Recall is ill-defined and being set to 0.0 due to no true samples. Use zero_division parameter to control this behavior."

This happened because some of the cross-validation splits didn’t contain any true values for one of the classes. As a result, recall couldn’t be calculated for that fold, which triggered the warning.

A quick workaround was to reduce the number of validation splits. In my case, the only value that didn’t throw the error was 2—but this defeats the purpose of cross-validation (although this approach is sometimes used for extremely imbalanced datasets).

A far better solution is to use StratifiedKFold, which ensures that each fold preserves the same class proportions as the full dataset. This prevents folds from having zero instances of the minority class, meaning recall and other class-wise metrics can be computed properly in every fold.

Using StratifiedKFold not only fixed the error but also improved the overall metrics, as shown below.

**Before SKF**

|   | case       | accuracy | recall | precision | f1     | mcc    |
| - | ---------- | -------- | ------ | --------- | ------ | ------ |
| 0 | UNBALANCED | 0.6547   | 0.7663 | 0.7490    | 0.7569 | 0.1400 |
| 1 | BAL-AUG    | 0.7373   | 0.4787 | 0.5808    | 0.5231 | 0.2301 |
| 2 | DD-AUG     | 0.8512   | 0.5780 | 0.6348    | 0.6045 | 0.2961 |
| 3 | TD-AUG     | 0.8876   | 0.6150 | 0.6529    | 0.6323 | 0.3619 |
| 4 | QD-AUG     | 0.9160   | 0.6428 | 0.6654    | 0.6530 | 0.3304 |

**After SKF**

|   | case       | accuracy | recall | precision | f1     | mcc    |
| - | ---------- | -------- | ------ | --------- | ------ | ------ |
| 0 | UNBALANCED | 0.6255   | 0.7469 | 0.7319    | 0.7379 | 0.0751 |
| 1 | BAL-AUG    | 0.7470   | 0.6941 | 0.7929    | 0.7325 | 0.5049 |
| 2 | DD-AUG     | 0.8566   | 0.8301 | 0.8816    | 0.8532 | 0.7172 |
| 3 | TD-AUG     | 0.8948   | 0.8876 | 0.9046    | 0.8949 | 0.7915 |
| 4 | QD-AUG     | 0.9148   | 0.9145 | 0.9199    | 0.9155 | 0.8327 |



### FINAL RESULTS:

**DT WITH SMOTE**

|   | case       | accuracy | recall | precision | f1     | mcc    |
| - | ---------- | -------- | ------ | --------- | ------ | ------ |
| 0 | UNBALANCED | 0.6255   | 0.7469 | 0.7319    | 0.7379 | 0.0751 |
| 1 | BAL-AUG    | 0.7470   | 0.6941 | 0.7929    | 0.7325 | 0.5049 |
| 2 | DD-AUG     | 0.8566   | 0.8301 | 0.8816    | 0.8532 | 0.7172 |
| 3 | TD-AUG     | 0.8948   | 0.8876 | 0.9046    | 0.8949 | 0.7915 |
| 4 | QD-AUG     | 0.9148   | 0.9145 | 0.9199    | 0.9155 | 0.8327 |



**DT WITH CTGAN**

|   | case    | accuracy | recall | precision | f1     | mcc    |
| - | ------- | -------- | ------ | --------- | ------ | ------ |
| 0 | BAL-AUG | 0.6747   | 0.7784 | 0.7873    | 0.7790 | 0.1422 |
| 1 | DD-AUG  | 0.7169   | 0.8160 | 0.8356    | 0.8234 | 0.0786 |
| 2 | TD-AUG  | 0.7217   | 0.8100 | 0.8527    | 0.8273 | 0.0602 |
| 3 | QD-AUG  | 0.7446   | 0.8366 | 0.8591    | 0.8454 | 0.0761 |




### SUMMARY:
In summary, using a Decision Tree (DT) augmented with SMOTE consistently yielded high metrics (all above 91%, except MCC at 83%). However, these results still fell short of what was achieved with K-Nearest Neighbors (KNN), consistent with the reference paper.

Similarly, while CTGAN produced higher accuracy, recall, precision, and F1 scores, it still underperformed compared to SMOTE.