<a href="https://www.kaggle.com/code/emmanuelniyioriolowo/3-logistic-regression-classifier-rildc?scriptVersionId=285441785" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Imports

In [1]:
!pip install scikit-learn==1.4.2 imbalanced-learn==0.12.2
!pip install --upgrade ctgan

Collecting scikit-learn==1.4.2
  Downloading scikit_learn-1.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting imbalanced-learn==0.12.2
  Downloading imbalanced_learn-0.12.2-py3-none-any.whl.metadata (8.2 kB)
Downloading scikit_learn-1.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading imbalanced_learn-0.12.2-py3-none-any.whl (257 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.0/258.0 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn, imbalanced-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-

In [2]:
# Imports and environment setup

import os
import torch
import random
import numpy as np
import pandas as pd

from ctgan import CTGAN
from collections import Counter
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import matthews_corrcoef, make_scorer

# List available files in the Kaggle input directory
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/Indian Liver Patient Dataset (ILPD).csv


# Data Loading and Cleaning

In [3]:
# Load ILPD dataset
path = "/kaggle/input/Indian Liver Patient Dataset (ILPD).csv"
data = pd.read_csv(path)

# Rename columns for consistency and readability
data.columns = [
    'Age',
    'Gender',
    'TB_total_bilirubin',
    'DB_Direct_Bilirubin',
    'Alkphos_Alkaline_Phosphotase',
    'Sgpt_Alamine_Aminotransferase',
    'Sgot_Aspartate_Aminotransferase',
    'TP_Total_Protiens',
    'ALB_Albumin',
    'A/G_Ratio',
    'Selector'
]

data

Unnamed: 0,Age,Gender,TB_total_bilirubin,DB_Direct_Bilirubin,Alkphos_Alkaline_Phosphotase,Sgpt_Alamine_Aminotransferase,Sgot_Aspartate_Aminotransferase,TP_Total_Protiens,ALB_Albumin,A/G_Ratio,Selector
0,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
1,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
2,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.00,1
3,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.40,1
4,46,Male,1.8,0.7,208,19,14,7.6,4.4,1.30,1
...,...,...,...,...,...,...,...,...,...,...,...
577,60,Male,0.5,0.1,500,20,34,5.9,1.6,0.37,2
578,40,Male,0.6,0.1,98,35,31,6.0,3.2,1.10,1
579,52,Male,0.8,0.2,245,48,49,6.4,3.2,1.00,1
580,31,Male,1.3,0.5,184,29,32,6.8,3.4,1.00,1


In [4]:
# one hot encode categorical data
data = pd.get_dummies(data)
data.head()

Unnamed: 0,Age,TB_total_bilirubin,DB_Direct_Bilirubin,Alkphos_Alkaline_Phosphotase,Sgpt_Alamine_Aminotransferase,Sgot_Aspartate_Aminotransferase,TP_Total_Protiens,ALB_Albumin,A/G_Ratio,Selector,Gender_Female,Gender_Male
0,62,10.9,5.5,699,64,100,7.5,3.2,0.74,1,False,True
1,62,7.3,4.1,490,60,68,7.0,3.3,0.89,1,False,True
2,58,1.0,0.4,182,14,20,6.8,3.4,1.0,1,False,True
3,72,3.9,2.0,195,27,59,7.3,2.4,0.4,1,False,True
4,46,1.8,0.7,208,19,14,7.6,4.4,1.3,1,False,True


In [5]:
# replace NaN values with mean
data['A/G_Ratio'] = data['A/G_Ratio'].fillna(data['A/G_Ratio'].mean())
data['A/G_Ratio'].isna().sum()

0

In [6]:
# set y
y = data.Selector

# Update the columns list post one hot encoding 
cols = data.columns.tolist()
cols.remove("Selector")

# set X
features = cols
X = data[features]

# Model Definition

In [7]:
# Performance metrics for evaluation
mcc_scorer = make_scorer(matthews_corrcoef)

scoring = {
    'accuracy': 'accuracy',
    'recall': 'recall',
    'precision': 'precision',
    'f1': 'f1',
    'mcc': mcc_scorer
}

# Logistic regression baseline
ilp_model = LogisticRegression()

# Standardize feature matrix
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Cross-validation using stratified folds
skf = StratifiedKFold(n_splits=10)
cv_results = cross_validate(
    ilp_model,
    X_scaled, y,
    cv=skf,
    scoring=scoring
)

# Display averaged metrics
for metric in scoring:
    print(f"Average {metric}: {cv_results[f'test_{metric}'].mean():.3f}")


Average accuracy: 0.718
Average recall: 0.928
Average precision: 0.744
Average f1: 0.824
Average mcc: 0.173


In [8]:
# Reporting table for SMOTE experiments
smote_results = pd.DataFrame(columns=["case", "accuracy", "recall", "precision", "f1", "mcc"])

In [9]:
# Store baseline (unbalanced) metrics
metric_list = ["UNBALANCED"]

for metric in scoring:
    metric_list.append(round(cv_results[f'test_{metric}'].mean(), 4))

smote_results.loc[len(smote_results)] = metric_list
smote_results.head()

Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,UNBALANCED,0.7184,0.9278,0.7436,0.8243,0.1731


# Augmentation with SMOTE (Synthetic Minority Oversampling Technique - SMOTE)

In [10]:
print("Before SMOTE:", Counter(y))

smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

print("After SMOTE:", Counter(y_smote))

Before SMOTE: Counter({1: 415, 2: 167})
After SMOTE: Counter({1: 415, 2: 415})


In [11]:
# calculate with augmented data
cv_results = cross_validate(ilp_model, X_smote, y_smote, cv=skf, scoring=scoring)

# Store SMOTE-balanced metrics
metric_list = ["BAL-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results['test_' + metric].mean(), 4))

smote_results.loc[len(smote_results)] = metric_list
smote_results

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,UNBALANCED,0.7184,0.9278,0.7436,0.8243,0.1731
1,BAL-AUG,0.694,0.6359,0.7299,0.6704,0.3967


In [12]:
# get current number of rows
current_count = Counter(y_smote)[1]
double = current_count * 2
triple = current_count * 3
quadruple = current_count * 4

In [13]:
# SMOTE-based doubling of the minority class
smote_double = SMOTE(sampling_strategy={1: double}, random_state=42)
X_double, y_double = smote_double.fit_resample(X_smote, y_smote)
print("After targeted doubling:", Counter(y_double))

# Second pass to rebalance both classes
smote_double = SMOTE(random_state=42)
X_double, y_double = smote_double.fit_resample(X_double, y_double)
print("After normalization:", Counter(y_double))

After targeted doubling: Counter({1: 830, 2: 415})
After normalization: Counter({1: 830, 2: 830})


In [14]:
# calculate with augmented data
cv_results = cross_validate(ilp_model, X_double, y_double, cv=skf, scoring=scoring)

# retrieve the metrics for the dataset augmented 2 fold and add them to the df
metric_list = ["DD-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results['test_' + metric].mean(), 4))

smote_results.loc[len(smote_results)] = metric_list
smote_results

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,UNBALANCED,0.7184,0.9278,0.7436,0.8243,0.1731
1,BAL-AUG,0.694,0.6359,0.7299,0.6704,0.3967
2,DD-AUG,0.7181,0.6241,0.7816,0.6843,0.4537


In [15]:
# SMOTE-based tripling of the minority class
smote_triple = SMOTE(sampling_strategy={1: triple}, random_state=42)
X_triple, y_triple = smote_triple.fit_resample(X_smote, y_smote)

# Second pass to rebalance both classes
smote_triple = SMOTE(random_state=42)
X_triple, y_triple = smote_triple.fit_resample(X_triple, y_triple)

print("After triple-level SMOTE:", Counter(y_triple))

# calculate with augmented data
cv_results = cross_validate(ilp_model, X_triple, y_triple, cv=skf, scoring=scoring, return_train_score=True)
print("Training mcc:", cv_results['train_mcc'].mean())
print("Validation mcc:", cv_results['test_mcc'].mean())

# retrieve the metrics for the dataset augmented 3 fold and add them to the df
metric_list = ["TD-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results['test_' + metric].mean(), 4))

smote_results.loc[len(smote_results)] = metric_list

After triple-level SMOTE: Counter({1: 1245, 2: 1245})


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Training mcc: 0.4927677524083781
Validation mcc: 0.4857876307680259


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
# SMOTE-based quadrupling of the minority class
smote_quad = SMOTE(sampling_strategy={1: quadruple}, random_state=42)
X_quad, y_quad = smote_quad.fit_resample(X_smote, y_smote)

# Second pass to rebalance both classes
smote_quad = SMOTE(random_state=42)
X_quad, y_quad = smote_quad.fit_resample(X_quad, y_quad)

# Cross-validation on the 4× SMOTE-augmented dataset
cv_results = cross_validate(
    ilp_model,
    X_quad, y_quad,
    cv=skf,
    scoring=scoring,
    return_train_score=True
)

# Store performance metrics for comparison
metric_list = ["QD-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results[f'test_{metric}'].mean(), 4))

smote_results.loc[len(smote_results)] = metric_list

# Basic overfitting check
print("Training mcc:", cv_results["train_mcc"].mean())
print("Validation mcc:", cv_results["test_mcc"].mean())

smote_results


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Training mcc: 0.48687056947019103
Validation mcc: 0.4726475716251152


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,UNBALANCED,0.7184,0.9278,0.7436,0.8243,0.1731
1,BAL-AUG,0.694,0.6359,0.7299,0.6704,0.3967
2,DD-AUG,0.7181,0.6241,0.7816,0.6843,0.4537
3,TD-AUG,0.7369,0.6466,0.7949,0.7096,0.4858
4,QD-AUG,0.7298,0.6373,0.7906,0.7014,0.4726


# Augmentation with CTGAN (Conditional Tabular Generative Adversarial Network)

In [17]:
# Seed configuration for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

# Train CTGAN on the full dataset
ctgan = CTGAN(epochs=400)
ctgan.fit(data)

# Generate minority samples to correct the original imbalance
synthetic_minority = ctgan.sample(248, {'Selector': 2})

# Merge original data with synthetic minority samples
balanced_data = pd.concat([data, synthetic_minority])

# Create a 4× expanded dataset in a single sampling pass
len_balanced = len(balanced_data)
number_to_generate = (len_balanced * 4) - len_balanced
generated_difference = ctgan.sample(number_to_generate)

# Assemble expanded datasets at different scales
quad_data = pd.concat([balanced_data, generated_difference])
double_data = quad_data.head(len_balanced * 2)
triple_data = quad_data.head(len_balanced * 3)

In [18]:
y = quad_data.Selector
print("After GAN:", Counter(y))

After GAN: Counter({1: 2781, 2: 539})


In [19]:
# Balanced CTGAN data evaluation

# Labels from the balanced dataset
y = balanced_data["Selector"]

# Feature matrix
cols = balanced_data.columns.tolist()
cols.remove("Selector")
features = cols

X = balanced_data[features]
X = scaler.fit_transform(X)

# Results table for CTGAN experiments
gan_results = pd.DataFrame(
    columns=["case", "accuracy", "recall", "precision", "f1", "mcc"]
)

# Cross-validation on balanced GAN data
cv_results = cross_validate(
    ilp_model,
    X, y,
    cv=skf,
    scoring=scoring,
    return_train_score=True
)

# Record test metrics
metric_list = ["BAL-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results[f'test_{metric}'].mean(), 4))

gan_results.loc[len(gan_results)] = metric_list

gan_results

Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,BAL-AUG,0.7482,0.9871,0.7531,0.8541,0.0745


In [20]:
# 2× CTGAN data evaluation

# Labels from the doubled dataset
y = double_data["Selector"]

# Feature matrix
features = cols
X = double_data[features]
X = scaler.fit_transform(X)

# Cross-validation on 2× GAN-augmented data
cv_results = cross_validate(
    ilp_model,
    X, y,
    cv=skf,
    scoring=scoring,
    return_train_score=True
)

# Store metrics
metric_list = ["DD-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results[f'test_{metric}'].mean(), 4))

gan_results.loc[len(gan_results)] = metric_list

# Check for possible overfitting
print("Training mcc:", cv_results["train_mcc"].mean())
print("Validation mcc:", cv_results["test_mcc"].mean())

gan_results

Training mcc: 0.07134971125513814
Validation mcc: 0.02617648164150737


Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,BAL-AUG,0.7482,0.9871,0.7531,0.8541,0.0745
1,DD-AUG,0.8042,0.9838,0.8144,0.8906,0.0262


In [21]:
# 3× CTGAN data evaluation

# Labels from the tripled dataset
y = triple_data["Selector"]

# Feature matrix
features = cols
X = triple_data[features]
X = scaler.fit_transform(X)

# Cross-validation on 3× GAN-augmented data
cv_results = cross_validate(
    ilp_model,
    X, y,
    cv=skf,
    scoring=scoring,
    return_train_score=True
)

# Store metrics
metric_list = ["TD-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results[f'test_{metric}'].mean(), 4))

gan_results.loc[len(gan_results)] = metric_list

# Overfitting check
print("Training mcc:", cv_results["train_mcc"].mean())
print("Validation mcc:", cv_results["test_mcc"].mean())

gan_results

Training mcc: 0.026056047685505408
Validation mcc: 0.019802231833817234


Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,BAL-AUG,0.7482,0.9871,0.7531,0.8541,0.0745
1,DD-AUG,0.8042,0.9838,0.8144,0.8906,0.0262
2,TD-AUG,0.8229,0.9837,0.8338,0.9018,0.0198


In [22]:
# 4× CTGAN data evaluation

# Labels from the quadrupled dataset
y = quad_data["Selector"]

# Feature matrix
features = cols
X = quad_data[features]
X = scaler.fit_transform(X)

# Cross-validation on 4× GAN-augmented data
cv_results = cross_validate(
    ilp_model,
    X, y,
    cv=skf,
    scoring=scoring,
    return_train_score=True
)

# Store metrics
metric_list = ["QD-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results[f'test_{metric}'].mean(), 4))

gan_results.loc[len(gan_results)] = metric_list

# Overfitting check
print("Training mcc:", cv_results["train_mcc"].mean())
print("Validation mcc:", cv_results["test_mcc"].mean())

Training mcc: 0.0187809057841039
Validation mcc: 0.004022934471691769


In [23]:
gan_results.head()

Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,BAL-AUG,0.7482,0.9871,0.7531,0.8541,0.0745
1,DD-AUG,0.8042,0.9838,0.8144,0.8906,0.0262
2,TD-AUG,0.8229,0.9837,0.8338,0.9018,0.0198
3,QD-AUG,0.8247,0.9799,0.8384,0.9026,0.004


In [24]:
smote_results.head()

Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,UNBALANCED,0.7184,0.9278,0.7436,0.8243,0.1731
1,BAL-AUG,0.694,0.6359,0.7299,0.6704,0.3967
2,DD-AUG,0.7181,0.6241,0.7816,0.6843,0.4537
3,TD-AUG,0.7369,0.6466,0.7949,0.7096,0.4858
4,QD-AUG,0.7298,0.6373,0.7906,0.7014,0.4726


In [25]:
# Save Both Files 
smote_results.to_csv("lr_gan_results.csv", index=False) 
smote_results.to_csv("lr_smote_results.csv", index=False) 

# Dev Notes
### 19-11-25, 13:18

### FINAL RESULTS:

**LR WITH SMOTE**

|   | case       | accuracy | recall | precision | f1     | mcc    |
| - | ---------- | -------- | ------ | --------- | ------ | ------ |
| 0 | UNBALANCED | 0.7184   | 0.9278 | 0.7436    | 0.8243 | 0.1731 |
| 1 | BAL-AUG    | 0.7253   | 0.5709 | 0.8274    | 0.6641 | 0.4776 |
| 2 | DD-AUG     | 0.7319   | 0.5928 | 0.8226    | 0.6826 | 0.4860 |
| 3 | TD-AUG     | 0.7325   | 0.5968 | 0.8206    | 0.6894 | 0.4844 |
| 4 | QD-AUG     | 0.7304   | 0.5946 | 0.8167    | 0.6867 | 0.4795 |



**LR WITH CTGAN**

|   | case    | accuracy | recall | precision | f1     | mcc    |
| - | ------- | -------- | ------ | --------- | ------ | ------ |
| 0 | BAL-AUG | 0.7482   | 0.9871 | 0.7531    | 0.8541 | 0.0745 |
| 1 | DD-AUG  | 0.8042   | 0.9838 | 0.8144    | 0.8906 | 0.0262 |
| 2 | TD-AUG  | 0.8229   | 0.9837 | 0.8338    | 0.9018 | 0.0198 |
| 3 | QD-AUG  | 0.8247   | 0.9799 | 0.8384    | 0.9026 | 0.0040 |



### SUMMARY:
Summary:
Logistic Regression reacts very differently from KNN to augmentation.

SMOTE results:

- Accuracy stays around 0.72–0.73 regardless of oversampling level.

- Precision improves with balancing, but recall drops sharply (from ~0.93 to ~0.59).

- MCC improves from 0.17 → ~0.48, showing better class balance handling even though overall predictive power doesn’t meaningfully rise.

CTGAN results:

- Accuracy climbs as high as 0.82, and recall becomes extremely high (~0.98), but

- MCC collapses to nearly zero, meaning the classifier is essentially predicting one class most of the time.

- Strong recall + very low MCC indicates severe class imbalance or mode collapse in the GAN-generated data.

Overall:

- SMOTE yields stable, balanced behavior for Logistic Regression, though only modest gains.

- CTGAN boosts accuracy/recall artificially but produces unreliable, low-quality datasets for LR, making its outputs unusable.