<a href="https://www.kaggle.com/code/emmanuelniyioriolowo/4-support-vector-machines-svm-rildc?scriptVersionId=285441628" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Imports

In [1]:
!pip install scikit-learn==1.4.2 imbalanced-learn==0.12.2
!pip install --upgrade ctgan

Collecting scikit-learn==1.4.2
  Downloading scikit_learn-1.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting imbalanced-learn==0.12.2
  Downloading imbalanced_learn-0.12.2-py3-none-any.whl.metadata (8.2 kB)
Downloading scikit_learn-1.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m87.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading imbalanced_learn-0.12.2-py3-none-any.whl (257 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.0/258.0 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn, imbalanced-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-

In [2]:
# Imports and environment setup

import os
import torch
import random
import numpy as np
import pandas as pd

from ctgan import CTGAN
from collections import Counter
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import matthews_corrcoef, make_scorer
from sklearn.svm import SVC

# List available files in the Kaggle input directory
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/Indian Liver Patient Dataset (ILPD).csv


# Data Loading and Cleaning

In [3]:
# Load ILPD dataset
path = "/kaggle/input/Indian Liver Patient Dataset (ILPD).csv"
data = pd.read_csv(path)

# Rename columns for consistency and readability
data.columns = [
    'Age',
    'Gender',
    'TB_total_bilirubin',
    'DB_Direct_Bilirubin',
    'Alkphos_Alkaline_Phosphotase',
    'Sgpt_Alamine_Aminotransferase',
    'Sgot_Aspartate_Aminotransferase',
    'TP_Total_Protiens',
    'ALB_Albumin',
    'A/G_Ratio',
    'Selector'
]

data

Unnamed: 0,Age,Gender,TB_total_bilirubin,DB_Direct_Bilirubin,Alkphos_Alkaline_Phosphotase,Sgpt_Alamine_Aminotransferase,Sgot_Aspartate_Aminotransferase,TP_Total_Protiens,ALB_Albumin,A/G_Ratio,Selector
0,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
1,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
2,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.00,1
3,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.40,1
4,46,Male,1.8,0.7,208,19,14,7.6,4.4,1.30,1
...,...,...,...,...,...,...,...,...,...,...,...
577,60,Male,0.5,0.1,500,20,34,5.9,1.6,0.37,2
578,40,Male,0.6,0.1,98,35,31,6.0,3.2,1.10,1
579,52,Male,0.8,0.2,245,48,49,6.4,3.2,1.00,1
580,31,Male,1.3,0.5,184,29,32,6.8,3.4,1.00,1


In [4]:
# one hot encode categorical data
data = pd.get_dummies(data)
data.head()

Unnamed: 0,Age,TB_total_bilirubin,DB_Direct_Bilirubin,Alkphos_Alkaline_Phosphotase,Sgpt_Alamine_Aminotransferase,Sgot_Aspartate_Aminotransferase,TP_Total_Protiens,ALB_Albumin,A/G_Ratio,Selector,Gender_Female,Gender_Male
0,62,10.9,5.5,699,64,100,7.5,3.2,0.74,1,False,True
1,62,7.3,4.1,490,60,68,7.0,3.3,0.89,1,False,True
2,58,1.0,0.4,182,14,20,6.8,3.4,1.0,1,False,True
3,72,3.9,2.0,195,27,59,7.3,2.4,0.4,1,False,True
4,46,1.8,0.7,208,19,14,7.6,4.4,1.3,1,False,True


In [5]:
# replace NaN values with mean
data['A/G_Ratio'] = data['A/G_Ratio'].fillna(data['A/G_Ratio'].mean())
data['A/G_Ratio'].isna().sum()

0

In [6]:
# set y
y = data.Selector

# Update the columns list post one hot encoding 
cols = data.columns.tolist()
cols.remove("Selector")

# set X
features = cols
X = data[features]

# Model Definition

In [7]:
# Performance metrics for evaluation
mcc_scorer = make_scorer(matthews_corrcoef)

scoring = {
    'accuracy': 'accuracy',
    'recall': 'recall',
    'precision': 'precision',
    'f1': 'f1',
    'mcc': mcc_scorer
}

# SVM with RBF kernel
ilp_model = SVC(kernel="rbf", C=1.0, gamma="scale")

# Standardize feature matrix
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Stratified 10-fold cross-validation
skf = StratifiedKFold(n_splits=10)
cv_results = cross_validate(
    ilp_model,
    X_scaled, y,
    cv=skf,
    scoring=scoring
)

# Display averaged metrics
for metric in scoring:
    print(f"Average {metric}: {cv_results[f'test_{metric}'].mean():.3f}")

Average accuracy: 0.706
Average recall: 0.988
Average precision: 0.712
Average f1: 0.827
Average mcc: -0.013


In [8]:
# Reporting table for SMOTE experiments
smote_results = pd.DataFrame(columns=["case", "accuracy", "recall", "precision", "f1", "mcc"])

In [9]:
# retrieve the for the initial dataset metrics and add them to the df
metric_list = ["UNBALANCED"]
for metric in scoring:
    metric_list.append(round(cv_results['test_' + metric].mean(), 4))

smote_results.loc[len(smote_results)] = metric_list
smote_results.head()

Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,UNBALANCED,0.7062,0.9879,0.7117,0.8273,-0.0127


# Augmentation with SMOTE (Synthetic Minority Oversampling Technique - SMOTE)

In [10]:
print("Before SMOTE:", Counter(y))

# Apply SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_scaled, y)

print("After SMOTE:", Counter(y_smote))

Before SMOTE: Counter({1: 415, 2: 167})
After SMOTE: Counter({1: 415, 2: 415})


In [11]:
# calculate with augmented data
cv_results = cross_validate(ilp_model, X_smote, y_smote, cv=skf, scoring=scoring)

# retrieve the for the initial dataset metrics and add them to the df
metric_list = ["BAL-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results['test_' + metric].mean(), 4))

smote_results.loc[len(smote_results)] = metric_list
smote_results.head()

Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,UNBALANCED,0.7062,0.9879,0.7117,0.8273,-0.0127
1,BAL-AUG,0.7193,0.5298,0.8557,0.6441,0.4764


In [12]:
# get current number of rows
current_count = Counter(y_smote)[1]
double = current_count * 2
triple = current_count * 3
quadruple = current_count * 4

In [13]:
# SMOTE-based doubling of the minority class
smote_double = SMOTE(sampling_strategy={1: double}, random_state=42)  
X_double, y_double = smote_double.fit_resample(X_smote, y_smote)
print("Double SMOTE:", Counter(y_double))

# Second pass to rebalance both classes
smote_double = SMOTE(random_state=42)
X_double, y_double = smote_double.fit_resample(X_double, y_double)
print("Double SMOTE:", Counter(y_double))

Double SMOTE: Counter({1: 830, 2: 415})
Double SMOTE: Counter({1: 830, 2: 830})


In [14]:
# calculate with augmented data
cv_results = cross_validate(ilp_model, X_double, y_double, cv=skf, scoring=scoring)

# retrieve the metrics for the dataset augmented 2 fold and add them to the df
metric_list = ["DD-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results['test_' + metric].mean(), 4))

smote_results.loc[len(smote_results)] = metric_list
smote_results

Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,UNBALANCED,0.7062,0.9879,0.7117,0.8273,-0.0127
1,BAL-AUG,0.7193,0.5298,0.8557,0.6441,0.4764
2,DD-AUG,0.7464,0.553,0.9064,0.6786,0.5377


In [15]:
# SMOTE-based tripling of the minority class
smote_triple = SMOTE(sampling_strategy={1: triple}, random_state=42)
X_triple, y_triple = smote_triple.fit_resample(X_smote, y_smote)

# Second pass to rebalance both classes
smote_triple = SMOTE(random_state=42)
X_triple, y_triple = smote_triple.fit_resample(X_triple, y_triple)
print("After triple SMOTE:", Counter(y_triple))

# Cross-validation on the 3× SMOTE-augmented data
cv_results = cross_validate(
    ilp_model,
    X_triple, y_triple,
    cv=skf,
    scoring=scoring,
    return_train_score=True
)

print("Training mcc:", cv_results["train_mcc"].mean())
print("Validation mcc:", cv_results["test_mcc"].mean())

# Record metrics
metric_list = ["TD-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results[f'test_{metric}'].mean(), 4))

smote_results.loc[len(smote_results)] = metric_list

After triple SMOTE: Counter({1: 1245, 2: 1245})
Training mcc: 0.5828472262748866
Validation mcc: 0.564604832056821


In [16]:
# SMOTE-based quadrupling of the minority class
smote_quad = SMOTE(sampling_strategy={1: quadruple}, random_state=42)
X_quad, y_quad = smote_quad.fit_resample(X_smote, y_smote)

# Second pass to rebalance both classes
smote_quad = SMOTE(random_state=42)
X_quad, y_quad = smote_quad.fit_resample(X_quad, y_quad)

# Cross-validation on the 4× SMOTE-augmented data
cv_results = cross_validate(
    ilp_model,
    X_quad, y_quad,
    cv=skf,
    scoring=scoring,
    return_train_score=True
)

# Store test metrics
metric_list = ["QD-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results[f'test_{metric}'].mean(), 4))

smote_results.loc[len(smote_results)] = metric_list

# Overfitting check
print("Training mcc:", cv_results["train_mcc"].mean())
print("Validation mcc:", cv_results["test_mcc"].mean())

smote_results

Training mcc: 0.5904309500293248
Validation mcc: 0.5646596286386867


Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,UNBALANCED,0.7062,0.9879,0.7117,0.8273,-0.0127
1,BAL-AUG,0.7193,0.5298,0.8557,0.6441,0.4764
2,DD-AUG,0.7464,0.553,0.9064,0.6786,0.5377
3,TD-AUG,0.7647,0.5936,0.9045,0.7153,0.5646
4,QD-AUG,0.7642,0.5904,0.9068,0.7129,0.5647


# Augmentation with CTGAN (Conditional Tabular Generative Adversarial Network)

In [17]:
# Seed configuration for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

# Train CTGAN on the dataset
ctgan = CTGAN(epochs=400)
ctgan.fit(data)

# Generate minority samples to correct the original imbalance
synthetic_minority = ctgan.sample(248, {'Selector': 2})

# Combine original data with synthetic minority samples
balanced_data = pd.concat([data, synthetic_minority])

# Create a 4× expanded dataset in one sampling pass
len_balanced = len(balanced_data)
number_to_generate = (len_balanced * 4) - len_balanced
generated_difference = ctgan.sample(number_to_generate)

# Build expanded datasets at different scales
quad_data = pd.concat([balanced_data, generated_difference])
double_data = quad_data.head(len_balanced * 2)
triple_data = quad_data.head(len_balanced * 3)

In [18]:
y = quad_data.Selector
print("After GAN:", Counter(y))

After GAN: Counter({1: 2781, 2: 539})


In [19]:
# Balanced CTGAN data evaluation

# Target labels
y = balanced_data["Selector"]

# Feature matrix
cols = balanced_data.columns.tolist()
cols.remove("Selector")
features = cols

X = balanced_data[features]
X = scaler.fit_transform(X)

# Table for CTGAN performance metrics
gan_results = pd.DataFrame(
    columns=["case", "accuracy", "recall", "precision", "f1", "mcc"]
)

# Cross-validation on balanced GAN data
cv_results = cross_validate(
    ilp_model,
    X, y,
    cv=skf,
    scoring=scoring,
    return_train_score=True
)

# Store test metrics
metric_list = ["BAL-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results[f'test_{metric}'].mean(), 4))

gan_results.loc[len(gan_results)] = metric_list

gan_results

Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,BAL-AUG,0.747,1.0,0.747,0.8552,0.0


In [20]:
# 2× CTGAN data evaluation

# Target labels
y = double_data["Selector"]

# Feature matrix
features = cols
X = double_data[features]
X = scaler.fit_transform(X)

# Cross-validation on the 2× GAN-augmented data
cv_results = cross_validate(
    ilp_model,
    X, y,
    cv=skf,
    scoring=scoring,
    return_train_score=True
)

# Store test metrics
metric_list = ["DD-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results[f'test_{metric}'].mean(), 4))

gan_results.loc[len(gan_results)] = metric_list

# Overfitting check
print("Training mcc:", cv_results["train_mcc"].mean())
print("Validation mcc:", cv_results["test_mcc"].mean())

gan_results

Training mcc: 0.07095609001355818
Validation mcc: -0.018262764073136383


Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,BAL-AUG,0.747,1.0,0.747,0.8552,0.0
1,DD-AUG,0.7789,0.95,0.8103,0.8722,-0.0183


In [21]:
# 3× CTGAN data evaluation

# Target labels
y = triple_data["Selector"]

# Feature matrix
features = cols
X = triple_data[features]
X = scaler.fit_transform(X)

# Cross-validation on the 3× GAN-augmented data
cv_results = cross_validate(
    ilp_model,
    X, y,
    cv=skf,
    scoring=scoring,
    return_train_score=True
)

# Store test metrics
metric_list = ["TD-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results[f'test_{metric}'].mean(), 4))

gan_results.loc[len(gan_results)] = metric_list

# Overfitting check
print("Training mcc:", cv_results["train_mcc"].mean())
print("Validation mcc:", cv_results["test_mcc"].mean())

gan_results

Training mcc: 0.05682177159601328
Validation mcc: 0.01622765921940205


Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,BAL-AUG,0.747,1.0,0.747,0.8552,0.0
1,DD-AUG,0.7789,0.95,0.8103,0.8722,-0.0183
2,TD-AUG,0.8012,0.9486,0.8362,0.8852,0.0162


In [22]:
# 4× CTGAN data evaluation

# Target labels
y = quad_data["Selector"]

# Feature matrix
features = cols
X = quad_data[features]
X = scaler.fit_transform(X)

# Cross-validation on the 4× GAN-augmented data
cv_results = cross_validate(
    ilp_model,
    X, y,
    cv=skf,
    scoring=scoring,
    return_train_score=True
)

# Store test metrics
metric_list = ["QD-AUG"]
for metric in scoring:
    metric_list.append(round(cv_results[f'test_{metric}'].mean(), 4))

gan_results.loc[len(gan_results)] = metric_list

# Overfitting check
print("Training mcc:", cv_results["train_mcc"].mean())
print("Validation mcc:", cv_results["test_mcc"].mean())

Training mcc: 0.040936572658310376
Validation mcc: 0.01651692128392805


# Results

In [23]:
gan_results.head()

Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,BAL-AUG,0.747,1.0,0.747,0.8552,0.0
1,DD-AUG,0.7789,0.95,0.8103,0.8722,-0.0183
2,TD-AUG,0.8012,0.9486,0.8362,0.8852,0.0162
3,QD-AUG,0.8136,0.9613,0.8407,0.8924,0.0165


In [24]:
smote_results.head()

Unnamed: 0,case,accuracy,recall,precision,f1,mcc
0,UNBALANCED,0.7062,0.9879,0.7117,0.8273,-0.0127
1,BAL-AUG,0.7193,0.5298,0.8557,0.6441,0.4764
2,DD-AUG,0.7464,0.553,0.9064,0.6786,0.5377
3,TD-AUG,0.7647,0.5936,0.9045,0.7153,0.5646
4,QD-AUG,0.7642,0.5904,0.9068,0.7129,0.5647


In [25]:
smote_results.to_csv("svm_gan_results.csv", index=False) 
smote_results.to_csv("svm_smote_results.csv", index=False) 

# Dev Notes
### 19-11-25, 20:42

### FINAL RESULTS:

**SVM WITH SMOTE**

| case       | accuracy | recall | precision | f1     | mcc     |
| ---------- | -------- | ------ | --------- | ------ | ------- |
| UNBALANCED | 0.7062   | 0.9879 | 0.7117    | 0.8273 | -0.0127 |
| BAL-AUG    | 0.7193   | 0.5298 | 0.8557    | 0.6441 | 0.4764  |
| DD-AUG     | 0.7464   | 0.5530 | 0.9064    | 0.6786 | 0.5377  |
| TD-AUG     | 0.7647   | 0.5936 | 0.9045    | 0.7153 | 0.5646  |
| QD-AUG     | 0.7642   | 0.5904 | 0.9068    | 0.7129 | 0.5647  |



**SVM WITH CTGAN**

|   | case    | accuracy | recall | precision | f1     | mcc     |
| - | ------- | -------- | ------ | --------- | ------ | ------- |
| 0 | BAL-AUG | 0.7470   | 1.0000 | 0.7470    | 0.8552 | 0.0000  |
| 1 | DD-AUG  | 0.7789   | 0.9500 | 0.8103    | 0.8722 | -0.0183 |
| 2 | TD-AUG  | 0.8012   | 0.9486 | 0.8362    | 0.8852 | 0.0162  |
| 3 | QD-AUG  | 0.8136   | 0.9613 | 0.8407    | 0.8924 | 0.0165  |


### SUMMARY:
SVM shows the same pattern as Logistic Regression: SMOTE helps create balanced, reliable performance, while CTGAN inflates recall but destroys class balance.

SMOTE results:
- Accuracy gradually improves from 0.71 → 0.76 as oversampling increases.
- Recall drops from an overfitted 0.99 (unbalanced) to a more realistic ~0.59 once classes are balanced.
- Precision becomes very high (0.90+) with heavier SMOTE.
- MCC rises from negative (bad) to solid values around 0.56, indicating genuinely better-balanced decision boundaries.

CTGAN results:
- Accuracy increases to ~0.81, and recall remains extremely high (0.95–1.00).
- However, MCC is essentially 0, even dipping slightly negative at times — meaning the classifier isn’t learning a real boundary and is dominated by GAN-induced imbalance.
- High recall + near-zero MCC again confirms GAN-generated data is distorted/unbalanced for SVM.

Overall:
- SMOTE produces meaningful, stable improvements for SVM, with MCC showing strong gains.
- CTGAN results are misleadingly good on surface metrics but fundamentally unreliable due to class imbalance and poor separability in generated data.