<a href="https://colab.research.google.com/github/AdityaB2007/GAN-Generated-Synthetic-Malignant-Samples-for-Breast-Cancer-Detection/blob/main/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sdv seaborn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata

# =========================
# Function to plot confusion matrix as image
# =========================
def plot_confusion_matrix(cm, classes, title, filename):
    plt.figure(figsize=(4, 3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=classes, yticklabels=classes)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.title(title)
    plt.tight_layout()
    plt.savefig(filename, dpi=300)
    plt.close()

# =========================
# 1. Load dataset
# =========================
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
columns = ['id', 'diagnosis'] + [f'feature_{i}' for i in range(1, 31)]
df = pd.read_csv(url, header=None, names=columns)
df = df.drop('id', axis=1)
df['diagnosis'] = LabelEncoder().fit_transform(df['diagnosis'])  # M=1, B=0
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
data = pd.concat([X_scaled_df, y.reset_index(drop=True)], axis=1)

# =========================
# 2. Metadata for CTGAN
# =========================
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data)

# =========================
# 3. Stratified 5-fold CV
# =========================
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train_metrics = []
val_metrics = []
train_conf_mats = []
val_conf_mats = []

for fold, (train_idx, test_idx) in enumerate(kf.split(X_scaled_df, y), start=1):
    # Split
    X_train, y_train = X_scaled_df.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X_scaled_df.iloc[test_idx], y.iloc[test_idx]

    # Prepare DataFrame for CTGAN
    train_data = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)

    # Train CTGAN on minority class only
    minority_train = train_data[train_data['diagnosis'] == 1].reset_index(drop=True)
    synth_metadata = SingleTableMetadata()
    synth_metadata.detect_from_dataframe(minority_train)

    synthesizer = CTGANSynthesizer(metadata=synth_metadata, epochs=300, enforce_rounding=True, verbose=False)
    synthesizer.fit(minority_train)

    # Generate enough synthetic malignant samples to match benign count
    n_seq = sum(y_train == 0) - sum(y_train == 1)
    if n_seq > 0:
        synth = synthesizer.sample(num_rows=n_seq)
        synth['diagnosis'] = 1
        aug_train = pd.concat([train_data, synth], axis=0, ignore_index=True)
    else:
        aug_train = train_data.copy()

    # Shuffle augmented data
    aug_train = aug_train.sample(frac=1.0, random_state=42).reset_index(drop=True)
    X_aug = aug_train.drop('diagnosis', axis=1)
    y_aug = aug_train['diagnosis']

    # Train ensemble classifier
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    mlp = MLPClassifier(hidden_layer_sizes=(32,), max_iter=300, random_state=42)
    svm = SVC(kernel='rbf', probability=True, random_state=42)

    ensemble = VotingClassifier(estimators=[
        ('rf', rf),
        ('mlp', mlp),
        ('svm', svm)
    ], voting='soft')

    ensemble.fit(X_aug, y_aug)
    train_preds = ensemble.predict(X_aug)
    val_preds = ensemble.predict(X_val)

    # Training Metrics
    train_acc = accuracy_score(y_aug, train_preds)
    train_prec = precision_score(y_aug, train_preds)
    train_rec = recall_score(y_aug, train_preds)
    train_f1 = f1_score(y_aug, train_preds)
    train_conf = confusion_matrix(y_aug, train_preds)

    # Validation Metrics
    val_acc = accuracy_score(y_val, val_preds)
    val_prec = precision_score(y_val, val_preds)
    val_rec = recall_score(y_val, val_preds)
    val_f1 = f1_score(y_val, val_preds)
    val_conf = confusion_matrix(y_val, val_preds)

    # Append for aggregate stats
    train_metrics.append((train_acc, train_prec, train_rec, train_f1))
    val_metrics.append((val_acc, val_prec, val_rec, val_f1))
    train_conf_mats.append(train_conf)
    val_conf_mats.append(val_conf)

    # Save confusion matrices as images
    plot_confusion_matrix(train_conf, ['Benign(0)', 'Malignant(1)'],
                          f'Fold {fold} - Training CM', f'cm_train_fold{fold}.png')
    plot_confusion_matrix(val_conf, ['Benign(0)', 'Malignant(1)'],
                          f'Fold {fold} - Validation CM', f'cm_val_fold{fold}.png')

    # Print fold metrics
    print(f"\nFold {fold} Training: Accuracy={train_acc:.4f}, Precision={train_prec:.4f}, Recall={train_rec:.4f}, F1={train_f1:.4f}")
    print(f"Fold {fold} Validation: Accuracy={val_acc:.4f}, Precision={val_prec:.4f}, Recall={val_rec:.4f}, F1={val_f1:.4f}")
    print(f"Confusion Matrices saved: cm_train_fold{fold}.png & cm_val_fold{fold}.png")

# =========================
# Aggregate Mean ± Std
# =========================
train_arr = np.array(train_metrics)
val_arr = np.array(val_metrics)

print("\n=== Mean Training Results ====================")
print(f"Accuracy:  {train_arr[:,0].mean():.4f} ± {train_arr[:,0].std():.4f}")
print(f"Precision: {train_arr[:,1].mean():.4f} ± {train_arr[:,1].std():.4f}")
print(f"Recall:    {train_arr[:,2].mean():.4f} ± {train_arr[:,2].std():.4f}")
print(f"F1:        {train_arr[:,3].mean():.4f} ± {train_arr[:,3].std():.4f}")

print("\n=== Mean Validation Results ==================")
print(f"Accuracy:  {val_arr[:,0].mean():.4f} ± {val_arr[:,0].std():.4f}")
print(f"Precision: {val_arr[:,1].mean():.4f} ± {val_arr[:,1].std():.4f}")
print(f"Recall:    {val_arr[:,2].mean():.4f} ± {val_arr[:,2].std():.4f}")
print(f"F1:        {val_arr[:,3].mean():.4f} ± {val_arr[:,3].std():.4f}")


Collecting sdv
  Downloading sdv-1.25.0-py3-none-any.whl.metadata (14 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.40.8-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.40.8-py3-none-any.whl.metadata (5.7 kB)
Collecting copulas>=0.12.1 (from sdv)
  Downloading copulas-0.12.3-py3-none-any.whl.metadata (9.5 kB)
Collecting ctgan>=0.11.0 (from sdv)
  Downloading ctgan-0.11.0-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.7.0 (from sdv)
  Downloading deepecho-0.7.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.17.0 (from sdv)
  Downloading rdt-1.17.1-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.21.0 (from sdv)
  Downloading sdmetrics-0.22.0-py3-none-any.whl.metadata (9.4 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3<2.0.0,>=1.28->sdv)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.14.0,>=0.13.0 (from boto3<2.0.0,>=1.28->sdv)
  Downloading s3t




Fold 1 Training: Accuracy=0.9930, Precision=1.0000, Recall=0.9860, F1=0.9930
Fold 1 Validation: Accuracy=0.9825, Precision=0.9556, Recall=1.0000, F1=0.9773
Confusion Matrices saved: cm_train_fold1.png & cm_val_fold1.png





Fold 2 Training: Accuracy=0.9948, Precision=1.0000, Recall=0.9895, F1=0.9947
Fold 2 Validation: Accuracy=0.9649, Precision=0.9756, Recall=0.9302, F1=0.9524
Confusion Matrices saved: cm_train_fold2.png & cm_val_fold2.png





Fold 3 Training: Accuracy=0.9965, Precision=1.0000, Recall=0.9930, F1=0.9965
Fold 3 Validation: Accuracy=0.9825, Precision=1.0000, Recall=0.9524, F1=0.9756
Confusion Matrices saved: cm_train_fold3.png & cm_val_fold3.png





Fold 4 Training: Accuracy=0.9930, Precision=1.0000, Recall=0.9860, F1=0.9929
Fold 4 Validation: Accuracy=0.9825, Precision=0.9545, Recall=1.0000, F1=0.9767
Confusion Matrices saved: cm_train_fold4.png & cm_val_fold4.png





Fold 5 Training: Accuracy=0.9948, Precision=1.0000, Recall=0.9895, F1=0.9947
Fold 5 Validation: Accuracy=0.9912, Precision=1.0000, Recall=0.9762, F1=0.9880
Confusion Matrices saved: cm_train_fold5.png & cm_val_fold5.png

Accuracy:  0.9944 ± 0.0013
Precision: 1.0000 ± 0.0000
Recall:    0.9888 ± 0.0026
F1:        0.9944 ± 0.0013

Accuracy:  0.9807 ± 0.0086
Precision: 0.9771 ± 0.0201
Recall:    0.9718 ± 0.0273
F1:        0.9740 ± 0.0117
