In [1]:
!git clone https://github.com/SamsungSAILMontreal/ForestDiffusion
!pip install ForestDiffusion


Cloning into 'ForestDiffusion'...
remote: Enumerating objects: 447, done.[K
remote: Counting objects: 100% (148/148), done.[K
remote: Compressing objects: 100% (63/63), done.[K
remote: Total 447 (delta 82), reused 129 (delta 72), pack-reused 299 (from 1)[K
Receiving objects: 100% (447/447), 901.47 KiB | 26.51 MiB/s, done.
Resolving deltas: 100% (215/215), done.
Collecting ForestDiffusion
  Downloading ForestDiffusion-1.0.6-py3-none-any.whl.metadata (1.7 kB)
Downloading ForestDiffusion-1.0.6-py3-none-any.whl (14 kB)
Installing collected packages: ForestDiffusion
Successfully installed ForestDiffusion-1.0.6


In [2]:
!cp /kaggle/input/oil-spill-detection/oil_spill.csv /kaggle/working/

# ##Equal Opportunity

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, recall_score, f1_score, confusion_matrix
from ForestDiffusion import ForestDiffusionModel

# Load data
data = pd.read_csv('/kaggle/working/oil_spill.csv')

# Extract features and target
X = data.iloc[:, 1:-1].values
y = data.iloc[:, -1].values

# Check initial class distribution
unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

# Separate minority class
X_minority = X[y == 1]

# Split data into training and testing sets
X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train model on original data
clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

# Predict on test set (original data)
y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig, pos_label=1)
f1_orig = f1_score(y_test, y_pred_orig, pos_label=1)
print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")

# Define TPR calculation function
def calculate_tpr(y_true, y_pred, target_class):
    cm = confusion_matrix(y_true, y_pred, labels=[-1, 1])  # Specify class order
    index = 1 if target_class == 1 else 0  # Map 1 -> minority, -1 -> majority
    tp = cm[index, index]  # True Positives
    fn = cm[index, :].sum() - tp  # False Negatives
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
    return tpr

# Calculate TPR before augmentation
tpr_orig_minority = calculate_tpr(y_test, y_pred_orig, target_class=1)
tpr_orig_majority = calculate_tpr(y_test, y_pred_orig, target_class=-1)
print(f"TPR (minority class) before augmentation: {tpr_orig_minority:.4f}")
print(f"TPR (majority class) before augmentation: {tpr_orig_majority:.4f}")

# Augment data using Forest Diffusion
int_indexes = [i for i in range(X.shape[1]) if np.issubdtype(X[:, i].dtype, np.integer)]
forest_model = ForestDiffusionModel(X_minority, label_y=None, n_t=50, duplicate_K=100,
                                    bin_indexes=[], cat_indexes=[], int_indexes=int_indexes,
                                    diffusion_type='flow', n_jobs=-1)

X_minority_fake = forest_model.generate(batch_size=len(X) // 5)

# Combine original and synthetic data
X_balanced = np.concatenate((X, X_minority_fake), axis=0)
y_balanced = np.concatenate((y, np.ones(X_minority_fake.shape[0])), axis=0)

# Check class distribution after augmentation
unique, counts = np.unique(y_balanced, return_counts=True)
class_dist_after = dict(zip(unique, counts))
print(f"Class distribution after augmentation: {class_dist_after}")

# Split augmented data into training and testing sets
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)

# Train model on augmented data
clf_bal = RandomForestClassifier(random_state=42)
clf_bal.fit(X_train_bal, y_train_bal)

# Predict on test set (augmented data)
y_pred_bal = clf_bal.predict(X_test)
recall_bal = recall_score(y_test, y_pred_bal, pos_label=1)
f1_bal = f1_score(y_test, y_pred_bal, pos_label=1)
print(f"Recall score (generated data): {recall_bal:.4f}")
print(f"F1 score (generated data): {f1_bal:.4f}")

# Calculate TPR after augmentation
tpr_bal_minority = calculate_tpr(y_test, y_pred_bal, target_class=1)
tpr_bal_majority = calculate_tpr(y_test, y_pred_bal, target_class=-1)
print(f"TPR (minority class) after augmentation: {tpr_bal_minority:.4f}")
print(f"TPR (majority class) after augmentation: {tpr_bal_majority:.4f}")

# Classification reports
print("Classification Report (original data):\n", classification_report(y_test, y_pred_orig))
print("Classification Report (generated data):\n", classification_report(y_test, y_pred_bal))


Class distribution before augmentation: {0: 896, 1: 41}
Recall score (original data): 0.2500
F1 score (original data): 0.3750
TPR (minority class) before augmentation: 1.0000
TPR (majority class) before augmentation: 0.0000
Class distribution after augmentation: {0.0: 896, 1.0: 228}
Recall score (generated data): 0.9167
F1 score (generated data): 0.9167
TPR (minority class) after augmentation: 1.0000
TPR (majority class) after augmentation: 0.0000
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       270
           1       0.75      0.25      0.38        12

    accuracy                           0.96       282
   macro avg       0.86      0.62      0.68       282
weighted avg       0.96      0.96      0.96       282

Classification Report (generated data):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       270
           1       0.92      0.

# RandomForest

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, recall_score, f1_score, confusion_matrix
from ForestDiffusion import ForestDiffusionModel

# Load data
data = pd.read_csv('/kaggle/working/oil_spill.csv')

# Extract features and target
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Check initial class distribution
unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

# Separate minority class
X_minority = X[y == 1]

# Split data into training and testing sets
X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train model on original data
clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

# Predict on test set (original data)
y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig, pos_label=1)
f1_orig = f1_score(y_test, y_pred_orig, pos_label=1)
print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")

# Define TPR calculation function
def calculate_tpr(y_true, y_pred, target_class):
    """
    Calculate the True Positive Rate (TPR) for the specified class.

    Parameters:
    - y_true: Ground truth labels
    - y_pred: Predicted labels
    - target_class: The class for which to calculate TPR

    Returns:
    - TPR value
    """
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])  # Use [0, 1] since 0 is majority and 1 is minority
    index = 1 if target_class == 1 else 0  # If target_class is 1 (minority), index is 1
    tp = cm[index, index]  # True Positives
    fn = cm[index, :].sum() - tp  # False Negatives
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0  # Avoid division by zero
    return tpr

# Calculate TPR before augmentation
tpr_orig_minority = calculate_tpr(y_test, y_pred_orig, target_class=1)
tpr_orig_majority = calculate_tpr(y_test, y_pred_orig, target_class=0)
print(f"TPR (minority class) before augmentation: {tpr_orig_minority:.4f}")
print(f"TPR (majority class) before augmentation: {tpr_orig_majority:.4f}")

# Augment data using Forest Diffusion
int_indexes = [i for i in range(X.shape[1]) if np.issubdtype(X[:, i].dtype, np.integer)]
forest_model = ForestDiffusionModel(X_minority, label_y=None, n_t=50, duplicate_K=100,
                                    bin_indexes=[], cat_indexes=[], int_indexes=int_indexes,
                                    diffusion_type='flow', n_jobs=-1)

X_minority_fake = forest_model.generate(batch_size=len(X) // 5)

# Combine original and synthetic data
X_balanced = np.concatenate((X, X_minority_fake), axis=0)
y_balanced = np.concatenate((y, np.ones(X_minority_fake.shape[0], dtype=int)), axis=0)

# Check class distribution after augmentation
unique, counts = np.unique(y_balanced, return_counts=True)
class_dist_after = dict(zip(unique, counts))
print(f"Class distribution after augmentation: {class_dist_after}")

# Split augmented data into training and testing sets
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)

# Train model on augmented data
clf_bal = RandomForestClassifier(random_state=42)
clf_bal.fit(X_train_bal, y_train_bal)

# Predict on test set (augmented data)
y_pred_bal = clf_bal.predict(X_test)
recall_bal = recall_score(y_test, y_pred_bal, pos_label=1)
f1_bal = f1_score(y_test, y_pred_bal, pos_label=1)
print(f"Recall score (generated data): {recall_bal:.4f}")
print(f"F1 score (generated data): {f1_bal:.4f}")

# Calculate TPR after augmentation
tpr_bal_minority = calculate_tpr(y_test, y_pred_bal, target_class=1)
tpr_bal_majority = calculate_tpr(y_test, y_pred_bal, target_class=0)
print(f"TPR (minority class) after augmentation: {tpr_bal_minority:.4f}")
print(f"TPR (majority class) after augmentation: {tpr_bal_majority:.4f}")

# Classification reports
print("Classification Report (original data):\n", classification_report(y_test, y_pred_orig))
print("Classification Report (generated data):\n", classification_report(y_test, y_pred_bal))


Class distribution before augmentation: {0: 896, 1: 41}
Recall score (original data): 0.3333
F1 score (original data): 0.4706
TPR (minority class) before augmentation: 0.3333
TPR (majority class) before augmentation: 0.9963
Class distribution after augmentation: {0: 896, 1: 228}
Recall score (generated data): 0.8333
F1 score (generated data): 0.8333
TPR (minority class) after augmentation: 0.8333
TPR (majority class) after augmentation: 0.9926
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       270
           1       0.80      0.33      0.47        12

    accuracy                           0.97       282
   macro avg       0.89      0.66      0.73       282
weighted avg       0.96      0.97      0.96       282

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       270
           1       0.83      0.83  

# XGBoost

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, recall_score, f1_score, confusion_matrix
import xgboost as xgb
from ForestDiffusion import ForestDiffusionModel

# Load data
data = pd.read_csv('/kaggle/working/oil_spill.csv')

# Extract features and target
X = data.iloc[:, 1:-1].values
y = data.iloc[:, -1].values

# Check initial class distribution
unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

# Separate minority class
X_minority = X[y == 1]

# Split data into training and testing sets
X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train model on original data using XGBoost
clf_orig = xgb.XGBClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

# Predict on test set (original data)
y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig, pos_label=1)
f1_orig = f1_score(y_test, y_pred_orig, pos_label=1)
print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")

# Define TPR calculation function
def calculate_tpr(y_true, y_pred, target_class):
    """
    Calculate the True Positive Rate (TPR) for the specified class.

    Parameters:
    - y_true: Ground truth labels
    - y_pred: Predicted labels
    - target_class: The class for which to calculate TPR

    Returns:
    - TPR value
    """
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])  # Use [0, 1] since 0 is majority and 1 is minority
    index = 1 if target_class == 1 else 0  # If target_class is 1 (minority), index is 1
    tp = cm[index, index]  # True Positives
    fn = cm[index, :].sum() - tp  # False Negatives
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0  # Avoid division by zero
    return tpr

# Calculate TPR before augmentation
tpr_orig_minority = calculate_tpr(y_test, y_pred_orig, target_class=1)
tpr_orig_majority = calculate_tpr(y_test, y_pred_orig, target_class=0)
print(f"TPR (minority class) before augmentation: {tpr_orig_minority:.4f}")
print(f"TPR (majority class) before augmentation: {tpr_orig_majority:.4f}")

# Augment data using Forest Diffusion
int_indexes = [i for i in range(X.shape[1]) if np.issubdtype(X[:, i].dtype, np.integer)]
forest_model = ForestDiffusionModel(X_minority, label_y=None, n_t=50, duplicate_K=100,
                                    bin_indexes=[], cat_indexes=[], int_indexes=int_indexes,
                                    diffusion_type='flow', n_jobs=-1)

X_minority_fake = forest_model.generate(batch_size=len(X) // 5)

# Combine original and synthetic data
X_balanced = np.concatenate((X, X_minority_fake), axis=0)
y_balanced = np.concatenate((y, np.ones(X_minority_fake.shape[0], dtype=int)), axis=0)

# Check class distribution after augmentation
unique, counts = np.unique(y_balanced, return_counts=True)
class_dist_after = dict(zip(unique, counts))
print(f"Class distribution after augmentation: {class_dist_after}")

# Split augmented data into training and testing sets
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)

# Train model on augmented data using XGBoost
clf_bal = xgb.XGBClassifier(random_state=42)
clf_bal.fit(X_train_bal, y_train_bal)

# Predict on test set (augmented data)
y_pred_bal = clf_bal.predict(X_test)
recall_bal = recall_score(y_test, y_pred_bal, pos_label=1)
f1_bal = f1_score(y_test, y_pred_bal, pos_label=1)
print(f"Recall score (generated data): {recall_bal:.4f}")
print(f"F1 score (generated data): {f1_bal:.4f}")

# Calculate TPR after augmentation
tpr_bal_minority = calculate_tpr(y_test, y_pred_bal, target_class=1)
tpr_bal_majority = calculate_tpr(y_test, y_pred_bal, target_class=0)
print(f"TPR (minority class) after augmentation: {tpr_bal_minority:.4f}")
print(f"TPR (majority class) after augmentation: {tpr_bal_majority:.4f}")

# Classification reports
print("Classification Report (original data):\n", classification_report(y_test, y_pred_orig))
print("Classification Report (generated data):\n", classification_report(y_test, y_pred_bal))


Class distribution before augmentation: {0: 896, 1: 41}
Recall score (original data): 0.3333
F1 score (original data): 0.4706
TPR (minority class) before augmentation: 0.3333
TPR (majority class) before augmentation: 0.9963
Class distribution after augmentation: {0: 896, 1: 228}
Recall score (generated data): 0.7500
F1 score (generated data): 0.7500
TPR (minority class) after augmentation: 0.7500
TPR (majority class) after augmentation: 0.9889
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       270
           1       0.80      0.33      0.47        12

    accuracy                           0.97       282
   macro avg       0.89      0.66      0.73       282
weighted avg       0.96      0.97      0.96       282

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       270
           1       0.75      0.75  

# Decision Tree

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, recall_score, f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from ForestDiffusion import ForestDiffusionModel

# Load data
data = pd.read_csv('/kaggle/working/oil_spill.csv')

# Extract features and target
X = data.iloc[:, 1:-1].values
y = data.iloc[:, -1].values

# Check initial class distribution
unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

# Separate minority class
X_minority = X[y == 1]

# Split data into training and testing sets
X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train model on original data using DecisionTreeClassifier
clf_orig = DecisionTreeClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

# Predict on test set (original data)
y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig, pos_label=1)
f1_orig = f1_score(y_test, y_pred_orig, pos_label=1)
print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")

# Define TPR calculation function
def calculate_tpr(y_true, y_pred, target_class):
    """
    Calculate the True Positive Rate (TPR) for the specified class.

    Parameters:
    - y_true: Ground truth labels
    - y_pred: Predicted labels
    - target_class: The class for which to calculate TPR

    Returns:
    - TPR value
    """
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])  # Use [0, 1] since 0 is majority and 1 is minority
    index = 1 if target_class == 1 else 0  # If target_class is 1 (minority), index is 1
    tp = cm[index, index]  # True Positives
    fn = cm[index, :].sum() - tp  # False Negatives
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0  # Avoid division by zero
    return tpr

# Calculate TPR before augmentation
tpr_orig_minority = calculate_tpr(y_test, y_pred_orig, target_class=1)
tpr_orig_majority = calculate_tpr(y_test, y_pred_orig, target_class=0)
print(f"TPR (minority class) before augmentation: {tpr_orig_minority:.4f}")
print(f"TPR (majority class) before augmentation: {tpr_orig_majority:.4f}")

# Augment data using Forest Diffusion
int_indexes = [i for i in range(X.shape[1]) if np.issubdtype(X[:, i].dtype, np.integer)]
forest_model = ForestDiffusionModel(X_minority, label_y=None, n_t=50, duplicate_K=100,
                                    bin_indexes=[], cat_indexes=[], int_indexes=int_indexes,
                                    diffusion_type='flow', n_jobs=-1)

X_minority_fake = forest_model.generate(batch_size=len(X) // 5)

# Combine original and synthetic data
X_balanced = np.concatenate((X, X_minority_fake), axis=0)
y_balanced = np.concatenate((y, np.ones(X_minority_fake.shape[0], dtype=int)), axis=0)

# Check class distribution after augmentation
unique, counts = np.unique(y_balanced, return_counts=True)
class_dist_after = dict(zip(unique, counts))
print(f"Class distribution after augmentation: {class_dist_after}")

# Split augmented data into training and testing sets
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)

# Train model on augmented data using DecisionTreeClassifier
clf_bal = DecisionTreeClassifier(random_state=42)
clf_bal.fit(X_train_bal, y_train_bal)

# Predict on test set (augmented data)
y_pred_bal = clf_bal.predict(X_test)
recall_bal = recall_score(y_test, y_pred_bal, pos_label=1)
f1_bal = f1_score(y_test, y_pred_bal, pos_label=1)
print(f"Recall score (generated data): {recall_bal:.4f}")
print(f"F1 score (generated data): {f1_bal:.4f}")

# Calculate TPR after augmentation
tpr_bal_minority = calculate_tpr(y_test, y_pred_bal, target_class=1)
tpr_bal_majority = calculate_tpr(y_test, y_pred_bal, target_class=0)
print(f"TPR (minority class) after augmentation: {tpr_bal_minority:.4f}")
print(f"TPR (majority class) after augmentation: {tpr_bal_majority:.4f}")

# Classification reports
print("Classification Report (original data):\n", classification_report(y_test, y_pred_orig))
print("Classification Report (generated data):\n", classification_report(y_test, y_pred_bal))


Class distribution before augmentation: {0: 896, 1: 41}
Recall score (original data): 0.3333
F1 score (original data): 0.3810
TPR (minority class) before augmentation: 0.3333
TPR (majority class) before augmentation: 0.9815
Class distribution after augmentation: {0: 896, 1: 228}
Recall score (generated data): 0.7500
F1 score (generated data): 0.5455
TPR (minority class) after augmentation: 0.7500
TPR (majority class) after augmentation: 0.9556
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.97      0.98      0.98       270
           1       0.44      0.33      0.38        12

    accuracy                           0.95       282
   macro avg       0.71      0.66      0.68       282
weighted avg       0.95      0.95      0.95       282

Classification Report (generated data):
               precision    recall  f1-score   support

           0       0.99      0.96      0.97       270
           1       0.43      0.75  

# #Mammography Dataset

In [12]:
from collections import Counter
from imblearn.datasets import fetch_datasets
mammography = fetch_datasets()['mammography']
mammography.data.shape
print(sorted(Counter(mammography.target).items()))

[(-1, 10923), (1, 260)]


# RandomForest

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, recall_score, f1_score, confusion_matrix
from ForestDiffusion import ForestDiffusionModel
from imblearn.datasets import fetch_datasets

# Load data
data = fetch_datasets()['mammography']

# Extract features and target
X = data['data']
y = data['target']


# Check initial class distribution
unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

# Separate minority class
X_minority = X[y == 1]

# Split data into training and testing sets
X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train model on original data
clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

# Predict on test set (original data)
y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig, pos_label=1)
f1_orig = f1_score(y_test, y_pred_orig, pos_label=1)
print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")

# Define TPR calculation function
def calculate_tpr(y_true, y_pred, target_class):
    """
    Calculate the True Positive Rate (TPR) for the specified class.

    Parameters:
    - y_true: Ground truth labels
    - y_pred: Predicted labels
    - target_class: The class for which to calculate TPR

    Returns:
    - TPR value
    """
    cm = confusion_matrix(y_true, y_pred, labels=[-1, 1])  # Use [-1, 1] since -1 is majority and 1 is minority
    index = 1 if target_class == 1 else 0  # If target_class is 1 (minority), index is 1
    tp = cm[index, index]  # True Positives
    fn = cm[index, :].sum() - tp  # False Negatives
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0  # Avoid division by zero
    return tpr

# Calculate TPR before augmentation
tpr_orig_minority = calculate_tpr(y_test, y_pred_orig, target_class=1)
tpr_orig_majority = calculate_tpr(y_test, y_pred_orig, target_class=-1)
print(f"TPR (minority class) before augmentation: {tpr_orig_minority:.4f}")
print(f"TPR (majority class) before augmentation: {tpr_orig_majority:.4f}")

# Augment data using Forest Diffusion
#int_indexes = [i for i in range(X.shape[1]) if np.issubdtype(X[:, i].dtype, np.integer)]
forest_model = ForestDiffusionModel(X_minority, label_y=None, n_t=50, duplicate_K=100,
                                    bin_indexes=[], cat_indexes=[], int_indexes=[],
                                    diffusion_type='flow', n_jobs=-1)

X_minority_fake = forest_model.generate(batch_size=len(X) // 5)

# Combine original and synthetic data
X_balanced = np.concatenate((X, X_minority_fake), axis=0)
y_balanced = np.concatenate((y, np.ones(X_minority_fake.shape[0], dtype=int)), axis=0)

# Check class distribution after augmentation
unique, counts = np.unique(y_balanced, return_counts=True)
class_dist_after = dict(zip(unique, counts))
print(f"Class distribution after augmentation: {class_dist_after}")

print(f"Number of synthetic samples generated: {X_minority_fake.shape[0]}")


# Split augmented data into training and testing sets
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)

# Train model on augmented data
clf_bal = RandomForestClassifier(random_state=42)
clf_bal.fit(X_train_bal, y_train_bal)

# Predict on test set (augmented data)
y_pred_bal = clf_bal.predict(X_test)
recall_bal = recall_score(y_test, y_pred_bal, pos_label=1)
f1_bal = f1_score(y_test, y_pred_bal, pos_label=1)
print(f"Recall score (generated data): {recall_bal:.4f}")
print(f"F1 score (generated data): {f1_bal:.4f}")

# Calculate TPR after augmentation
tpr_bal_minority = calculate_tpr(y_test, y_pred_bal, target_class=1)
tpr_bal_majority = calculate_tpr(y_test, y_pred_bal, target_class=-1)
print(f"TPR (minority class) after augmentation: {tpr_bal_minority:.4f}")
print(f"TPR (majority class) after augmentation: {tpr_bal_majority:.4f}")

# Classification reports
print("Classification Report (original data):\n", classification_report(y_test, y_pred_orig))
print("Classification Report (generated data):\n", classification_report(y_test, y_pred_bal))


Class distribution before augmentation: {-1: 10923, 1: 260}
Recall score (original data): 0.5000
F1 score (original data): 0.6240
TPR (minority class) before augmentation: 0.5000
TPR (majority class) before augmentation: 0.9976
Class distribution after augmentation: {-1: 10923, 1: 2496}
Number of synthetic samples generated: 2236
Recall score (generated data): 0.9103
F1 score (generated data): 0.7889
TPR (minority class) after augmentation: 0.9103
TPR (majority class) after augmentation: 0.9905
Classification Report (original data):
               precision    recall  f1-score   support

          -1       0.99      1.00      0.99      3277
           1       0.83      0.50      0.62        78

    accuracy                           0.99      3355
   macro avg       0.91      0.75      0.81      3355
weighted avg       0.98      0.99      0.98      3355

Classification Report (generated data):
               precision    recall  f1-score   support

          -1       1.00      0.99    

# XGBoost

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, recall_score, f1_score, confusion_matrix
from xgboost import XGBClassifier
from ForestDiffusion import ForestDiffusionModel
from imblearn.datasets import fetch_datasets

# Load data
data = fetch_datasets()['mammography']

# Extract features and target
X = data['data']
y = data['target']

# Remap target labels from [-1, 1] to [0, 1]
y = np.where(y == -1, 0, 1)

# Check initial class distribution
unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

# Separate minority class
X_minority = X[y == 1]

# Split data into training and testing sets
X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train model on original data
clf_orig = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="logloss")
clf_orig.fit(X_train_orig, y_train_orig)

# Predict on test set (original data)
y_pred_orig = clf_orig.predict(X_test)
recall_orig = recall_score(y_test, y_pred_orig, pos_label=1)
f1_orig = f1_score(y_test, y_pred_orig, pos_label=1)
print(f"Recall score (original data): {recall_orig:.4f}")
print(f"F1 score (original data): {f1_orig:.4f}")

# Define TPR calculation function
def calculate_tpr(y_true, y_pred, target_class):
    """
    Calculate the True Positive Rate (TPR) for the specified class.

    Parameters:
    - y_true: Ground truth labels
    - y_pred: Predicted labels
    - target_class: The class for which to calculate TPR

    Returns:
    - TPR value
    """
    cm = confusion_matrix(y_true, y_pred, labels=[0, 1])  # Use [0, 1] since labels were remapped
    index = 1 if target_class == 1 else 0  # If target_class is 1 (minority), index is 1
    tp = cm[index, index]  # True Positives
    fn = cm[index, :].sum() - tp  # False Negatives
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0  # Avoid division by zero
    return tpr

# Calculate TPR before augmentation
tpr_orig_minority = calculate_tpr(y_test, y_pred_orig, target_class=1)
tpr_orig_majority = calculate_tpr(y_test, y_pred_orig, target_class=0)
print(f"TPR (minority class) before augmentation: {tpr_orig_minority:.4f}")
print(f"TPR (majority class) before augmentation: {tpr_orig_majority:.4f}")

# Augment data using Forest Diffusion
#int_indexes = [i for i in range(X.shape[1]) if np.issubdtype(X[:, i].dtype, np.integer)]
forest_model = ForestDiffusionModel(X_minority, label_y=None, n_t=50, duplicate_K=100,
                                    bin_indexes=[], cat_indexes=[], int_indexes=[],
                                    diffusion_type='flow', n_jobs=-1)

X_minority_fake = forest_model.generate(batch_size=len(X) // 5)

# Print the number of generated samples
print(f"Number of synthetic samples generated: {X_minority_fake.shape[0]}")

# Combine original and synthetic data
X_balanced = np.concatenate((X, X_minority_fake), axis=0)
y_balanced = np.concatenate((y, np.ones(X_minority_fake.shape[0], dtype=int)), axis=0)

# Check class distribution after augmentation
unique, counts = np.unique(y_balanced, return_counts=True)
class_dist_after = dict(zip(unique, counts))
print(f"Class distribution after augmentation: {class_dist_after}")

# Split augmented data into training and testing sets
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)

# Train model on augmented data
clf_bal = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="logloss")
clf_bal.fit(X_train_bal, y_train_bal)

# Predict on test set (augmented data)
y_pred_bal = clf_bal.predict(X_test)
recall_bal = recall_score(y_test, y_pred_bal, pos_label=1)
f1_bal = f1_score(y_test, y_pred_bal, pos_label=1)
print(f"Recall score (generated data): {recall_bal:.4f}")
print(f"F1 score (generated data): {f1_bal:.4f}")

# Calculate TPR after augmentation
tpr_bal_minority = calculate_tpr(y_test, y_pred_bal, target_class=1)
tpr_bal_majority = calculate_tpr(y_test, y_pred_bal, target_class=0)
print(f"TPR (minority class) after augmentation: {tpr_bal_minority:.4f}")
print(f"TPR (majority class) after augmentation: {tpr_bal_majority:.4f}")

# Classification reports
print("Classification Report (original data):\n", classification_report(y_test, y_pred_orig))
print("Classification Report (generated data):\n", classification_report(y_test, y_pred_bal))


Class distribution before augmentation: {0: 10923, 1: 260}
Recall score (original data): 0.6410
F1 score (original data): 0.7042
TPR (minority class) before augmentation: 0.6410
TPR (majority class) before augmentation: 0.9957
Number of synthetic samples generated: 2236
Class distribution after augmentation: {0: 10923, 1: 2496}
Recall score (generated data): 0.9103
F1 score (generated data): 0.8161
TPR (minority class) after augmentation: 0.9103
TPR (majority class) after augmentation: 0.9924
Classification Report (original data):
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      3277
           1       0.78      0.64      0.70        78

    accuracy                           0.99      3355
   macro avg       0.89      0.82      0.85      3355
weighted avg       0.99      0.99      0.99      3355

Classification Report (generated data):
               precision    recall  f1-score   support

           0       1.00      0.99      

# ##Equalized Odds

# Mammography Dataset

# RandomForest

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score
from ForestDiffusion import ForestDiffusionModel
from imblearn.datasets import fetch_datasets

# Load data
data = fetch_datasets()['mammography']

# Extract features and target
X = data['data']
y = data['target']

# Check initial class distribution
unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

# Separate minority class
X_minority = X[y == 1]

# Split data into training and testing sets
X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train model on original data
clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

# Predict on test set (original data)
y_pred_orig = clf_orig.predict(X_test)

# Define function to calculate TPR and FPR
def calculate_tpr_fpr(y_true, y_pred, target_class):
    """
    Calculate TPR and FPR for the specified class.

    Parameters:
    - y_true: Ground truth labels
    - y_pred: Predicted labels
    - target_class: The class for which to calculate TPR and FPR

    Returns:
    - TPR and FPR values
    """
    cm = confusion_matrix(y_true, y_pred, labels=[-1, 1])  # Majority is -1, Minority is 1
    index = 1 if target_class == 1 else 0  # If target_class is 1 (minority), index is 1
    tp = cm[index, index]  # True Positives
    fn = cm[index, :].sum() - tp  # False Negatives
    fp = cm[:, index].sum() - tp  # False Positives
    tn = cm.sum() - (tp + fn + fp)  # True Negatives
    
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    return tpr, fpr

# Calculate TPR and FPR before augmentation
tpr_orig_minority, fpr_orig_minority = calculate_tpr_fpr(y_test, y_pred_orig, target_class=1)
tpr_orig_majority, fpr_orig_majority = calculate_tpr_fpr(y_test, y_pred_orig, target_class=-1)
print(f"Before augmentation: TPR (minority) = {tpr_orig_minority:.4f}, FPR (minority) = {fpr_orig_minority:.4f}")
print(f"Before augmentation: TPR (majority) = {tpr_orig_majority:.4f}, FPR (majority) = {fpr_orig_majority:.4f}")

# Augment data using Forest Diffusion
#int_indexes = [i for i in range(X.shape[1]) if np.issubdtype(X[:, i].dtype, np.integer)]
forest_model = ForestDiffusionModel(X_minority, label_y=None, n_t=50, duplicate_K=100,
                                    bin_indexes=[], cat_indexes=[], int_indexes=[],
                                    diffusion_type='flow', n_jobs=-1)

X_minority_fake = forest_model.generate(batch_size=len(X) // 5)

# Combine original and synthetic data
X_balanced = np.concatenate((X, X_minority_fake), axis=0)
y_balanced = np.concatenate((y, np.ones(X_minority_fake.shape[0], dtype=int)), axis=0)

# Split augmented data into training and testing sets
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)

# Train model on augmented data
clf_bal = RandomForestClassifier(random_state=42)
clf_bal.fit(X_train_bal, y_train_bal)

# Predict on test set (augmented data)
y_pred_bal = clf_bal.predict(X_test)

# Calculate TPR and FPR after augmentation
tpr_bal_minority, fpr_bal_minority = calculate_tpr_fpr(y_test, y_pred_bal, target_class=1)
tpr_bal_majority, fpr_bal_majority = calculate_tpr_fpr(y_test, y_pred_bal, target_class=-1)
print(f"After augmentation: TPR (minority) = {tpr_bal_minority:.4f}, FPR (minority) = {fpr_bal_minority:.4f}")
print(f"After augmentation: TPR (majority) = {tpr_bal_majority:.4f}, FPR (majority) = {fpr_bal_majority:.4f}")

# Classification reports
print("Classification Report (original data):\n", classification_report(y_test, y_pred_orig))
print("Classification Report (generated data):\n", classification_report(y_test, y_pred_bal))

# Evaluate Equalized Odds
def evaluate_equalized_odds(tpr1, fpr1, tpr2, fpr2, tolerance=0.05):
    """
    Evaluate if Equalized Odds condition is met.

    Parameters:
    - tpr1, fpr1: TPR and FPR for group 1 (minority)
    - tpr2, fpr2: TPR and FPR for group 2 (majority)
    - tolerance: Allowed difference between groups

    Returns:
    - Boolean indicating if Equalized Odds is satisfied
    """
    tpr_diff = abs(tpr1 - tpr2)
    fpr_diff = abs(fpr1 - fpr2)
    return tpr_diff <= tolerance and fpr_diff <= tolerance

equalized_odds_before = evaluate_equalized_odds(tpr_orig_minority, fpr_orig_minority, tpr_orig_majority, fpr_orig_majority)
equalized_odds_after = evaluate_equalized_odds(tpr_bal_minority, fpr_bal_minority, tpr_bal_majority, fpr_bal_majority)

print(f"Equalized Odds satisfied before augmentation: {equalized_odds_before}")
print(f"Equalized Odds satisfied after augmentation: {equalized_odds_after}")


Class distribution before augmentation: {-1: 10923, 1: 260}
Before augmentation: TPR (minority) = 0.5000, FPR (minority) = 0.0024
Before augmentation: TPR (majority) = 0.9976, FPR (majority) = 0.5000
After augmentation: TPR (minority) = 0.9103, FPR (minority) = 0.0095
After augmentation: TPR (majority) = 0.9905, FPR (majority) = 0.0897
Classification Report (original data):
               precision    recall  f1-score   support

          -1       0.99      1.00      0.99      3277
           1       0.83      0.50      0.62        78

    accuracy                           0.99      3355
   macro avg       0.91      0.75      0.81      3355
weighted avg       0.98      0.99      0.98      3355

Classification Report (generated data):
               precision    recall  f1-score   support

          -1       1.00      0.99      0.99      3277
           1       0.70      0.91      0.79        78

    accuracy                           0.99      3355
   macro avg       0.85      0.95   

# ##Demographic Parity

In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score
from ForestDiffusion import ForestDiffusionModel
from imblearn.datasets import fetch_datasets

# Load data
data = fetch_datasets()['mammography']

# Extract features and target
X = data['data']
y = data['target']

# Check initial class distribution
unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

# Separate minority class
X_minority = X[y == 1]

# Split data into training and testing sets
X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train model on original data
clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

# Predict on test set (original data)
y_pred_orig = clf_orig.predict(X_test)

# Define function to calculate TPR and FPR
def calculate_tpr_fpr(y_true, y_pred, target_class):
    """
    Calculate TPR and FPR for the specified class.

    Parameters:
    - y_true: Ground truth labels
    - y_pred: Predicted labels
    - target_class: The class for which to calculate TPR and FPR

    Returns:
    - TPR and FPR values
    """
    cm = confusion_matrix(y_true, y_pred, labels=[-1, 1])  # Majority is -1, Minority is 1
    index = 1 if target_class == 1 else 0  # If target_class is 1 (minority), index is 1
    tp = cm[index, index]  # True Positives
    fn = cm[index, :].sum() - tp  # False Negatives
    fp = cm[:, index].sum() - tp  # False Positives
    tn = cm.sum() - (tp + fn + fp)  # True Negatives
    
    tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
    return tpr, fpr

# Calculate TPR and FPR before augmentation
tpr_orig_minority, fpr_orig_minority = calculate_tpr_fpr(y_test, y_pred_orig, target_class=1)
tpr_orig_majority, fpr_orig_majority = calculate_tpr_fpr(y_test, y_pred_orig, target_class=-1)
print(f"Before augmentation: TPR (minority) = {tpr_orig_minority:.4f}, FPR (minority) = {fpr_orig_minority:.4f}")
print(f"Before augmentation: TPR (majority) = {tpr_orig_majority:.4f}, FPR (majority) = {fpr_orig_majority:.4f}")

# Augment data using Forest Diffusion
#int_indexes = [i for i in range(X.shape[1]) if np.issubdtype(X[:, i].dtype, np.integer)]
forest_model = ForestDiffusionModel(X_minority, label_y=None, n_t=50, duplicate_K=100,
                                    bin_indexes=[], cat_indexes=[], int_indexes=[],
                                    diffusion_type='flow', n_jobs=-1)

X_minority_fake = forest_model.generate(batch_size=len(X) // 5)

# Combine original and synthetic data
X_balanced = np.concatenate((X, X_minority_fake), axis=0)
y_balanced = np.concatenate((y, np.ones(X_minority_fake.shape[0], dtype=int)), axis=0)

# Split augmented data into training and testing sets
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)

# Train model on augmented data
clf_bal = RandomForestClassifier(random_state=42)
clf_bal.fit(X_train_bal, y_train_bal)

# Predict on test set (augmented data)
y_pred_bal = clf_bal.predict(X_test)

# Calculate TPR and FPR after augmentation
tpr_bal_minority, fpr_bal_minority = calculate_tpr_fpr(y_test, y_pred_bal, target_class=1)
tpr_bal_majority, fpr_bal_majority = calculate_tpr_fpr(y_test, y_pred_bal, target_class=-1)
print(f"After augmentation: TPR (minority) = {tpr_bal_minority:.4f}, FPR (minority) = {fpr_bal_minority:.4f}")
print(f"After augmentation: TPR (majority) = {tpr_bal_majority:.4f}, FPR (majority) = {fpr_bal_majority:.4f}")

# Classification reports
print("Classification Report (original data):\n", classification_report(y_test, y_pred_orig))
print("Classification Report (generated data):\n", classification_report(y_test, y_pred_bal))

# Demographic Parity Evaluation
def calculate_demographic_parity(y_true, y_pred):
    """
    Calculate the demographic parity (positive prediction rate) for each class.
    
    Parameters:
    - y_true: Ground truth labels
    - y_pred: Predicted labels
    
    Returns:
    - Demographic parity gap
    """
    # Positive prediction rate for majority and minority classes
    pos_rate_majority = np.mean(y_pred[y_true == -1] == 1)
    pos_rate_minority = np.mean(y_pred[y_true == 1] == 1)
    
    print(f"Proportion of positive predictions (majority class): {pos_rate_majority:.4f}")
    print(f"Proportion of positive predictions (minority class): {pos_rate_minority:.4f}")
    
    # Demographic Parity gap
    demographic_parity_gap = abs(pos_rate_majority - pos_rate_minority)
    print(f"Demographic Parity Gap: {demographic_parity_gap:.4f}")
    
    return demographic_parity_gap

# Evaluate Demographic Parity before and after augmentation
demographic_parity_gap_before = calculate_demographic_parity(y_test, y_pred_orig)
demographic_parity_gap_after = calculate_demographic_parity(y_test, y_pred_bal)

# Check if Demographic Parity is satisfied (gap <= tolerance)
tolerance = 0.05
demographic_parity_satisfied_before = demographic_parity_gap_before <= tolerance
demographic_parity_satisfied_after = demographic_parity_gap_after <= tolerance

print(f"Demographic Parity satisfied before augmentation: {demographic_parity_satisfied_before}")
print(f"Demographic Parity satisfied after augmentation: {demographic_parity_satisfied_after}")


Class distribution before augmentation: {-1: 10923, 1: 260}
Before augmentation: TPR (minority) = 0.5000, FPR (minority) = 0.0024
Before augmentation: TPR (majority) = 0.9976, FPR (majority) = 0.5000
After augmentation: TPR (minority) = 0.9103, FPR (minority) = 0.0095
After augmentation: TPR (majority) = 0.9905, FPR (majority) = 0.0897
Classification Report (original data):
               precision    recall  f1-score   support

          -1       0.99      1.00      0.99      3277
           1       0.83      0.50      0.62        78

    accuracy                           0.99      3355
   macro avg       0.91      0.75      0.81      3355
weighted avg       0.98      0.99      0.98      3355

Classification Report (generated data):
               precision    recall  f1-score   support

          -1       1.00      0.99      0.99      3277
           1       0.70      0.91      0.79        78

    accuracy                           0.99      3355
   macro avg       0.85      0.95   

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score
from ForestDiffusion import ForestDiffusionModel
from imblearn.datasets import fetch_datasets

# Load data
data = fetch_datasets()['mammography']

# Extract features and target
X = data['data']
y = data['target']

# Check initial class distribution
unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

# Separate minority class
X_minority = X[y == 1]

# Split data into training and testing sets
X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train model on original data
clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

# Predict on test set (original data)
y_pred_orig = clf_orig.predict(X_test)

# Function to calculate positive prediction proportions (Statistical Parity)
def calculate_statistical_parity(y_true, y_pred, group_label):
    """
    Calculate the proportion of positive predictions for a specific group.

    Parameters:
    - y_true: Ground truth labels
    - y_pred: Predicted labels
    - group_label: The class label (majority or minority group) to calculate for

    Returns:
    - Proportion of positive predictions for the specified group
    """
    positive_predictions = (y_pred == 1)  # Positive predictions are labeled as 1
    group_mask = (y_true == group_label)
    return np.mean(positive_predictions[group_mask])

# Calculate Statistical Parity before augmentation
prop_pos_majority_before = calculate_statistical_parity(y_test, y_pred_orig, group_label=-1)
prop_pos_minority_before = calculate_statistical_parity(y_test, y_pred_orig, group_label=1)

statistical_parity_gap_before = abs(prop_pos_majority_before - prop_pos_minority_before)

print(f"Proportion of positive predictions (majority class) before augmentation: {prop_pos_majority_before:.4f}")
print(f"Proportion of positive predictions (minority class) before augmentation: {prop_pos_minority_before:.4f}")
print(f"Statistical Parity Gap before augmentation: {statistical_parity_gap_before:.4f}")

# Augment data using Forest Diffusion
#int_indexes = [i for i in range(X.shape[1]) if np.issubdtype(X[:, i].dtype, np.integer)]
forest_model = ForestDiffusionModel(X_minority, label_y=None, n_t=50, duplicate_K=100,
                                    bin_indexes=[], cat_indexes=[], int_indexes=[],
                                    diffusion_type='flow', n_jobs=-1)

X_minority_fake = forest_model.generate(batch_size=len(X) // 5)

# Combine original and synthetic data
X_balanced = np.concatenate((X, X_minority_fake), axis=0)
y_balanced = np.concatenate((y, np.ones(X_minority_fake.shape[0], dtype=int)), axis=0)

# Split augmented data into training and testing sets
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)

# Train model on augmented data
clf_bal = RandomForestClassifier(random_state=42)
clf_bal.fit(X_train_bal, y_train_bal)

# Predict on test set (augmented data)
y_pred_bal = clf_bal.predict(X_test)

# Calculate Statistical Parity after augmentation
prop_pos_majority_after = calculate_statistical_parity(y_test, y_pred_bal, group_label=-1)
prop_pos_minority_after = calculate_statistical_parity(y_test, y_pred_bal, group_label=1)

statistical_parity_gap_after = abs(prop_pos_majority_after - prop_pos_minority_after)

print(f"Proportion of positive predictions (majority class) after augmentation: {prop_pos_majority_after:.4f}")
print(f"Proportion of positive predictions (minority class) after augmentation: {prop_pos_minority_after:.4f}")
print(f"Statistical Parity Gap after augmentation: {statistical_parity_gap_after:.4f}")

# Classification reports
print("Classification Report (original data):\n", classification_report(y_test, y_pred_orig))
print("Classification Report (generated data):\n", classification_report(y_test, y_pred_bal))

# Evaluate Statistical Parity
def evaluate_statistical_parity(gap, tolerance=0.05):
    """
    Evaluate if Statistical Parity condition is met.

    Parameters:
    - gap: The difference in positive prediction proportions between the majority and minority classes
    - tolerance: The acceptable difference for statistical parity

    Returns:
    - Boolean indicating if Statistical Parity is satisfied
    """
    return gap <= tolerance

statistical_parity_before = evaluate_statistical_parity(statistical_parity_gap_before)
statistical_parity_after = evaluate_statistical_parity(statistical_parity_gap_after)

print(f"Statistical Parity satisfied before augmentation: {statistical_parity_before}")
print(f"Statistical Parity satisfied after augmentation: {statistical_parity_after}")


Class distribution before augmentation: {-1: 10923, 1: 260}
Proportion of positive predictions (majority class) before augmentation: 0.0024
Proportion of positive predictions (minority class) before augmentation: 0.5000
Statistical Parity Gap before augmentation: 0.4976
Proportion of positive predictions (majority class) after augmentation: 0.0095
Proportion of positive predictions (minority class) after augmentation: 0.9103
Statistical Parity Gap after augmentation: 0.9008
Classification Report (original data):
               precision    recall  f1-score   support

          -1       0.99      1.00      0.99      3277
           1       0.83      0.50      0.62        78

    accuracy                           0.99      3355
   macro avg       0.91      0.75      0.81      3355
weighted avg       0.98      0.99      0.98      3355

Classification Report (generated data):
               precision    recall  f1-score   support

          -1       1.00      0.99      0.99      3277
    

In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score
from ForestDiffusion import ForestDiffusionModel
from imblearn.datasets import fetch_datasets
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors

# Load data
data = fetch_datasets()['mammography']

# Extract features and target
X = data['data']
y = data['target']

# Check initial class distribution
unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

# Separate minority class
X_minority = X[y == 1]

# Split data into training and testing sets
X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train model on original data
clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

# Predict on test set (original data)
y_pred_orig = clf_orig.predict(X_test)

# Classification reports
print("Classification Report (original data):\n", classification_report(y_test, y_pred_orig))

# Augment data using Forest Diffusion
int_indexes = [i for i in range(X.shape[1]) if np.issubdtype(X[:, i].dtype, np.integer)]
forest_model = ForestDiffusionModel(X_minority, label_y=None, n_t=50, duplicate_K=100,
                                    bin_indexes=[], cat_indexes=[], int_indexes=int_indexes,
                                    diffusion_type='flow', n_jobs=-1)

X_minority_fake = forest_model.generate(batch_size=len(X) // 5)

# Combine original and synthetic data
X_balanced = np.concatenate((X, X_minority_fake), axis=0)
y_balanced = np.concatenate((y, np.ones(X_minority_fake.shape[0], dtype=int)), axis=0)

# Split augmented data into training and testing sets
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)

# Train model on augmented data
clf_bal = RandomForestClassifier(random_state=42)
clf_bal.fit(X_train_bal, y_train_bal)

# Predict on test set (augmented data)
y_pred_bal = clf_bal.predict(X_test)

# Classification reports
print("Classification Report (generated data):\n", classification_report(y_test, y_pred_bal))

# --- Causal Inference Tests ---
# Causal Inference: Estimate Propensity Scores

# Estimate propensity scores using logistic regression (for treatment effect estimation)
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_orig, y_train_orig)

# Predict the propensity scores (probability of being treated)
propensity_scores = log_reg.predict_proba(X_train_orig)[:, 1]

# Nearest Neighbors Matching (for estimating treatment effects)
neighbors = NearestNeighbors(n_neighbors=1)
neighbors.fit(propensity_scores.reshape(-1, 1))

# Match treatment groups with the closest neighbors (propensity score matching)
matched_indices = neighbors.kneighbors(propensity_scores.reshape(-1, 1), return_distance=False).flatten()

# Treatment effect estimation (difference in outcomes between treated and matched control)
treated_outcome = clf_bal.predict(X_train_bal)[matched_indices]  # Outcome after treatment (augmentation)
control_outcome = clf_orig.predict(X_train_orig)[matched_indices]  # Outcome before treatment (original)

# Calculate treatment effect
treatment_effect = np.mean(treated_outcome != control_outcome)  # Difference in outcomes
print(f"Estimated Treatment Effect (difference in prediction outcomes): {treatment_effect:.4f}")


Class distribution before augmentation: {-1: 10923, 1: 260}
Classification Report (original data):
               precision    recall  f1-score   support

          -1       0.99      1.00      0.99      3277
           1       0.83      0.50      0.62        78

    accuracy                           0.99      3355
   macro avg       0.91      0.75      0.81      3355
weighted avg       0.98      0.99      0.98      3355

Classification Report (generated data):
               precision    recall  f1-score   support

          -1       1.00      0.99      0.99      3277
           1       0.70      0.91      0.79        78

    accuracy                           0.99      3355
   macro avg       0.85      0.95      0.89      3355
weighted avg       0.99      0.99      0.99      3355

Estimated Treatment Effect (difference in prediction outcomes): 0.1483


In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, recall_score, f1_score
from ForestDiffusion import ForestDiffusionModel
from imblearn.datasets import fetch_datasets
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors

# Load data
data = fetch_datasets()['mammography']

# Extract features and target
X = data['data']
y = data['target']

# Check initial class distribution
unique, counts = np.unique(y, return_counts=True)
class_dist_before = dict(zip(unique, counts))
print(f"Class distribution before augmentation: {class_dist_before}")

# Separate minority class
X_minority = X[y == 1]

# Split data into training and testing sets
X_train_orig, X_test, y_train_orig, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train model on original data
clf_orig = RandomForestClassifier(random_state=42)
clf_orig.fit(X_train_orig, y_train_orig)

# Predict on test set (original data)
y_pred_orig = clf_orig.predict(X_test)

# Augment data using Forest Diffusion
#int_indexes = [i for i in range(X.shape[1]) if np.issubdtype(X[:, i].dtype, np.integer)]
forest_model = ForestDiffusionModel(X_minority, label_y=None, n_t=50, duplicate_K=100,
                                    bin_indexes=[], cat_indexes=[], int_indexes=[],
                                    diffusion_type='flow', n_jobs=-1)

X_minority_fake = forest_model.generate(batch_size=len(X) // 5)

# Combine original and synthetic data
X_balanced = np.concatenate((X, X_minority_fake), axis=0)
y_balanced = np.concatenate((y, np.ones(X_minority_fake.shape[0], dtype=int)), axis=0)

# Split augmented data into training and testing sets
X_train_bal, X_test_bal, y_train_bal, y_test_bal = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42, stratify=y_balanced)

# Train model on augmented data
clf_bal = RandomForestClassifier(random_state=42)
clf_bal.fit(X_train_bal, y_train_bal)

# Predict on test set (augmented data)
y_pred_bal = clf_bal.predict(X_test)

# --- Causal Inference: Estimating Treatment Effect by Group ---
# For minority class (y == 1) and majority class (y == -1), compare treatment effects
def calculate_treatment_effect_by_group(y_true, y_pred_orig, y_pred_bal, target_class):
    """
    Calculate treatment effect (difference in predictions) for the specified class.

    Parameters:
    - y_true: Ground truth labels
    - y_pred_orig: Predictions from the model trained on original data
    - y_pred_bal: Predictions from the model trained on augmented data
    - target_class: The class for which to calculate the treatment effect (1 for minority, -1 for majority)

    Returns:
    - Treatment effect for the target class
    """
    # Get the indices for the target class
    class_indices = np.where(y_true == target_class)[0]

    # Calculate the treatment effect: difference in predictions for the target class
    treatment_effect = np.mean(y_pred_bal[class_indices] != y_pred_orig[class_indices])  # Difference in outcomes

    return treatment_effect

# Treatment effect for minority class (1)
treatment_effect_minority = calculate_treatment_effect_by_group(y_test, y_pred_orig, y_pred_bal, target_class=1)
# Treatment effect for majority class (-1)
treatment_effect_majority = calculate_treatment_effect_by_group(y_test, y_pred_orig, y_pred_bal, target_class=-1)

print(f"Estimated Treatment Effect for Minority Class: {treatment_effect_minority:.4f}")
print(f"Estimated Treatment Effect for Majority Class: {treatment_effect_majority:.4f}")


Class distribution before augmentation: {-1: 10923, 1: 260}
Estimated Treatment Effect for Minority Class: 0.4103
Estimated Treatment Effect for Majority Class: 0.0070
