In [1]:
import pandas as pd
import numpy as np
import json
import torch

# Direct imports from the same directory
from tabddpm_adapter import TabDDPMAdapter
from tabddpm_benchmark import evaluate_tabddpm, print_evaluation_results
from tabddpm_utils import preprocess_data, get_tstr_results

In [2]:
# Load configuration
print("# Load configuration")
with open("config.json", "r") as f:
    config = json.load(f)

# Load configuration


In [3]:
# 1. Load and prepare the dataset
print("# 1. Load and prepare the dataset")
data_raw = pd.read_csv("magic.csv")
print(f"Columns in dataset: {data_raw.columns.tolist()}")
print(f"Dataset shape: {data_raw.shape}")
print(data_raw.head())

# 1. Load and prepare the dataset
Columns in dataset: ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'class']
Dataset shape: (19020, 11)
    fLength    fWidth   fSize   fConc  fConc1     fAsym  fM3Long  fM3Trans  \
0   28.7967   16.0021  2.6449  0.3918  0.1982   27.7004  22.0110   -8.2027   
1   31.6036   11.7235  2.5185  0.5303  0.3773   26.2722  23.8238   -9.9574   
2  162.0520  136.0310  4.0612  0.0374  0.0187  116.7410 -64.8580  -45.2160   
3   23.8172    9.5728  2.3385  0.6147  0.3922   27.2107  -6.4633   -7.1513   
4   75.1362   30.9205  3.1611  0.3168  0.1832   -5.5277  28.5525   21.8393   

    fAlpha     fDist class  
0  40.0920   81.8828     g  
1   6.3609  205.2610     g  
2  76.9600  256.7880     g  
3  10.4490  116.7370     g  
4   4.6480  356.4620     g  


In [4]:
# 2. Preprocess data and detect categorical columns
print("\n# 2. Preprocess data and detect categorical columns")
data, categorical_columns = preprocess_data(data_raw)
print(f"Detected categorical columns: {categorical_columns}")

2025-04-27 12:14:35,974 - INFO - Converted class to category type (has 2 unique values)



# 2. Preprocess data and detect categorical columns
Detected categorical columns: ['class']


In [5]:
# 3. Define the target column for this dataset
print("\n# 3. Define the target column for this dataset")
target_column = "class"
print(f"Target column: {target_column}")


# 3. Define the target column for this dataset
Target column: class


In [6]:
# 4. Split the data into features and target
print("\n# 4. Split the data into features and target")
X = data.drop(columns=[target_column])
y = data[target_column]
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution:\n{y.value_counts()}")


# 4. Split the data into features and target
Features shape: (19020, 10)
Target shape: (19020,)
Target distribution:
class
g    12332
h     6688
Name: count, dtype: int64


In [7]:
# 5. Initialize and train TabDDPM
print("\n# 5. Initialize and train TabDDPM")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tabddpm = TabDDPMAdapter(**config["tabddpm_params"], device=device)
print("Training TabDDPM model...")
tabddpm.fit(X, y)
print("Training completed")


# 5. Initialize and train TabDDPM
Training TabDDPM model...
Original data shape: (19020, 11), Target column: class
Added StandardScaler for 10 numerical columns
Target 'class' identified as categorical with 2 classes
Class mapping: {'g': 0, 'h': 1}

Preprocessing Summary:
- Number of numerical features: 10
- Number of categorical features: 0
- Categorical columns: []
- Target column: class
- Target type: Categorical
- Number of target classes: 2
X shape: (19020, 10), y shape: (19020,)
X_tensor shape: torch.Size([19020, 10]), y_tensor shape: torch.Size([19020])
X_tensor shape: torch.Size([19020, 10])
y_tensor shape: torch.Size([19020])


Epoch 299, Loss: 0.3395: 100%|██████████| 300/300 [09:02<00:00,  1.81s/it]

Training completed





In [8]:
# 6. Generate synthetic data
print("\n# 6. Generate synthetic data")
n_samples = 1000  
print(f"Generating {n_samples} synthetic samples...")
synthetic_data = tabddpm.generate(n_samples)
print(f"Generated {len(synthetic_data)} synthetic samples")
print("Synthetic data head:")
print(synthetic_data.head())


# 6. Generate synthetic data
Generating 1000 synthetic samples...
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Debug - out_dict type: <class 'torch.Tensor'>
Added target column 'class' with 2 unique values
Final columns in synthetic data: ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'class']
Generated 1000 synthetic samples
Synthetic data head:
      fLength      fWidth     fSize     fConc    fConc1       fAsym  \
0   23.132451   16.044517  2.416267  0.431629  0.219371   18.395814   
1   28.297065   14.584512  2.587033  0.470330  0.252192   -0.097241   
2  367.166350  112.722260  1.603100  0.980990  0.742690 -561.231780   
3   84.187362   35.502132  3.477488  0.090964  0.047783   16.061834   
4   25.330818   13.923141  2.481471  0.574524  0.346881  -19.612429   

      fM3Long   fM3Trans     fAlpha       fDist class  
0   15.157432  -6.794261  32.805569  122.770295     g  
1    6.830813   7.

In [9]:
# 7. Evaluate quality using TSTR and other metrics
print("\n# 7. Evaluate quality using TSTR and other metrics")
print("Running evaluation...")
evaluation_results = evaluate_tabddpm(data, synthetic_data, target_column=target_column)
print_evaluation_results(evaluation_results)


2025-04-27 12:46:13,636 - INFO - Encoded categorical target with mapping: {'g': 0, 'h': 1}



# 7. Evaluate quality using TSTR and other metrics
Running evaluation...


Parameters: { "use_label_encoder" } are not used.

2025-04-27 12:46:28,979 - INFO - Encoded categorical targets with mapping: {'g': 0, 'h': 1}
Parameters: { "use_label_encoder" } are not used.

2025-04-27 12:46:30,831 - INFO - TabDDPM Evaluation Results:
2025-04-27 12:46:30,832 - INFO - 
Likelihood Fitness Metrics:
2025-04-27 12:46:30,833 - INFO -   - Lsyn (Synthetic Data Log-Likelihood): -28.4363
2025-04-27 12:46:30,833 - INFO -   - Ltest (Real Data Log-Likelihood under Synthetic Model): -32.2831
2025-04-27 12:46:30,834 - INFO - 
Statistical Similarity Metrics:
2025-04-27 12:46:30,834 - INFO -   - Wasserstein Distance Mean (Numerical): 88.2596
2025-04-27 12:46:30,835 - INFO - 
Machine Learning Efficacy Metrics on Real Data:
2025-04-27 12:46:30,836 - INFO -   LogisticRegression:
2025-04-27 12:46:30,836 - INFO -     - Accuracy: 0.7931
2025-04-27 12:46:30,837 - INFO -     - F1: 0.7867
2025-04-27 12:46:30,837 - INFO -   RandomForest:
2025-04-27 12:46:30,839 - INFO -     - Accuracy: 0.8780

In [10]:
# 8. Extract and display TSTR results specifically
print("\n# 8. TSTR Performance Results")
tstr_results = get_tstr_results(evaluation_results)
if tstr_results is not None:
    print(tstr_results)


# 8. TSTR Performance Results
                    Accuracy        F1
LogisticRegression  0.397266  0.404562
RandomForest        0.529706  0.540244
MLP                 0.566614  0.572535
XGBoost             0.497371  0.508637


In [11]:
# 9. Save the synthetic data
print("\n# 9. Save synthetic data")
output_path = "magic_synthetic.csv"
synthetic_data.to_csv(output_path, index=False)
print(f"Synthetic data saved to {output_path}")

print("\nTest completed successfully!")


# 9. Save synthetic data
Synthetic data saved to magic_synthetic.csv

Test completed successfully!
