In [1]:
import pandas as pd
import numpy as np
import json
import torch

# Direct imports from the same directory
from tabddpm_adapter import TabDDPMAdapter
from tabddpm_benchmark import evaluate_tabddpm, print_evaluation_results
from tabddpm_utils import preprocess_data, get_tstr_results

In [2]:
# Load configuration
print("# Load configuration")
with open("config.json", "r") as f:
    config = json.load(f)

# Load configuration


In [3]:
# 1. Load and prepare the dataset
print("# 1. Load and prepare the dataset")
data_raw = pd.read_csv("shuttle.csv")
print(f"Columns in dataset: {data_raw.columns.tolist()}")
print(f"Dataset shape: {data_raw.shape}")
print(data_raw.head())

# 1. Load and prepare the dataset
Columns in dataset: ['55', '0', '81', '0.1', '-6', '11', '25', '88', '64', '4']
Dataset shape: (14499, 10)
   55  0  81  0.1  -6  11  25  88  64  4
0  56  0  96    0  52  -4  40  44   4  4
1  50 -1  89   -7  50   0  39  40   2  1
2  53  9  79    0  42  -2  25  37  12  4
3  55  2  82    0  54  -6  26  28   2  1
4  41  0  84    3  38  -4  43  45   2  1


In [4]:
# 2. Preprocess data and detect categorical columns
print("\n# 2. Preprocess data and detect categorical columns")
data, categorical_columns = preprocess_data(data_raw)
print(f"Detected categorical columns: {categorical_columns}")

2025-04-27 23:20:34,037 - INFO - Converted 4 to category type (has 7 unique values)



# 2. Preprocess data and detect categorical columns
Detected categorical columns: ['4']


In [6]:
# 3. Define the target column for this dataset
print("\n# 3. Define the target column for this dataset")
target_column = "4"
print(f"Target column: {target_column}")


# 3. Define the target column for this dataset
Target column: 4


In [7]:
# 4. Split the data into features and target
print("\n# 4. Split the data into features and target")
X = data.drop(columns=[target_column])
y = data[target_column]
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution:\n{y.value_counts()}")


# 4. Split the data into features and target
Features shape: (14499, 9)
Target shape: (14499,)
Target distribution:
4
1    11478
4     2154
5      809
3       39
2       13
6        4
7        2
Name: count, dtype: int64


In [8]:
# 5. Initialize and train TabDDPM
print("\n# 5. Initialize and train TabDDPM")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tabddpm = TabDDPMAdapter(**config["tabddpm_params"], device=device)
print("Training TabDDPM model...")
tabddpm.fit(X, y)
print("Training completed")


# 5. Initialize and train TabDDPM
Training TabDDPM model...
Original data shape: (14499, 10), Target column: 4
Added StandardScaler for 9 numerical columns
Target '4' identified as categorical with 7 classes
Class mapping: {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6}

Preprocessing Summary:
- Number of numerical features: 9
- Number of categorical features: 0
- Categorical columns: []
- Target column: 4
- Target type: Categorical
- Number of target classes: 7
X shape: (14499, 9), y shape: (14499,)
X_tensor shape: torch.Size([14499, 9]), y_tensor shape: torch.Size([14499])
X_tensor shape: torch.Size([14499, 9])
y_tensor shape: torch.Size([14499])


Epoch 299, Loss: 0.1604: 100%|██████████| 300/300 [12:14<00:00,  2.45s/it]

Training completed





In [9]:
# 6. Generate synthetic data
print("\n# 6. Generate synthetic data")
n_samples = 1000  
print(f"Generating {n_samples} synthetic samples...")
synthetic_data = tabddpm.generate(n_samples)
print(f"Generated {len(synthetic_data)} synthetic samples")
print("Synthetic data head:")
print(synthetic_data.head())


# 6. Generate synthetic data
Generating 1000 synthetic samples...
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Debug - out_dict type: <class 'torch.Tensor'>
Added target column '4' with 7 unique values
Final columns in synthetic data: ['55', '0', '81', '0.1', '-6', '11', '25', '88', '64', '4']
Generated 1000 synthetic samples
Synthetic data head:
          55            0          81         0.1          -6           11  \
0  17.400000 -3927.200000   62.889020  160.361097  345.600000  2188.490990   
1  55.529991     0.410733  111.000895    0.314406   55.076492    -0.516448   
2  52.195830     2.069632   78.355207   -0.718292   50.948600   -16.694845   
3  49.269261    -2.050622  103.713311    1.202297   48.647823   -10.599105   
4  54.351364    -5.598778  111.068194    1.407014   52.174194   -10.813418   

          25          88          64  4  
0  82.100000  163.200000  165.000000  7  
1  54.556080   54.347188   -0.526818  4  
2  25.872805   2

In [10]:
# 7. Evaluate quality using TSTR and other metrics
print("\n# 7. Evaluate quality using TSTR and other metrics")
print("Running evaluation...")
evaluation_results = evaluate_tabddpm(data, synthetic_data, target_column=target_column)
print_evaluation_results(evaluation_results)


# 7. Evaluate quality using TSTR and other metrics
Running evaluation...


2025-04-27 23:34:26,928 - INFO - Encoded categorical target with mapping: {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6}
Parameters: { "use_label_encoder" } are not used.

2025-04-27 23:34:32,087 - INFO - Encoded categorical targets with mapping: {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6}
Parameters: { "use_label_encoder" } are not used.

2025-04-27 23:34:36,013 - INFO - TabDDPM Evaluation Results:
2025-04-27 23:34:36,015 - INFO - 
Likelihood Fitness Metrics:
2025-04-27 23:34:36,015 - INFO -   - Lsyn (Synthetic Data Log-Likelihood): -31.1196
2025-04-27 23:34:36,017 - INFO -   - Ltest (Real Data Log-Likelihood under Synthetic Model): -24.5958
2025-04-27 23:34:36,017 - INFO - 
Statistical Similarity Metrics:
2025-04-27 23:34:36,019 - INFO -   - Wasserstein Distance Mean (Numerical): 564.2534
2025-04-27 23:34:36,019 - INFO - 
Machine Learning Efficacy Metrics on Real Data:
2025-04-27 23:34:36,021 - INFO -   LogisticRegression:
2025-04-27 23:34:36,021 - INFO -     - Accuracy: 0.9659
2025-04-27

In [11]:
# 8. Extract and display TSTR results specifically
print("\n# 8. TSTR Performance Results")
tstr_results = get_tstr_results(evaluation_results)
if tstr_results is not None:
    print(tstr_results)


# 8. TSTR Performance Results
                    Accuracy        F1
LogisticRegression  0.665425  0.682181
RandomForest        0.169667  0.262627
MLP                 0.188082  0.274447
XGBoost             0.131319  0.204750


In [12]:
# 9. Save the synthetic data
print("\n# 9. Save synthetic data")
output_path = "shuttle_synthetic.csv"
synthetic_data.to_csv(output_path, index=False)
print(f"Synthetic data saved to {output_path}")

print("\nTest completed successfully!")


# 9. Save synthetic data
Synthetic data saved to shuttle_synthetic.csv

Test completed successfully!
