In [1]:
import pandas as pd
import numpy as np
from ctgan_adapter import CtganAdapter
from ctgan_benchmark import evaluate_ctgan, print_evaluation_results
from ctgan_utils import preprocess_data, get_tstr_results
import json

In [2]:
# Load configuration
print("# Load configuration")
with open("config.json", "r") as f:
    config = json.load(f)

# Load configuration


In [3]:
# 1. Load and prepare the dataset
print("# 1. Load and prepare the dataset")
data_raw = pd.read_csv("magic.csv")
print(f"Columns in dataset: {data_raw.columns.tolist()}")
print(f"Dataset shape: {data_raw.shape}")
print(data_raw.head())

# 1. Load and prepare the dataset
Columns in dataset: ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'class']
Dataset shape: (19020, 11)
    fLength    fWidth   fSize   fConc  fConc1     fAsym  fM3Long  fM3Trans  \
0   28.7967   16.0021  2.6449  0.3918  0.1982   27.7004  22.0110   -8.2027   
1   31.6036   11.7235  2.5185  0.5303  0.3773   26.2722  23.8238   -9.9574   
2  162.0520  136.0310  4.0612  0.0374  0.0187  116.7410 -64.8580  -45.2160   
3   23.8172    9.5728  2.3385  0.6147  0.3922   27.2107  -6.4633   -7.1513   
4   75.1362   30.9205  3.1611  0.3168  0.1832   -5.5277  28.5525   21.8393   

    fAlpha     fDist class  
0  40.0920   81.8828     g  
1   6.3609  205.2610     g  
2  76.9600  256.7880     g  
3  10.4490  116.7370     g  
4   4.6480  356.4620     g  


In [4]:
# 2. Preprocess data and detect categorical columns
print("\n# 2. Preprocess data and detect categorical columns")
data, categorical_columns = preprocess_data(data_raw)
print(f"Detected categorical columns: {categorical_columns}")

2025-03-31 08:53:00,016 - INFO - Converted class to category type (has 2 unique values)



# 2. Preprocess data and detect categorical columns
Detected categorical columns: ['class']


In [5]:
# 3. Define the target column for this dataset
print("\n# 3. Define the target column for this dataset")
target_column = "class"
print(f"Target column: {target_column}")


# 3. Define the target column for this dataset
Target column: class


In [6]:
# 4. Split the data into features and target
print("\n# 4. Split the data into features and target")
X = data.drop(columns=[target_column])
y = data[target_column]
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution:\n{y.value_counts()}")


# 4. Split the data into features and target
Features shape: (19020, 10)
Target shape: (19020,)
Target distribution:
class
g    12332
h     6688
Name: count, dtype: int64


In [7]:
# 5. Initialize and train CTGAN
print("\n# 5. Initialize and train CTGAN")
ctgan = CtganAdapter(**config["ctgan_params"])
print("Training CTGAN model...")
ctgan.fit(X, y)
print("Training completed")


# 5. Initialize and train CTGAN
Training CTGAN model...


Training Epochs:   0%|          | 1/300 [00:18<1:33:14, 18.71s/it]

Epoch 0, Loss D: 2.7329, Loss G: 0.8742


Training Epochs:  10%|█         | 31/300 [08:36<1:14:13, 16.56s/it]

Epoch 30, Loss D: -0.8772, Loss G: 1.4377


Training Epochs:  20%|██        | 61/300 [18:19<1:20:40, 20.25s/it]

Epoch 60, Loss D: -2.8761, Loss G: 1.0217


Training Epochs:  30%|███       | 91/300 [28:59<1:09:58, 20.09s/it]

Epoch 90, Loss D: -3.0163, Loss G: 1.2229


Training Epochs:  40%|████      | 121/300 [38:59<1:01:48, 20.72s/it]

Epoch 120, Loss D: -2.7886, Loss G: 1.4049


Training Epochs:  50%|█████     | 151/300 [49:58<41:17, 16.63s/it]  

Epoch 150, Loss D: -2.6046, Loss G: 1.5601


Training Epochs:  60%|██████    | 181/300 [1:00:38<45:05, 22.73s/it]

Epoch 180, Loss D: -2.4813, Loss G: 1.6029


Training Epochs:  70%|███████   | 211/300 [1:12:14<36:22, 24.52s/it]

Epoch 210, Loss D: -2.3016, Loss G: 1.7680


Training Epochs:  80%|████████  | 241/300 [1:23:39<21:52, 22.25s/it]

Epoch 240, Loss D: -2.2559, Loss G: 1.7652


Training Epochs:  90%|█████████ | 271/300 [1:34:10<11:09, 23.09s/it]

Epoch 270, Loss D: -2.1590, Loss G: 1.8030


Training Epochs: 100%|██████████| 300/300 [1:44:48<00:00, 20.96s/it]

Training completed





In [8]:
# 6. Generate synthetic data
print("\n# 6. Generate synthetic data")
n_samples = 1000  
print(f"Generating {n_samples} synthetic samples...")
synthetic_data = ctgan.generate(n_samples)
print(f"Generated {len(synthetic_data)} synthetic samples")
print("Synthetic data head:")
print(synthetic_data.head())


# 6. Generate synthetic data
Generating 1000 synthetic samples...
Generated 1000 synthetic samples
Synthetic data head:
      fLength     fWidth     fSize     fConc    fConc1       fAsym  \
0   22.828516   9.245157  2.204933  0.651088  0.321686   28.587789   
1  164.201571  29.299912  2.828017  0.204912  0.083796  130.753018   
2   32.792143   9.612016  2.536615  0.484315  0.363209   96.370634   
3   20.490402   8.896585  2.252200  0.579687  0.428748    7.704064   
4   17.735438  15.773135  2.938515  0.407805  0.225042   26.758642   

      fM3Long   fM3Trans     fAlpha       fDist class  
0   16.608428 -12.710108  48.747535  179.957367     g  
1  141.899047 -10.561327  32.054131  130.454906     h  
2  -16.474628 -14.305284  58.264342   57.352998     h  
3  -12.910390   5.160824  51.561391  166.110352     g  
4   24.706263   3.727664   0.169784  182.270104     g  


In [9]:
# 7. Evaluate quality using TSTR and other metrics
print("\n# 7. Evaluate quality using TSTR and other metrics")
print("Running evaluation...")
evaluation_results = evaluate_ctgan(data, synthetic_data, target_column=target_column)
print_evaluation_results(evaluation_results)



# 7. Evaluate quality using TSTR and other metrics
Running evaluation...


2025-03-31 10:49:29,238 - INFO - Encoded categorical target with mapping: {'g': 0, 'h': 1}
Parameters: { "use_label_encoder" } are not used.

2025-03-31 10:49:48,579 - INFO - Encoded categorical targets with mapping: {'g': 0, 'h': 1}
Parameters: { "use_label_encoder" } are not used.

2025-03-31 10:49:53,292 - INFO - CTGAN Evaluation Results:
2025-03-31 10:49:53,293 - INFO - 
Likelihood Fitness Metrics:
2025-03-31 10:49:53,295 - INFO -   - Lsyn (Synthetic Data Log-Likelihood): -27.5534
2025-03-31 10:49:53,295 - INFO -   - Ltest (Real Data Log-Likelihood under Synthetic Model): -31.6121
2025-03-31 10:49:53,295 - INFO - 
Statistical Similarity Metrics:
2025-03-31 10:49:53,295 - INFO -   - Wasserstein Distance Mean (Numerical): 3.9625
2025-03-31 10:49:53,297 - INFO - 
Machine Learning Efficacy Metrics on Real Data:
2025-03-31 10:49:53,297 - INFO -   LogisticRegression:
2025-03-31 10:49:53,297 - INFO -     - Accuracy: 0.7931
2025-03-31 10:49:53,299 - INFO -     - F1: 0.7867
2025-03-31 10:49

In [10]:
# 8. Extract and display TSTR results specifically
print("\n# 8. TSTR Performance Results")
tstr_results = get_tstr_results(evaluation_results)
if tstr_results is not None:
    print(tstr_results)


# 8. TSTR Performance Results
                    Accuracy        F1
LogisticRegression  0.778391  0.766582
RandomForest        0.808097  0.802876
MLP                 0.795531  0.795315
XGBoost             0.785331  0.784029


In [12]:
# 9. Save the synthetic data
print("\n# 9. Save synthetic data")
output_path = "magic_synthetic.csv"
synthetic_data.to_csv(output_path, index=False)
print(f"Synthetic data saved to {output_path}")

print("\nTest completed successfully!")


# 9. Save synthetic data
Synthetic data saved to magic_synthetic.csv

Test completed successfully!
