In [1]:
import pandas as pd
import numpy as np
from ctgan_adapter import CtganAdapter
from ctgan_benchmark import evaluate_ctgan, print_evaluation_results
from ctgan_utils import preprocess_data, get_tstr_results
import json

In [2]:
# Load configuration
print("# Load configuration")
with open("config.json", "r") as f:
    config = json.load(f)

# Load configuration


In [3]:
# 1. Load and prepare the dataset
print("# 1. Load and prepare the dataset")
data_raw = pd.read_csv("SIGN.csv")
print(f"Columns in dataset: {data_raw.columns.tolist()}")
print(f"Dataset shape: {data_raw.shape}")
print(data_raw.head())

# 1. Load and prepare the dataset
Columns in dataset: ['class', 'region-centroid-col', 'region-centroid-row', 'region-pixel-count', 'short-line-density-5', 'short-line-density-2', 'vedge-mean', 'vedge-sd', 'hedge-mean', 'hedge-sd', 'intensity-mean', 'rawred-mean', 'rawblue-mean', 'rawgreen-mean', 'exred-mean', 'exblue-mean', 'exgreen-mean', 'value-mean', 'saturatoin-mean', 'hue-mean']
Dataset shape: (2310, 20)
   class  region-centroid-col  region-centroid-row  region-pixel-count  \
0  218.0                178.0                    9            0.111111   
1  113.0                130.0                    9            0.000000   
2  202.0                 41.0                    9            0.000000   
3   32.0                173.0                    9            0.000000   
4   61.0                197.0                    9            0.000000   

   short-line-density-5  short-line-density-2  vedge-mean  vedge-sd  \
0                   0.0              0.833333    0.547722  1.111109   

In [4]:
# 2. Preprocess data and detect categorical columns
print("\n# 2. Preprocess data and detect categorical columns")
data, categorical_columns = preprocess_data(data_raw)
print(f"Detected categorical columns: {categorical_columns}")

2025-03-31 13:44:40,253 - INFO - Converted region-centroid-row to category type (has 1 unique values)
2025-03-31 13:44:40,253 - INFO - Converted hue-mean to category type (has 7 unique values)



# 2. Preprocess data and detect categorical columns
Detected categorical columns: ['region-centroid-row', 'hue-mean']


In [14]:
# 3. Define the target column for this dataset
print("\n# 3. Define the target column for this dataset")
target_column = "hue-mean"
print(f"Target column: {target_column}")


# 3. Define the target column for this dataset
Target column: hue-mean


In [15]:
# 4. Split the data into features and target
print("\n# 4. Split the data into features and target")
X = data.drop(columns=[target_column])
y = data[target_column]
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution:\n{y.value_counts()}")


# 4. Split the data into features and target
Features shape: (2310, 19)
Target shape: (2310,)
Target distribution:
hue-mean
1    330
2    330
3    330
4    330
5    330
6    330
7    330
Name: count, dtype: int64


In [16]:
# 5. Initialize and train CTGAN
print("\n# 5. Initialize and train CTGAN")
ctgan = CtganAdapter(**config["ctgan_params"])
print("Training CTGAN model...")
ctgan.fit(X, y)
print("Training completed")


# 5. Initialize and train CTGAN
Training CTGAN model...


Training Epochs:   0%|          | 1/300 [00:02<13:22,  2.68s/it]

Epoch 0, Loss D: 22.6849, Loss G: 1.3231


Training Epochs:  10%|█         | 31/300 [01:30<13:32,  3.02s/it]

Epoch 30, Loss D: -1.5852, Loss G: 0.3105


Training Epochs:  20%|██        | 61/300 [03:01<12:55,  3.24s/it]

Epoch 60, Loss D: -1.9030, Loss G: 0.1972


Training Epochs:  30%|███       | 91/300 [04:34<10:15,  2.95s/it]

Epoch 90, Loss D: -1.7810, Loss G: 0.1217


Training Epochs:  40%|████      | 121/300 [06:03<08:49,  2.96s/it]

Epoch 120, Loss D: -1.7480, Loss G: 0.0051


Training Epochs:  50%|█████     | 151/300 [07:33<07:30,  3.03s/it]

Epoch 150, Loss D: -1.7598, Loss G: -0.0168


Training Epochs:  60%|██████    | 181/300 [09:02<06:00,  3.03s/it]

Epoch 180, Loss D: -1.9509, Loss G: 0.0924


Training Epochs:  70%|███████   | 211/300 [10:33<04:37,  3.12s/it]

Epoch 210, Loss D: -1.9713, Loss G: 0.1367


Training Epochs:  80%|████████  | 241/300 [12:02<02:59,  3.04s/it]

Epoch 240, Loss D: -2.0635, Loss G: 0.0676


Training Epochs:  90%|█████████ | 271/300 [13:31<01:24,  2.93s/it]

Epoch 270, Loss D: -2.1284, Loss G: 0.0880


Training Epochs: 100%|██████████| 300/300 [14:57<00:00,  2.99s/it]

Training completed





In [17]:
# 6. Generate synthetic data
print("\n# 6. Generate synthetic data")
n_samples = 1000  
print(f"Generating {n_samples} synthetic samples...")
synthetic_data = ctgan.generate(n_samples)
print(f"Generated {len(synthetic_data)} synthetic samples")
print("Synthetic data head:")
print(synthetic_data.head())


# 6. Generate synthetic data
Generating 1000 synthetic samples...
Generated 1000 synthetic samples
Synthetic data head:
        class  region-centroid-col  region-centroid-row  region-pixel-count  \
0  108.776090            54.207259                    9            0.000000   
1   55.795674            26.721259                    9            0.000000   
2  200.759431           172.013136                    9            0.000108   
3   75.637295           102.567914                    9            0.000000   
4    9.006460           119.666484                    9            0.000000   

   short-line-density-5  short-line-density-2  vedge-mean  vedge-sd  \
0              0.000000              0.910067    1.135094  0.830491   
1              0.000000              0.414667    0.358131  1.103272   
2              0.000000              0.638769    0.000000  2.155316   
3              0.000179              0.407835    1.607726  1.057736   
4              0.000111              0.384901    

In [18]:
# 7. Evaluate quality using TSTR and other metrics
print("\n# 7. Evaluate quality using TSTR and other metrics")
print("Running evaluation...")
evaluation_results = evaluate_ctgan(data, synthetic_data, target_column=target_column)
print_evaluation_results(evaluation_results)



# 7. Evaluate quality using TSTR and other metrics
Running evaluation...


2025-03-31 15:30:12,091 - INFO - Encoded categorical target with mapping: {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6}
Parameters: { "use_label_encoder" } are not used.

2025-03-31 15:30:20,720 - INFO - Encoded categorical targets with mapping: {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6}
Parameters: { "use_label_encoder" } are not used.

2025-03-31 15:30:29,666 - INFO - CTGAN Evaluation Results:
2025-03-31 15:30:29,667 - INFO - 
Likelihood Fitness Metrics:
2025-03-31 15:30:29,667 - INFO -   - Lsyn (Synthetic Data Log-Likelihood): -33.8806
2025-03-31 15:30:29,668 - INFO -   - Ltest (Real Data Log-Likelihood under Synthetic Model): -267.3284
2025-03-31 15:30:29,668 - INFO - 
Statistical Similarity Metrics:
2025-03-31 15:30:29,668 - INFO -   - Jensen-Shannon Divergence Mean (Categorical): 0.0000
2025-03-31 15:30:29,668 - INFO -   - Wasserstein Distance Mean (Numerical): 2.4631
2025-03-31 15:30:29,670 - INFO - 
Machine Learning Efficacy Metrics on Real Data:
2025-03-31 15:30:29,670 - INFO -  

In [19]:
# 8. Extract and display TSTR results specifically
print("\n# 8. TSTR Performance Results")
tstr_results = get_tstr_results(evaluation_results)
if tstr_results is not None:
    print(tstr_results)


# 8. TSTR Performance Results
                    Accuracy        F1
LogisticRegression  0.844589  0.839476
RandomForest        0.867965  0.864580
MLP                 0.806061  0.804005
XGBoost             0.874026  0.873199


In [20]:
# 9. Save the synthetic data
print("\n# 9. Save synthetic data")
output_path = "SIGN_synthetic.csv"
synthetic_data.to_csv(output_path, index=False)
print(f"Synthetic data saved to {output_path}")

print("\nTest completed successfully!")


# 9. Save synthetic data
Synthetic data saved to SIGN_synthetic.csv

Test completed successfully!
