In [5]:
import pandas as pd
import numpy as np
import json
import torch

# Direct imports from the same directory
from tabddpm_adapter import TabDDPMAdapter
from tabddpm_benchmark import evaluate_tabddpm, print_evaluation_results
from tabddpm_utils import preprocess_data, get_tstr_results

In [6]:
# Load configuration
print("# Load configuration")
with open("config.json", "r") as f:
    config = json.load(f)

# Load configuration


In [7]:
# 1. Load and prepare the dataset
print("# 1. Load and prepare the dataset")
data_raw = pd.read_csv("Adult.csv")
print(f"Columns in dataset: {data_raw.columns.tolist()}")
print(f"Dataset shape: {data_raw.shape}")
print(data_raw.head())

# 1. Load and prepare the dataset
Columns in dataset: ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'capital.gain', 'capital.loss', 'hours.per.week', 'native.country', 'income']
Dataset shape: (32561, 15)
   age          workclass  fnlwgt   education  education.num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital.status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-

In [8]:
# 2. Preprocess data and detect categorical columns
print("\n# 2. Preprocess data and detect categorical columns")
data, categorical_columns = preprocess_data(data_raw)
print(f"Detected categorical columns: {categorical_columns}")

2025-04-28 00:05:31,264 - INFO - Converted workclass to category type (has 9 unique values)
2025-04-28 00:05:31,269 - INFO - Converted education to category type (has 16 unique values)
2025-04-28 00:05:31,277 - INFO - Converted marital.status to category type (has 7 unique values)
2025-04-28 00:05:31,285 - INFO - Converted occupation to category type (has 15 unique values)
2025-04-28 00:05:31,297 - INFO - Converted relationship to category type (has 6 unique values)
2025-04-28 00:05:31,297 - INFO - Converted race to category type (has 5 unique values)
2025-04-28 00:05:31,297 - INFO - Converted sex to category type (has 2 unique values)
2025-04-28 00:05:31,309 - INFO - Converted native.country to category type (has 42 unique values)
2025-04-28 00:05:31,311 - INFO - Converted income to category type (has 2 unique values)



# 2. Preprocess data and detect categorical columns
Detected categorical columns: ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country', 'income']


In [9]:
# 3. Define the target column for this dataset
print("\n# 3. Define the target column for this dataset")
target_column = "income"
print(f"Target column: {target_column}")


# 3. Define the target column for this dataset
Target column: income


In [10]:
# 4. Split the data into features and target
print("\n# 4. Split the data into features and target")
X = data.drop(columns=[target_column])
y = data[target_column]
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution:\n{y.value_counts()}")


# 4. Split the data into features and target
Features shape: (32561, 14)
Target shape: (32561,)
Target distribution:
income
<=50K    24720
>50K      7841
Name: count, dtype: int64


In [None]:
# 5. Initialize and train TabDDPM
print("\n# 5. Initialize and train TabDDPM")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tabddpm = TabDDPMAdapter(**config["tabddpm_params"], device=device)
print("Training TabDDPM model...")
tabddpm.fit(X, y)
print("Training completed")


# 5. Initialize and train TabDDPM
Training TabDDPM model...
Original data shape: (32561, 15), Target column: income
Added StandardScaler for 6 numerical columns
Added OneHotEncoder for 8 categorical columns
Target 'income' identified as categorical with 2 classes
Class mapping: {' <=50K': 0, ' >50K': 1}

Preprocessing Summary:
- Number of numerical features: 6
- Number of categorical features: 8
- Categorical columns: ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
- Target column: income
- Target type: Categorical
- Number of target classes: 2
X shape: (32561, 14), y shape: (32561,)
X_tensor shape: torch.Size([32561, 108]), y_tensor shape: torch.Size([32561])
X_tensor shape: torch.Size([32561, 108])
y_tensor shape: torch.Size([32561])


  2%|▏         | 7/300 [00:41<29:13,  5.98s/it]

In [None]:
# 6. Generate synthetic data
print("\n# 6. Generate synthetic data")
n_samples = 1000  
print(f"Generating {n_samples} synthetic samples...")
synthetic_data = tabddpm.generate(n_samples)
print(f"Generated {len(synthetic_data)} synthetic samples")
print("Synthetic data head:")
print(synthetic_data.head())

In [None]:
# 7. Evaluate quality using TSTR and other metrics
print("\n# 7. Evaluate quality using TSTR and other metrics")
print("Running evaluation...")
evaluation_results = evaluate_tabddpm(data, synthetic_data, target_column=target_column)
print_evaluation_results(evaluation_results)


In [None]:
# 8. Extract and display TSTR results specifically
print("\n# 8. TSTR Performance Results")
tstr_results = get_tstr_results(evaluation_results)
if tstr_results is not None:
    print(tstr_results)

In [None]:
# 9. Save the synthetic data
print("\n# 9. Save synthetic data")
output_path = "Adult_synthetic.csv"
synthetic_data.to_csv(output_path, index=False)
print(f"Synthetic data saved to {output_path}")

print("\nTest completed successfully!")