# This is a sample Jupyter Notebook

Below is an example of a code cell. 
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click 'Run Cell' button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/ipython-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

In [3]:
pip install faker pandas numpy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import pandas as pd
import numpy as np
from faker import Faker
import random
import os

# Initialize Faker
fake = Faker('en_IN')

# Parameters
n_customers = 2000
job_roles = ["Software Engineer", "Consultant", "Banker", "Designer", "Sales", "Data Scientist", "Product Manager"]
employment_types = ["Salaried", "Contract", "Freelance"]
cities = ["Mumbai", "Bengaluru", "Delhi", "Hyderabad", "Chennai", "Pune", "Gurugram", "Kolkata"]

# === Save directly to Downloads folder ===
downloads_dir = os.path.join(os.path.expanduser("~"), "Downloads", "synthetic_data")
os.makedirs(downloads_dir, exist_ok=True)

# Generate customer profiles
customer_data = []
for i in range(1, n_customers + 1):
    age = random.randint(23, 40)
    gender = random.choice(["Male", "Female", "Non-binary"])
    city = random.choice(cities)
    job = random.choice(job_roles)
    employment_type = random.choice(employment_types)
    income = random.randint(300000, 2000000)
    credit_score = random.randint(300, 850)
    savings_rate = round(random.uniform(0, 0.6), 2)
    existing_loan_balance = random.choice([0, random.randint(50000, 1000000)])
    account_balance = random.randint(1000, 500000)
    joined_date = fake.date_between(start_date="-5y", end_date="today")

    customer_data.append([
        i, age, gender, city, job, employment_type, income,
        credit_score, savings_rate, existing_loan_balance,
        account_balance, joined_date
    ])

customers_df = pd.DataFrame(customer_data, columns=[
    "customer_id", "age", "gender", "city", "job_role", "employment_type",
    "income", "credit_score", "savings_rate", "existing_loan_balance",
    "account_balance", "joined_date"
])

# Transactions
transaction_categories = ["Rent", "Groceries", "Entertainment", "Travel", "Investment", "Dining"]
transactions = []
for cust_id in customers_df['customer_id']:
    num_txns = random.randint(10, 50)
    for _ in range(num_txns):
        txn_date = fake.date_between(start_date="-1y", end_date="today")
        category = random.choice(transaction_categories)
        amount = round(random.uniform(500, 50000), 2)
        transactions.append([cust_id, txn_date, category, amount])

transactions_df = pd.DataFrame(transactions, columns=[
    "customer_id", "transaction_date", "category", "amount"
])

# Campaign Events
campaign_responses = ["Opened", "Clicked", "Converted", "Ignored"]
campaigns = []
for cust_id in customers_df['customer_id']:
    num_events = random.randint(1, 5)
    for _ in range(num_events):
        event_date = fake.date_between(start_date="-6m", end_date="today")
        response = random.choices(campaign_responses, weights=[0.4, 0.3, 0.2, 0.1])[0]
        campaigns.append([cust_id, event_date, response])

campaigns_df = pd.DataFrame(campaigns, columns=[
    "customer_id", "event_date", "response"
])

# Save files in Downloads
customers_file = os.path.join(downloads_dir, "customers.csv")
transactions_file = os.path.join(downloads_dir, "transactions.csv")
campaigns_file = os.path.join(downloads_dir, "campaign_events.csv")

customers_df.to_csv(customers_file, index=False)
transactions_df.to_csv(transactions_file, index=False)
campaigns_df.to_csv(campaigns_file, index=False)

print(f"✅ Files saved in your Downloads folder: {downloads_dir}")
print(f"- {customers_file}")
print(f"- {transactions_file}")
print(f"- {campaigns_file}")

✅ Files saved in your Downloads folder: C:\Users\user\Downloads\synthetic_data
- C:\Users\user\Downloads\synthetic_data\customers.csv
- C:\Users\user\Downloads\synthetic_data\transactions.csv
- C:\Users\user\Downloads\synthetic_data\campaign_events.csv


In [10]:
pip install sdv

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [15]:
import sdv
print(sdv.__version__)

1.27.0


In [16]:
from sdv.single_table import CTGANSynthesizer
from sdv.multi_table import HMASynthesizer

print("CTGAN and HMA Synthesizer are available ✅")

CTGAN and HMA Synthesizer are available ✅


In [18]:
# ctgan_full.py
import os
import pandas as pd
import numpy as np
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
import argparse
import time

# ---------- Config ----------
# Set DEBUG=True for quick runs (faster, fewer rows/epochs)
DEBUG = False

DOWNLOADS_DIR = os.path.join(os.path.expanduser("~"), "Downloads", "synthetic_data")
os.makedirs(DOWNLOADS_DIR, exist_ok=True)
SEED_CUST_PATH = os.path.join(DOWNLOADS_DIR, "customers.csv")  # from Phase 1

# CTGAN params
if DEBUG:
    N_SYNTH = 500
    EPOCHS = 50
else:
    N_SYNTH = 10000
    EPOCHS = 300

# ---------- Load seed ----------
print("Loading seed customers from:", SEED_CUST_PATH)
customers_df = pd.read_csv(SEED_CUST_PATH, parse_dates=['joined_date'])

# ---------- Metadata ----------
print("Detecting metadata...")
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=customers_df)

# Ensure joined_date is recognized as datetime if present
if 'joined_date' in customers_df.columns:
    try:
        metadata.update_column('joined_date', sdtype='datetime')
    except Exception:
        pass

# ---------- Train CTGAN ----------
print(f"Initializing CTGAN (epochs={EPOCHS}) — training may take time...")
start = time.time()
synth = CTGANSynthesizer(metadata=metadata, epochs=EPOCHS)
synth.fit(customers_df)
print("Training finished in %.1f seconds" % (time.time() - start))

# ---------- Sample ----------
print(f"Sampling {N_SYNTH} synthetic customers...")
synthetic_customers = synth.sample(N_SYNTH)

# Postprocess: ensure customer_id unique numeric incremental
synthetic_customers = synthetic_customers.reset_index(drop=True)
synthetic_customers['customer_id'] = np.arange(1, len(synthetic_customers) + 1)

# Save
out_file = os.path.join(DOWNLOADS_DIR, f"customers_ctgan_{N_SYNTH}.csv")
synthetic_customers.to_csv(out_file, index=False)
print("Saved:", out_file)

Loading seed customers from: C:\Users\user\Downloads\synthetic_data\customers.csv
Detecting metadata...
Initializing CTGAN (epochs=300) — training may take time...



The 'SingleTableMetadata' is deprecated. Please use the new 'Metadata' class for synthesizers.


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.



Training finished in 65.6 seconds
Sampling 10000 synthetic customers...
Saved: C:\Users\user\Downloads\synthetic_data\customers_ctgan_10000.csv


In [21]:
# hma_full.py
import os
import pandas as pd
from sdv.multi_table import HMASynthesizer
from sdv.metadata import Metadata
import time

# ---------- Config ----------
DEBUG = False  # set True for faster debug

DOWNLOADS_DIR = os.path.join(os.path.expanduser("~"), "Downloads", "synthetic_data")
os.makedirs(DOWNLOADS_DIR, exist_ok=True)

# Seed file paths (Phase 1)
CUSTOMERS_PATH = os.path.join(DOWNLOADS_DIR, "customers.csv")
TRANSACTIONS_PATH = os.path.join(DOWNLOADS_DIR, "transactions.csv")
CAMPAIGNS_PATH = os.path.join(DOWNLOADS_DIR, "campaign_events.csv")

# Output sizes
if DEBUG:
    N_CUSTOMERS = 500
else:
    N_CUSTOMERS = 10000

# ---------- Load seed ----------
print("Loading seed tables...")
customers = pd.read_csv(CUSTOMERS_PATH, parse_dates=['joined_date'])
transactions = pd.read_csv(TRANSACTIONS_PATH, parse_dates=['transaction_date'])
campaigns = pd.read_csv(CAMPAIGNS_PATH, parse_dates=['event_date'])

# ---------- Metadata ----------
print("Building multi-table metadata...")
metadata = Metadata()
# Detect tables automatically
metadata.detect_table_from_dataframe(table_name='customers', data=customers)
metadata.detect_table_from_dataframe(table_name='transactions', data=transactions)
metadata.detect_table_from_dataframe(table_name='campaigns', data=campaigns)

# Define relationships (parent -> child)
# Make sure column names match your CSVs
metadata.add_relationship(
    parent_table_name='customers',
    parent_primary_key='customer_id',
    child_table_name='transactions',
    child_foreign_key='customer_id'
)
metadata.add_relationship(
    parent_table_name='customers',
    parent_primary_key='customer_id',
    child_table_name='campaigns',
    child_foreign_key='customer_id'
)

# ---------- Initialize HMA ----------
print("Initializing HMASynthesizer (this may take time)...")
hma = HMASynthesizer(metadata)

# Optionally: set specific synthesizers per table for speed/quality
# Example: use CTGAN for customers and Gaussian for big transactions
# hma.set_table_parameters('customers', synthesizer_cls='CTGANSynthesizer')
# hma.set_table_parameters('transactions', synthesizer_cls='GaussianCopulaSynthesizer')

# ---------- Train ----------
print("Training HMA on multi-table seed...")
start = time.time()
hma.fit({
    'customers': customers,
    'transactions': transactions,
    'campaigns': campaigns
})
print("HMA training finished in %.1f seconds" % (time.time() - start))

# ---------- Sample ----------
print(f"Sampling multi-table dataset with {N_CUSTOMERS} customers...")
synthetic_multi = hma.sample(scale=5)

# Extract tables
syn_customers = synthetic_multi['customers']
syn_transactions = synthetic_multi['transactions']
syn_campaigns = synthetic_multi['campaigns']

# Save outputs
out_customers = os.path.join(DOWNLOADS_DIR, f"customers_hma_{N_CUSTOMERS}.csv")
out_transactions = os.path.join(DOWNLOADS_DIR, f"transactions_hma_{N_CUSTOMERS}.csv")
out_campaigns = os.path.join(DOWNLOADS_DIR, f"campaigns_hma_{N_CUSTOMERS}.csv")

syn_customers.to_csv(out_customers, index=False)
syn_transactions.to_csv(out_transactions, index=False)
syn_campaigns.to_csv(out_campaigns, index=False)

print("Saved HMA outputs to:", DOWNLOADS_DIR)
print("-", out_customers)
print("-", out_transactions)
print("-", out_campaigns)

Loading seed tables...
Building multi-table metadata...
Initializing HMASynthesizer (this may take time)...
Training HMA on multi-table seed...



We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.

Preprocess Tables: 100%|██████████| 3/3 [00:03<00:00,  1.10s/it]



Learning relationships:


(1/2) Tables 'customers' and 'transactions' ('customer_id'): 100%|██████████| 2000/2000 [04:46<00:00,  6.99it/s]
(2/2) Tables 'customers' and 'campaigns' ('customer_id'): 100%|██████████| 2000/2000 [01:03<00:00, 31.47it/s]





Modeling Tables: 100%|██████████| 3/3 [00:10<00:00,  3.40s/it]


HMA training finished in 366.6 seconds
Sampling multi-table dataset with 10000 customers...
Saved HMA outputs to: C:\Users\user\Downloads\synthetic_data
- C:\Users\user\Downloads\synthetic_data\customers_hma_10000.csv
- C:\Users\user\Downloads\synthetic_data\transactions_hma_10000.csv
- C:\Users\user\Downloads\synthetic_data\campaigns_hma_10000.csv


In [22]:
# reality_checks.py
import os
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt

DOWNLOADS_DIR = os.path.join(os.path.expanduser("~"), "Downloads", "synthetic_data")
os.makedirs(DOWNLOADS_DIR, exist_ok=True)

# Choose seed and synthetic files
seed_file = os.path.join(DOWNLOADS_DIR, "customers.csv")

# Prefer CTGAN file if exists, else HMA file
ctgan_file = os.path.join(DOWNLOADS_DIR, "customers_ctgan_10000.csv")
hma_file = os.path.join(DOWNLOADS_DIR, "customers_hma_10000.csv")
debug_ctgan = os.path.join(DOWNLOADS_DIR, "customers_ctgan_debug.csv")
debug_hma = os.path.join(DOWNLOADS_DIR, "customers_hma_debug.csv")

if os.path.exists(ctgan_file):
    syn_file = ctgan_file
elif os.path.exists(hma_file):
    syn_file = hma_file
elif os.path.exists(debug_ctgan):
    syn_file = debug_ctgan
elif os.path.exists(debug_hma):
    syn_file = debug_hma
else:
    raise FileNotFoundError("No synthetic customer file found in Downloads/synthetic_data/. Run CTGAN/HMA first.")

print("Seed file:", seed_file)
print("Synthetic file:", syn_file)

seed = pd.read_csv(seed_file)
syn = pd.read_csv(syn_file)

# Numeric columns to compare
num_cols = ['income', 'credit_score', 'account_balance', 'age']

# 1) Histograms and save plots
for col in num_cols:
    if col in seed.columns and col in syn.columns:
        plt.figure(figsize=(8,4))
        plt.hist(seed[col].dropna(), bins=50, density=True, alpha=0.5, label='seed')
        plt.hist(syn[col].dropna(), bins=50, density=True, alpha=0.5, label='synthetic')
        plt.title(f'{col} distribution: seed vs synthetic')
        plt.legend()
        out_png = os.path.join(DOWNLOADS_DIR, f'{col}_dist_compare.png')
        plt.savefig(out_png)
        plt.close()
        print("Saved plot:", out_png)

# 2) KS tests
for col in ['income', 'credit_score', 'account_balance']:
    if col in seed.columns and col in syn.columns:
        ks = stats.ks_2samp(seed[col].dropna().astype(float), syn[col].dropna().astype(float))
        print(f"KS test for {col}: statistic={ks.statistic:.4f}, pvalue={ks.pvalue:.4f}")

# 3) Correlations: income vs credit_score
if set(['income','credit_score']).issubset(seed.columns) and set(['income','credit_score']).issubset(syn.columns):
    seed_corr = seed[['income','credit_score']].dropna().astype(float).corr().iloc[0,1]
    syn_corr = syn[['income','credit_score']].dropna().astype(float).corr().iloc[0,1]
    print(f"Income vs Credit Score correlation -> seed: {seed_corr:.3f}, synthetic: {syn_corr:.3f}")

    # Scatter sample for visual
    sample_seed = seed[['income','credit_score']].dropna().sample(min(1000, len(seed)), random_state=1)
    sample_syn = syn[['income','credit_score']].dropna().sample(min(1000, len(syn)), random_state=1)
    plt.figure(figsize=(6,5))
    plt.scatter(sample_seed['income'], sample_seed['credit_score'], alpha=0.35, s=10, label='seed')
    plt.scatter(sample_syn['income'], sample_syn['credit_score'], alpha=0.35, s=10, label='synthetic')
    plt.xlabel('income'); plt.ylabel('credit_score'); plt.legend()
    scatter_out = os.path.join(DOWNLOADS_DIR, 'income_vs_credit_scatter.png')
    plt.savefig(scatter_out)
    plt.close()
    print("Saved scatter:", scatter_out)

# 4) Basic aggregates
print("Seed averages:")
print(" - mean income:", seed['income'].mean(), "mean account_balance:", seed['account_balance'].mean())
print("Synthetic averages:")
print(" - mean income:", syn['income'].mean(), "mean account_balance:", syn['account_balance'].mean())

print("\nReality checks complete. Inspect PNGs in:", DOWNLOADS_DIR)

Seed file: C:\Users\user\Downloads\synthetic_data\customers.csv
Synthetic file: C:\Users\user\Downloads\synthetic_data\customers_ctgan_10000.csv
Saved plot: C:\Users\user\Downloads\synthetic_data\income_dist_compare.png
Saved plot: C:\Users\user\Downloads\synthetic_data\credit_score_dist_compare.png
Saved plot: C:\Users\user\Downloads\synthetic_data\account_balance_dist_compare.png
Saved plot: C:\Users\user\Downloads\synthetic_data\age_dist_compare.png
KS test for income: statistic=0.1270, pvalue=0.0000
KS test for credit_score: statistic=0.2647, pvalue=0.0000
KS test for account_balance: statistic=0.1403, pvalue=0.0000
Income vs Credit Score correlation -> seed: 0.044, synthetic: -0.010
Saved scatter: C:\Users\user\Downloads\synthetic_data\income_vs_credit_scatter.png
Seed averages:
 - mean income: 1142873.5225 mean account_balance: 252992.649
Synthetic averages:
 - mean income: 1285824.2098 mean account_balance: 295007.5841

Reality checks complete. Inspect PNGs in: C:\Users\user\Dow

In [23]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# Paths
DOWNLOADS_DIR = os.path.join(os.path.expanduser("~"), "Downloads", "synthetic_data")
seed_file = os.path.join(DOWNLOADS_DIR, "customers.csv")
synthetic_file = os.path.join(DOWNLOADS_DIR, "customers_ctgan_10000.csv")  # or customers_hma_10000.csv

# Load data
seed = pd.read_csv(seed_file)
syn = pd.read_csv(synthetic_file)

# Ensure numeric types
for col in ['income', 'credit_score', 'account_balance', 'age']:
    if col in seed.columns:
        seed[col] = pd.to_numeric(seed[col], errors='coerce')
    if col in syn.columns:
        syn[col] = pd.to_numeric(syn[col], errors='coerce')

# Plot distributions
cols = ['income', 'credit_score', 'account_balance', 'age']
for col in cols:
    if col in seed.columns and col in syn.columns:
        plt.figure(figsize=(8,5))
        plt.hist(seed[col].dropna(), bins=50, density=True, alpha=0.5, label='Seed')
        plt.hist(syn[col].dropna(), bins=50, density=True, alpha=0.5, label='Synthetic')
        plt.legend()
        plt.title(f"{col.capitalize()} Distribution (Seed vs Synthetic)")
        out_path = os.path.join(DOWNLOADS_DIR, f"{col}_distribution.png")
        plt.savefig(out_path)
        plt.close()
        print("Saved:", out_path)

# Scatterplot (Income vs Credit Score)
if 'income' in seed.columns and 'credit_score' in seed.columns:
    sample_seed = seed[['income','credit_score']].dropna().sample(min(1000, len(seed)), random_state=42)
    sample_syn = syn[['income','credit_score']].dropna().sample(min(1000, len(syn)), random_state=42)

    plt.figure(figsize=(8,6))
    plt.scatter(sample_seed['income'], sample_seed['credit_score'], alpha=0.4, label='Seed', s=10)
    plt.scatter(sample_syn['income'], sample_syn['credit_score'], alpha=0.4, label='Synthetic', s=10)
    plt.xlabel("Income")
    plt.ylabel("Credit Score")
    plt.title("Income vs Credit Score (Seed vs Synthetic)")
    plt.legend()
    scatter_path = os.path.join(DOWNLOADS_DIR, "income_vs_credit_scatter.png")
    plt.savefig(scatter_path)
    plt.close()
    print("Saved:", scatter_path)


Saved: C:\Users\user\Downloads\synthetic_data\income_distribution.png
Saved: C:\Users\user\Downloads\synthetic_data\credit_score_distribution.png
Saved: C:\Users\user\Downloads\synthetic_data\account_balance_distribution.png
Saved: C:\Users\user\Downloads\synthetic_data\age_distribution.png
Saved: C:\Users\user\Downloads\synthetic_data\income_vs_credit_scatter.png


In [4]:
# evaluate_ctgan.py
import os
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import evaluate_quality

# ---------- File paths ----------
DOWNLOADS_DIR = os.path.join(os.path.expanduser("~"), "Downloads", "synthetic_data")
REAL_PATH = os.path.join(DOWNLOADS_DIR, "customers.csv")                 # seed (Phase 1)
SYNTH_PATH = os.path.join(DOWNLOADS_DIR, "customers_ctgan_10000.csv")   # CTGAN output

# ---------- Load data ----------
real_data = pd.read_csv(REAL_PATH)
synthetic_data = pd.read_csv(SYNTH_PATH)

# ---------- Metadata ----------
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=real_data)

# ---------- Evaluate ----------
score = evaluate_quality(
    real_data=real_data,
    synthetic_data=synthetic_data,
    metadata=metadata
)

print("Overall Quality Score:", score.get_score())  # 0.0–1.0
print("\nDetailed Breakdown:")
# To see details for "Column Shapes"
print("\nColumn Shapes Details:")
print(score.get_details(property_name="Column Shapes"))

# To see details for "Column Pair Trends"
print("\nColumn Pair Trends Details:")
print(score.get_details(property_name="Column Pair Trends"))

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 12/12 [00:00<00:00, 23.62it/s]|
Column Shapes Score: 87.5%

(2/2) Evaluating Column Pair Trends: |██████████| 66/66 [00:00<00:00, 125.20it/s]|
Column Pair Trends Score: 88.94%

Overall Score (Average): 88.22%

Overall Quality Score: 0.8822061548467146

Detailed Breakdown:

Column Shapes Details:
                  Column        Metric   Score
0                    age  KSComplement  0.8621
1                 gender  TVComplement  0.8760
2               job_role  TVComplement  0.9443
3        employment_type  TVComplement  0.9247
4                 income  KSComplement  0.8730
5           credit_score  KSComplement  0.7353
6           savings_rate  KSComplement  0.9212
7  existing_loan_balance  KSComplement  0.8636
8        account_balance  KSComplement  0.8597
9            joined_date  KSComplement  0.8902

Column Pair Trends Details:
                 Column 1               Column 2                 Metric  \
0             

In [6]:
# evaluate_hma.py
import os
import pandas as pd
from sdv.metadata import Metadata
from sdv.evaluation.multi_table import evaluate_quality

# ---------- File paths ----------
DOWNLOADS_DIR = os.path.join(os.path.expanduser("~"), "Downloads", "synthetic_data")

# Real (seed) tables
real_data = {
    'customers': pd.read_csv(os.path.join(DOWNLOADS_DIR, "customers.csv")),
    'transactions': pd.read_csv(os.path.join(DOWNLOADS_DIR, "transactions.csv")),
    'campaigns': pd.read_csv(os.path.join(DOWNLOADS_DIR, "campaign_events.csv"))
}

# Synthetic tables
synthetic_data = {
    'customers': pd.read_csv(os.path.join(DOWNLOADS_DIR, "customers_hma_10000.csv")),
    'transactions': pd.read_csv(os.path.join(DOWNLOADS_DIR, "transactions_hma_10000.csv")),
    'campaigns': pd.read_csv(os.path.join(DOWNLOADS_DIR, "campaigns_hma_10000.csv"))
}

# ---------- Build Metadata ----------
metadata = Metadata()
metadata.detect_table_from_dataframe('customers', real_data['customers'])
metadata.detect_table_from_dataframe('transactions', real_data['transactions'])
metadata.detect_table_from_dataframe('campaigns', real_data['campaigns'])

metadata.add_relationship(
    parent_table_name='customers',
    parent_primary_key='customer_id',
    child_table_name='transactions',
    child_foreign_key='customer_id'
)
metadata.add_relationship(
    parent_table_name='customers',
    parent_primary_key='customer_id',
    child_table_name='campaigns',
    child_foreign_key='customer_id'
)

# ---------- Evaluate ----------
score = evaluate_quality(
    real_data=real_data,
    synthetic_data=synthetic_data,
    metadata=metadata
)

print("Overall Multi-Table Quality Score:", score.get_score())
print("\nDetailed Breakdown:")
print("\nDetailed Breakdown:")

# Column Shapes (how realistic each column is)
print("\n📊 Column Shapes Details:")
print(score.get_details(property_name="Column Shapes"))

# Column Pair Trends (relationships within each table)
print("\n🔗 Column Pair Trends Details:")
print(score.get_details(property_name="Column Pair Trends"))

# Cardinality (how well parent/child row ratios are preserved)
print("\n🧩 Cardinality Details:")
print(score.get_details(property_name="Cardinality"))

# Intertable Trends (relationships between tables)
print("\n🔄 Intertable Trends Details:")
print(score.get_details(property_name="Intertable Trends"))

Generating report ...

(1/4) Evaluating Column Shapes: |██████████| 19/19 [00:00<00:00, 59.77it/s]|
Column Shapes Score: 87.13%

(2/4) Evaluating Column Pair Trends: |██████████| 75/75 [00:00<00:00, 175.81it/s]|
Column Pair Trends Score: 92.33%

(3/4) Evaluating Cardinality: |██████████| 2/2 [00:00<00:00, 34.13it/s]|
Cardinality Score: 93.3%

(4/4) Evaluating Intertable Trends: |██████████| 84/84 [00:01<00:00, 58.50it/s]|
Intertable Trends Score: 79.96%

Overall Score (Average): 88.18%

Overall Multi-Table Quality Score: 0.8817918022321913

Detailed Breakdown:

Detailed Breakdown:

📊 Column Shapes Details:
           Table                 Column        Metric     Score
0      customers                    age  KSComplement  0.919900
1      customers                 gender  TVComplement  0.994300
2      customers               job_role  TVComplement  0.987100
3      customers        employment_type  TVComplement  0.991900
4      customers                 income  KSComplement  0.972000
5 

In [None]:
S