## Attempt 1: composit score function

In [13]:
from pathlib import Path
from typing import Tuple
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm                 # NEW
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import Metadata
from sdv.evaluation.single_table import evaluate_quality
from sdmetrics.reports.single_table import QualityReport
from sdmetrics.single_table import LogisticDetection, RangeCoverage


from sdmetrics.reports.single_table import QualityReport
from sdmetrics.single_table import LogisticDetection, RangeCoverage

def score(real_df, synth_df, metadata_obj) -> float:
    """Return composite quality/utility score (↑ better)."""
    meta_full = metadata_obj.to_dict()
    meta_single = meta_full["tables"]["table"]          # <─ unwrap here

    report = QualityReport()
    report.generate(real_df, synth_df, meta_single, verbose=False)
    base = report.get_score()

    auc = LogisticDetection.compute(real_df, synth_df, metadata=meta_single)
    cov = RangeCoverage.compute(real_df, synth_df, metadata=meta_single)

    return base - 0.5 * auc + 0.2 * cov

def train_until_convergence(
    df_slice: pd.DataFrame,
    *,
    service_code: str,
    max_epochs: int = 2_000,
    step: int = 50,
    patience: int = 4,
    outdir: Path = Path("checkpoints/models")
) -> Tuple[CTGANSynthesizer, int, float]:
    """
    CTGAN fit with early-stopping based on SDV quality metrics.
    Shows a live progress bar (tqdm) for the outer training loop.
    """
    outdir.mkdir(parents=True, exist_ok=True)

    metadata = Metadata.detect_from_dataframes(data={"table": df_slice})

    best_s, best_ep, bad = -np.inf, 0, 0
    cur = 0

    # ─── progress bar ────────────────────────────────────────────────────────────
    bar = tqdm(
        total=max_epochs,
        desc=f"CTGAN [{service_code}]",
        unit="epoch",
        leave=True,
        dynamic_ncols=True,
    )
    # ─────────────────────────────────────────────────────────────────────────────

    while cur < max_epochs and bad < patience:
        synth = CTGANSynthesizer(metadata=metadata, epochs=step, verbose=False)
        synth.fit(df_slice)
        cur += step
        bar.update(step)                   # update progress bar

        synthetic_sample = synth.sample(num_rows=len(df_slice))
        s = score(df_slice, synthetic_sample, metadata)

        if s > best_s + 1e-3:
            model_path = outdir / f"{service_code}_ep{cur}.pkl"
            synth.save(str(model_path))
            best_s, best_ep, bad = s, cur, 0
        else:
            bad += 1

    bar.close()                            # tidy up the bar

    best_model_path = outdir / f"{service_code}_ep{best_ep}.pkl"
    best_synth = CTGANSynthesizer.load(str(best_model_path))
    return best_synth, best_ep, best_s


In [14]:
import pandas as pd

# Load your filtered dataset from pickle
DATA_PATH = "filtered_data/df_v2_filtered.pkl"
TOP_K     = 5   # Train this many CTGANs

df = pd.read_pickle(DATA_PATH)

# Select top-K most frequent SERVICE_CODEs
codes = df["service_code_description"].value_counts().head(TOP_K).index.tolist()
codes

['ASSIST MEMBER OF THE PUBLIC',
 'CONCERN FOR PERSON',
 'DISTURBANCE',
 'THEFT',
 'DOMESTIC INCIDENT']

In [15]:
results = []

for code in codes:
    print(f"\n🟢  [{code}] starting …")
    slice_df = df[df["service_code_description"] == code].copy()
    
    best_model, best_epoch, best_score = train_until_convergence(
        slice_df,
        service_code=code,
        outdir=Path("checkpoints/models")
    )

    results.append({"code": code, "epoch": best_epoch, "score": best_score})
    print(f"✅  [{code}] finished at epoch {best_epoch}  (score={best_score:.4f})")


🟢  [ASSIST MEMBER OF THE PUBLIC] starting …


CTGAN [ASSIST MEMBER OF THE PUBLIC]:   0%|          | 0/2000 [00:00<?, ?epoch/s]


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(cop

✅  [ASSIST MEMBER OF THE PUBLIC] finished at epoch 50  (score=0.7195)

🟢  [CONCERN FOR PERSON] starting …


CTGAN [CONCERN FOR PERSON]:   0%|          | 0/2000 [00:00<?, ?epoch/s]


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(cop

✅  [CONCERN FOR PERSON] finished at epoch 150  (score=0.7219)

🟢  [DISTURBANCE] starting …


CTGAN [DISTURBANCE]:   0%|          | 0/2000 [00:00<?, ?epoch/s]


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(cop

✅  [DISTURBANCE] finished at epoch 250  (score=0.7753)

🟢  [THEFT] starting …


CTGAN [THEFT]:   0%|          | 0/2000 [00:00<?, ?epoch/s]


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(cop

✅  [THEFT] finished at epoch 50  (score=0.7562)

🟢  [DOMESTIC INCIDENT] starting …


CTGAN [DOMESTIC INCIDENT]:   0%|          | 0/2000 [00:00<?, ?epoch/s]


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(cop

✅  [DOMESTIC INCIDENT] finished at epoch 150  (score=0.7549)


In [16]:
ledger = pd.DataFrame(results)

In [17]:
ledger

Unnamed: 0,code,epoch,score
0,ASSIST MEMBER OF THE PUBLIC,50,0.719502
1,CONCERN FOR PERSON,150,0.721933
2,DISTURBANCE,250,0.77532
3,THEFT,50,0.756218
4,DOMESTIC INCIDENT,150,0.754918


## Attempt 2: Checking convergence measurement on THEFT

In [1]:
from pathlib import Path
import numpy as np, pandas as pd
from tqdm.auto import tqdm

from sdv.single_table import CTGANSynthesizer
from sdv.metadata import Metadata
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ╔══════════ 2 · LOSS-BASED CONVERGENCE ══╗
def losses_converged(
    g_hist: list[float],
    d_hist: list[float],
    *,
    window: int  = 10,      # look at the last 10 logged points
    epsilon: float = 0.1,  # G & D must be this close *on average*
    drift: float   = 0.25,  # reject if either series moves >±0.10
) -> bool:
    """
    True when, over the last `window` points:
      – mean(|G − D|) ≤ ε
      – max drift within each series ≤ `drift`
    """
    if len(g_hist) < window + 1:
        return False

    g_win, d_win = np.array(g_hist[-window:]), np.array(d_hist[-window:])
    balanced = np.abs(g_win - d_win).mean() <= epsilon
    stable   = (g_win.ptp() <= drift) and (d_win.ptp() <= drift)
    return balanced and stable


# ╔══════════ 3 · TRAIN UNTIL CONVERGENCE ══╗
def train_until_convergence(
    df_slice: pd.DataFrame,
    *,
    service_code: str,
    max_epochs: int = 2_000,
    step: int = 50,
    outdir: Path = Path("checkpoints/models"),
):
    outdir.mkdir(parents=True, exist_ok=True)
    metadata = Metadata.detect_from_dataframes({"table": df_slice})

    g_hist, d_hist, cur = [], [], 0
    bar = tqdm(total=max_epochs, desc=f"CTGAN [{service_code}]",
               unit="epoch", dynamic_ncols=True)

    while cur < max_epochs:
        synth = CTGANSynthesizer(metadata=metadata, epochs=step, verbose=False, cuda=True)
        synth.fit(df_slice)

        cur += step
        bar.update(step)

        # — official helper (SDV ≥1.6, incl. 1.23.0) —
        losses = synth.get_loss_values()          # 
        g_hist.extend(losses["Generator Loss"].tolist())
        d_hist.extend(losses["Discriminator Loss"].tolist())

        if losses_converged(g_hist, d_hist):
            bar.set_postfix_str("loss-converged")
            break

    bar.close()
    model_path = outdir / f"{service_code}_final.pkl"
    synth.save(model_path)
    return synth, cur, model_path

# ╔══════════ 4 · RUN FOR “THEFT” ONLY ═════╗
DATA_PATH   = "filtered_data/df_v2_filtered.pkl"
TARGET_CODE = "THEFT"

df = pd.read_pickle(DATA_PATH)
if TARGET_CODE not in df["service_code_description"].unique():
    raise ValueError(f'"{TARGET_CODE}" not found in column.')

slice_df = df[df["service_code_description"] == TARGET_CODE].copy()
print(f"\n👟  Training CTGAN for service code: {TARGET_CODE} "
      f"(n={len(slice_df):,} rows)")

model, final_epoch, path = train_until_convergence(
    slice_df, service_code=TARGET_CODE
)

print(f"\n✅  Converged after {final_epoch} epochs — model saved to {path}")


👟  Training CTGAN for service code: THEFT (n=87,517 rows)


CTGAN [THEFT]:   0%|          | 0/2000 [00:00<?, ?epoch/s]


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(cop

KeyboardInterrupt: 

In [3]:
# Visualising the loss functions
fig = model.get_loss_values_plot()
fig.show()

NameError: name 'model' is not defined

## Attempt 3: Moving average approach at detecting convergence for CTGANS

In [None]:
from pathlib import Path
from typing import Tuple
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm

from sdv.single_table import CTGANSynthesizer
from sdv.metadata import Metadata

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# ─────────────── 1 · EMA-Based Convergence Detector ───────────────
def find_convergence_epoch(
    loss_df: pd.DataFrame,
    *,
    smooth_span: int = 25,
    window: int = 10,
    epsilon: float = 0.05,
    drift: float = 0.10,
) -> int | None:
    """Detect first epoch where G/D smoothed losses are both close and stable."""
    g_s = loss_df["Generator Loss"].ewm(span=smooth_span, adjust=False).mean().to_numpy()
    d_s = loss_df["Discriminator Loss"].ewm(span=smooth_span, adjust=False).mean().to_numpy()
    epochs = loss_df["Epoch"].to_numpy()

    for i in range(window, len(loss_df)):
        g_win, d_win = g_s[i - window:i], d_s[i - window:i]
        gap_ok = np.abs(g_win - d_win).mean() <= epsilon
        drift_ok = (g_win.ptp() <= drift) and (d_win.ptp() <= drift)

        if gap_ok and drift_ok:
            return int(epochs[i])
    return None


# ─────────────── 2 · Training Function ───────────────
def train_until_convergence(
    df_slice: pd.DataFrame,
    *,
    service_code: str,
    max_epochs: int = 2000,
    step: int = 50,
    outdir: Path = Path("checkpoints_3/models")
) -> Tuple[CTGANSynthesizer, int, Path]:
    """Train CTGAN with early stopping based on smoothed loss convergence."""
    outdir.mkdir(parents=True, exist_ok=True)
    metadata = Metadata.detect_from_dataframes({"table": df_slice})

    full_loss_df = pd.DataFrame(columns=["Epoch", "Generator Loss", "Discriminator Loss"])
    cur_epoch = 0
    bar = tqdm(total=max_epochs, desc=f"CTGAN [{service_code}]", unit="epoch", dynamic_ncols=True)

    while cur_epoch < max_epochs:
        synth = ctgan = CTGANSynthesizer(
            metadata,
            epochs=800,
            batch_size=1024,
            pac=8,
            embedding_dim=128,
            generator_dim=(256, 256),
            discriminator_dim=(256, 256),
            generator_lr=2e-4,
            discriminator_lr=2e-4,
            cuda=True,
            verbose=True
        )
        synth.fit(df_slice)
        bar.update(step)
        cur_epoch += step

        # Extract step loss history and adjust epoch numbers
        step_losses = synth.get_loss_values().copy()
        step_losses["Epoch"] += (cur_epoch - step)
        full_loss_df = pd.concat([full_loss_df, step_losses], ignore_index=True)

        # Check convergence after enough points
        if len(full_loss_df) >= 35:  # 25 for EMA + 10 for window
            conv_epoch = find_convergence_epoch(full_loss_df)
            if conv_epoch is not None and conv_epoch <= cur_epoch:
                bar.set_postfix_str(f"converged @ {conv_epoch}")
                break

    bar.close()

    model_path = outdir / f"{service_code}_final.pkl"
    synth.save(str(model_path))
    return synth, cur_epoch, model_path


# ─────────────── 3 · Run for Top-5 Codes ───────────────
DATA_PATH = "filtered_data/df_v2_filtered.pkl"
TOP_K = 5

df = pd.read_pickle(DATA_PATH)
codes = df["service_code_description"].value_counts().head(TOP_K).index.tolist()

print(f"\nTop-{TOP_K} service codes: {codes}\n")

for code in codes:
    slice_df = df[df["service_code_description"] == code].copy()
    print(f"\nTraining CTGAN for '{code}' ({len(slice_df):,} rows)")

    model, final_epoch, path = train_until_convergence(
        slice_df,
        service_code=code,
        max_epochs=2000,
        step=50,
        outdir=Path("checkpoints_3/models")
    )

    print(f"'{code}' converged after {final_epoch} epochs — model saved to {path}")


Top-5 service codes: ['ASSIST MEMBER OF THE PUBLIC', 'CONCERN FOR PERSON', 'DISTURBANCE', 'THEFT', 'DOMESTIC INCIDENT']


Training CTGAN for 'ASSIST MEMBER OF THE PUBLIC' (169,406 rows)


CTGAN [ASSIST MEMBER OF THE PUBLIC]:   0%|          | 0/2000 [00:00<?, ?epoch/s]


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`




  0%|          | 0/800 [00:00<?, ?it/s][A[A[A


Gen. (0.00) | Discrim. (0.00):   0%|          | 0/800 [00:00<?, ?it/s][A[A[A


Gen. (1.02) | Discrim. (0.05):   0%|          | 0/800 [00:04<?, ?it/s][A[A[A


Gen. (1.02) | Discrim. (0.05):   0%|          | 1/800 [00:04<1:00:15,  4.53s/it][A[A[A


Gen. (0.94) | Discrim. (-0.02):   0%|          | 1/800 [00:09<1:00:15,  4.53s/it][A[A[A


Gen. (0.94) | Discrim. (-0.02):   0%|          | 2/800 [00:09<1:00:10,  4.52s/it][A[A[A


Gen. (0.42) | Discrim. (-0.01):   0%|          | 2/800 [00:13<1:00:10,  4.52s/it][A[A[A


Gen. (0.42) | Discrim. (-0.01):   0%|          | 3/80

## Attempt 4: Give up and run for 200 epochs

In [1]:
from pathlib import Path
from typing import Tuple

import pandas as pd
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import Metadata


def train_ctgan(
    df_slice: pd.DataFrame,
    *,
    service_code: str,
    epochs: int = 200,
    outdir: Path = Path("checkpoints_3/models"),
) -> Tuple[CTGANSynthesizer, Path]:
    """Train a CTGAN for a fixed number of epochs and save the model."""

    outdir.mkdir(parents=True, exist_ok=True)

    # Build table metadata for the slice
    metadata = Metadata.detect_from_dataframes({"table": df_slice})

    # Configure and fit CTGAN
    synth = CTGANSynthesizer(
        metadata,
        epochs=epochs,
        batch_size=1024,
        pac=8,
        embedding_dim=128,
        generator_dim=(256, 256),
        discriminator_dim=(256, 256),
        generator_lr=2e-4,
        discriminator_lr=2e-4,
        cuda=True,
        verbose=True,
    )
    synth.fit(df_slice)

    # Save the trained model
    model_path = outdir / f"{service_code}_final.pkl"
    synth.save(str(model_path))

    return synth, model_path


if __name__ == "__main__":
    DATA_PATH = "filtered_data/df_v2_filtered.pkl"
    TOP_K = 5

    df = pd.read_pickle(DATA_PATH)
    codes = df["service_code_description"].value_counts().head(TOP_K).index.tolist()

    print(f"\nTop-{TOP_K} service codes: {codes}\n")

    for code in codes:
        slice_df = df[df["service_code_description"] == code].copy()
        print(f"\nTraining CTGAN for '{code}' ({len(slice_df):,} rows, 200 epochs)")

        _, model_path = train_ctgan(
            slice_df,
            service_code=code,
            epochs=200,
            outdir=Path("checkpoints_3/models"),
        )

        print(f"Model saved to {model_path}")



Top-5 service codes: ['ASSIST MEMBER OF THE PUBLIC', 'CONCERN FOR PERSON', 'DISTURBANCE', 'THEFT', 'DOMESTIC INCIDENT']


Training CTGAN for 'ASSIST MEMBER OF THE PUBLIC' (169,406 rows, 200 epochs)



We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`

Gen. (-0.08) | Discrim. (-0.20): 100%|██████████| 200/200 [15:10<00:00,  4.55s/it]


Model saved to checkpoints_3/models/ASSIST MEMBER OF THE PUBLIC_final.pkl

Training CTGAN for 'CONCERN FOR PERSON' (121,876 rows, 200 epochs)



We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`

Gen. (-0.57) | Discrim. (-0.05): 100%|██████████| 200/200 [10:47<00:00,  3.24s/it]


Model saved to checkpoints_3/models/CONCERN FOR PERSON_final.pkl

Training CTGAN for 'DISTURBANCE' (90,395 rows, 200 epochs)



We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`

Gen. (-0.63) | Discrim. (-0.07): 100%|██████████| 200/200 [08:00<00:00,  2.40s/it]


Model saved to checkpoints_3/models/DISTURBANCE_final.pkl

Training CTGAN for 'THEFT' (87,517 rows, 200 epochs)



We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`

Gen. (-0.21) | Discrim. (-0.04): 100%|██████████| 200/200 [07:47<00:00,  2.34s/it]


Model saved to checkpoints_3/models/THEFT_final.pkl

Training CTGAN for 'DOMESTIC INCIDENT' (79,291 rows, 200 epochs)



We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`

Gen. (-0.34) | Discrim. (0.00): 100%|██████████| 200/200 [07:01<00:00,  2.11s/it] 

Model saved to checkpoints_3/models/DOMESTIC INCIDENT_final.pkl





## Making more synth data with 200 epoch models

In [1]:
import pandas as pd

pd.options.display.max_rows = 4000

# Load your filtered dataset from pickle
DATA_PATH = "filtered_data/df_v2_filtered.pkl"

df = pd.read_pickle(DATA_PATH)

In [2]:
import pandas as pd

# Example: assume df is your full incidents DataFrame
cutoff = 5000

# Identify categories below the cutoff
counts = df['service_code_description'].value_counts()
low_categories = counts[counts < cutoff].index

# Replace low-frequency categories with "OTHER"
df['service_code_description'] = df['service_code_description'].replace(low_categories, 'OTHER')

# Optional: check new distribution
print(df['service_code_description'].value_counts().sort_values(ascending=False))

service_code_description
ASSIST MEMBER OF THE PUBLIC    169406
CONCERN FOR PERSON             121876
DISTURBANCE                     90395
THEFT                           87517
DOMESTIC INCIDENT               79291
ROAD TRAFFIC MATTER             75033
PUBLIC NUISANCE                 59037
ROAD TRAFFIC COLLISION          58259
OTHER                           44717
ABANDONED/SILENT 999 CALL       41102
EXTERNAL AGENCY REQUEST         40557
ASSAULT                         39608
NOISE                           36849
DAMAGE                          33240
NEIGHBOUR DISPUTE               30507
FALSE CALL                      30413
COMMUNICATIONS                  29706
MISSING PERSON/ABSCONDER        27140
SUSPECT PERSONS                 25141
POLICE INFORMATION              20941
FRAUD INCIDENT                  20360
ROAD TRAFFIC OFFENCE            18563
CHILD PROTECTION                17091
DRUGS/SUBSTANCE MISUSE          16948
VEHICLE CRIME                   15139
DOMESTIC BAIL CHECK      

In [3]:
from pathlib import Path
from typing import Tuple
import pandas as pd
import ray
import torch
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata


def train_ctgan(df_slice: pd.DataFrame, service_code: str, epochs: int, outdir: Path) -> Path:
    """Train a CTGAN on one slice (GPU assumed available) and save the model."""
    outdir.mkdir(parents=True, exist_ok=True)

    meta = SingleTableMetadata()
    meta.detect_from_dataframe(df_slice)
    meta.validate()

    synth = CTGANSynthesizer(
        meta,
        epochs=epochs,
        batch_size=1024,
        pac=8,
        embedding_dim=128,
        generator_dim=(256, 256),
        discriminator_dim=(256, 256),
        generator_lr=2e-4,
        discriminator_lr=2e-4,
        cuda=True,
        verbose=False,
    )
    synth.fit(df_slice)

    model_path = outdir / f"{service_code}_final.pkl"
    synth.save(str(model_path))
    return model_path


@ray.remote(num_gpus=1)  # One GPU per slice
def train_actor(df_slice: pd.DataFrame, service_code: str, epochs: int, outdir: str) -> str:
    torch.cuda.set_device(0)  # Ray handles GPU assignment
    return str(train_ctgan(df_slice, service_code, epochs, Path(outdir)))


if __name__ == "__main__":
    ray.init()
    CATEGORY_COL = "service_code_description"
    EPOCHS = 200
    OUTDIR = "checkpoints_4/models"

    categories = df[CATEGORY_COL].value_counts().index.tolist()

    print(f"Launching CTGAN training for {len(categories)} slices on multiple GPUs...")

    futures = []
    for code in categories:
        slice_df = df[df[CATEGORY_COL] == code].copy()
        futures.append(train_actor.remote(slice_df, code, EPOCHS, OUTDIR))

    results = ray.get(futures)

    for r in results:
        print(f"✅ Saved: {r}")

    ray.shutdown()

2025-08-09 08:19:21,013	INFO worker.py:1927 -- Started a local Ray instance.


Launching CTGAN training for 36 slices on multiple GPUs...


[36m(train_actor pid=294117)[0m 
[36m(train_actor pid=294117)[0m The 'SingleTableMetadata' is deprecated. Please use the new 'Metadata' class for synthesizers.
[36m(train_actor pid=294117)[0m 
[36m(train_actor pid=294117)[0m 
[36m(train_actor pid=294117)[0m We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.
[36m(train_actor pid=294117)[0m 
[36m(train_actor pid=294117)[0m 
[36m(train_actor pid=294117)[0m Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
[36m(train_actor pid=294117)[0m 
[36m(train_actor pid=294117)[0m 
[36m(train_actor pid=294117)[0m resource_tracker: process died unexpectedly, relaunching.  Some folders/sempahores might leak.
[36m(train_actor pid=294117)[0m 
[36m(train_actor pid=294117)[0m Traceb

RayTaskError(FileNotFoundError): [36mray::train_actor()[39m (pid=299455, ip=10.131.34.65)
  File "/tmp/dcs-tmp.u2164966/ipykernel_293942/693837314.py", line 41, in train_actor
  File "/tmp/dcs-tmp.u2164966/ipykernel_293942/693837314.py", line 34, in train_ctgan
  File "/dcs/large/u2164966/MSc_Dissertation/myenv/lib64/python3.12/site-packages/sdv/single_table/base.py", line 695, in save
    with open(filepath, 'wb') as output:
         ^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: 'checkpoints_4/models/ABANDONED/SILENT 999 CALL_final.pkl'

[36m(train_actor pid=299709)[0m 
[36m(train_actor pid=299709)[0m The 'SingleTableMetadata' is deprecated. Please use the new 'Metadata' class for synthesizers.
[36m(train_actor pid=299709)[0m 
[36m(train_actor pid=299709)[0m 
[36m(train_actor pid=299709)[0m We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.
[36m(train_actor pid=299709)[0m 
[36m(train_actor pid=299709)[0m 
[36m(train_actor pid=299709)[0m Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
[36m(train_actor pid=299709)[0m 


In [3]:
from pathlib import Path
import pandas as pd
import ray
import torch
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata


def sanitize_filename(name: str) -> str:
    """Replace illegal filename characters with underscores."""
    return name.replace("/", "_").replace("\\", "_")


def train_ctgan(df_slice: pd.DataFrame, service_code: str, epochs: int, outdir: Path) -> Path:
    """Train a CTGAN on one slice (GPU assumed available) and save the model."""
    outdir.mkdir(parents=True, exist_ok=True)

    meta = SingleTableMetadata()
    meta.detect_from_dataframe(df_slice)
    meta.validate()

    synth = CTGANSynthesizer(
        meta,
        epochs=epochs,
        batch_size=1024,
        pac=8,
        embedding_dim=128,
        generator_dim=(256, 256),
        discriminator_dim=(256, 256),
        generator_lr=2e-4,
        discriminator_lr=2e-4,
        cuda=True,
        verbose=False,
    )
    synth.fit(df_slice)

    safe_name = sanitize_filename(service_code)
    model_path = outdir / f"{safe_name}_final.pkl"
    synth.save(str(model_path))
    return model_path


@ray.remote(num_gpus=1)
def train_actor(df_slice: pd.DataFrame, service_code: str, epochs: int, outdir: str) -> str:
    torch.cuda.set_device(0)
    return str(train_ctgan(df_slice, service_code, epochs, Path(outdir)))


if __name__ == "__main__":
    ray.init()
    CATEGORY_COL = "service_code_description"
    EPOCHS = 200
    OUTDIR = "checkpoints_4/models"

    target_codes = [
        "ABANDONED/SILENT 999 CALL",
        "MISSING PERSON/ABSCONDER",
        "DRUGS/SUBSTANCE MISUSE",
        "BAIL/CURFEW/ADDRESS CHECKS"
    ]

    print(f"Launching CTGAN training for {len(target_codes)} missing slices on multiple GPUs...")

    futures = []
    for code in target_codes:
        slice_df = df[df[CATEGORY_COL] == code].copy()
        if slice_df.empty:
            print(f"⚠️ Skipping {code}: no data found.")
            continue
        futures.append(train_actor.remote(slice_df, code, EPOCHS, OUTDIR))

    results = ray.get(futures)

    for r in results:
        print(f"✅ Saved: {r}")

    ray.shutdown()

2025-08-11 15:41:03,131	INFO worker.py:1927 -- Started a local Ray instance.


Launching CTGAN training for 4 missing slices on multiple GPUs...


[36m(train_actor pid=56711)[0m 
[36m(train_actor pid=56711)[0m The 'SingleTableMetadata' is deprecated. Please use the new 'Metadata' class for synthesizers.
[36m(train_actor pid=56711)[0m 
[36m(train_actor pid=56711)[0m 
[36m(train_actor pid=56711)[0m We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.
[36m(train_actor pid=56711)[0m 
[36m(train_actor pid=56711)[0m 
[36m(train_actor pid=56711)[0m Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
[36m(train_actor pid=56711)[0m 
[36m(train_actor pid=56711)[0m 
[36m(train_actor pid=56711)[0m resource_tracker: process died unexpectedly, relaunching.  Some folders/sempahores might leak.
[36m(train_actor pid=56711)[0m 
[36m(train_actor pid=56711)[0m Traceback (most rec

✅ Saved: checkpoints_4/models/ABANDONED_SILENT 999 CALL_final.pkl
✅ Saved: checkpoints_4/models/MISSING PERSON_ABSCONDER_final.pkl
✅ Saved: checkpoints_4/models/DRUGS_SUBSTANCE MISUSE_final.pkl
✅ Saved: checkpoints_4/models/BAIL_CURFEW_ADDRESS CHECKS_final.pkl


[36m(train_actor pid=56710)[0m 
[36m(train_actor pid=56710)[0m 
[36m(train_actor pid=56710)[0m resource_tracker: process died unexpectedly, relaunching.  Some folders/sempahores might leak.
