# Toy Dataset Generation

This notebook generates a synthetic dataset that matches the expected
input schema of the `ct_residual_disease` pipeline.

#### Creation of the training set

In [2]:
# Import
import numpy as np
import pandas as pd

# Reproducibility
rng = np.random.default_rng(seed=42)

N_SAMPLES = 50

# Creation of the IDs
ids = [f"ID_{i:02d}" for i in range(1, N_SAMPLES + 1)]
study_ids = [int(i.split("_")[1]) for i in ids]

# Presence of residual disease = 1, 0 otherwise
trg = rng.integers(1, 5, size=N_SAMPLES)

residual_disease = np.where(trg == 1, 0, 1)

# If Residual_Disease = 0, volume between 0 and 35
# If Residual_Disease = 1, volume between 15 and 70

volume_component = np.where(
    residual_disease == 0,
    rng.uniform(0, 35, size=N_SAMPLES),
    rng.uniform(15, 70, size=N_SAMPLES),
)

num_component = rng.integers(0, 2, size=N_SAMPLES)

fpc = rng.choice([True, False], size=N_SAMPLES)

histology = rng.choice(
    ["Adenocarcinoma", "Squamous cell carcinoma"],
    size=N_SAMPLES,
)

# Age between 45 and 85
age = rng.integers(45, 86, size=N_SAMPLES)

gender = rng.choice(["Male", "Female"], size=N_SAMPLES)

# No T1, out of scope
cT = rng.choice(["T2", "T3", "T4"], size=N_SAMPLES)

cN = rng.choice(["N0", "N1", "N2", "N3"], size=N_SAMPLES)

toy_df = pd.DataFrame(
    {
        "ID": ids,
        "StudyID": study_ids,
        "Num_Component": num_component,
        "Volume_Component": volume_component,
        "FPC": fpc,
        "Histology": histology,
        "TRG": trg,
        "Residual_Disease": residual_disease,
        "Age": age,
        "Gender": gender,
        "cT": cT,
        "cN": cN,
    }
)

toy_df.head()
toy_df.to_csv("../data/toy_dataset_training.csv", index=False)


#### Creation of the test set

In [3]:
# Import
import numpy as np
import pandas as pd

# Reproducibility
rng = np.random.default_rng(seed=42)

N_SAMPLES = 50

# Creation of the IDs
ids = [f"ID_{i:02d}" for i in range(1, N_SAMPLES + 1)]
study_ids = [int(i.split("_")[1]) for i in ids]

# Presence of residual disease = 1, 0 otherwise
trg = rng.integers(1, 5, size=N_SAMPLES)

residual_disease = np.where(trg == 1, 0, 1)

# If Residual_Disease = 0, volume between 0 and 35
# If Residual_Disease = 1, volume between 15 and 70

volume_component = np.where(
    residual_disease == 0,
    rng.uniform(0, 40, size=N_SAMPLES),
    rng.uniform(10, 70, size=N_SAMPLES),
)

num_component = rng.integers(0, 2, size=N_SAMPLES)

fpc = rng.choice([True, False], size=N_SAMPLES)

histology = rng.choice(
    ["Adenocarcinoma", "Squamous cell carcinoma"],
    size=N_SAMPLES,
)

# Age between 45 and 85
age = rng.integers(45, 86, size=N_SAMPLES)

gender = rng.choice(["Male", "Female"], size=N_SAMPLES)

# No T1, out of scope
cT = rng.choice(["T2", "T3", "T4"], size=N_SAMPLES)

cN = rng.choice(["N0", "N1", "N2", "N3"], size=N_SAMPLES)

toy_df = pd.DataFrame(
    {
        "ID": ids,
        "StudyID": study_ids,
        "Num_Component": num_component,
        "Volume_Component": volume_component,
        "FPC": fpc,
        "Histology": histology,
        "TRG": trg,
        "Residual_Disease": residual_disease,
        "Age": age,
        "Gender": gender,
        "cT": cT,
        "cN": cN,
    }
)

toy_df.head()
toy_df.to_csv("../data/toy_dataset_test.csv", index=False)
