# Mock datasets
Ben Pennell, MPIA

February 20th, 2026
_____


In [None]:
import numpy as np
from tqdm.notebook import tqdm
from astropy.table import Table
import pickle
import SyntheticData as sd
from datetime import datetime, date
from utils.utils import *
import pandas as pd
import json

In [None]:
with open("config.json") as f:
    d = json.load(f)
output_dir = d["output_path"]

Open up the dataset and create a lower-memory version of the catalogue with only the relevant entries

In [28]:
mdwarfs = Table.read('./data/200pc_mdwarfs_reduced.fits', format='fits')
all_mdwarfs = []
relevant_list = ["parallax", "mass", "phot_g_mean_mag", "solution_type"]
for row in tqdm(mdwarfs): # we don't need every object, really 
    df = dict()
    for item in relevant_list:
        working_item = item
        if item == "mass":
            working_item = "mass_single"
        df[item] = float(row[working_item])
    all_mdwarfs.append(df)

  0%|          | 0/225536 [00:00<?, ?it/s]

In [27]:
def generate_dataset(name, object_count=250000, binary_fraction=0.3, binarity_model=None, period_model=(4,1.3), mass_model=0.5, ecc_type="turnover", save=True):
    outdata = dict()
    outdata["meta"] = dict()
    outdata["meta"]["timestamp"] = datetime.now()
    outdata["meta"]["object_count"] = object_count
    outdata["meta"]["binary_fraction"] = binary_fraction
    outdata["meta"]["binarity_model"] = f"{binarity_model}"
    outdata["meta"]["period_model"] = period_model
    outdata["meta"]["mass_model"] = mass_model
    outdata["meta"]["ecc_type"] = ecc_type
    outdata["data"] = sd.create_synthetic_data(object_count=object_count, catalogue=mdwarfs, binary_fraction=binary_fraction, binarity_model=binarity_model,
                                               period_model=period_model, mass_model=mass_model, ecc_type=ecc_type)

    if not save:
        return outdata

    outfile = open(output_dir+f'{date.today()}-{name}.pkl', "wb")
    pickle.dump(outdata, outfile)
    outfile.close()

In [None]:
dset = generate_dataset("test", object_count=25000, save=False)

In [20]:
t_loaded = np.array(dset["data"])
rates = [len(t_loaded[[t["solution_type"] == soltype for t in t_loaded]])/len(t_loaded)*100 for soltype in [0,5,7,9,12]]
grid = [
    rates,
    [int(grp/100*len(t_loaded)) for grp in rates]
]
row_labels = ["Rate (%)", "Counts"]
col_labels = ["low RUWE", "high RUWE", "Acceleration", "Jerk", "Full Orbit"]
df = pd.DataFrame(grid, index=row_labels, columns=col_labels)
df.style.format(
    "{:.0f}",
    subset=pd.IndexSlice["Counts", :]
).format(
    "{:.2f}",
    subset=pd.IndexSlice["Rate (%)", :]
)

Unnamed: 0,low RUWE,high RUWE,Acceleration,Jerk,Full Orbit
Rate (%),95.38,3.74,0.52,0.13,0.23
Counts,18537.0,726.0,102.0,26.0,44.0


In [11]:
t_loaded = mdwarfs
rates = [len(t_loaded[t_loaded["solution_type"] == soltype])/len(t_loaded)*100 for soltype in [0,5,7,9,12]]
grid = [
    rates,
    [int(grp/100*len(t_loaded)) for grp in rates]
]
row_labels = ["Rate (%)", "Counts"]
col_labels = ["low RUWE", "high RUWE", "Acceleration", "Jerk", "Full Orbit"]
df = pd.DataFrame(grid, index=row_labels, columns=col_labels)
df.style.format(
    "{:.0f}",
    subset=pd.IndexSlice["Counts", :]
).format(
    "{:.2f}",
    subset=pd.IndexSlice["Rate (%)", :]
)

Unnamed: 0,low RUWE,high RUWE,Acceleration,Jerk,Full Orbit
Rate (%),96.5,3.27,0.14,0.03,0.06
Counts,217641.0,7369.0,306.0,74.0,146.0


## A 'fiducial' dataset

The main dataset to make will be one of "conventional wisdom", and all other datasets will just be slight modifications to this one

In [29]:
generate_dataset(name="reference_vol", object_count=250000) # this one has all the presets.

Computing Binaries:   0%|          | 0/58560 [00:00<?, ?it/s]

## Variable binary fraction

supply a function for the mass-varying binary fraction

In [5]:
REF_FRACTIONS = (0.25, 0.5)
MASS_RANGE = (0.2, 0.4)
def bin_frac(m):
    sep = (m - MASS_RANGE[0]) / (MASS_RANGE[1] - MASS_RANGE[0])
    return sep * (REF_FRACTIONS[1] - REF_FRACTIONS[0]) + REF_FRACTIONS[0]
generate_dataset(name="variable_binarity", binary_fraction=None, binarity_model=bin_frac)

Computing Binaries:   0%|          | 0/99992 [00:00<?, ?it/s]

## Period test

different period distributions

In [7]:
generate_dataset(name="periods_42", object_count=50000, period_model=(4,2))
generate_dataset(name="periods_513", object_count=50000, period_model=(5,1.3))

Computing Binaries:   0%|          | 0/15140 [00:00<?, ?it/s]

Computing Binaries:   0%|          | 0/15096 [00:00<?, ?it/s]

## Mass ratio test

uniform vs. steep power law mass ratio distribution

In [None]:
generate_dataset(name="flat_q", object_count=50000, mass_model=None)
generate_dataset(name="gamma1", object_count=50000, mass_model=1)

Computing Binaries:   0%|          | 0/15092 [00:00<?, ?it/s]

## Eccentricity test

circular vs thermal eccentricities :)

In [9]:
generate_dataset(name="circular", object_count=50000, ecc_type="circular")
generate_dataset(name="thermal", object_count=50000, ecc_type="thermal")

Computing Binaries:   0%|          | 0/14879 [00:00<?, ?it/s]

Computing Binaries:   0%|          | 0/15129 [00:00<?, ?it/s]