In [45]:
from generator import generate_synthetic_data
import logging
import sys
import json 
import pickle
import random

### Setup Logging

In [14]:
# set logging
logging.getLogger().handlers.clear()
logging.basicConfig(
    level=logging.DEBUG,  # Set to DEBUG
    format='%(levelname)s: %(message)s',
    stream=sys.stdout,
    force=True
)
logging.getLogger().setLevel(logging.WARNING)

### Load JSON and Additional Required Data

In [27]:
data_path = "../../synthetic_data/json/all_full.json"
with open(data_path, "r", encoding="utf-8") as f:
    json_data = json.load(f)

with open("../../synthetic_data/noise_toc.pkl", "rb") as f:
    toc_noise = pickle.load(f)

### Generate

In [28]:
# Randomness parameters for synthetic data generation
subchapters_random_noise_param = [.1, .2, .3]
chapters_random_noise_param = [.1, .3]
chapters_systemic_noise_param = [.1, .1, .3]
add_subchapters_param = [.2, .3]

In [None]:
# Randomness parameters for synthetic data generation
subchapters_random_noise_param = [.1, .2, .3]
chapters_random_noise_param = [.1, .3]
chapters_systemic_noise_param = [.1, .1, .3]
add_subchapters_param = [.2, .3]

# parameters for the number of iterations
n_iterations = 4
synthetic_data = []

for chapters_random_noise in chapters_random_noise_param:
    for subchapters_random_noise in subchapters_random_noise_param:
        for chapters_systemic_noise in chapters_systemic_noise_param:
            for add_subchapters in add_subchapters_param:
                randomness_parameters = {
                    "subchapter_random_noise": subchapters_random_noise,
                    "chapter_random_noise": chapters_random_noise,
                    "chapter_systemic_noise": chapters_systemic_noise,
                    "add_subchapter": add_subchapters
                }
                logging.info(f"Using TOC noise: {toc_noise}")
                # Generate synthetic data for the current noise configuration
                for iteration in range(n_iterations):
                    logging.info(f"Iteration {iteration + 1} of {n_iterations}")
                    # Generate synthetic data
                    synthetic_data.extend(generate_synthetic_data(
                        json_data=json_data,
                        toc_noise=toc_noise,
                        **randomness_parameters
                    ))
                    

### Final Processing and Storage

In [None]:
#data_full= synthetic_data  # this is because I ran independently the generator for 
                            # the full data and the chapters data
data_chapters = synthetic_data.copy()
all_data = data_full + data_chapters
n_data_to_keep = 10000

final_data = random.sample(all_data, n_data_to_keep)

In [None]:
# Save the generated data
output_path = f"../../synthetic_data/training_dataset/synthetic_toc.pkl"
with open(output_path, "wb") as f:
    pickle.dump(final_data, f)

logging.info(f"Saved synthetic data to {output_path}")
