In [1]:
from generator import generate_synthetic_data
import logging
import sys
import json 
import pickle

### Setup Logging

In [2]:
# set logging
logging.getLogger().handlers.clear()
logging.basicConfig(
    level=logging.DEBUG,  # Set to DEBUG
    format='%(levelname)s: %(message)s',
    stream=sys.stdout,
    force=True
)
logging.getLogger().setLevel(logging.INFO)

### Load JSON and Additional Required Data

In [17]:
data_path = "../../synthetic_data/json/all_chapters.json"
with open(data_path, "r", encoding="utf-8") as f:
    json_data = json.load(f)

with open("../../synthetic_data/noise_toc.pkl", "rb") as f:
    toc_noise = pickle.load(f)

### Generate

In [None]:
# Randomness parameters for synthetic data generation
subchapters_random_noise_param = [.1, .3]
chapters_random_noise_param = [.1, .3]
chapters_systemic_noise_param = [.1, .3]
add_subchapters_param = [.2, .4]

In [None]:
n_iterations = 10
synthetic_data = []

for iteration in range(n_iterations):
    for chapters_random_noise in chapters_random_noise_param:
        for subchapters_random_noise in subchapters_random_noise_param:
            for chapters_systemic_noise in chapters_systemic_noise_param:
                for add_subchapters in add_subchapters_param:
                    randomness_parameters = {
                        "subchapter_random_noise": subchapters_random_noise,
                        "chapter_random_noise": chapters_random_noise,
                        "chapter_systemic_noise": chapters_systemic_noise,
                        "add_subchapter": add_subchapters
                    }
                    logging.info(f"Using TOC noise: {toc_noise}")
                    # Generate synthetic data for the current noise configuration
                    for iteration in range(n_iterations):
                        logging.info(f"Iteration {iteration + 1} of {n_iterations}")
                        # Generate synthetic data
                        synthetic_data.extend(generate_synthetic_data(
                            json_data=json_data,
                            toc_noise=toc_noise,
                            **randomness_parameters
                        ))
                        

INFO: Using TOC noise: ['Understanding the fundamental principles of modern economics', 'Historical development of computational methods in science', 'Key concepts and definitions for advanced practitioners', 'Theoretical framework for analyzing complex systems', 'Research methods and approaches in contemporary studies', 'Data collection techniques for large-scale surveys', 'Statistical analysis methods for experimental research', 'Results and findings from the longitudinal study', 'Interpretation of results using advanced modeling techniques', 'Practical applications in real-world business scenarios', 'Case study analysis of successful implementation strategies', 'Implementation strategies for organizational change management', 'Best practices and guidelines for sustainable development', 'Common challenges and solutions in project management', 'Future directions and emerging trends in technology', 'Comparative analysis of different methodological approaches', 'Literature review and co

In [18]:
len(json_data)

40

In [None]:
# Save the generated data
output_path = f"../synthetic_data/json/synthetic_data_{iteration + 1}.json"
with open(output_path, "w", encoding="utf-8") as f:
    pickle.dump(synthetic_data, f, indent=4, ensure_ascii=False)

logging.info(f"Saved synthetic data to {output_path}")
