In [1]:
import json
import random
import numpy as np
import logging
import sys
import string
import warnings
import pickle 


In [2]:
# load
data_path = "../synthetic_data/json/all_full.json"
with open(data_path, "r", encoding="utf-8") as f:
    json_data = json.load(f)

with open("../synthetic_data/noise_toc.pkl", "rb") as f:
    toc_noise = pickle.load(f)

In [172]:
for i in json_data:
    print(i)

{'id': 1, 'name': 'Fine Art Techniques and Materials', 'chapters_counts': 7, 'chapters': [{'number': 1, 'title': 'Oil Painting Mastery', 'subchapter_count': 10, 'subchapters': [{'number': '1.1', 'title': 'Canvas Preparation and Priming Techniques'}, {'number': '1.2', 'title': 'Color Mixing and Palette Management'}, {'number': '1.3', 'title': 'Brushwork and Paint Application Methods'}, {'number': '1.4', 'title': 'Glazing and Scumbling Techniques'}, {'number': '1.5', 'title': 'Portrait Painting and Flesh Tones'}, {'number': '1.6', 'title': 'Landscape Composition and Atmosphere'}, {'number': '1.7', 'title': 'Still Life Arrangement and Lighting'}, {'number': '1.8', 'title': 'Impasto and Texture Creation'}, {'number': '1.9', 'title': 'Varnishing and Preservation Methods'}, {'number': '1.10', 'title': 'Common Mistakes and Troubleshooting'}]}, {'number': 2, 'title': 'Watercolor and Wet Media', 'subchapter_count': 10, 'subchapters': [{'number': '2.1', 'title': 'Paper Selection and Stretching'}

In [None]:
def generate_symbol_noise(min_len=1, max_len=3):
    """Generate random symbols and punctuation"""
    length = random.randint(min_len, max_len)
    symbols = "!@#$%^&*()[]{}|;:,.<>?/~`_+-="
    return ''.join(random.choices(symbols, k=length))


def generate_number_noise(min_digits=1, max_digits=4):
    """Generate random number strings"""
    length = random.randint(min_digits, max_digits)
    return ''.join(random.choices(string.digits, k=length))


def generate_page_number_noise():
    """Generate realistic but random page number patterns"""
    patterns = [
        f"Page {random.randint(1, 999)}",
        f"p. {random.randint(1, 999)}",
        f"pp. {random.randint(1, 999)}-{random.randint(1, 999)}",
        f"{random.randint(1, 999)}",
        f"[{random.randint(1, 999)}]",
        f"({random.randint(1, 999)})",
    ]
    return random.choice(patterns)


def generate_formatting_noise():
    """Generate random formatting-like text"""
    formats = [
        "...............",
        "_______________",
        "---------------",
        "===============",
        "***************",
        "###############",
        "|||||||||||||||",
        "               ",  # spaces
        "\t\t\t\t",       # tabs
        "• • • • • • • •",
        "→ → → → → → →",
        "※ ※ ※ ※ ※ ※",
    ]
    return random.choice(formats)


def generate_text_noise(toc_noise):
    """Sample random text from toc noise"""
    random_toc = str(np.random.choice(toc_noise, 1, replace=False)[0])
    words = random_toc.split()
    n_words = len(words)
    n_words_to_use = random.randint(1,n_words)
    words_ids = np.random.choice(range(0,n_words),n_words_to_use,replace=False)
    words_subset = ' '.join([words[idx] for idx in words_ids])
    return words_subset


def generate_random_noise(chunk_type="symbols", toc_noise=toc_noise):
    """
    Generate a chunk of noise text
    
    Args:
        chunk_type: Type of noise - "symbols", "numbers",  "page_numbers", 
                   "formatting", "text"
    """
    # chunk_size = random.randint(*chunk_size_range)
    
    if chunk_type == "random":
        chunk_type = random.choice([
            "symbols", "numbers", "page_numbers", "formatting", "text"
        ])

    if chunk_type == "subchapters":
        chunk_type = random.choice([
            "symbols", "numbers", "text"
        ])
    
    generators = {
        "symbols": lambda: generate_symbol_noise(),
        "numbers": lambda: generate_number_noise(),
        "page_numbers": lambda: generate_page_number_noise(),
        "formatting": lambda: generate_formatting_noise(),
        "text": lambda: generate_text_noise(toc_noise)
    }
    
    return str(generators[chunk_type]())

In [None]:
def create_weighted_generator(max_number, max_ceiling=None, start=4, weight_range=(7, 11), weight_multiplier=3):
    """
    Creates a weighted random generator for range [start, max_number]
    with higher weights for numbers in weight_range.
    """
    if max_number < start:
        raise ValueError(f"max_number ({max_number}) must be >= start ({start})")
    
    # Set a max range
    if max_ceiling:
        if max_number > max_ceiling:
            max_number = max_ceiling

    # Create the range
    numbers = np.arange(start, max_number + 1)
    
    # Create weights - start with all 1s
    weights = np.ones(len(numbers))
    
    # Find indices within the weight range that also exist in our number range
    weight_min, weight_max = weight_range
    if weight_max >= max_number:
        weight_max = max_number
        warnings.warn("max range set equal to book's chapters number ", category=UserWarning)
    mask = (numbers > weight_min) & (numbers <= weight_max)
    weights[mask] = weight_multiplier
    
    # Normalize weights
    weights = weights / weights.sum()
    
    return int(np.random.choice(numbers, p=weights))


def add_random_spacing(range=range(1,5), weights=[.6,.2,.1,.1]):
    """ add random spacing"""
    n_spaces = random.choices(range, weights=weights, k=1)[0]
    spacing = " " * n_spaces
    return spacing

In [5]:
def generate_chapter_layout():
    """Generates a consistent layout for all chapters in a book."""
    layout_types = [
        "Chapter {}: ",  # "Chapter 1: "
        "{}. ",          # "1. "
        "{} ",           # "1 "
        "chapter {} ",   # "chapter 1 "
        "Chapter {} "    # "Chapter 1 "
    ]
    weights = [0.1, 0.4, 0.4, 0.05, 0.05]
    number_layout = random.choices(layout_types, weights=weights, k=1)[0]
    
    nextline_page_number = random.choices([1, 0], weights=[0.15, 0.85], k=1)[0]

    return {
        "number_layout": number_layout,
        "nextline_page_number": nextline_page_number
    }


def format_chapter(layout, title, number, page_start):
    """Formats a chapter using the given book layout."""
    if layout["nextline_page_number"]:
        page_start = f"\n{add_random_spacing(range=range(0,6), weights=[.5,.2,.1,.1,.05,.05])}{page_start}"
    else:
        page_start = f"{add_random_spacing(range=range(1,7), weights=[.5,.2,.1,.1,.05,.05])}{page_start}"
    
    return f"{layout['number_layout'].format(number)}{title}{page_start}"


def generate_systemic_noise_layout():
    # section noise
    number_sections = random.choices([1,2,3], weights=[.5,.35,.15], k=1)[0] 
    text_sections_noise = ["Exercises", "References",  "Bibliography", "Notes", "Further Reading", "Contents", "Tables"]
    text_sections_weights = [0.3, 0.2, 0.2, 0.1, 0.1, 0.05, .05]
    text_section = np.random.choice(
                                    text_sections_noise,
                                    size=number_sections,
                                    replace=False,  
                                    p=text_sections_weights
    )

    # add numbers to sections
    add_numbers = random.choices([0,1], weights=[.7,.3], k=number_sections)
    
    return {
        "text_section": text_section,
        "add_numbers": add_numbers
    }
    
    
def format_noise(noise_layout, start_page, end_page, toc_noise):
    """Formats a chapter using the given book layout."""
    add_random_noise = random.choices([0,1], weights=[.7,.3], k=1)[0]
    if add_random_noise:
        random_noise = str(np.random.choice(toc_noise, 1, replace=False)[0])
        random_noise_position = random.randint(0, len(noise_layout['text_section']))
    
    noise_output = ""
    current_number = ""

    for idx, _ in enumerate(noise_layout['text_section']):
        # get random noise
        if add_random_noise and idx == random_noise_position:
            noise_output += add_random_spacing() + random_noise + "\n"
        
        # get numbers to add
        if noise_layout['add_numbers'][idx]:
            try:
                if not current_number:
                    number_to_add = random.sample(range(start_page,end_page-2), 1)[0]
                else:
                    number_to_add = random.sample(range(start_page,end_page), 1)[0]
            except:
                number_to_add = ""
            current_number = number_to_add
        else:
            current_number = ""

        # define noise output
        noise_output += noise_layout['text_section'][idx] + add_random_spacing() + str(current_number) + "\n"
        
    return noise_output
    

def format_subchapter(text, sub_number, sub_page, use_numbers, use_pages):
    """Generates a consistent layout for all suchapters in a book."""
    if not use_numbers and not use_pages:
        return f"{add_random_spacing(range=range(0,3), weights=[.7,.2,.1])}{text}\n"
    elif use_numbers and not use_pages:
        return f"{add_random_spacing(range=range(0,3), weights=[.7,.2,.1])}{str(sub_number)} {text}\n"
    elif not use_numbers and use_pages:
        return f"{text}{add_random_spacing(range=range(1,4), weights=[.6,.2,.6])}{str(sub_page)}\n"
    else:
        return f"{add_random_spacing(range=range(0,3), weights=[.85,.1,.05])}{str(sub_number)} {text}{add_random_spacing(range=range(1,4), weights=[.7,.2,.1])}{str(sub_page)}\n"


In [57]:
.2-1

-0.8

In [None]:
subchapters_random_noise = 
chapters_random_noise =
chapters_systemic_noise =
add_subchapters =

0.8

In [None]:
# set logging
logging.getLogger().handlers.clear()
logging.basicConfig(
    level=logging.DEBUG,  # Set to DEBUG
    format='%(levelname)s: %(message)s',
    stream=sys.stdout,
    force=True
)
logging.getLogger().setLevel(logging.INFO)


def generate_synthetic_data(json_data,
                            toc_noise,
                            subchapter_random_noise=.2,
                            chapter_random_noise=.3,
                            chapter_systemic_noise=.2,
                            add_subchapter=.3
                            ):

    # randomness variables
    first_page_numbers = [1, 2, 3, 4, 5]
    first_page_weights = [6, 3, 3, 1, 1]  # 3 and 4 are 5x more likely
    page_range_numbers = np.arange(5, 60)  
    page_range_weights = np.where((page_range_numbers >= 15) & (page_range_numbers <= 35), 3, 1)

    # output variables
    output_data = []

    for book in json_data:
        
        # set new book name for each book
        logging.debug("\nNew book:")
        book_name = book["name"]
        number_of_chapters = book["chapters_counts"]
        current_chapter = {}
        prompt = ""

        # set book layout variables
        book_layout = generate_chapter_layout()

        # set systemic noise layout if needed
        add_systemic_noise = random.choices([0, 1], weights=[1-chapter_systemic_noise, chapter_systemic_noise], k=1)[0]
        if add_systemic_noise:
            noise_layout = generate_systemic_noise_layout()
        
        # define chapters numbers to use
        number_of_chapters_subset = create_weighted_generator(number_of_chapters)
        chapters_ids = np.random.choice(range(1,number_of_chapters+1),
                                        size=number_of_chapters_subset,
                                        replace=False)
        
        # define wheter to add subchapters
        add_subchapters = random.choices([0, 1], weights=[1-add_subchapter, add_subchapter], k=1)[0]
        if add_subchapters:
            use_numbers = random.choices([0,1], weights=[.85,.15], k=1)[0]
            add_page = random.choices([0,1], weights=[.75,.25], k=1)[0]

        # initialise labe
        book_label = '```json\n[\n'

        for idx,chapter_id in enumerate(chapters_ids):
            for chapter in book['chapters']:
                if chapter["number"] == chapter_id:

                    # get chapter information
                    chapter_title = chapter["title"]
                    chapter_number = idx + 1
                    number_of_subchapter = chapter["subchapter_count"]

                    # get chapters pages
                    if not bool(current_chapter):
                        logging.debug("new chapter")
                        start_page = int(random.choices(first_page_numbers, weights=first_page_weights, k=1)[0])
                        end_page = int(start_page + random.choices(page_range_numbers, weights=page_range_weights, k=1)[0])
                    else:
                        logging.debug("continue chapter")
                        start_page = current_chapter["end_page"] + 1
                        end_page = int(start_page + random.choices(page_range_numbers, weights=page_range_weights, k=1)[0])

                    # add label
                    current_chapter = {
                        "chapter_number": chapter_number,
                        "chapter_title": chapter_title,
                        "start_page": start_page,
                        "end_page": end_page
                    }
                    book_label += format_labels(chapter_number, chapter_title, start_page, end_page)
                    logging.debug(f"{current_chapter}")

                    # define chapter layout
                    formatted_chapter = format_chapter(
                        book_layout,
                        chapter_title,
                        chapter_number,
                        start_page
                    )

                    # add chapter to prompt
                    prompt += f"{formatted_chapter}\n"
                        
                    # set noise variables
                    if add_systemic_noise:
                        prompt += format_noise(noise_layout, start_page, end_page, toc_noise)

                    # add random noise
                    add_random_noise = random.choices([0, 1], weights=[1-chapter_random_noise, chapter_random_noise], k=1)[0]
                    if add_random_noise:
                        prompt += generate_random_noise('random', toc_noise) + "\n"
                    
                    # add subchapters
                    if number_of_subchapter and add_subchapters: 
                        number_of_subchapters_to_use = create_weighted_generator(number_of_subchapter, max_ceiling=9, start=2, weight_range=(3, 5), weight_multiplier=3)
                        number_of_subchapters_to_use = random.randint(1, number_of_subchapter)
                        actual_chapter_number = chapter["number"]
                        
                        # define subchapters params
                        counter_ids = chapter_number + .1
                        page_range = end_page - start_page
                        
                        # adjust number of subchapters based on page range
                        if page_range <= number_of_subchapters_to_use:
                            number_of_subchapters_to_use = page_range - 2
                        
                        # NOW generate subchapters_ids with the final count
                        all_subchapters_ids = [f"{actual_chapter_number}.{i}" for i in range(1, number_of_subchapter + 1)]
                        subchapters_ids = np.random.choice(all_subchapters_ids,
                                                        size=number_of_subchapters_to_use,
                                                        replace=False)
                        
                        # generate pages array (now same size as subchapters_ids)
                        subchapters_pages = np.sort(np.random.choice(range(start_page, end_page), 
                                                                size=number_of_subchapters_to_use, 
                                                                replace=False))

                        # loop trough subchapters
                        for idx_sub, subchapter_id in enumerate(subchapters_ids):
                            for subchapter in chapter['subchapters']:
                                if subchapter["number"] == subchapter_id:
                                    prompt += format_subchapter(subchapter["title"], 
                                                                counter_ids, 
                                                                subchapters_pages[idx_sub], 
                                                                use_numbers, 
                                                                add_page)
                                    counter_ids = round(counter_ids + 0.1, 1)     
                                    add_random_noise_to_subchapters = random.choices([0, 1], weights=[1-subchapter_random_noise, subchapter_random_noise], k=1)[0]
                                    if add_random_noise_to_subchapters:
                                        prompt += generate_random_noise('subchapters', toc_noise) + "\n"           

                    logging.debug(f"Chapter layout: {formatted_chapter}")
        
        # finish label format
        book_label = book_label[:-2] +'\n]\n```'
        output_data.append((prompt, book_label))
        logging.info(f"{prompt}")

        return output_data



INFO: 1. Oil Painting Mastery
 1
(936)
Portrait Painting and Flesh Tones
Common Mistakes and Troubleshooting
  Impasto and Texture Creation
2. Art History and Cultural Context
17
Non-Western Art Traditions
  Impressionist Movement and Innovation
 Modern Art Breaking Traditions
Contemporary Art Movements
Museum and Gallery Systems
  Renaissance Masters and Techniques
3. Sculpture and Three-Dimensional Art
61
Stone Carving and Tool Usage
4. Professional Art Practice
81
23
Grant Writing and Funding
  Art Fair Participation Strategies
experience improvement Service and customer enhancement
5. Watercolor and Wet Media
  112
  Plein Air Watercolor Strategies
teams protocols distributed for Communication
Masking and Resist Techniques
Abstract Expressionist Approaches
 Wet-on-Dry Precision Control
Mixed Media Integration
?

INFO: 1 Jazz Guitar and String Instruments 2
2 Improvisation Techniques and Scales  23
3 Jazz Ensemble and Big Band Performance 60
4 Jazz Piano Performance    78



INFO: chapter 1 Food Science and Molecular Gastronomy 1
Heat Transfer and Cooking Physics
Enzyme Applications in Cooking
 Protein Denaturation and Coagulation
,)[
Texture Modification Techniques
 Emulsification Science and Stability
chapter 2 Restaurant Operations and Management  19
Inventory Management and Purchasing
Customer Service Integration
  Menu Engineering and Cost Control
Service Flow and Timing Coordination
Food Safety and HACCP Systems
Staff Training and Kitchen Leadership
 Sustainable Restaurant Practices
Financial Management and Profitability
chapter 3 Farm-to-Table and Sustainable Cooking  43
 Preservation and Pickling Techniques
 Community Supported Agriculture
Urban Farming and Kitchen Gardens
Local Sourcing and Seasonal Menus
 Ethical Meat and Seafood Sourcing
 Organic and Biodynamic Ingredients
Composting and Food Scrap Utilization
  Zero-Waste Kitchen Practices
Packaging and Waste Reduction
Energy-Efficient Cooking Methods
chapter 4 Wine and Beverage Pairing   60
  



In [22]:
print(output_data[6][0], output_data[6][1])

Chapter 1 Fermentation Management and Control     1
Bibliography 6
 Forensic analysis and investigation methodologies
Further Reading  11
Exercises 
planning support Institutional
  Yeast Health and Viability Assessment
 Troubleshooting Fermentation Problems
Carbonation Methods and Control
 Packaging and Shelf Stability
Diacetyl Rest and Conditioning
3245
Off-Flavor Prevention and Correction
Cold Conditioning and Lagering
Chapter 2 Brewing Science and Fundamentals  30
Bibliography 30
Further Reading   46
Exercises 
|||||||||||||||
pH Control and Acid Management
:]
  Sugar Extraction and Efficiency Optimization
Yeast Biology and Fermentation Science
 Water Chemistry and Mineral Adjustment
 Hop Varieties and Utilization Techniques
Enzyme Activity and Mashing Process
Chapter 3 Traditional Beer Styles and Recipes 50
Bibliography 65
Further Reading 67
Exercises  
Belgian Specialty Ales and Fermentation
German Lager Traditions and Techniques
`{
Seasonal and Holiday Beer Specialties
American 

In [46]:
start_page = 10
end_page = 15
number_of_subchapters_to_use = 4
np.sort(np.random.choice(range(start_page, end_page), size=number_of_subchapters_to_use, replace=False))

array([10, 11, 13, 14])

In [None]:
random.choices([0,1], weights=[.65,.35], k=1)[0]


0

In [None]:
'```json\n[\n  {"chapter_number": "1", "chapter_title": "Financial Machine Learning as a Distinct Subject", "start_page": 3, "end_page": 19},\n  {"chapter_number": "3", "chapter_title": "Labeling", "start_page": 43, "end_page": 55},\n  {"chapter_number": "4", "chapter_title": "Sample Weights", "start_page": 59, "end_page": 72},\n  {"chapter_number": "5", "chapter_title": "Fractionally Differentiated Features", "start_page": 75, "end_page": 88},\n  {"chapter_number": "6", "chapter_title": "Ensemble Methods", "start_page": 93, "end_page": 101},\n  {"chapter_number": "7", "chapter_title": "Cross-Validation in Finance", "start_page": 103, "end_page": 110},\n  {"chapter_number": "8", "chapter_title": "Feature Importance", "start_page": 113, "end_page": 127},\n  {"chapter_number": "9", "chapter_title": "Hyper-Parameter Tuning with Cross-Validation", "start_page": 129, "end_page": 135},\n  {"chapter_number": "10", "chapter_title": "Bet Sizing", "start_page": 141, "end_page": 148},\n  {"chapter_number": "11", "chapter_title": "The Dangers of Backtesting", "start_page": 151, "end_page": 158},\n  {"chapter_number": "12", "chapter_title": "Backtesting through Cross-Validation", "start_page": 161, "end_page": 167},\n  {"chapter_number": "13", "chapter_title": "Backtesting on Synthetic Data", "start_page": 169, "end_page": 192},\n  {"chapter_number": "14", "chapter_title": "Backtest Statistics", "start_page": 195, "end_page": 208},\n  {"chapter_number": "15", "chapter_title": "Understanding Strategy Risk", "start_page": 211, "end_page": 219},\n  {"chapter_number": "16", "chapter_title": "Machine Learning Asset Allocation", "start_page": 221, "end_page": 244},\n  {"chapter_number": "18", "chapter_title": "Entropy Features", "start_page": 263, "end_page": 277},\n  {"chapter_number": "19", "chapter_title": "Microstructural Features", "start_page": 281, "end_page": 296},\n  {"chapter_number": "20", "chapter_title": "Multiprocessing and Vectorization", "start_page": 303, "end_page": 317},\n  {"chapter_number": "21", "chapter_title": "Brute Force and Quantum Computers", "start_page": 319, "end_page": 327},\n  {"chapter_number": "22", "chapter_title": "High-Performance Computational Intelligence and Forecasting Technologies", "start_page": 329, "end_page": null}\n]\n```'


In [11]:
def format_labels(chapter_number, chapter_title, start_page, end_page):
    return (
        f'{{"chapter_number": "{chapter_number}", '
        f'"chapter_title": "{chapter_title}", '
        f'"start_page": {start_page}, '
        f'"end_page": {end_page}}},\n'
    )

In [13]:
lab = format_labels(2, "the rise of rome", 3, 45)

In [15]:
lab[:-2]

'{"chapter_number": "2", "chapter_title": "the rise of rome", "start_page": 3, "end_page": 45}'

In [47]:
for book in json_data:
    ids = [3,2,8]
    for id in ids:
        for chapter in book['chapters']:
            if chapter["number"] == id:
                number_of_subchapter = chapter["subchapter_count"]
                print(f"Book: {book['name']}, Chapter: {chapter['title']}, Number: {chapter['number']}")
                for subchapters in chapter['subchapters']:
                    print(subchapters)


Book: Fine Art Techniques and Materials, Chapter: Drawing and Sketching Fundamentals, Number: 3
{'number': '3.1', 'title': 'Pencil Grades and Mark Making'}
{'number': '3.2', 'title': 'Perspective and Spatial Relationships'}
{'number': '3.3', 'title': 'Figure Drawing and Human Anatomy'}
{'number': '3.4', 'title': 'Value Studies and Shading Techniques'}
{'number': '3.5', 'title': 'Charcoal and Graphite Blending'}
{'number': '3.6', 'title': 'Ink and Pen Drawing Methods'}
{'number': '3.7', 'title': 'Gesture Drawing and Quick Studies'}
{'number': '3.8', 'title': 'Observational Drawing Skills'}
{'number': '3.9', 'title': 'Digital Drawing Tablet Techniques'}
{'number': '3.10', 'title': 'Portfolio Development Strategies'}
Book: Fine Art Techniques and Materials, Chapter: Watercolor and Wet Media, Number: 2
{'number': '2.1', 'title': 'Paper Selection and Stretching'}
{'number': '2.2', 'title': 'Wet-on-Wet Blending Techniques'}
{'number': '2.3', 'title': 'Wet-on-Dry Precision Control'}
{'number'