In [None]:
import json
import re
import os
import random
import numpy as np
import logging
import sys
import string
import warnings
import pickle 


In [34]:
# load
data_path = "../synthetic_data/json/all_full.json"
with open(data_path, "r", encoding="utf-8") as f:
    json_data = json.load(f)

with open("../synthetic_data/noise_toc.pkl", "rb") as f:
    toc_noise = pickle.load(f)

In [172]:
for i in json_data:
    print(i)

{'id': 1, 'name': 'Fine Art Techniques and Materials', 'chapters_counts': 7, 'chapters': [{'number': 1, 'title': 'Oil Painting Mastery', 'subchapter_count': 10, 'subchapters': [{'number': '1.1', 'title': 'Canvas Preparation and Priming Techniques'}, {'number': '1.2', 'title': 'Color Mixing and Palette Management'}, {'number': '1.3', 'title': 'Brushwork and Paint Application Methods'}, {'number': '1.4', 'title': 'Glazing and Scumbling Techniques'}, {'number': '1.5', 'title': 'Portrait Painting and Flesh Tones'}, {'number': '1.6', 'title': 'Landscape Composition and Atmosphere'}, {'number': '1.7', 'title': 'Still Life Arrangement and Lighting'}, {'number': '1.8', 'title': 'Impasto and Texture Creation'}, {'number': '1.9', 'title': 'Varnishing and Preservation Methods'}, {'number': '1.10', 'title': 'Common Mistakes and Troubleshooting'}]}, {'number': 2, 'title': 'Watercolor and Wet Media', 'subchapter_count': 10, 'subchapters': [{'number': '2.1', 'title': 'Paper Selection and Stretching'}

In [17]:
def create_weighted_generator(max_number, start=4, weight_range=(7, 11), weight_multiplier=3):
    """
    Creates a weighted random generator for range [start, max_number]
    with higher weights for numbers in weight_range.
    """
    if max_number < start:
        raise ValueError(f"max_number ({max_number}) must be >= start ({start})")
    
    # Create the range
    numbers = np.arange(start, max_number + 1)
    
    # Create weights - start with all 1s
    weights = np.ones(len(numbers))
    
    # Find indices within the weight range that also exist in our number range
    weight_min, weight_max = weight_range
    if weight_max >= max_number:
        weight_max = max_number
        warnings.warn("max range set equal to book's chapters number ", category=UserWarning)
    mask = (numbers > weight_min) & (numbers <= weight_max)
    weights[mask] = weight_multiplier
    
    # Normalize weights
    weights = weights / weights.sum()
    
    return int(np.random.choice(numbers, p=weights))


def add_random_spacing(range=range(1,5), weights=[.6,.2,.1,.1]):
    n_spaces = random.choices(range, weights=weights, k=1)[0]
    spacing = " " * n_spaces
    return spacing

In [None]:
def generate_chapter_layout():
    """Generates a consistent layout for all chapters in a book."""
    layout_types = [
        "Chapter {}: ",  # "Chapter 1: "
        "{}. ",          # "1. "
        "{} ",           # "1 "
        "chapter {} ",   # "chapter 1 "
        "Chapter {} "    # "Chapter 1 "
    ]
    weights = [0.1, 0.4, 0.4, 0.05, 0.05]
    number_layout = random.choices(layout_types, weights=weights, k=1)[0]
    
    nextline_page_number = random.choices([1, 0], weights=[0.15, 0.85], k=1)[0]

    return {
        "number_layout": number_layout,
        "nextline_page_number": nextline_page_number
    }


def format_chapter(layout, title, number, page_start):
    """Formats a chapter using the given book layout."""
    if layout["nextline_page_number"]:
        page_start = f"\n{add_random_spacing(range=range(0,6), weights=[.5,.2,.1,.1,.05,.05])}{page_start}"
    else:
        page_start = f"{add_random_spacing(range=range(1,7), weights=[.5,.2,.1,.1,.05,.05])}{page_start}"
    
    return f"{layout['number_layout'].format(number)}{title}{page_start}"


def generate_systemic_noise_layout():
    # section noise
    number_sections = random.choices([1,2,3], weights=[.5,.35,.15], k=1)[0] 
    text_sections_noise = ["Exercises", "References",  "Bibliography", "Notes", "Further Reading", "Contents", "Tables"]
    text_sections_weights = [0.3, 0.2, 0.2, 0.1, 0.1, 0.05, .05]
    text_section = np.random.choice(
                                    text_sections_noise,
                                    size=number_sections,
                                    replace=False,  
                                    p=text_sections_weights
    )

    # add numbers to sections
    add_numbers = random.choices([0,1], weights=[.7,.3], k=number_sections)
    
    return {
        "text_section": text_section,
        "add_numbers": add_numbers
    }
    
    
def format_noise(noise_layout, start_page, end_page, toc_noise):
    """Formats a chapter using the given book layout."""
    add_random_noise = random.choices([0,1], weights=[.7,.3], k=1)[0]
    if add_random_noise:
        random_noise = str(np.random.choice(toc_noise, 1, replace=False)[0])
        random_noise_position = random.randint(0, len(noise_layout['text_section']))
    
    noise_output = ""
    current_number = ""

    for idx, _ in enumerate(noise_layout['text_section']):
        # get random noise
        if add_random_noise and idx == random_noise_position:
            noise_output += add_random_spacing() + random_noise + "\n"
        
        # get numbers to add
        if noise_layout['add_numbers'][idx]:
            try:
                if not current_number:
                    number_to_add = random.sample(range(start_page,end_page-2), 1)[0]
                else:
                    number_to_add = random.sample(range(start_page,end_page), 1)[0]
            except:
                number_to_add = ""
            current_number = number_to_add
        else:
            current_number = ""

        # define noise output
        noise_output += noise_layout['text_section'][idx] + add_random_spacing() + str(current_number) + "\n"
        
    return noise_output
    
    

In [None]:
def generate_symbol_noise(min_len=1, max_len=3):
    """Generate random symbols and punctuation"""
    length = random.randint(min_len, max_len)
    symbols = "!@#$%^&*()[]{}|;:,.<>?/~`_+-="
    return ''.join(random.choices(symbols, k=length))


def generate_number_noise(min_digits=1, max_digits=4):
    """Generate random number strings"""
    length = random.randint(min_digits, max_digits)
    return ''.join(random.choices(string.digits, k=length))


def generate_page_number_noise():
    """Generate realistic but random page number patterns"""
    patterns = [
        f"Page {random.randint(1, 999)}",
        f"p. {random.randint(1, 999)}",
        f"pp. {random.randint(1, 999)}-{random.randint(1, 999)}",
        f"{random.randint(1, 999)}",
        f"[{random.randint(1, 999)}]",
        f"({random.randint(1, 999)})",
    ]
    return random.choice(patterns)


def generate_formatting_noise():
    """Generate random formatting-like text"""
    formats = [
        "...............",
        "_______________",
        "---------------",
        "===============",
        "***************",
        "###############",
        "|||||||||||||||",
        "               ",  # spaces
        "\t\t\t\t",       # tabs
        "• • • • • • • •",
        "→ → → → → → →",
        "※ ※ ※ ※ ※ ※",
    ]
    return random.choice(formats)


def generate_text_noise(toc_noise):
    random_toc = str(np.random.choice(toc_noise, 1, replace=False)[0])
    words = random_toc.split()
    n_words = len(words)
    n_words_to_use = random.randint(1,n_words)
    words_ids = np.random.choice(range(0,n_words),n_words_to_use,replace=False)
    words_subset = ' '.join([words[idx] for idx in words_ids])
    return words_subset


def generate_random_noise(chunk_type="symbols", toc_noise=toc_noise):
    """
    Generate a chunk of noise text
    
    Args:
        chunk_type: Type of noise - "symbols", "numbers",  "page_numbers", 
                   "formatting", "text"
    """
    # chunk_size = random.randint(*chunk_size_range)
    
    if chunk_type == "random":
        chunk_type = random.choice([
            "symbols", "numbers", "page_numbers", "formatting", "text"
        ])

    if chunk_type == "subchapters":
        chunk_type = random.choice([
            "symbols", "numbers", "page_numbers", "text"
        ])
    
    generators = {
        "symbols": lambda: generate_symbol_noise(),
        "numbers": lambda: generate_number_noise(),
        "page_numbers": lambda: generate_page_number_noise(),
        "formatting": lambda: generate_formatting_noise(),
        "text": lambda: generate_text_noise(toc_noise)
    }
    
    return str(generators[chunk_type]())

In [None]:
# set logging
logging.getLogger().handlers.clear()
logging.basicConfig(
    level=logging.DEBUG,  # Set to DEBUG
    format='%(levelname)s: %(message)s',
    stream=sys.stdout,
    force=True
)
logging.getLogger().setLevel(logging.INFO)


# randomness variables
first_page_numbers = [1, 2, 3, 4, 5]
first_page_weights = [6, 3, 3, 1, 1]  # 3 and 4 are 5x more likely
page_range_numbers = np.arange(5, 60)  
page_range_weights = np.where((page_range_numbers >= 15) & (page_range_numbers <= 35), 3, 1)

# output variables
book_name = None
output_data = []

for book in json_data:
    
    # set new book name for each book
    logging.debug("\nNew book:")
    book_name = book["name"]
    number_of_chapters = book["chapters_counts"]
    current_chapter = {}
    prompt = ""

    # set book layout variables
    book_layout = generate_chapter_layout()

    # set systemic noise layout if needed
    add_systemic_noise = random.choices([0, 1], weights=[0.8, 0.2], k=1)[0]
    if add_systemic_noise:
        noise_layout = generate_systemic_noise_layout()
    
    # define chapters numbers to use
    number_of_chapters_subset = create_weighted_generator(number_of_chapters)
    chapters_ids = np.random.choice(range(1,number_of_chapters+1),
                                    size=number_of_chapters_subset,
                                    replace=False)

    for idx,chapter_id in enumerate(chapters_ids):
        for chapter in book['chapters']:
            if chapter["number"] == chapter_id:

                # get chapter information
                chapter_title = chapter["title"]
                chapter_number = idx + 1
                number_of_subchapter = chapter["subchapter_count"]

                # get chapters pages
                if not bool(current_chapter):
                    logging.debug("new chapter")
                    start_page = int(random.choices(first_page_numbers, weights=first_page_weights, k=1)[0])
                    end_page = int(start_page + random.choices(page_range_numbers, weights=page_range_weights, k=1)[0])
                else:
                    logging.debug("continue chapter")
                    start_page = current_chapter["end_page"] + 1
                    end_page = int(start_page + random.choices(page_range_numbers, weights=page_range_weights, k=1)[0])

                # add label
                current_chapter = {
                    "chapter_number": chapter_number,
                    "chapter_title": chapter_title,
                    "start_page": start_page,
                    "end_page": end_page
                }
                logging.debug(f"{current_chapter}")

                # define chapter layout
                formatted_chapter = format_chapter(
                    book_layout,
                    chapter_title,
                    chapter_number,
                    start_page
                )

                # add chapter to prompt
                prompt += f"{formatted_chapter}\n"
                    
                # set noise variables
                if add_systemic_noise:
                    prompt += format_noise(noise_layout, start_page, end_page, toc_noise)

                # add random noise
                add_random_noise = random.choices([0, 1], weights=[0.7, 0.3], k=1)[0]
                if add_random_noise:
                    prompt += generate_random_noise('random', toc_noise) + "\n"
                
                # add subchapters
                if number_of_subchapter: 
                    number_of_subchapters_to_use = random.randint(1, number_of_subchapter)
                    actual_chapter_number = chapter["number"]

                    # get random subchapters
                    all_subchapters_ids = [f"{actual_chapter_number}.{i}" for i in range(1,number_of_subchapter+1)]
                    subchapters_ids = np.random.choice(all_subchapters_ids,
                                size=number_of_subchapters_to_use,
                                replace=False)
                    
                    
                    


                logging.debug(f"Chapter layout: {formatted_chapter}")
    
    output_data.append(prompt)
    logging.info(f"{prompt}")

    book_name = None



INFO: 1. Art History and Cultural Context     3
2. Oil Painting Mastery 63
3. Printmaking and Reproduction Techniques      70
p. 941
4. Sculpture and Three-Dimensional Art 103
5. Watercolor and Wet Media   138
Page 483
6. Drawing and Sketching Fundamentals   169

INFO: 1. Jazz Guitar and String Instruments 2
2. Improvisation Techniques and Scales  40
3. Jazz Piano Performance 64
-
4. Jazz Vocal Performance and Scat Singing  97
5. Jazz Drumming and Rhythm Section 126
6. Jazz Ensemble and Big Band Performance     150
※ ※ ※ ※ ※ ※
7. Jazz Harmony and Chord Progressions  172

INFO: 1 Food Science and Molecular Gastronomy 2
Bibliography  3
Further Reading 
sensitive information for protecting Security
2 Restaurant Operations and Management  12
  Production planning and capacity utilization strategies
Bibliography 58
Further Reading   
3 Wine and Beverage Pairing 62
Bibliography 86
 Environmental considerations for sustainable manufacturing processes
Further Reading  
4 Baking and Pastry Arts



In [40]:
actual_chapter_number = 5
number_of_subchapter = 10
number_of_subchapters_subset = 5
all_subchapters_ids = [f"{actual_chapter_number}.{i}" for i in range(1,number_of_subchapter+1)]
subchapters_ids = np.random.choice(all_subchapters_ids,
                                size=number_of_subchapters_subset,
                                replace=False)


In [41]:
subchapters_ids

array(['5.8', '5.6', '5.4', '5.2', '5.1'], dtype='<U4')

In [None]:
def generate_subchapter_layout():
    """Generates a consistent layout for all chapters in a book."""
    layout_types = [
        "Chapter {}: ",  # "Chapter 1: "
        "{}. ",          # "1. "
        "{} ",           # "1 "
        "chapter {} ",   # "chapter 1 "
        "Chapter {} "    # "Chapter 1 "
    ]
    weights = [0.1, 0.4, 0.4, 0.05, 0.05]
    number_layout = random.choices(layout_types, weights=weights, k=1)[0]
    
    nextline_page_number = random.choices([1, 0], weights=[0.15, 0.85], k=1)[0]

    return {
        "number_layout": number_layout,
        "nextline_page_number": nextline_page_number
    }


def format_chapter(layout, title, number, page_start):
    """Formats a chapter using the given book layout."""
    if layout["nextline_page_number"]:
        page_start = f"\n{add_random_spacing(range=range(0,6), weights=[.5,.2,.1,.1,.05,.05])}{page_start}"
    else:
        page_start = f"{add_random_spacing(range=range(1,7), weights=[.5,.2,.1,.1,.05,.05])}{page_start}"
    
    return f"{layout['number_layout'].format(number)}{title}{page_start}"

In [33]:
for book in json_data:
    ids = [3,2,8]
    for id in ids:
        for chapter in book['chapters']:
            if chapter["number"] == id:
                number_of_subchapter = chapter["subchapter_count"]
                print(f"Book: {book['name']}, Chapter: {chapter['title']}, Number: {chapter['number']}")
                if number_of_subchapter:
                    print(f"has subchapters: {number_of_subchapter}")
                else:
                    print(f"no subchapters: {number_of_subchapter}")


Book: Artificial Intelligence and Machine Learning, Chapter: Neural Networks and Deep Learning, Number: 3
no subchapters: 0
Book: Artificial Intelligence and Machine Learning, Chapter: Machine Learning Fundamentals, Number: 2
no subchapters: 0
Book: Artificial Intelligence and Machine Learning, Chapter: Automated Decision Making, Number: 8
no subchapters: 0
Book: Renaissance Art and Architecture, Chapter: Michelangelo and the Sistine Chapel, Number: 3
no subchapters: 0
Book: Renaissance Art and Architecture, Chapter: Leonardo da Vinci: Genius of Art and Science, Number: 2
no subchapters: 0
Book: Renaissance Art and Architecture, Chapter: Perspective and Mathematical Principles, Number: 8
no subchapters: 0
Book: Jazz History and Evolution, Chapter: Jazz Age and the Roaring Twenties, Number: 3
no subchapters: 0
Book: Jazz History and Evolution, Chapter: Ragtime and Scott Joplin's Influence, Number: 2
no subchapters: 0
Book: Jazz History and Evolution, Chapter: Dizzy Gillespie and Afro-Cu