In [45]:
import json
import re
import os
import random
import numpy as np
import logging
import sys

In [82]:

data_path = "../synthetic_data/json/all_full.json"
with open(data_path, "r", encoding="utf-8") as f:
    json_data = json.load(f)


In [83]:
for i in json_data:
    print(i)

{'id': 1, 'name': 'Fine Art Techniques and Materials', 'chapters_counts': 7, 'chapters': [{'number': 1, 'title': 'Oil Painting Mastery', 'subchapter_count': 10, 'subchapters': [{'number': '1.1', 'title': 'Canvas Preparation and Priming Techniques'}, {'number': '1.2', 'title': 'Color Mixing and Palette Management'}, {'number': '1.3', 'title': 'Brushwork and Paint Application Methods'}, {'number': '1.4', 'title': 'Glazing and Scumbling Techniques'}, {'number': '1.5', 'title': 'Portrait Painting and Flesh Tones'}, {'number': '1.6', 'title': 'Landscape Composition and Atmosphere'}, {'number': '1.7', 'title': 'Still Life Arrangement and Lighting'}, {'number': '1.8', 'title': 'Impasto and Texture Creation'}, {'number': '1.9', 'title': 'Varnishing and Preservation Methods'}, {'number': '1.10', 'title': 'Common Mistakes and Troubleshooting'}]}, {'number': 2, 'title': 'Watercolor and Wet Media', 'subchapter_count': 10, 'subchapters': [{'number': '2.1', 'title': 'Paper Selection and Stretching'}

In [None]:
def generate_book_layout():
    """Generates a consistent layout for all chapters in a book."""
    layout_types = [
        "Chapter {}: ",  # "Chapter 1: "
        "{}. ",          # "1. "
        "{} ",           # "1 "
        "chapter {} ",   # "chapter 1 "
        "Chapter {} "    # "Chapter 1 "
    ]
    weights = [0.1, 0.4, 0.4, 0.05, 0.05]
    number_layout = random.choices(layout_types, weights=weights, k=1)[0]
    
    nextline_page_number = random.choices([1, 0], weights=[0.15, 0.85], k=1)[0]
    space_before_pagenumber = random.choices([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 
                                          weights=[0.35, 0.2, 0.1, 0.1, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.025], k=1)[0]
    
    return {
        "number_layout": number_layout,
        "nextline_page_number": nextline_page_number,
        "space_before_pagenumber": space_before_pagenumber
    }

def format_chapter(layout, title, number, page_start):
    """Formats a chapter using the given book layout."""
    if layout["nextline_page_number"]:
        page_start = f"\n{' ' * layout['space_before_pagenumber']}{page_start}"
    else:
        page_start = f"{' ' * layout['space_before_pagenumber']}{page_start}"
    
    return f"{layout['number_layout'].format(number)}{title}{page_start}"


In [114]:
# set logging
# logging.getLogger().handlers.clear()
logging.basicConfig(
    level=logging.DEBUG,
    format='%(levelname)s: %(message)s',
    stream=sys.stdout,
    force=True  # This is key for notebooks
)


# randomness variables
first_page_numbers = [1, 2, 3, 4, 5]
first_page_weights = [6, 3, 3, 1, 1]  # 3 and 4 are 5x more likely
page_range_numbers = np.arange(5, 60)  
page_range_weights = np.where((page_range_numbers >= 15) & (page_range_numbers <= 35), 3, 1)

# output variables
book_name = None
output_data = []

for book in json_data:
    # set new book name for each book
    if not book_name:
        logging.info("\nNew book:")
        book_name = book["name"]
        number_of_chapters = book["chapters_counts"]
        current_chapter = {}
        prompt = ""
        book_layout = generate_book_layout()

    for chapter in book['chapters']:
        # get chapter information
        chapter_title = chapter["title"]
        chapter_number = chapter["number"]
        number_of_subchapter = chapter["subchapter_count"]
        if not bool(current_chapter):
            logging.info("new chapter")
            start_page = int(random.choices(first_page_numbers, weights=first_page_weights, k=1)[0])
            end_end_page = int(start_page + random.choices(page_range_numbers, weights=page_range_weights, k=1)[0])
        else:
            logging.info("continue chapter")
            start_page = current_chapter["end_page"] + 1
            end_end_page = int(start_page + random.choices(page_range_numbers, weights=page_range_weights, k=1)[0])

        # add label
        current_chapter = {
            "chapter_number": chapter_number,
            "chapter_title": chapter_title,
            "start_page": start_page,
            "end_page": end_end_page
        }
        logging.info(f"{current_chapter}")

        # set chapter layout variables
        number_of_chapters_to_use = random.randint(5, book["chapters_counts"]) # 5 is the minimum number of chapters to use
        chapters_ids = random.sample(range(1, book["chapters_counts"] + 1), number_of_chapters_to_use)
        number_of_subchapter = chapter["subchapter_count"]
        use_systemic_noise = random.choices([0,1], weights=[.8,.2], k=1)[0]

        # define chapter layout
        formatted_chapter = format_chapter(
            book_layout,
            chapter_title,
            chapter_number,
            start_page
        )
        logging.debug(f"Chapter layout: {formatted_chapter}")

    book_name = None



INFO: 
New book:
INFO: new chapter
INFO: {'chapter_number': 1, 'chapter_title': 'Oil Painting Mastery', 'start_page': 1, 'end_page': 28}
DEBUG: Chapter layout: 1 Oil Painting Mastery 1
INFO: continue chapter
INFO: {'chapter_number': 2, 'chapter_title': 'Watercolor and Wet Media', 'start_page': 29, 'end_page': 80}
DEBUG: Chapter layout: 2 Watercolor and Wet Media 29
INFO: continue chapter
INFO: {'chapter_number': 3, 'chapter_title': 'Drawing and Sketching Fundamentals', 'start_page': 81, 'end_page': 110}
DEBUG: Chapter layout: 3 Drawing and Sketching Fundamentals 81
INFO: continue chapter
INFO: {'chapter_number': 4, 'chapter_title': 'Sculpture and Three-Dimensional Art', 'start_page': 111, 'end_page': 132}
DEBUG: Chapter layout: 4 Sculpture and Three-Dimensional Art 111
INFO: continue chapter
INFO: {'chapter_number': 5, 'chapter_title': 'Printmaking and Reproduction Techniques', 'start_page': 133, 'end_page': 151}
DEBUG: Chapter layout: 5 Printmaking and Reproduction Techniques 133
INFO

In [76]:
" "*0

''

In [107]:
layout = generate_book_layout()
layout

{'number_layout': '{}. ',
 'nextline_page_number': 0,
 'space_before_pagenumber': 1}

In [62]:
for book in json_data:
    ids = [2,3,8]
    for chapter in book['chapters']:

        for id in ids:

            if chapter["number"] == id:
                print(f"Book: {book['name']}, Chapter: {chapter['title']}, Number: {chapter['number']}")



Book: Fine Art Techniques and Materials, Chapter: Watercolor and Wet Media, Number: 2
Book: Fine Art Techniques and Materials, Chapter: Drawing and Sketching Fundamentals, Number: 3
Book: Fine Art Techniques and Materials, Chapter: Digital Art and New Media, Number: 8
Book: Jazz Music Theory and Performance, Chapter: Improvisation Techniques and Scales, Number: 2
Book: Jazz Music Theory and Performance, Chapter: Jazz Piano Performance, Number: 3
Book: Jazz Music Theory and Performance, Chapter: Jazz History and Cultural Impact, Number: 8
Book: Culinary Arts and Gastronomy, Chapter: International Cuisine and Regional Specialties, Number: 2
Book: Culinary Arts and Gastronomy, Chapter: Baking and Pastry Arts, Number: 3
Book: Culinary Arts and Gastronomy, Chapter: Culinary Entrepreneurship and Innovation, Number: 8
Book: Professional Sports Training and Performance, Chapter: Sports Nutrition and Hydration, Number: 2
Book: Professional Sports Training and Performance, Chapter: Sports Psycho