list all unicode characters

In [1]:
import json
import re

# Path to your JSON file
json_path = 'data/Legends/Pathfinder Legends_1_50.json'

# Load the JSON data
with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Function to extract all non-ASCII (unicode) characters from a string
def extract_unicode(text):
    return re.findall(r'[^\x00-\x7F]', text)

# Collect all unicode characters from all text fields
all_unicode_chars = set()
for entry in data:
    text = entry.get('text', '')
    unicode_chars = extract_unicode(text)
    all_unicode_chars.update(unicode_chars)

# Print the unique unicode characters found
print("Unique unicode characters found:")
for char in sorted(all_unicode_chars):
    print(f"U+{ord(char):04X} : {repr(char)}")

Unique unicode characters found:
U+00E7 : 'ç'
U+00FC : 'ü'
U+014C : 'Ō'
U+2003 : '\u2003'
U+2013 : '–'
U+2014 : '—'
U+2019 : '’'
U+201C : '“'
U+201D : '”'
U+2022 : '•'
U+25C6 : '◆'
U+25C7 : '◇'


Find all unique chunk types

In [2]:
import json

# Path to your JSON file
json_path = 'data/Legends/Pathfinder Legends_1_50.json'

# Load the JSON data
with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Collect all unique chunk types
chunk_types = set()
for entry in data:
    chunk_type = entry.get('chunk_type')
    if chunk_type is not None:
        chunk_types.add(chunk_type)

# Print the unique chunk types
print("Unique chunk types found:")
for ct in sorted(chunk_types):
    print(ct)

Unique chunk types found:
figure
marginalia
table
text


# Findings

- can remove all of marginalia
- need to parse and remove any mention of Jarvin
- need to convert unicode
- need to parse figure chunk type and only keep scene overview
- need to stitch back into text or markdown for processing

- need to decide where to split document

In [8]:
import json
import re
from unidecode import unidecode

# Path to your JSON file
json_path = 'data/Legends/Pathfinder Legends_1_50.json'

# Load the JSON data
with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

processed_entries = []

for entry in data:
    # 1. Remove all marginalia
    if entry.get('chunk_type') == 'marginalia':
        continue

    text = entry.get('text', '')

    # 2. Remove any mention of Jarvin (case-insensitive)
    text = re.sub(r'Jarvin', '', text, flags=re.IGNORECASE)

    # 3. Convert unicode to closest ASCII
    text = unidecode(text)

    # 4. For figure, only keep Scene Overview section
    if entry.get('chunk_type') == 'figure':
        # Extract "Scene Overview" section
        match = re.search(r'Scene Overview\s*:(.*?)(?:\n[A-Z][a-zA-Z ]+ ?:|\Z)', text, re.DOTALL)
        if match:
            text = match.group(1).strip()
        else:
            text = ''  # If no scene overview, skip text

    # 5. Get the first page number for ordering
    grounding = entry.get('grounding', [])
    if grounding and isinstance(grounding, list) and 'page' in grounding[0]:
        page = grounding[0]['page']
    else:
        page = float('inf')  # Put entries without page at the end

    processed_entries.append((page, text))

# 6. Sort by page number
processed_entries.sort(key=lambda x: x[0])

# 7. Write to markdown file
with open('test.md', 'w', encoding='utf-8') as f:
    for _, text in processed_entries:
        if text.strip():
            f.write(text.strip() + '\n\n')

print("Done! Output written to test.md")

Done! Output written to test.md


In [10]:
import json
import re
from unidecode import unidecode
from collections import defaultdict

# Path to your JSON file
json_path = 'data/Legends/Pathfinder Legends_1_50.json'

# Load the JSON data
with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

processed_entries = []

for entry in data:
    # 1. Remove all marginalia
    if entry.get('chunk_type') == 'marginalia':
        continue

    text = entry.get('text', '')

    # 2. Remove any mention of Jarvin (case-insensitive)
    text = re.sub(r'Jarvin', '', text, flags=re.IGNORECASE)

    # 3. Convert unicode to closest ASCII
    text = unidecode(text)

    # 4. For figure, only keep Scene Overview section and add a tag
    is_figure = entry.get('chunk_type') == 'figure'
    if is_figure:
        match = re.search(r'Scene Overview\s*:(.*?)(?:\n[A-Z][a-zA-Z ]+ ?:|\Z)', text, re.DOTALL)
        if match:
            text = match.group(1).strip()
            # Add markdown-friendly tag for figure
            text = '<!-- FIGURE -->\n**[Figure]**\n\n' + text
        else:
            text = ''

    # 5. Get the first page number for ordering
    grounding = entry.get('grounding', [])
    if grounding and isinstance(grounding, list) and 'page' in grounding[0]:
        page = grounding[0]['page']
    else:
        page = float('inf')

    processed_entries.append((page, text))

# 6. Sort by page number
processed_entries.sort(key=lambda x: x[0])

# 7. Group entries by page
page_texts = defaultdict(list)
for page, text in processed_entries:
    if text.strip() and page != float('inf'):
        page_texts[page].append(text.strip())

# 8. Write to markdown file with markdown-compatible page and figure tags
with open('test.md', 'w', encoding='utf-8') as f:
    for page in sorted(page_texts):
        f.write(f'<!-- PAGE {page} -->\n')
        f.write(f'## Page {page}\n')
        f.write('---\n\n')
        for text in page_texts[page]:
            f.write(text + '\n\n')

print("Done! Output written to test.md")

Done! Output written to test.md


In [11]:
import json
import re
import os
from unidecode import unidecode
from collections import defaultdict

# Directory containing JSON files
legends_dir = 'data/Legends'

# List to hold all entries
all_entries = []

# Read all JSON files in the directory
for filename in os.listdir(legends_dir):
    if filename.endswith('.json'):
        # Parse the page range from the filename
        match = re.search(r'Pathfinder Legends_(\d+)_(\d+)\.json', filename)
        if match:
            start_page = int(match.group(1))
            json_path = os.path.join(legends_dir, filename)
            with open(json_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                for entry in data:
                    # Adjust page numbers to start at 0
                    grounding = entry.get('grounding', [])
                    if grounding and isinstance(grounding, list) and 'page' in grounding[0]:
                        entry['grounding'][0]['page'] -= start_page
                    all_entries.append(entry)

# Write combined JSON to test.json
with open('test.json', 'w', encoding='utf-8') as f:
    json.dump(all_entries, f, indent=2)

# Process combined JSON for Markdown output
processed_entries = []

for entry in all_entries:
    # 1. Remove all marginalia
    if entry.get('chunk_type') == 'marginalia':
        continue

    text = entry.get('text', '')

    # 2. Remove any mention of Jarvin (case-insensitive)
    text = re.sub(r'Jarvin', '', text, flags=re.IGNORECASE)

    # 3. Convert unicode to closest ASCII
    text = unidecode(text)

    # 4. For figure, only keep Scene Overview section and add a tag
    is_figure = entry.get('chunk_type') == 'figure'
    if is_figure:
        match = re.search(r'Scene Overview\s*:(.*?)(?:\n[A-Z][a-zA-Z ]+ ?:|\Z)', text, re.DOTALL)
        if match:
            text = match.group(1).strip()
            text = '<!-- FIGURE -->\n**[Figure]**\n\n' + text
        else:
            text = ''

    # 5. Get the first page number for ordering
    grounding = entry.get('grounding', [])
    if grounding and isinstance(grounding, list) and 'page' in grounding[0]:
        page = grounding[0]['page']
    else:
        page = float('inf')

    processed_entries.append((page, text))

# 6. Sort by page number
processed_entries.sort(key=lambda x: x[0])

# 7. Group entries by page
page_texts = defaultdict(list)
for page, text in processed_entries:
    if text.strip() and page != float('inf'):
        page_texts[page].append(text.strip())

# 8. Write to markdown file with markdown-compatible page and figure tags
with open('test.md', 'w', encoding='utf-8') as f:
    for page in sorted(page_texts):
        f.write(f'<!-- PAGE {page} -->\n')
        f.write(f'## Page {page}\n')
        f.write('---\n\n')
        for text in page_texts[page]:
            f.write(text + '\n\n')

print("Done! Combined JSON written to test.json and Markdown written to test.md")

Done! Combined JSON written to test.json and Markdown written to test.md


In [14]:
import json
import re
import unicodedata
from pathlib import Path

directory_path = 'data/Legends'
output_file = 'Lengends_combined.json'

directory = Path(directory_path)

if not directory.exists():
   print(f"Error: Directory '{directory_path}' does not exist")

# Find all JSON files matching the pattern
json_files = list(directory.glob("Pathfinder Legends_*.json"))

if not json_files:
   print(f"No JSON files found matching pattern 'Pathfinder Legends_*.json' in {directory_path}")

# Sort files by the starting page number
def extract_start_page(file_path):
   match = re.search(r'_(\d+)_\d+\.json$', file_path.name)
   return int(match.group(1)) if match else 0

json_files.sort(key=extract_start_page)

combined_data = []
current_page_offset = 1  # Start at 1 instead of 0

print(f"Found {len(json_files)} JSON files in {directory_path}")

for file_path in json_files:
   print(f"Processing {file_path.name}...")
   
   try:
       with open(file_path, 'r', encoding='utf-8') as f:
           data = json.load(f)
   except Exception as e:
       print(f"Error reading {file_path.name}: {e}")
       continue
   
   # Extract page range from filename
   match = re.search(r'_(\d+)_(\d+)\.json$', file_path.name)
   if match:
       start_page = int(match.group(1))
       end_page = int(match.group(2))
       original_page_count = end_page - start_page + 1
   else:
       # Fallback: count unique pages in the data
       pages_in_file = set()
       for item in data:
           if 'grounding' in item:
               for ground in item['grounding']:
                   if 'page' in ground:
                       pages_in_file.add(ground['page'])
       original_page_count = len(pages_in_file) if pages_in_file else 1
   
   # Update page numbers and clean unicode in grounding data
   for item in data:
       # Clean unicode in text field
       if 'text' in item and isinstance(item['text'], str):
           # Normalize unicode and replace problematic characters
           item['text'] = unicodedata.normalize('NFKD', item['text'])
           item['text'] = item['text'].replace('\u2022', '•').replace('\u2013', '–').replace('\u2014', '—')
           item['text'] = item['text'].replace('\u2018', "'").replace('\u2019', "'")
           item['text'] = item['text'].replace('\u201c', '"').replace('\u201d', '"')
           item['text'] = item['text'].replace('\u2026', '...').replace('\u00a0', ' ')
       
       if 'grounding' in item:
           for grounding in item['grounding']:
               if 'page' in grounding:
                   grounding['page'] = grounding['page'] + current_page_offset
   
   # Add to combined data
   combined_data.extend(data)
   
   # Update offset for next file
   current_page_offset += original_page_count
   print(f"  Added {len(data)} items, pages now offset by {current_page_offset}")

# Write combined file to the same directory
output_path = directory / output_file
try:
   with open(output_path, 'w', encoding='utf-8') as f:
       json.dump(combined_data, f, indent=2, ensure_ascii=False)
   
   print(f"\nSuccessfully combined {len(json_files)} files into {output_path}")
   print(f"Total items: {len(combined_data)}")
   print(f"Total pages: {current_page_offset}")
   
except Exception as e:
   print(f"Error writing output file: {e}")

Found 3 JSON files in data/Legends
Processing Pathfinder Legends_1_50.json...
  Added 451 items, pages now offset by 51
Processing Pathfinder Legends_51_100.json...
  Added 527 items, pages now offset by 101
Processing Pathfinder Legends_101_130.json...
  Added 305 items, pages now offset by 131

Successfully combined 3 files into data/Legends/test.json
Total items: 1283
Total pages: 131
