list all unicode characters

In [1]:
import json
import re

# Path to your JSON file
json_path = 'data/Legends/Pathfinder Legends_1_50.json'

# Load the JSON data
with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Function to extract all non-ASCII (unicode) characters from a string
def extract_unicode(text):
    return re.findall(r'[^\x00-\x7F]', text)

# Collect all unicode characters from all text fields
all_unicode_chars = set()
for entry in data:
    text = entry.get('text', '')
    unicode_chars = extract_unicode(text)
    all_unicode_chars.update(unicode_chars)

# Print the unique unicode characters found
print("Unique unicode characters found:")
for char in sorted(all_unicode_chars):
    print(f"U+{ord(char):04X} : {repr(char)}")

Unique unicode characters found:
U+00E7 : 'ç'
U+00FC : 'ü'
U+014C : 'Ō'
U+2003 : '\u2003'
U+2013 : '–'
U+2014 : '—'
U+2019 : '’'
U+201C : '“'
U+201D : '”'
U+2022 : '•'
U+25C6 : '◆'
U+25C7 : '◇'


Find all unique chunk types

In [2]:
import json

# Path to your JSON file
json_path = 'data/Legends/Pathfinder Legends_1_50.json'

# Load the JSON data
with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Collect all unique chunk types
chunk_types = set()
for entry in data:
    chunk_type = entry.get('chunk_type')
    if chunk_type is not None:
        chunk_types.add(chunk_type)

# Print the unique chunk types
print("Unique chunk types found:")
for ct in sorted(chunk_types):
    print(ct)

Unique chunk types found:
figure
marginalia
table
text


# Findings

- can remove all of marginalia
- need to parse and remove any mention of Jarvin
- need to convert unicode
- need to parse figure chunk type and only keep scene overview
- need to stitch back into text or markdown for processing

- need to decide where to split document

## Combined json and fix page numbers

In [15]:
import json
import re
import unicodedata
from pathlib import Path

directory_path = 'data/Legends'
output_file = 'Lengends_combined.json'

directory = Path(directory_path)

if not directory.exists():
   print(f"Error: Directory '{directory_path}' does not exist")

# Find all JSON files matching the pattern
json_files = list(directory.glob("Pathfinder Legends_*.json"))

if not json_files:
   print(f"No JSON files found matching pattern 'Pathfinder Legends_*.json' in {directory_path}")

# Sort files by the starting page number
def extract_start_page(file_path):
   match = re.search(r'_(\d+)_\d+\.json$', file_path.name)
   return int(match.group(1)) if match else 0

json_files.sort(key=extract_start_page)

combined_data = []
current_page_offset = 1  # Start at 1 instead of 0

print(f"Found {len(json_files)} JSON files in {directory_path}")

for file_path in json_files:
   print(f"Processing {file_path.name}...")
   
   try:
       with open(file_path, 'r', encoding='utf-8') as f:
           data = json.load(f)
   except Exception as e:
       print(f"Error reading {file_path.name}: {e}")
       continue
   
   # Extract page range from filename
   match = re.search(r'_(\d+)_(\d+)\.json$', file_path.name)
   if match:
       start_page = int(match.group(1))
       end_page = int(match.group(2))
       original_page_count = end_page - start_page + 1
   else:
       # Fallback: count unique pages in the data
       pages_in_file = set()
       for item in data:
           if 'grounding' in item:
               for ground in item['grounding']:
                   if 'page' in ground:
                       pages_in_file.add(ground['page'])
       original_page_count = len(pages_in_file) if pages_in_file else 1
   
   # Update page numbers and clean unicode in grounding data
   for item in data:
       # Clean unicode in text field
       if 'text' in item and isinstance(item['text'], str):
           # Normalize unicode and replace problematic characters
           item['text'] = unicodedata.normalize('NFKD', item['text'])
           item['text'] = item['text'].replace('\u2022', '•').replace('\u2013', '–').replace('\u2014', '—')
           item['text'] = item['text'].replace('\u2018', "'").replace('\u2019', "'")
           item['text'] = item['text'].replace('\u201c', '"').replace('\u201d', '"')
           item['text'] = item['text'].replace('\u2026', '...').replace('\u00a0', ' ')
       
       if 'grounding' in item:
           for grounding in item['grounding']:
               if 'page' in grounding:
                   grounding['page'] = grounding['page'] + current_page_offset
   
   # Add to combined data
   combined_data.extend(data)
   
   # Update offset for next file
   current_page_offset += original_page_count
   print(f"  Added {len(data)} items, pages now offset by {current_page_offset}")

# Write combined file to the same directory
output_path = directory / output_file
try:
   with open(output_path, 'w', encoding='utf-8') as f:
       json.dump(combined_data, f, indent=2, ensure_ascii=False)
   
   print(f"\nSuccessfully combined {len(json_files)} files into {output_path}")
   print(f"Total items: {len(combined_data)}")
   print(f"Total pages: {current_page_offset}")
   
except Exception as e:
   print(f"Error writing output file: {e}")

Found 3 JSON files in data/Legends
Processing Pathfinder Legends_1_50.json...
  Added 451 items, pages now offset by 51
Processing Pathfinder Legends_51_100.json...
  Added 527 items, pages now offset by 101
Processing Pathfinder Legends_101_130.json...
  Added 305 items, pages now offset by 131

Successfully combined 3 files into data/Legends/Lengends_combined.json
Total items: 1283
Total pages: 131


## Cleaning pdf

In [20]:
import json
import re
from pathlib import Path

input_file = 'data/Legends/Lengends_combined.json'
output_file = 'data/Legends/Lengends_combined_cleaned.json'

# Load the JSON data
with open(input_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"Original data has {len(data)} items")

cleaned_data = []
ogl_found = False

for item in data:
    # Check if this item contains "open game license" - if so, stop processing
    if 'text' in item and isinstance(item['text'], str):
        if 'open game license' in item['text'].lower():
            print(f"Found 'open game license' in item - stopping processing here")
            ogl_found = True
            break
    
    # Skip marginalia and figure chunk types
    if item.get('chunk_type') in ['marginalia', 'figure']:
        continue
    
    # Clean the text field
    if 'text' in item and isinstance(item['text'], str):
        text = item['text']
        
        # Remove "Jarvin B" followed by anything (using regex to handle variations)
        text = re.sub(r'Jarvin B.*?(?=\n|$)', '', text, flags=re.IGNORECASE)
        
        # Remove "Paizo Inc"
        text = re.sub(r'Paizo Inc[.,;]*', '', text, flags=re.IGNORECASE)
        
        # Remove "Wizards of the Coast"
        text = re.sub(r'Wizards of the Coast[.,;]*', '', text, flags=re.IGNORECASE)
        
        # Clean up extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Update the item's text
        item['text'] = text
    
    # Only keep items that still have meaningful text after cleaning
    if item.get('text', '').strip():
        cleaned_data.append(item)

print(f"Cleaned data has {len(cleaned_data)} items")
print(f"Processed until Open Game License: {'Yes' if ogl_found else 'No (not found)'}")

# Save the cleaned data
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(cleaned_data, f, indent=2, ensure_ascii=False)

print(f"Cleaned data saved to {output_file}")

Original data has 1283 items
Found 'open game license' in item - stopping processing here
Cleaned data has 1113 items
Processed until Open Game License: Yes
Cleaned data saved to data/Legends/Lengends_combined_cleaned.json


Extract table of contents...by hand