list all unicode characters

In [1]:
import json
import re

# Path to your JSON file
json_path = 'data/Legends/Pathfinder Legends_1_50.json'

# Load the JSON data
with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Function to extract all non-ASCII (unicode) characters from a string
def extract_unicode(text):
    return re.findall(r'[^\x00-\x7F]', text)

# Collect all unicode characters from all text fields
all_unicode_chars = set()
for entry in data:
    text = entry.get('text', '')
    unicode_chars = extract_unicode(text)
    all_unicode_chars.update(unicode_chars)

# Print the unique unicode characters found
print("Unique unicode characters found:")
for char in sorted(all_unicode_chars):
    print(f"U+{ord(char):04X} : {repr(char)}")

Unique unicode characters found:
U+00E7 : 'ç'
U+00FC : 'ü'
U+014C : 'Ō'
U+2003 : '\u2003'
U+2013 : '–'
U+2014 : '—'
U+2019 : '’'
U+201C : '“'
U+201D : '”'
U+2022 : '•'
U+25C6 : '◆'
U+25C7 : '◇'


Find all unique chunk types

In [2]:
import json

# Path to your JSON file
json_path = 'data/Legends/Pathfinder Legends_1_50.json'

# Load the JSON data
with open(json_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Collect all unique chunk types
chunk_types = set()
for entry in data:
    chunk_type = entry.get('chunk_type')
    if chunk_type is not None:
        chunk_types.add(chunk_type)

# Print the unique chunk types
print("Unique chunk types found:")
for ct in sorted(chunk_types):
    print(ct)

Unique chunk types found:
figure
marginalia
table
text


# Findings

- can remove all of marginalia
- need to parse and remove any mention of Jarvin
- need to convert unicode
- need to parse figure chunk type and only keep scene overview
- need to stitch back into text or markdown for processing

- need to decide where to split document

## Combined json and fix page numbers

In [1]:
import json
import re
import unicodedata
from pathlib import Path

directory_path = 'data/GM Core'
output_file = 'GM_Core_combined.json'

directory = Path(directory_path)

if not directory.exists():
   print(f"Error: Directory '{directory_path}' does not exist")

# Find all JSON files matching the pattern
json_files = list(directory.glob("Pathfinder GM Core Condensed*.json"))

if not json_files:
   print(f"No JSON files found matching pattern 'Pathfinder GM Core Condensed*.json' in {directory_path}")

# Sort files by the starting page number
def extract_start_page(file_path):
   match = re.search(r'_(\d+)_\d+\.json$', file_path.name)
   return int(match.group(1)) if match else 0

json_files.sort(key=extract_start_page)

combined_data = []
current_page_offset = 0  # Start at 1 instead of 0

print(f"Found {len(json_files)} JSON files in {directory_path}")

for file_path in json_files:
   print(f"Processing {file_path.name}...")
   
   try:
       with open(file_path, 'r', encoding='utf-8') as f:
           data = json.load(f)
   except Exception as e:
       print(f"Error reading {file_path.name}: {e}")
       continue
   
   # Extract page range from filename
   match = re.search(r'_(\d+)_(\d+)\.json$', file_path.name)
   if match:
       start_page = int(match.group(1))
       end_page = int(match.group(2))
       original_page_count = end_page - start_page + 1
   else:
       # Fallback: count unique pages in the data
       pages_in_file = set()
       for item in data:
           if 'grounding' in item:
               for ground in item['grounding']:
                   if 'page' in ground:
                       pages_in_file.add(ground['page'])
       original_page_count = len(pages_in_file) if pages_in_file else 1
   
   # Update page numbers and clean unicode in grounding data
   for item in data:
       # Clean unicode in text field
       if 'text' in item and isinstance(item['text'], str):
           # Normalize unicode and replace problematic characters
           item['text'] = unicodedata.normalize('NFKD', item['text'])
           item['text'] = item['text'].replace('\u2022', '•').replace('\u2013', '–').replace('\u2014', '—')
           item['text'] = item['text'].replace('\u2018', "'").replace('\u2019', "'")
           item['text'] = item['text'].replace('\u201c', '"').replace('\u201d', '"')
           item['text'] = item['text'].replace('\u2026', '...').replace('\u00a0', ' ')
       
       if 'grounding' in item:
           for grounding in item['grounding']:
               if 'page' in grounding:
                   grounding['page'] = grounding['page'] + current_page_offset
   
   # Add to combined data
   combined_data.extend(data)
   
   # Update offset for next file
   current_page_offset += original_page_count
   print(f"  Added {len(data)} items, pages now offset by {current_page_offset}")

# Write combined file to the same directory
output_path = directory / output_file
try:
   with open(output_path, 'w', encoding='utf-8') as f:
       json.dump(combined_data, f, indent=2, ensure_ascii=False)
   
   print(f"\nSuccessfully combined {len(json_files)} files into {output_path}")
   print(f"Total items: {len(combined_data)}")
   print(f"Total pages: {current_page_offset}")
   
except Exception as e:
   print(f"Error writing output file: {e}")

Found 7 JSON files in data/GM Core
Processing Pathfinder GM Core Condensed_1_50.json...
  Added 626 items, pages now offset by 50
Processing Pathfinder GM Core Condensed_51_100.json...
  Added 710 items, pages now offset by 100
Processing Pathfinder GM Core Condensed_101_150.json...
  Added 799 items, pages now offset by 150
Processing Pathfinder GM Core Condensed_151_200.json...
  Added 750 items, pages now offset by 200
Processing Pathfinder GM Core Condensed_201_250.json...
  Added 768 items, pages now offset by 250
Processing Pathfinder GM Core Condensed_251_300.json...
  Added 858 items, pages now offset by 300
Processing Pathfinder GM Core Condensed_301_333.json...
  Added 431 items, pages now offset by 333

Successfully combined 7 files into data/GM Core/GM_Core_combined.json
Total items: 4942
Total pages: 333


Fix single pdf

In [2]:
import json
import unicodedata
from pathlib import Path

# Configuration - modify these as needed
input_file = '/Users/animaznman/Library/CloudStorage/OneDrive-Personal/Documents/Coding/USF Masters/Generative AI/PF_GM_aid/data/GM Core/GM_Core_combined.json'  # Change to your specific file
output_file = 'GM_Core_processed.json'

input_path = Path(input_file)

if not input_path.exists():
    print(f"Error: File '{input_file}' does not exist")
    exit(1)

print(f"Processing {input_path.name}...")

try:
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
except Exception as e:
    print(f"Error reading {input_path.name}: {e}")
    exit(1)

# Clean unicode in the data without changing page numbers
for item in data:
    # Clean unicode in text field
    if 'text' in item and isinstance(item['text'], str):
        # Normalize unicode and replace problematic characters
        item['text'] = unicodedata.normalize('NFKD', item['text'])
        item['text'] = item['text'].replace('\u2022', '•').replace('\u2013', '–').replace('\u2014', '—')
        item['text'] = item['text'].replace('\u2018', "'").replace('\u2019', "'")
        item['text'] = item['text'].replace('\u201c', '"').replace('\u201d', '"')
        item['text'] = item['text'].replace('\u2026', '...').replace('\u00a0', ' ')
    
    # Note: Page numbers in grounding are kept as-is (no offset applied)

# Write processed file to the same directory as input
output_path = input_path.parent / output_file
try:
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    
    print(f"\nSuccessfully processed {input_path.name} into {output_path}")
    print(f"Total items: {len(data)}")
    
    # Count unique pages for reference
    pages_in_file = set()
    for item in data:
        if 'grounding' in item:
            for ground in item['grounding']:
                if 'page' in ground:
                    pages_in_file.add(ground['page'])
    
    if pages_in_file:
        print(f"Page range: {min(pages_in_file)} - {max(pages_in_file)}")
        print(f"Total unique pages: {len(pages_in_file)}")
    
except Exception as e:
    print(f"Error writing output file: {e}")

Processing GM_Core_combined.json...

Successfully processed GM_Core_combined.json into /Users/animaznman/Library/CloudStorage/OneDrive-Personal/Documents/Coding/USF Masters/Generative AI/PF_GM_aid/data/GM Core/GM_Core_processed.json
Total items: 4942
Page range: 0 - 332
Total unique pages: 333


## Cleaning pdf

In [None]:
import json
import re
from pathlib import Path

input_file = 'data/GM Core/GM_Core_combined.json'
output_file = 'data/GM Core/GM_Core_processed.json'

# Load the JSON data
with open(input_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

print(f"Original data has {len(data)} items")

cleaned_data = []
ogl_found = False

# Define all the text patterns to check for removal
removal_patterns = [
    r'jarvin b',
    r'paizo',
    r'wizards of the coast'
]

for item in data:
    # Check if this item contains "open game license" - if so, stop processing
    if 'text' in item and isinstance(item['text'], str):
        if 'open game license' in item['text'].lower():
            print(f"Found 'open game license' in item - stopping processing here")
            ogl_found = True
            break
    
    # Skip marginalia and figure chunk types
    if item.get('chunk_type') in ['marginalia', 'figure']:
        continue
    
    # Check if the text contains any of the removal patterns
    should_remove = False
    if 'text' in item and isinstance(item['text'], str):
        text_lower = item['text'].lower()
        for pattern in removal_patterns:
            if re.search(pattern, text_lower):
                print(f"Removing item containing: {pattern}")
                should_remove = True
                break
    
    # Only keep items that don't match removal patterns and have meaningful text
    if not should_remove and item.get('text', '').strip():
        cleaned_data.append(item)

print(f"Cleaned data has {len(cleaned_data)} items")
print(f"Removed {len(data) - len(cleaned_data)} items")
print(f"Processed until Open Game License: {'Yes' if ogl_found else 'No (not found)'}")

# Save the cleaned data
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(cleaned_data, f, indent=2, ensure_ascii=False)

print(f"Cleaned data saved to {output_file}")

Original data has 878 items
Removing item containing: jarvin b
Removing item containing: paizo
Removing item containing: jarvin b
Found 'open game license' in item - stopping processing here
Cleaned data has 644 items
Removed 234 items
Processed until Open Game License: Yes
Cleaned data saved to data/Plague/Plague_processed_cleaned.json


## Extract table of contents...by hand

In [10]:
contents_dict = {
    "The Fall of Plaguestone": 4,
    "Part 1: A Mysterious Murder": 5,
    "Part 2: The Ranger's Request": 25,
    "Part 3: Into Vilree's Lair": 37,
    "Adventure Toolbox": 51,
    "Etran's Folly Gazetteer": 52,
    "NPCs Around Town": 53,
    "Bort's Caravan and Crew": 54,
    "Character Creation and Backgrounds": 56,
    "Rules and Rewards": 57,
    "Background Side Quests": 60,
    "Deadly Flora": 63,
    "Ooze, Blood": 64
}

In [4]:
contents_dict = {
    "INTRODUCTION": 6,
    "ABROGAIL THRUNE II": 8,
    "ANDIRA MARUSEK": 12,
    "ANONG ARUNAK": 14,
    "ARDAX THE WHITE-HAIR": 16,
    "ARTOKUS KIRRAN": 18,
    "AVARNEUS": 22,
    "AZAERSI": 26,
    "BABA YAGA": 30,
    "BELIMARIUS AND SORSHEN": 34,
    "CAMILIA DRANNOCH": 38,
    "CHORAL THE CONQUEROR": 40,
    "EUTROPIA STAVIAN": 42,
    "GEB": 44,
    "HAO JIN": 46,
    "HASHIM IBN SAYYID": 50,
    "IRABETH TIRABADE": 52,
    "IRAHAI": 54,
    "JAKALYN": 56,
    "JANATIMO": 60,
    "OLD-MAGE JATEMBE": 62,
    "KALABRYNNE AND CLARETHE IOMEDAR": 66,
    "KASSI AZIRIL": 70,
    "KEVOTH-KUL": 74,
    "KHISMAR CROOKCHAR": 78,
    "LICKTOAD GOBLINS": 82,
    "MAGDELENA AND MARTUM FALLOWS": 84,
    "NANKOU": 88,
    "NEX": 90,
    "RAZMIR": 92,
    "SAPPHIRE BUTTERFLY": 94,
    "SHIMALI MANUX": 96,
    "SIHAR": 98,
    "TAARGICK": 102,
    "TAR-BAPHON": 104,
    "TELANDIA EDASSERIL": 106,
    "TESSA FAIRWIND": 108,
    "Thira Ash-Eyes": 110,
    "Toulon Vidoc": 112,
    "Ulthun II": 114,
    "White Estrid": 116,
    "Wynsal Starborn": 118,
    "Xerbystes II, Hebizid Vraj, and Deena al-Parishat": 120,
    "Entwined Destinies": 124,
    "Glossary & Index": 126
}

## Seperate full json document by chapters then convert to markdown

In [11]:
import json
import os
from pathlib import Path

# Create the output directory if it doesn't exist
output_dir = Path("data/distil/Plague")
output_dir.mkdir(parents=True, exist_ok=True)

# Load the JSON data
with open("data/Plague/Plague_processed_cleaned.json", "r") as f:
    json_data = json.load(f)

# Function to get text entries for a specific page range
def get_text_for_pages(start_page, end_page):
    text_entries = []
    for entry in json_data:
        if "grounding" in entry and entry["grounding"]:
            page = entry["grounding"][0].get("page")
            if page and start_page <= page < end_page:
                text_entries.append(entry["text"])
    return "\n\n".join(text_entries)

# Create markdown files for each chapter
for chapter, page in contents_dict.items():
    # Get the next page number to determine the range
    next_page = None
    for next_chapter, next_page_num in contents_dict.items():
        if next_page_num > page:
            if next_page is None or next_page_num < next_page:
                next_page = next_page_num
    
    # If this is the last chapter, use a large number as the end page
    if next_page is None:
        next_page = 1000
    
    # Get the text content for this chapter
    content = get_text_for_pages(page, next_page)
    
    # Create a safe filename from the chapter name
    safe_filename = chapter.lower().replace(" ", "_").replace("&", "and")
    filename = f"{safe_filename}.md"
    
    # Write the markdown file
    with open(output_dir / filename, "w", encoding="utf-8") as f:
        f.write(f"# {chapter}\n\n")
        f.write(content)

## Split large md files

In [12]:
from pathlib import Path

def split_markdown_file(input_file, lines_per_chunk=100):
    """
    Split a markdown file into multiple files with specified number of lines each.
    
    Args:
        input_file (str): Path to the input markdown file
        lines_per_chunk (int): Number of lines per output file (default: 100)
    """
    input_path = Path(input_file)
    
    # Check if file exists
    if not input_path.exists():
        print(f"Error: File '{input_file}' does not exist")
        return
    
    # Read all lines from the input file
    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    except Exception as e:
        print(f"Error reading file: {e}")
        return
    
    total_lines = len(lines)
    total_parts = (total_lines + lines_per_chunk - 1) // lines_per_chunk  # Ceiling division
    
    print(f"Splitting {input_path.name} ({total_lines} lines) into {total_parts} parts...")
    
    # Get the base name without extension
    base_name = input_path.stem
    extension = input_path.suffix
    output_dir = input_path.parent
    
    # Split the file
    for part_num in range(1, total_parts + 1):
        start_line = (part_num - 1) * lines_per_chunk
        end_line = min(start_line + lines_per_chunk, total_lines)
        
        # Create output filename
        output_filename = f"{base_name}_part_{part_num}{extension}"
        output_path = output_dir / output_filename
        
        # Write the chunk to the new file
        try:
            with open(output_path, 'w', encoding='utf-8') as f:
                f.writelines(lines[start_line:end_line])
            
            lines_in_part = end_line - start_line
            print(f"Created {output_filename} with {lines_in_part} lines")
            
        except Exception as e:
            print(f"Error writing {output_filename}: {e}")
    

input_file = "data/distil/Plague/part_1:_a_mysterious_murder.md"
split_markdown_file(input_file)

input_file = "data/distil/Plague/part_2:_the_ranger's_request.md"
split_markdown_file(input_file)

input_file = "data/distil/Plague/part_3:_into_vilree's_lair.md"
split_markdown_file(input_file)

Splitting part_1:_a_mysterious_murder.md (945 lines) into 10 parts...
Created part_1:_a_mysterious_murder_part_1.md with 100 lines
Created part_1:_a_mysterious_murder_part_2.md with 100 lines
Created part_1:_a_mysterious_murder_part_3.md with 100 lines
Created part_1:_a_mysterious_murder_part_4.md with 100 lines
Created part_1:_a_mysterious_murder_part_5.md with 100 lines
Created part_1:_a_mysterious_murder_part_6.md with 100 lines
Created part_1:_a_mysterious_murder_part_7.md with 100 lines
Created part_1:_a_mysterious_murder_part_8.md with 100 lines
Created part_1:_a_mysterious_murder_part_9.md with 100 lines
Created part_1:_a_mysterious_murder_part_10.md with 45 lines
Splitting part_2:_the_ranger's_request.md (599 lines) into 6 parts...
Created part_2:_the_ranger's_request_part_1.md with 100 lines
Created part_2:_the_ranger's_request_part_2.md with 100 lines
Created part_2:_the_ranger's_request_part_3.md with 100 lines
Created part_2:_the_ranger's_request_part_4.md with 100 lines
Cr