# Configuration

In [None]:
!pip install python-docx

from google.colab import drive
from collections import Counter
from docx import Document

import requests
import json
import sys
import re
import os
import collections


Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/253.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m245.8/253.0 kB[0m [31m12.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.2.0


## API
The output below demonstrates the constraints of using the open-source SRD ruleset:
1.  **Backgrounds:** Likely restricts to a single entry (e.g., "Acolyte").
2.  **Subclasses:** Typically includes only one subclass per class
3.  **Races:** Limited

Any dataset entry not matching these specific values is flagged by the strict validator

In [None]:
import requests
import textwrap

API_BASE = "https://www.dnd5eapi.co/api"

def fetch_all(endpoint):
    """Helper to fetch all results from an endpoint."""
    try:
        response = requests.get(f"{API_BASE}/{endpoint}", timeout=10)
        response.raise_for_status()
        return response.json()['results']
    except Exception as e:
        print(f"Error fetching {endpoint}: {e}")
        return []

print("Fetching SRD reference data:")

# Classes
classes = fetch_all("classes")
print(f"\n1. Classes ({len(classes)} found)")
print(f"   Values: {', '.join([c['name'] for c in classes])}")

# Subclasses
subclasses = fetch_all("subclasses")
print(f"\n2. Subclasses ({len(subclasses)} found)")
# Displaying first 5
print(f"   Sample: {', '.join([s['name'] for s in subclasses[:8]])}...")

# Races and Subraces
races = fetch_all("races")
subraces = fetch_all("subraces")
print(f"\n3. Races ({len(races)}) & Subraces ({len(subraces)})")

# Map subraces to parents for display
hierarchy = {r['name']: [] for r in races}
for sub in subraces:
    try:
        det = requests.get(f"{API_BASE}{sub['url']}", timeout=5).json()
        parent = det.get('race', {}).get('name')
        if parent in hierarchy:
            hierarchy[parent].append(sub['name'])
    except:
        pass

for race, subs in hierarchy.items():
    if subs:
        print(f"   - {race}: {', '.join(subs)}")
    else:
        print(f"   - {race}")

# Backgrounds
backgrounds = fetch_all("backgrounds")
print(f"\n4. Backgrounds ({len(backgrounds)} found)")
for b in backgrounds:
    print(f"   - Name: {b['name']} (Index: {b['index']})")

# Feats
feats = fetch_all("feats")
print(f"\n5. Feats ({len(feats)} found)")
print(f"   Sample: {', '.join([f['name'] for f in feats[:8]])}...")

# Skills
skills = fetch_all("skills")
print(f"\n6. Skills ({len(skills)} found)")
print(f"   Values: {', '.join([s['name'] for s in skills])}")

# Alignments
alignments = fetch_all("alignments")
print(f"\n7. Alignments ({len(alignments)} found)")
print(f"   Values: {', '.join([a['name'] for a in alignments])}")

# Spells
try:
    spell_count = requests.get(f"{API_BASE}/spells?count=1", timeout=10).json()['count']
    print(f"\n8. Spells (Total SRD Spells: {spell_count})")
except:
    print("\n8. Spells (Error fetching count)")

# Weapon Categories
try:
    weapons = requests.get(f"{API_BASE}/equipment-categories/weapon", timeout=10).json()['equipment']
    print(f"\n9. Standard Weapons ({len(weapons)} found)")
    print(f"   Sample: {', '.join([w['name'] for w in weapons[:8]])}...")
except:
    print("\n9. Standard Weapons (Error fetching)")

Fetching SRD reference data:

1. Classes (12 found)
   Values: Barbarian, Bard, Cleric, Druid, Fighter, Monk, Paladin, Ranger, Rogue, Sorcerer, Warlock, Wizard

2. Subclasses (12 found)
   Sample: Berserker, Champion, Devotion, Draconic, Evocation, Fiend, Hunter, Land...

3. Races (9) & Subraces (4)
   - Dragonborn
   - Dwarf
   - Elf
   - Gnome
   - Half-Elf
   - Half-Orc
   - Halfling
   - Human
   - Tiefling

4. Backgrounds (1 found)
   - Name: Acolyte (Index: acolyte)

5. Feats (1 found)
   Sample: Grappler...

6. Skills (18 found)
   Values: Acrobatics, Animal Handling, Arcana, Athletics, Deception, History, Insight, Intimidation, Investigation, Medicine, Nature, Perception, Performance, Persuasion, Religion, Sleight of Hand, Stealth, Survival

7. Alignments (9 found)
   Values: Chaotic Evil, Chaotic Good, Chaotic Neutral, Lawful Evil, Lawful Good, Lawful Neutral, Neutral, Neutral Evil, Neutral Good

8. Spells (Total SRD Spells: 319)

9. Standard Weapons (67 found)
   Sample: Club

## Strict API Validation

Loads the raw dataset (`dnd_chars_unique.json`) and validates each character against the strict **ruleset** fetched from the API.

**Logic:**
1.  **Fetch & Map:** It downloads all valid options (Classes, Races, Backgrounds, etc.) from `dnd5eapi.co` and creates normalized lookup maps.
2.  **Strict Filtering:** It iterates through every character sheet. If a character uses *any* content not present in the API it is flagged as **Invalid**.
3.  **Split:**
    * **Positive Dataset (`dataset_positive_cleaned.json`):** Characters that are 100% compliant with the open-source API rules.
    * **Negative Dataset (`dataset_negative_raw.json`):** Characters containing non-SRD content, homebrew, or errors

In [None]:
import os
import json
import re
import requests
from google.colab import drive

# CONFIGURATION
print("Connecting to Google Drive")
drive.mount('/content/drive', force_remount=True)

BASE_DIR = "/content/drive/MyDrive/DnD_Project_Data"
INPUT_FILE = os.path.join(BASE_DIR, 'dnd_chars_unique.json')
FILE_POS = os.path.join(BASE_DIR, 'dataset_positive_cleaned.json')
FILE_NEG = os.path.join(BASE_DIR, 'dataset_negative_raw.json')

API_BASE = "https://www.dnd5eapi.co/api"

#  FUNCTIONS

def super_normalize(text):
    """Removes all non-alphanumeric characters for safe comparison."""
    if not isinstance(text, str): return ""
    return re.sub(r'[^a-z0-9]', '', text.lower())

def get_scalar(data, key, default=None):
    """Extracts a single string value from a field that might be a list or dict."""
    if not isinstance(data, dict): return default
    val = data.get(key)
    if isinstance(val, list) and len(val) > 0: val = val[0]
    return str(val).strip() if val is not None else default

def fetch_resource_map(endpoint):
    """Fetches a resource list from the API and returns a normalization map."""
    try:
        resp = requests.get(f"{API_BASE}/{endpoint}", timeout=20)
        resp.raise_for_status()
        data = resp.json()
        if 'results' in data:
            # Returns: {'normalized_name': 'Official Name'}
            return {super_normalize(item['name']): item['name'] for item in data['results']}
    except Exception as e:
        print(f"Error fetching {endpoint}: {e}")
    return {}

#  BUILD VALIDATION MAPS
print("Fetching SRD Rules and building validation maps...")

resources = {
    'classes': 'classes',
    'subclasses': 'subclasses',
    'races': 'races',
    'backgrounds': 'backgrounds',
    'feats': 'feats',
    'skills': 'skills',
    'alignments': 'alignments'
}

master_maps = {}
for key, url in resources.items():
    master_maps[key] = fetch_resource_map(url)
    print(f"   - {key.capitalize()}: {len(master_maps[key])} valid entries loaded.")

print("Validation maps ready.")

# VALIDATION LOOP
print(f" Processing dataset from: {INPUT_FILE}")

dataset_pos = []
dataset_neg = []

try:
    with open(INPUT_FILE, 'r') as f:
        raw_data = json.load(f)

    for cid, data in raw_data.items():
        try:
            clean_entry = {"id": cid}

            # RACE
            rc_d = data.get('race', {})
            raw_rc = get_scalar(rc_d, 'processedRace') or get_scalar(rc_d, 'race')
            norm_rc = super_normalize(raw_rc)

            if norm_rc not in master_maps['races']:
                raise ValueError(f"Invalid Race: {raw_rc}")
            clean_entry['race'] = master_maps['races'][norm_rc]

            # CLASS
            c_dict = data.get('class', {})
            if not c_dict: raise ValueError("Missing Class")
            raw_c = list(c_dict.keys())[0]
            norm_c = super_normalize(raw_c)

            if norm_c not in master_maps['classes']:
                raise ValueError(f"Invalid Class: {raw_c}")
            clean_entry['class'] = master_maps['classes'][norm_c]

            # BACKGROUND
            bg = get_scalar(data, 'background')
            norm_bg = super_normalize(bg)

            if norm_bg not in master_maps['backgrounds']:
                raise ValueError(f"Invalid Background: {bg}")
            clean_entry['background'] = master_maps['backgrounds'][norm_bg]

            # SUBCLASS
            raw_sc = get_scalar(c_dict[raw_c], 'subclass')
            norm_sc = super_normalize(raw_sc)
            if norm_sc in master_maps['subclasses']:
                clean_entry['subclass'] = master_maps['subclasses'][norm_sc]
            else:
                clean_entry['subclass'] = None

            # ALIGNMENT
            al_dict = data.get('alignment', {})
            raw_al = get_scalar(al_dict, 'processedAlignment') or get_scalar(al_dict, 'alignment')
            norm_al = super_normalize(raw_al)
            clean_entry['alignment'] = master_maps['alignments'].get(norm_al)

            # SKILLS & FEATS: keep only valid items, discarding invalid ones without failing the whole sheet
            clean_entry['skills'] = [
                master_maps['skills'][super_normalize(s)]
                for s in data.get('skills', [])
                if super_normalize(s) in master_maps['skills']
            ]

            clean_entry['feats'] = [
                master_maps['feats'][super_normalize(f)]
                for f in data.get('feats', [])
                if super_normalize(f) in master_maps['feats']
            ]

            # Add to positive list
            dataset_pos.append(clean_entry)

        except ValueError as e:
            # Add to negative list
            dataset_neg.append({"id": cid, "error": str(e)})

    # SAVE RESULTS
    print(f"VALIDATION SUMMARY")
    print(f"STRICTLY VALID (SRD): {len(dataset_pos)}")
    print(f"REJECTED (Non-SRD):   {len(dataset_neg)}")

    with open(FILE_POS, 'w') as f: json.dump(dataset_pos, f, indent=2)
    with open(FILE_NEG, 'w') as f: json.dump(dataset_neg, f, indent=2)
    print(f"\nFiles saved to: {BASE_DIR}")

except FileNotFoundError:
    print(f"Error: Input file not found at {INPUT_FILE}")
except Exception as e:
    print(f"Unexpected Error: {e}")

Connecting to Google Drive
Mounted at /content/drive
Fetching SRD Rules and building validation maps...
   - Classes: 12 valid entries loaded.
   - Subclasses: 12 valid entries loaded.
   - Races: 9 valid entries loaded.
   - Backgrounds: 1 valid entries loaded.
   - Feats: 1 valid entries loaded.
   - Skills: 18 valid entries loaded.
   - Alignments: 9 valid entries loaded.
Validation maps ready.
 Processing dataset from: /content/drive/MyDrive/DnD_Project_Data/dnd_chars_unique.json
VALIDATION SUMMARY
STRICTLY VALID (SRD): 413
REJECTED (Non-SRD):   7533

Files saved to: /content/drive/MyDrive/DnD_Project_Data


## Step 2: Gap Analysis

This step analyzes the **Negative Dataset** (items rejected by the strict SRD validator) to identify high-frequency content.

**Purpose:**
The validator rejects valid official content. This analysis quantifies exactly what is missing.

**Process:**
1.  **Cross-Reference:** It matches the IDs from the negative dataset back to the raw input to retrieve the full character data.
2.  **Frequency Counting:** It counts every occurrence of a rejected Race, Class, Background, Feat, or Spell.
3.  **Reporting:** It generates Markdown reports (e.g., `Gap_Backgrounds.md`) listing the missing elements.

**Output:**
These reports guide the creation of the **Manual Whitelist** used in the next integration step.

In [None]:
import collections

REPORT_DIR = os.path.join(BASE_DIR, "analysis_reports_gap")
os.makedirs(REPORT_DIR, exist_ok=True)

print(f"Loading data for analysis")

# Load Negative Dataset
with open(FILE_NEG, 'r') as f:
    neg_data = json.load(f)

# Load Full Raw Dataset
print(f"   - Reading raw source: {INPUT_FILE}")
with open(INPUT_FILE, 'r') as f:
    full_raw_data = json.load(f)

# Initialize counters for each category
gap_counters = {
    "Races": collections.Counter(),
    "Classes": collections.Counter(),
    "Backgrounds": collections.Counter(),
    "Feats": collections.Counter(),
    "Skills": collections.Counter(),
    "Alignments": collections.Counter(),
    "Spells": collections.Counter(),
}

print("Analyzing rejection")

for entry in neg_data:
    cid = entry['id']

    if cid not in full_raw_data:
        continue

    orig = full_raw_data[cid]

    # RACE GAP
    rc_d = orig.get('race', {})
    # Handle mixed formats
    if isinstance(rc_d, dict):
        raw_rc = get_scalar(rc_d, 'processedRace') or get_scalar(rc_d, 'race')
    else:
        raw_rc = str(rc_d)

    if raw_rc:
        norm_rc = super_normalize(raw_rc)
        if norm_rc not in master_maps['races']:
            gap_counters["Races"][raw_rc] += 1

    # CLASS & SUBCLASS GAP
    c_d = orig.get('class', {})
    if c_d and isinstance(c_d, dict):
        raw_c = list(c_d.keys())[0]
        norm_c = super_normalize(raw_c)

        # Check Base Class
        if norm_c not in master_maps['classes']:
            gap_counters["Classes"][raw_c] += 1

        # Check Subclass
        raw_sc = get_scalar(c_d[raw_c], 'subclass')
        if raw_sc:
            norm_sc = super_normalize(raw_sc)
            if norm_sc not in master_maps['subclasses']:
                 gap_counters["Classes"][f"Subclass: {raw_sc}"] += 1

    # BACKGROUND GAP
    bg = get_scalar(orig, 'background')
    if bg:
        norm_bg = super_normalize(bg)
        if norm_bg not in master_maps['backgrounds']:
            gap_counters["Backgrounds"][bg] += 1

    # FEATS GAP
    for ft in orig.get('feats', []):
        if super_normalize(ft) not in master_maps['feats']:
            gap_counters["Feats"][ft] += 1

    # SKILLS GAP
    for sk in orig.get('skills', []):
        if super_normalize(sk) not in master_maps['skills']:
            gap_counters["Skills"][sk] += 1

    # ALIGNMENT GAP
    al_d = orig.get('alignment', {})
    raw_al = get_scalar(al_d, 'processedAlignment') or get_scalar(al_d, 'alignment')
    if raw_al and super_normalize(raw_al) not in master_maps['alignments']:
        gap_counters["Alignments"][raw_al] += 1

    # SPELLS GAP
    if 'spells' in master_maps and master_maps['spells']:
        spells_raw = orig.get('spells', {})
        iterator = spells_raw.values() if isinstance(spells_raw, dict) else spells_raw
        for v in iterator:
            s_name = v if isinstance(v, str) else (get_scalar(v, 'processedSpell') or get_scalar(v, 'spell'))
            if s_name and super_normalize(s_name) not in master_maps['spells']:
                gap_counters["Spells"][s_name] += 1

# reports
print(f"\n Writing Reports to: {REPORT_DIR}")

for category, counter in gap_counters.items():
    if not counter: continue

    filepath = os.path.join(REPORT_DIR, f"Gap_{category}.md")

    with open(filepath, 'w') as f:
        f.write(f"# Gap Analysis: {category}\n\n")
        f.write(f"| Missing Value | Frequency |\n")
        f.write(f"| :--- | :--- |\n")

        for val, count in counter.most_common():
            safe_val = str(val).replace("|", "-").replace("\n", " ")
            f.write(f"| {safe_val} | {count} |\n")

    print(f" Created: Gap_{category}.md")

Loading data for analysis
   - Reading raw source: /content/drive/MyDrive/DnD_Project_Data/dnd_chars_unique.json
Analyzing rejection

 Writing Reports to: /content/drive/MyDrive/DnD_Project_Data/analysis_reports_gap
 Created: Gap_Races.md
 Created: Gap_Classes.md
 Created: Gap_Backgrounds.md
 Created: Gap_Feats.md
 Created: Gap_Alignments.md


## Rules Enforcement

This step merges the API data with manual whitelists to create the final dataset. It enforces the **mathematical and logical rules of D&D 5th Edition** and  **repairs** valuable training samples.

**Key Validations & Features:**

1.  **Hybrid Knowledge Base (API + Manual Parsing):**
    * *Source:* `dnd5eapi.co` (SRD) + `DnD_Integrated.docx` (Manual).
    * *Logic:* The system parses specific data from the manual file (e.g., reading `"Toll the Dead - 0"` to assign Level 0). This allows strict validation even for non-SRD content.

2.  **Universal Spell Slot Validation:**
    * *Rule:* Characters cannot cast spells higher than their level permits.
    * *Math:* Implements precise progression formulas for **Full Casters** ($\lceil Lvl/2 \rceil$), **Half-Casters** (Paladin/Ranger), and **Pact Magic** (Warlock caps at Lv5). Spells exceeding the max slot are pruned.

3.  **Mandatory Spellcasting & Repair:**
    * *Problem:* High-level casters (e.g., Wizard Lv10) with empty spell lists caused by parsing errors or incomplete inputs.
    * *Solution:* Instead of discarding these valid characters, select 2 random valid spells (Lv0 or Lv1) from a class-specific pool. This preserves the data structure

4.  **Subclass Logic & Unlock Levels:**
    * *Rule:* Subclasses are validated based on unlock levels (Lv1 for Sorcerers/Clerics, Lv3 for others).
    * *Edge Cases:* Specifically handles "Third-Casters" (Arcane Trickster/Eldritch Knight), enforcing spell requirements only from Level 3 onwards.

5.  **Stat & HP Integrity:**
    * *Stats:* Enforces valid ranges (typically 1-20, max 30 for epic items).
    * *Hit Points:* Validates HP against the class Hit Die (e.g., Wizard d6 vs Barbarian d12) to ensure mathematical consistency.

6.  **Alignment Normalization:**
    * *Logic:* Maps chaotic inputs (e.g., "CG", "chaotic-good") to the 9 canonical alignment strings.

In [None]:
import re
import math
import requests
import time
import os
import json
import collections
import random
from google.colab import drive
try:
    from docx import Document
except ImportError:
    pass

print("Connecting to Google Drive")
drive.mount('/content/drive', force_remount=True)

BASE_DIR = "/content/drive/MyDrive/DnD_Project_Data"
INPUT_FILE = os.path.join(BASE_DIR, 'dnd_chars_unique.json')
OUTPUT_DIR = os.path.join(BASE_DIR, "dataset_integrated")
os.makedirs(OUTPUT_DIR, exist_ok=True)

FILE_POS_INT = os.path.join(OUTPUT_DIR, 'dataset_integrated_positive.json')
FILE_NEG_INT = os.path.join(OUTPUT_DIR, 'dataset_integrated_negative.json')
WHITELIST_DOC_PATH = os.path.join(BASE_DIR, "analysis_reports_gap", "DnD_Integrated.docx")

API_BASE = "https://www.dnd5eapi.co/api"
GRAPHQL_URL = "https://www.dnd5eapi.co/graphql"

# 5E RULES CONSTANTS
REQUIRED_STATS = {'str', 'dex', 'con', 'int', 'wis', 'cha'}
MAX_STAT_VALUE = 30
MAX_STRING_LEN = 60
MAX_SKILLS_COUNT = 18
MAX_FEATS_COUNT = 10

SUBCLASS_UNLOCK_LEVELS = {
    "cleric": 1, "sorcerer": 1, "warlock": 1,
    "druid": 2, "wizard": 2,
    "default": 3
}

SPELLCASTING_REQUIRED_AT = {
    "bard": 1, "cleric": 1, "druid": 1, "sorcerer": 1, "warlock": 1, "wizard": 1,
    "artificer": 1,
    "paladin": 2, "ranger": 2
}

# DEFAULT SPELL POOLS
# safe spells (Lv 0 & Lv 1)
DEFAULT_SPELL_POOLS = {
    "bard": ["Vicious Mockery", "Prestidigitation", "Minor Illusion", "Healing Word", "Detect Magic", "Thunderwave"],
    "cleric": ["Thaumaturgy", "Light", "Sacred Flame", "Guidance", "Bless", "Cure Wounds", "Healing Word", "Shield of Faith"],
    "druid": ["Druidcraft", "Produce Flame", "Shillelagh", "Thorn Whip", "Entangle", "Goodberry", "Thunderwave"],
    "sorcerer": ["Light", "Prestidigitation", "Fire Bolt", "Ray of Frost", "Shield", "Magic Missile", "Burning Hands"],
    "warlock": ["Eldritch Blast", "Prestidigitation", "Minor Illusion", "Hex", "Armor of Agathys", "Hellish Rebuke"],
    "wizard": ["Prestidigitation", "Light", "Mage Hand", "Fire Bolt", "Magic Missile", "Shield", "Mage Armor", "Sleep"],
    "artificer": ["Prestidigitation", "Mending", "Guidance", "Cure Wounds", "Detect Magic", "Grease"],

    # Half Casters (Lv 1)
    "paladin": ["Bless", "Cure Wounds", "Divine Favor", "Shield of Faith", "Command"],
    "ranger": ["Cure Wounds", "Fog Cloud", "Hunter's Mark", "Detect Magic", "Goodberry"],

    # Subclass Casters
    "fighter": ["Blade Ward", "Light", "Shield", "Magic Missile"],
    "rogue": ["Mage Hand", "Prestidigitation", "Minor Illusion", "Sleep"]
}

# Alignment Map
ALIGNMENT_MAP = {
    "lg": "Lawful Good", "lawfulgood": "Lawful Good",
    "ng": "Neutral Good", "neutralgood": "Neutral Good", "good": "Neutral Good",
    "cg": "Chaotic Good", "chaoticgood": "Chaotic Good",
    "ln": "Lawful Neutral", "lawfulneutral": "Lawful Neutral",
    "n": "True Neutral", "neutral": "True Neutral", "trueneutral": "True Neutral", "tn": "True Neutral",
    "cn": "Chaotic Neutral", "chaoticneutral": "Chaotic Neutral",
    "le": "Lawful Evil", "lawfulevil": "Lawful Evil",
    "ne": "Neutral Evil", "neutralevil": "Neutral Evil", "evil": "Neutral Evil",
    "ce": "Chaotic Evil", "chaoticevil": "Chaotic Evil",
    "unaligned": "Unaligned", "any": "Any Alignment"
}

# FUNCTIONS

def super_normalize(text):
    if not isinstance(text, str): return ""
    return re.sub(r'[^a-z0-9]', '', text.lower())

def extract_scalar(val):
    if isinstance(val, list): return val[0] if len(val) > 0 else None
    return val

def calculate_real_level(class_obj):
    total = 0
    main_class_name = "Unknown"
    max_lvl = 0

    if isinstance(class_obj, dict):
        for c_name, c_data in class_obj.items():
            lvl = 0
            if isinstance(c_data, dict): lvl = int(extract_scalar(c_data.get('level', 0)) or 0)
            elif isinstance(c_data, list): lvl = int(c_data[0]) if c_data else 0
            else:
                try: lvl = int(c_data)
                except: pass

            total += lvl
            if lvl > max_lvl:
                max_lvl = lvl
                main_class_name = c_name

    return (total if total > 0 else 1), main_class_name

def validate_stats(attributes_dict):
    if not isinstance(attributes_dict, dict): raise ValueError("Stats Missing")
    clean = {}
    found_keys = 0
    for k, v in attributes_dict.items():
        norm_k = k.lower()[:3]
        if norm_k in REQUIRED_STATS:
            try:
                val_int = int(extract_scalar(v))
                if val_int <= 0: raise ValueError # Min 1
                if val_int > MAX_STAT_VALUE: raise ValueError(f"Stat > {MAX_STAT_VALUE}")
                clean[norm_k] = val_int
                found_keys += 1
            except: raise ValueError(f"Invalid value for {k}")

    if found_keys < 6: raise ValueError(f"Missing one or more attributes")
    return clean

def get_max_spell_level(char_level, class_name):
    cls = class_name.lower()
    if "warlock" in cls:
        if char_level < 1: return 0
        return min(5, math.ceil(char_level / 2.0))
    if any(x in cls for x in ["paladin", "ranger"]):
        return math.ceil(char_level / 2.0) if char_level >= 2 else 0
    return math.ceil(char_level / 2.0)

# WHITELISTS CONTAINERS
whitelists = {
    "background": set(), "feats": set(), "races": set(),
    "subclasses": set(), "skills": set(), "spells": set(), "classes": set()
}

manual_spells_lvl = {}

# --- 1. LOAD MANUAL WHITELISTS (DOCX) ---
if os.path.exists(WHITELIST_DOC_PATH):
    print(f" - Integrating Manual Whitelist: {os.path.basename(WHITELIST_DOC_PATH)}")
    try:
        doc = Document(WHITELIST_DOC_PATH)
        current_cat = None
        for p in doc.paragraphs:
            line = p.text.strip()
            if not line: continue
            low = line.lower()

            if "background" in low and ":" in low: current_cat = "background"
            elif "feat" in low and ":" in low: current_cat = "feats"
            elif "race" in low and ":" in low: current_cat = "races"
            elif "subclass" in low and ":" in low: current_cat = "subclasses"
            elif "skill" in low and ":" in low: current_cat = "skills"
            elif "class" in low and ":" in low: current_cat = "classes"
            elif "spell" in low and ":" in low: current_cat = "spells"

            elif current_cat:
                val = line.split(":")[-1].strip() if ":" in line else line
                if current_cat == "spells":
                    parts = val.rsplit('-', 1)
                    if len(parts) == 2 and parts[1].strip().isdigit():
                        s_name = parts[0].strip()
                        s_lvl = int(parts[1].strip())
                        norm = super_normalize(s_name)
                        whitelists['spells'].add(norm)
                        manual_spells_lvl[norm] = s_lvl
                    else:
                        norm = super_normalize(val)
                        whitelists['spells'].add(norm)
                else:
                    whitelists[current_cat].add(super_normalize(val))

    except Exception as e:
        print(f"Could not read manual whitelist ({e})")

print(f"   Manual Spells with Levels Loaded: {len(manual_spells_lvl)}")

# API DATA
print(" - Fetching API Data...")
session = requests.Session()
api_classes_hd = {}
api_spells_lvl = {}

try:
    r = session.get(f"{API_BASE}/classes", timeout=10).json()['results']
    for c in r:
        detail = session.get(f"{API_BASE}/classes/{c['index']}").json()
        norm_name = super_normalize(c['name'])
        api_classes_hd[norm_name] = detail.get('hit_die', 8)
        whitelists['classes'].add(norm_name)

    print("Fetching Spell Index (via GraphQL)")
    query = """
    query {
      spells(limit: 2000) {
        name
        level
      }
    }
    """
    r_spells_response = session.post(GRAPHQL_URL, json={'query': query}, timeout=30)
    r_spells_response.raise_for_status()
    all_spells = r_spells_response.json().get('data', {}).get('spells', [])

    for s in all_spells:
        api_spells_lvl[super_normalize(s['name'])] = s['level']

    print(f"        Indexed {len(api_spells_lvl)} spells via GraphQL.")

    def merge_api_list(endpoint, category_key):
        items = session.get(f"{API_BASE}/{endpoint}").json()['results']
        for i in items:
            whitelists[category_key].add(super_normalize(i['name']))

    merge_api_list("subclasses", "subclasses")
    merge_api_list("races", "races")
    merge_api_list("backgrounds", "background")
    merge_api_list("feats", "feats")
    merge_api_list("skills", "skills")

except Exception as e:
    print(f"API Error: {e}")

print("Validation Logic Ready.")

# INTEGRATION LOOP
print(f"Validating & Integrating Data from: {INPUT_FILE}")

dataset_pos = []
dataset_neg = []
stats_rejected = collections.Counter()
augmented_count = 0

with open(INPUT_FILE, 'r') as f:
    raw_data = json.load(f)

for cid, d in raw_data.items():
    race_obj = d.get('race', {})
    raw_race = extract_scalar(race_obj.get('processedRace')) or extract_scalar(race_obj.get('race')) if isinstance(race_obj, dict) else str(race_obj)
    raw_subrace = extract_scalar(race_obj.get('subrace')) if isinstance(race_obj, dict) else None
    raw_bg = extract_scalar(d.get('background')) or "Unknown"

    lvl_val, class_name_val = calculate_real_level(d.get('class'))

    raw_subclass = None
    if isinstance(d.get('class'), dict) and class_name_val in d['class']:
        raw_subclass = extract_scalar(d['class'][class_name_val].get('subclass'))

    try:
        c_norm = super_normalize(class_name_val)

        if len(raw_race) > MAX_STRING_LEN: raise ValueError("Race Name too long")

        if super_normalize(raw_race) not in whitelists['races']:
            stats_rejected["INVALID_RACE"] += 1
            raise ValueError(f"Invalid/Homebrew Race: {raw_race}")

        if super_normalize(raw_bg) not in whitelists['background']:
            stats_rejected["INVALID_BACKGROUND"] += 1
            raise ValueError(f"Invalid/Homebrew Background: {raw_bg}")

        try:
            clean_stats = validate_stats(d.get('attributes') or d.get('stats') or {})
        except ValueError as e:
            stats_rejected["INVALID_STATS"] += 1
            raise ValueError(str(e))

        hp_val = int(extract_scalar(d.get('HP')) or 0)

        if c_norm not in api_classes_hd:
            stats_rejected["UNKNOWN_CLASS"] += 1
            raise ValueError(f"Unknown Class (No Hit Die info): {class_name_val}")

        hd_val = api_classes_hd[c_norm]
        max_possible_hp = (hd_val + 7) * lvl_val * 2
        if hp_val <= 0 or hp_val > max_possible_hp:
            stats_rejected["IMPOSSIBLE_HP"] += 1
            raise ValueError(f"HP {hp_val} out of valid range")

        subclass_val = None
        unlock_lvl = SUBCLASS_UNLOCK_LEVELS.get(c_norm, SUBCLASS_UNLOCK_LEVELS["default"])

        if lvl_val >= unlock_lvl:
            if not raw_subclass:
                stats_rejected["MISSING_SUBCLASS"] += 1
                raise ValueError("Missing Required Subclass")

            if super_normalize(raw_subclass) not in whitelists['subclasses']:
                stats_rejected["INVALID_SUBCLASS"] += 1
                raise ValueError(f"Invalid Subclass: {raw_subclass}")
            subclass_val = raw_subclass
        else:
            subclass_val = None

        raw_al_obj = d.get('alignment', {})
        raw_al = extract_scalar(raw_al_obj.get('processedAlignment')) or extract_scalar(raw_al_obj.get('alignment')) if isinstance(raw_al_obj, dict) else extract_scalar(raw_al_obj)
        al_val = ALIGNMENT_MAP.get(super_normalize(raw_al))

        skills = [s for s in (d.get('skills') or []) if super_normalize(s) in whitelists['skills']]
        if len(skills) > MAX_SKILLS_COUNT: raise ValueError("Too many skills")

        feats = [f for f in (d.get('feats') or []) if super_normalize(f) in whitelists['feats']]

        is_variant_human = "variant" in str(raw_race).lower()
        if lvl_val < 4 and len(feats) > 0 and not is_variant_human:
            feats = []

        # SPELL CHECK
        spells_raw = d.get('spells', [])
        clean_spells = []
        spell_iter = spells_raw if isinstance(spells_raw, list) else [item for sublist in spells_raw.values() for item in (sublist if isinstance(sublist, list) else [sublist])]

        max_slot = get_max_spell_level(lvl_val, c_norm)

        for s in spell_iter:
            s_str = str(s) if not isinstance(s, dict) else (s.get('processedSpell') or s.get('spell'))
            if not s_str: continue

            s_norm = super_normalize(s_str)
            spell_lvl = None

            if s_norm in api_spells_lvl:
                spell_lvl = api_spells_lvl[s_norm]
            elif s_norm in manual_spells_lvl:
                spell_lvl = manual_spells_lvl[s_norm]
            elif s_norm in whitelists['spells']:
                clean_spells.append(s_str)
                continue

            if spell_lvl is not None:
                if spell_lvl == 0 or spell_lvl <= max_slot:
                    clean_spells.append(s_str)

        # REPAIR
        required_lvl_for_spells = SPELLCASTING_REQUIRED_AT.get(c_norm)

        sub_norm = super_normalize(subclass_val) if subclass_val else ""
        if "arcanetrickster" in sub_norm or "eldritchknight" in sub_norm:
             required_lvl_for_spells = 3

        if required_lvl_for_spells is not None:
            if lvl_val >= required_lvl_for_spells and not clean_spells:

                pool = DEFAULT_SPELL_POOLS.get(c_norm)
                if pool:
                    # take 2 spell random
                    num_to_pick = min(len(pool), 2)
                    clean_spells = random.sample(pool, num_to_pick)
                    augmented_count += 1
                else:
                    stats_rejected["MISSING_REQUIRED_SPELLS"] += 1
                    raise ValueError(f"Class {class_name_val} requires spells, but none valid/default found.")

        raw_weapons = d.get('weapons', [])
        weapons = [str(w) for w in raw_weapons if w]

        dataset_pos.append({
            "race": str(raw_race),
            "subrace": raw_subrace,
            "class": str(class_name_val),
            "subclass": subclass_val,
            "level": int(lvl_val),
            "background": str(raw_bg),
            "stats": clean_stats,
            "alignment": al_val,
            "hp": int(hp_val),
            "ac": int(extract_scalar(d.get('AC'))) if d.get('AC') is not None else None,
            "skills": skills,
            "spells": clean_spells,
            "feats": feats,
            "weapons": weapons,
        })

    except ValueError as e:
        dataset_neg.append({
            "race": str(raw_race),
            "class": str(class_name_val),
            "level": int(lvl_val),
            "error": str(e)
        })

# SAVE
print(f"INTEGRATION SUMMARY")
print(f"ACCEPTED: {len(dataset_pos)}")
print(f"REJECTED: {len(dataset_neg)}")
print(f"AUGMENTED (Spells Randomized): {augmented_count}")
for reason, count in stats_rejected.most_common():
    print(f"   - {reason:<20} : {count}")
with open(FILE_POS_INT, 'w') as f: json.dump(dataset_pos, f, indent=2)
with open(FILE_NEG_INT, 'w') as f: json.dump(dataset_neg, f, indent=2)

print(f"\nFinal datasets saved to {OUTPUT_DIR}")

Connecting to Google Drive
Mounted at /content/drive
 - Integrating Manual Whitelist: DnD_Integrated.docx
   Manual Spells with Levels Loaded: 31
 - Fetching API Data...
Fetching Spell Index (via GraphQL)
        Indexed 319 spells via GraphQL.
Validation Logic Ready.
Validating & Integrating Data from: /content/drive/MyDrive/DnD_Project_Data/dnd_chars_unique.json
INTEGRATION SUMMARY
ACCEPTED: 2114
REJECTED: 5832
AUGMENTED (Spells Randomized): 1091
   - INVALID_BACKGROUND   : 2898
   - INVALID_RACE         : 2048
   - INVALID_SUBCLASS     : 817
   - UNKNOWN_CLASS        : 45
   - MISSING_SUBCLASS     : 14
   - INVALID_STATS        : 5
   - IMPOSSIBLE_HP        : 5

Final datasets saved to /content/drive/MyDrive/DnD_Project_Data/dataset_integrated


Dataset Splitting and Formatting

Data Loading: Loads the integrated positive and negative datasets generated in the previous step.


Task 1: Generation (1/3): Uses the first half of the available positive character sheets

Task 2: Completion/Fill (1/3): Uses the second half of the positive character sheets. Fields are masked (set to NULL)

Task 3: Refusal (1/3): Uses a subset of negative data


Data Cleaning: Applies a specific filter for weapon names (e.g., converting "Longsword.1" back to "Longsword").

Train/Test Split: Performs an 80% Train / 20% Test split within each bucket

**Saves two files:**

dnd_train.json: The training data formatted for the LLM.

dnd_test.json: A separated benchmark file with task types labelled for evaluation.

In [None]:
import json
import random
import copy
import os
import re

# PATHS
BASE_DIR = "/content/drive/MyDrive/DnD_Project_Data"
INPUT_DIR = os.path.join(BASE_DIR, "dataset_integrated")
OUTPUT_DIR = os.path.join(BASE_DIR, "processed_dataset")
os.makedirs(OUTPUT_DIR, exist_ok=True)

INPUT_POS = os.path.join(INPUT_DIR, "dataset_integrated_positive.json")
INPUT_NEG = os.path.join(INPUT_DIR, "dataset_integrated_negative.json")

# Prompt
INSTR_GEN = "Generate a complete D&D 5e character sheet based on the provided attributes."
INSTR_FILL = "Complete this D&D 5e character sheet. Replace the NULL value(s) with correct value(s) consistent with the rules."
GENERIC_REFUSAL_OUTPUT = {
    "message": "The provided character data contains invalid, homebrew, or rule-breaking content inconsistent with D&D 5e rules. Cannot generate a character sheet."
}

#  CLEANING FUNCTIONS
def clean_vtt_artifacts(char):
    """
    Removes numerical suffixes(e.g., 'Shortsword.1' -> 'Shortsword')
    """
    clean_char = copy.deepcopy(char)
    if 'weapons' in clean_char and isinstance(clean_char['weapons'], list):
        cleaned_list = []
        for w in clean_char['weapons']:
            # Regex: Finds a dot followed by numbers at the end of the string
            new_w = re.sub(r'\.\d+$', '', w)
            cleaned_list.append(new_w)
        clean_char['weapons'] = cleaned_list
    return clean_char

def clean_dirty_stats(stats_raw):
    """Standardizes the visual input format of stats for negative examples."""
    if not isinstance(stats_raw, dict): return None
    clean = {}
    for k, v in stats_raw.items():
        clean_k = k.lower()[:3]
        clean_v = v
        if isinstance(v, list): clean_v = v[0] if len(v) > 0 else 0
        try: clean_v = int(clean_v)
        except: pass
        clean[clean_k] = clean_v
    return clean

def build_input_context(char):
    def clean_str(v):
        if isinstance(v, (dict, list)): return str(v)
        return v
    stats_cleaned = clean_dirty_stats(char.get("stats"))
    return {
        "race": clean_str(char.get("race")),
        "subrace": clean_str(char.get("subrace")),
        "class": clean_str(char.get("class")),
        "subclass": clean_str(char.get("subclass")),
        "level": char.get("level", 1),
        "background": clean_str(char.get("background")),
        "stats": stats_cleaned
    }

def create_masked_entry(char):
    """
    Creates a masked version of the character.
    Can mask 1 to 3 fields (including spells, weapons, stats, ...).
    """
    masked = copy.deepcopy(char)

    # fields to mask
    options = ['alignment', 'subclass', 'background', 'stats', 'spells', 'weapons']

    num_to_mask = random.randint(1, 3)

    # take n fields random
    targets = random.sample(options, num_to_mask)

    masked_log_names = []

    for target in targets:
        # masking for stats
        if target == 'stats' and isinstance(masked.get('stats'), dict):
            masked['stats'] = {k: None for k in masked['stats']}
            masked_log_names.append("stats (all)")
        else:
            # masking for other
            masked[target] = None
            masked_log_names.append(target)

    return masked, ", ".join(masked_log_names)

# LOADING DATA
print("[Loading Dataset")
try:
    with open(INPUT_POS, 'r') as f: data_pos = json.load(f)
    with open(INPUT_NEG, 'r') as f: data_neg = json.load(f)
    print(f"    Positive Samples Loaded: {len(data_pos)}")
    print(f"    Negative Samples Loaded: {len(data_neg)}")
except FileNotFoundError:
    print("Error: Files not found. Please ensure previous steps are completed.")
    raise

# BALANCING LOGIC
print("\nCalculating Balance (3-Bucket Logic)")
items_pos = data_pos
items_neg = data_neg
random.shuffle(items_pos)
random.shuffle(items_neg)

total_pos = len(items_pos)
# Divide positives into 2 equal buckets (Half for Gen, Half for Fill)
bucket_size = int(total_pos / 2)

print(f"    Positives available: {total_pos}")
print(f"    Bucket Size calculated (Pos / 2): {bucket_size}")
print(f"    Negatives needed (equal to one bucket): {bucket_size}")
print(f"    Negatives discarded: {len(items_neg) - bucket_size}")

# Create Distinct Groups
pool_gen = items_pos[:bucket_size]
pool_fill = items_pos[bucket_size : bucket_size*2]
pool_refusal = items_neg[:bucket_size]

# Integrity Check (Ensure no overlap in positive groups)
print(f"    [Check] Intersection verification: Are the first items distinct?")
print(f"       Gen[0] Race: {pool_gen[0].get('race')} Class: {pool_gen[0].get('class')}")
print(f"       Fill[0] Race: {pool_fill[0].get('race')} Class: {pool_fill[0].get('class')}")

# TRAIN / TEST SPLIT
print("\nSplitting Train (80%) / Test (20%)...")
def split_and_report(name, pool):
    split_idx = int(len(pool) * 0.8)
    train = pool[:split_idx]
    test = pool[split_idx:]
    print(f"    Bucket {name:<10} | Total: {len(pool)} -> Train: {len(train)}, Test: {len(test)}")
    return train, test

train_gen, test_gen = split_and_report("GEN (Pos)", pool_gen)
train_fill, test_fill = split_and_report("FILL (Pos)", pool_fill)
train_ref, test_ref = split_and_report("NEG (Ref)", pool_refusal)

# --- 4. DATASET CONSTRUCTION ---
print("\nConstructing Final Dataset")
train_dataset = []

# Check weapon cleaning
dirty_weapon_sample = next((x for x in train_gen if 'weapons' in x and any('.' in w for w in x['weapons'])), None)
if dirty_weapon_sample:
    print(f"    [Debug] Weapon Cleaning (Before): {dirty_weapon_sample['weapons']}")
    cleaned_sample = clean_vtt_artifacts(dirty_weapon_sample)
    print(f"    [Debug] Weapon Cleaning (After):  {cleaned_sample['weapons']}")
else:
    print("No 'dirty' weapon names found in the initial check.")

# Generation Task
for p in train_gen:
    p_clean = clean_vtt_artifacts(p)
    train_dataset.append({
        "instruction": INSTR_GEN, "input": build_input_context(p_clean), "output": p_clean
    })

# Fill Task
for p in train_fill:
    p_clean = clean_vtt_artifacts(p)
    masked_inp, _ = create_masked_entry(p_clean)
    train_dataset.append({
        "instruction": INSTR_FILL, "input": masked_inp, "output": p_clean
    })

# Refusal Task
for n in train_ref:
    train_dataset.append({
        "instruction": INSTR_GEN, "input": build_input_context(n), "output": GENERIC_REFUSAL_OUTPUT
    })

random.shuffle(train_dataset)
print(f"    Train Set Constructed: {len(train_dataset)} total samples.")

# Test set
print("\n Constructing Test Set")
test_benchmark = []

for p in test_gen:
    p_clean = clean_vtt_artifacts(p)
    clean_inp = build_input_context(p_clean)
    test_benchmark.append({
        "task_type": "generation",
        "llm_prompt": f"{INSTR_GEN}\n\nInput Data:\n{json.dumps(clean_inp)}",
        "expected_output": p_clean
    })

for p in test_fill:
    p_clean = clean_vtt_artifacts(p)
    masked_inp, fname = create_masked_entry(p_clean)
    test_benchmark.append({
        "task_type": "completion",
        "masked_field": fname,
        "llm_prompt": f"{INSTR_FILL}\n\nCharacter Sheet:\n{json.dumps(masked_inp)}",
        "expected_output": p_clean
    })

for n in test_ref:
    unified_inp = build_input_context(n)
    test_benchmark.append({
        "task_type": "refusal",
        "llm_prompt": f"{INSTR_GEN}\n\nInput Data:\n{json.dumps(unified_inp)}",
        "expected_output": GENERIC_REFUSAL_OUTPUT
    })

print(f"Test set constructed: {len(test_benchmark)} total samples.")

# SAVE
print("Saving")
with open(os.path.join(OUTPUT_DIR, "dnd_train.json"), 'w') as f:
    json.dump(train_dataset, f, indent=2)
with open(os.path.join(OUTPUT_DIR, "dnd_test.json"), 'w') as f:
    json.dump(test_benchmark, f, indent=2)

print(f"Files saved in: {OUTPUT_DIR}")

[1/6] Loading Dataset...
    Positive Samples Loaded: 2114
    Negative Samples Loaded: 5832

[2/6] Calculating Balance (3-Bucket Logic)...
    Positives available: 2114
    Bucket Size calculated (Pos / 2): 1057
    Negatives needed (equal to one bucket): 1057
    Negatives discarded: 4775
    [Check] Intersection verification: Are the first items distinct?
       Gen[0] Race: Orc Class: Fighter
       Fill[0] Race: Human Class: Rogue

[3/6] Splitting Train (80%) / Test (20%)...
    Bucket GEN (Pos)  | Total: 1057 -> Train: 845, Test: 212
    Bucket FILL (Pos) | Total: 1057 -> Train: 845, Test: 212
    Bucket NEG (Ref)  | Total: 1057 -> Train: 845, Test: 212

[4/6] Constructing Final Dataset & Cleaning...
    [Debug] Weapon Cleaning (Before): ['Handaxe', 'Handaxe.1', 'Longbow']
    [Debug] Weapon Cleaning (After):  ['Handaxe', 'Handaxe', 'Longbow']
    Train Set Constructed: 2535 total samples.

[5/6] Constructing Benchmark (Test Set)...
    Test Benchmark Constructed: 636 total sampl