In [2]:
# OCR Cross-Reference Tool
# Validates OCR-scanned card names against the CubeCobra card list
# using exact and fuzzy matching to correct common OCR errors.
#
# What it does:
#   1. Loads the official card list from a CubeCobra export .csv file
#   2. Loads OCR results from player .csv files in the newest draft folder
#   3. Validates each card via case-insensitive exact match, then fuzzy match
#   4. Saves detailed_Player.csv into {draft}/detailed OCR/ and clean_Player.csv into data/clean/{draft}

import re
import csv
import glob
import os
from difflib import get_close_matches

SIMILARITY_THRESHOLD = 0.65

# Project root is one level up from this scripts/ folder
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))

DRAFTED_DECKS_DIR = os.path.join(PROJECT_ROOT, 'data', 'drafted_decks')
CLEAN_OUTPUT_DIR  = os.path.join(PROJECT_ROOT, 'data', 'clean')
CARDLIST_DIR      = os.path.join(PROJECT_ROOT, 'data', 'cardlist')

# Auto-detect newest draft folder
draft_folders = sorted(
    [d for d in glob.glob(os.path.join(DRAFTED_DECKS_DIR, "*")) if os.path.isdir(d)],
    reverse=True
)
if not draft_folders:
    print(f"❌ ERROR: No draft folders found in {DRAFTED_DECKS_DIR}")
    exit()
newest_folder = draft_folders[0]
draft_name = os.path.basename(newest_folder)

# Output folders
detailed_folder = os.path.join(newest_folder, "detailed OCR")
clean_folder    = os.path.join(CLEAN_OUTPUT_DIR, draft_name)
os.makedirs(detailed_folder, exist_ok=True)
os.makedirs(clean_folder, exist_ok=True)

# Exclude subfolders and output files from previous runs
EXCLUDED_SUFFIXES = ("_corrections", "_rejected", "_validated", "_detailed", "_clean")
cards_files = sorted([
    f for f in glob.glob(os.path.join(newest_folder, "*.csv"))
    if not os.path.splitext(os.path.basename(f))[0].endswith(EXCLUDED_SUFFIXES)
    and not os.path.basename(f).startswith("detailed_")
    and not os.path.basename(f).startswith("clean_")
])
if not cards_files:
    print(f"❌ ERROR: No player .csv files found in '{newest_folder}'")
    exit()

# Auto-detect newest cube list (dimlasN_cardlist.csv)
cube_lists = glob.glob(os.path.join(CARDLIST_DIR, "dimlas*_cardlist.csv"))
if not cube_lists:
    print(f"❌ ERROR: No cube list files found in {CARDLIST_DIR}")
    exit()
CUBE_LIST_FILE = sorted(
    cube_lists,
    key=lambda f: int(re.search(r'dimlas(\d+)_cardlist', f).group(1)),
    reverse=True
)[0]

print(f"Draft folder    : {newest_folder}")
print(f"Detailed output : {detailed_folder}")
print(f"Clean output    : {clean_folder}")
print(f"Player files    : {[os.path.basename(f) for f in cards_files]}")
print(f"Cube list       : {CUBE_LIST_FILE}")

# Load official card list from CubeCobra CSV export (expects a 'name' column)
official_cards = set()
with open(CUBE_LIST_FILE, 'r', encoding='utf-8', newline='') as f:
    reader = csv.DictReader(f)
    name_col = next((c for c in reader.fieldnames if c.strip().lower() == 'name'), None)
    if name_col is None:
        print("❌ ERROR: Cube list CSV has no 'name' column")
        exit()
    for row in reader:
        card = row[name_col].strip()
        if card:
            official_cards.add(card)
official_cards_lower = {card.lower(): card for card in official_cards}
print(f"✓ Loaded {len(official_cards)} official cards\n")

# Process each player
for OCR_INPUT_FILE in cards_files:
    player_name = os.path.splitext(os.path.basename(OCR_INPUT_FILE))[0]
    print(f"Processing: {player_name}")

    # Read card names from CSV (first column, skips blank rows)
    raw_cards = []
    with open(OCR_INPUT_FILE, 'r', encoding='utf-8', newline='') as f:
        reader = csv.reader(f)
        header = next(reader, None)
        # If header row looks like a card name (no obvious column label), treat it as data
        if header:
            first_val = header[0].strip() if header else ''
            if first_val and first_val.lower() not in ('name', 'card', 'card name', 'cardname'):
                raw_cards.append(first_val)
        for row in reader:
            if row and row[0].strip():
                raw_cards.append(row[0].strip())

    if not raw_cards:
        print(f"  ❌ No cards found — skipping\n")
        continue

    # Validate cards, preserving input order
    seen = set()
    results = []  # (ocr_card, status, official_name)

    for raw in raw_cards:
        m = re.match(r'(\d+)x\s+(.+)', raw)
        ocr_card = m.group(2) if m else raw

        if ocr_card.lower() in official_cards_lower:
            official_name = official_cards_lower[ocr_card.lower()]
            if official_name in seen:
                results.append((ocr_card, 'duplicate', official_name))
            else:
                seen.add(official_name)
                status = 'exact' if ocr_card == official_name else 'exact_corrected'
                results.append((ocr_card, status, official_name))
        else:
            matches = get_close_matches(ocr_card, official_cards, n=1, cutoff=SIMILARITY_THRESHOLD)
            if matches:
                official_name = matches[0]
                if official_name in seen:
                    results.append((ocr_card, 'duplicate', official_name))
                else:
                    seen.add(official_name)
                    results.append((ocr_card, 'fuzzy', official_name))
            else:
                results.append((ocr_card, 'unmatched', None))

    n_exact      = sum(1 for _, s, _ in results if s in ('exact', 'exact_corrected'))
    n_corrected  = sum(1 for _, s, _ in results if s == 'fuzzy')
    n_filtered   = sum(1 for _, s, _ in results if s == 'unmatched')
    n_duplicates = sum(1 for _, s, _ in results if s == 'duplicate')

    # Write detailed_Player.csv → into {draft}/detailed OCR/
    detailed_path = os.path.join(detailed_folder, f"detailed_{player_name}.csv")
    with open(detailed_path, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['status', 'official_name', 'ocr_input', 'note'])
        for ocr_card, status, official_name in results:
            if status in ('exact', 'exact_corrected'):
                writer.writerow(['exact', official_name, ocr_card, ''])
            elif status == 'fuzzy':
                writer.writerow(['corrected', official_name, ocr_card, f'corrected from: {ocr_card}'])
            elif status == 'unmatched':
                writer.writerow(['unmatched', '', ocr_card, 'no match found'])
            elif status == 'duplicate':
                writer.writerow(['duplicate', official_name, ocr_card, 'duplicate removed'])

    # Write clean_Player.csv → into data/clean/{draft_name}
    clean_path = os.path.join(clean_folder, f"clean_{player_name}.csv")
    with open(clean_path, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['name'])
        for _, status, official_name in results:
            if status in ('exact', 'exact_corrected', 'fuzzy'):
                writer.writerow([official_name])

    print(f"  Cards processed : {len(results)}")
    print(f"  Exact matches   : {n_exact}")
    print(f"  Corrected       : {n_corrected}")
    print(f"  Unmatched       : {n_filtered}")
    print(f"  Duplicates      : {n_duplicates}")
    print(f"  ✓ detailed_{player_name}.csv → {detailed_folder}")
    print(f"  ✓ clean_{player_name}.csv    → {clean_folder}\n")

print(f"✓ All files processed")

Draft folder    : c:\Users\Dimlas\Desktop\Dimi\Github\CubeOCR\data\drafted_decks\20260125_Draft_7
Detailed output : c:\Users\Dimlas\Desktop\Dimi\Github\CubeOCR\data\drafted_decks\20260125_Draft_7\detailed OCR
Clean output    : c:\Users\Dimlas\Desktop\Dimi\Github\CubeOCR\data\clean\20260125_Draft_7
Player files    : ['Andrin.csv', 'Dimlas.csv', 'Fubu.csv', 'Joel K..csv', 'Lukas Stalder.csv', 'Matthias.csv', 'Noe T..csv', 'Sili.csv', 'Tinu.csv', 'Tommy.csv', 'Valentin.csv', 'Yannik.csv']
Cube list       : c:\Users\Dimlas\Desktop\Dimi\Github\CubeOCR\data\cardlist\dimlas5_cardlist.csv
✓ Loaded 540 official cards

Processing: Andrin
  Cards processed : 29
  Exact matches   : 23
  Corrected       : 5
  Unmatched       : 1
  Duplicates      : 0
  ✓ detailed_Andrin.csv → c:\Users\Dimlas\Desktop\Dimi\Github\CubeOCR\data\drafted_decks\20260125_Draft_7\detailed OCR
  ✓ clean_Andrin.csv    → c:\Users\Dimlas\Desktop\Dimi\Github\CubeOCR\data\clean\20260125_Draft_7

Processing: Dimlas
  Cards process