In [1]:
# Test - Merge OCR and VC Drafted Decks
# Compares OCR-validated deck lists against VC drafted_decks data.
# Requires: pip install pandas requests

import re, glob, os, time
import pandas as pd
import requests

BASE_DIR       = os.path.abspath(os.path.join(os.getcwd(), '..'))
DRAFT_DATA_DIR = os.path.join(BASE_DIR, 'Draft csv data')
CARDLIST_DIR   = os.path.join(BASE_DIR, 'data', 'cardlist')
OUTPUT_DIR     = os.path.join(os.getcwd(), 'output')
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ── Load cube list (name ↔ scryfall_id lookup) ──
cube_lists = sorted(
    glob.glob(os.path.join(CARDLIST_DIR, 'dimlas*_cardlist.csv')),
    key=lambda f: int(re.search(r'dimlas(\d+)_cardlist', f).group(1)),
    reverse=True
)
if not cube_lists:
    raise FileNotFoundError(f'No dimlas*_cardlist.csv found in {CARDLIST_DIR}')

df_cube = pd.read_csv(cube_lists[0]).dropna(subset=['scryfall_id'])
id_to_name = dict(zip(df_cube['scryfall_id'], df_cube['name'].str.strip()))
name_to_id = {v: k for k, v in id_to_name.items()}

total = len(pd.read_csv(cube_lists[0]))
found = len(id_to_name)
print(f'Cube list: {os.path.basename(cube_lists[0])} — {found}/{total} cards with scryfall ID')
if found == total:
    print('✓ All cards have a scryfall ID.')

Cube list: dimlas5_cardlist.csv — 540/540 cards with scryfall ID
✓ All cards have a scryfall ID.


In [2]:
# ── Load both sources ────────────────────────────────────────────────────────

# 1) OCR: clean_*.csv from newest draft folder in data/clean/
CLEAN_DIR = os.path.join(BASE_DIR, 'data', 'clean')
clean_draft_folders = sorted(
    [d for d in glob.glob(os.path.join(CLEAN_DIR, '*')) if os.path.isdir(d)],
    reverse=True
)
if not clean_draft_folders:
    raise FileNotFoundError(f'No draft folders found in {CLEAN_DIR}')
newest_clean = clean_draft_folders[0]
print(f'OCR folder: {os.path.basename(newest_clean)}')

ocr_frames = []
for f in sorted(glob.glob(os.path.join(newest_clean, 'clean_*.csv'))):
    player = os.path.basename(f).replace('clean_', '').replace('.csv', '')
    df = pd.read_csv(f)
    df.columns = df.columns.str.strip().str.lower()
    df['player'] = player
    ocr_frames.append(df[['player', 'name']])

if not ocr_frames:
    raise FileNotFoundError(f'No clean_*.csv files found in {newest_clean}')

df_ocr = pd.concat(ocr_frames, ignore_index=True)
df_ocr.columns = ['player', 'card_name']
df_ocr['card_name'] = df_ocr['card_name'].str.strip()
print(f'  {len(df_ocr)} cards from {df_ocr["player"].nunique()} players')

# 2) VC: newest drafted_decks.csv → resolve scryfallId to card name
DRAFT_DATA_DIR = os.path.join(BASE_DIR, 'Draft csv data')
drafted = sorted(glob.glob(os.path.join(DRAFT_DATA_DIR, '*_drafted_decks.csv')), reverse=True)
if not drafted:
    raise FileNotFoundError('No *_drafted_decks.csv found')
print(f'\nVC file: {os.path.basename(drafted[0])}')

df_vc = pd.read_csv(drafted[0])[['player', 'scryfallId']].copy()
df_vc.columns = ['player', 'scryfall_id']
df_vc['player']    = df_vc['player'].str.strip()
df_vc['card_name'] = df_vc['scryfall_id'].map(id_to_name)

# Batch-fetch any IDs not in the cube list (old printings / swapped cards)
missing_ids = df_vc.loc[df_vc['card_name'].isna(), 'scryfall_id'].unique().tolist()
if missing_ids:
    print(f'  Fetching {len(missing_ids)} IDs not in cube list from Scryfall...')
    api_map = {}
    for i in range(0, len(missing_ids), 75):
        batch = missing_ids[i:i+75]
        resp = requests.post('https://api.scryfall.com/cards/collection',
                             json={'identifiers': [{'id': s} for s in batch]}, timeout=30)
        resp.raise_for_status()
        for card in resp.json().get('data', []):
            api_map[card['id']] = card['name'].split(' // ')[0].strip()
        time.sleep(0.1)
    df_vc['card_name'] = df_vc.apply(
        lambda r: api_map.get(r['scryfall_id'], r['card_name']) if pd.isna(r['card_name']) else r['card_name'], axis=1)
    print(f'  Resolved {len(api_map)}/{len(missing_ids)} via API')

df_vc = df_vc[['player', 'card_name', 'scryfall_id']].dropna(subset=['card_name'])
print(f'  {len(df_vc)} cards from {df_vc["player"].nunique()} players')

# ── Card count warning ────────────────────────────────────────────────────────
if len(df_ocr) != len(df_vc):
    print(f'\n⚠ WARNING: Card count mismatch — OCR has {len(df_ocr)} cards, VC has {len(df_vc)} cards (difference: {abs(len(df_ocr) - len(df_vc))})')
else:
    print(f'\n✓ Card counts match: {len(df_ocr)} cards in both sources.')

OCR folder: 20260125_Draft_7
  345 cards from 12 players

VC file: 2026_01_25_drafted_decks.csv
  345 cards from 12 players

✓ Card counts match: 345 cards in both sources.


In [3]:
# ── Compare newest Draft csv data file vs newest data/final file ──────────────

DRAFT_CSV_DIR = os.path.join(BASE_DIR, 'Draft csv data')
FINAL_DIR     = os.path.join(BASE_DIR, 'data', 'final')

# Find newest file by modification time in each directory
draft_files = sorted(glob.glob(os.path.join(DRAFT_CSV_DIR, '*_drafted_decks.csv')),
                     key=os.path.getmtime, reverse=True)
final_files = sorted(glob.glob(os.path.join(FINAL_DIR, '*.csv')),
                     key=os.path.getmtime, reverse=True)

if not draft_files:
    raise FileNotFoundError(f'No *_drafted_decks.csv found in {DRAFT_CSV_DIR}')
if not final_files:
    raise FileNotFoundError(f'No CSV files found in {FINAL_DIR}')

newest_draft = draft_files[0]
newest_final = final_files[0]
print(f'Draft file : {os.path.basename(newest_draft)}')
print(f'Final file : {os.path.basename(newest_final)}')

df_draft = pd.read_csv(newest_draft)
df_final = pd.read_csv(newest_final)

draft_ids = set(df_draft['scryfallId'].dropna())
final_ids = set(df_final['scryfallId'].dropna())

in_draft_only = draft_ids - final_ids
in_final_only = final_ids - draft_ids
in_both       = draft_ids & final_ids

print(f'\nCards in both      : {len(in_both)}')
print(f'In draft only      : {len(in_draft_only)}')
print(f'In final only      : {len(in_final_only)}')

if in_draft_only:
    print('\n── In Draft but not in Final ──')
    rows = df_draft[df_draft['scryfallId'].isin(in_draft_only)][['player', 'scryfallId']].drop_duplicates()
    display(rows.reset_index(drop=True))

if in_final_only:
    print('\n── In Final but not in Draft ──')
    rows = df_final[df_final['scryfallId'].isin(in_final_only)][['player', 'scryfallId']].drop_duplicates()
    display(rows.reset_index(drop=True))

if not in_draft_only and not in_final_only:
    print('\n✓ Perfect match — both files contain the same cards.')
else:
    print(f'\n⚠ WARNING: {len(in_draft_only)} card(s) in draft only, {len(in_final_only)} card(s) in final only.')

Draft file : 2026_01_25_drafted_decks.csv
Final file : 20260125_Draft_7.csv

Cards in both      : 345
In draft only      : 0
In final only      : 0

✓ Perfect match — both files contain the same cards.
