In [10]:
# Test: Compare both CSVs from both export zips
# Extracts matches + drafted_decks from the newest zip in 'Draft csv data' (original)
# and the newest zip in 'data/zip' (generated), then compares both files.
# Requires: pip install pandas

import glob, os, zipfile
import pandas as pd

BASE_DIR        = os.path.abspath(os.path.join(os.getcwd(), '..'))
DRAFT_DATA_DIR  = os.path.join(BASE_DIR, 'Draft csv data')
GENERATED_DIR   = os.path.join(BASE_DIR, 'data', 'zip')


def extract_csv_from_zip(zip_path, suffix):
    """Return (DataFrame, filename) for the first file ending with `suffix` in the zip."""
    with zipfile.ZipFile(zip_path) as zf:
        names = [n for n in zf.namelist() if n.endswith(suffix)]
        if not names:
            raise FileNotFoundError(f'No *{suffix} found inside {os.path.basename(zip_path)}')
        with zf.open(names[0]) as f:
            return pd.read_csv(f), names[0]


# ── Step 1: Load both CSVs from newest zip in Draft csv data/ ────────────────
original_zips = sorted(glob.glob(os.path.join(DRAFT_DATA_DIR, '*.zip')), reverse=True)
if not original_zips:
    raise FileNotFoundError(f'No zip files found in {DRAFT_DATA_DIR}')

original_zip = original_zips[0]
df_orig_matches, orig_matches_name   = extract_csv_from_zip(original_zip, '_matches.csv')
df_orig_decks,   orig_decks_name     = extract_csv_from_zip(original_zip, '_drafted_decks.csv')
print(f'Original zip : {os.path.basename(original_zip)}')
print(f'  └── {orig_matches_name}  ({len(df_orig_matches)} rows)')
print(f'  └── {orig_decks_name}  ({len(df_orig_decks)} rows)')

Original zip : 2026_02_22_tournament_export.zip
  └── 2026_02_22_matches.csv  (24 rows)
  └── 2026_02_22_drafted_decks.csv  (352 rows)


In [11]:
# ── Step 2: Load both CSVs from newest zip in data/zip/ ──────────────────────
generated_zips = sorted(glob.glob(os.path.join(GENERATED_DIR, '*.zip')), reverse=True)
if not generated_zips:
    raise FileNotFoundError(f'No zip files found in {GENERATED_DIR}')

generated_zip = generated_zips[0]
df_gen_matches, gen_matches_name = extract_csv_from_zip(generated_zip, '_matches.csv')
df_gen_decks,   gen_decks_name   = extract_csv_from_zip(generated_zip, '_drafted_decks.csv')
print(f'Generated zip: {os.path.basename(generated_zip)}')
print(f'  └── {gen_matches_name}  ({len(df_gen_matches)} rows)')
print(f'  └── {gen_decks_name}  ({len(df_gen_decks)} rows)')

Generated zip: 2026_02_22_tournament_export.zip
  └── 2026_02_22_matches.csv  (24 rows)
  └── 2026_02_22_drafted_decks.csv  (352 rows)


In [12]:
# ── Step 3: Compare matches ───────────────────────────────────────────────────
# Within each round, pairings may appear in a different order and
# player1/player2 columns may be swapped. Normalise so player1 is always
# alphabetically first, then sort by (round, player1, player2).

def normalize_matches(df):
    df = df.copy()
    df.columns = df.columns.str.strip()
    swap = df['player1'] > df['player2']
    df.loc[swap, ['player1', 'player2']]         = df.loc[swap, ['player2', 'player1']].values
    df.loc[swap, ['player1Wins', 'player2Wins']] = df.loc[swap, ['player2Wins', 'player1Wins']].values
    df = df.sort_values(['round', 'player1', 'player2']).reset_index(drop=True)
    return df

CMP_COLS = ['round', 'player1', 'player1Wins', 'player2', 'player2Wins', 'draws']

df_orig_cmp = normalize_matches(df_orig_matches)[CMP_COLS].reset_index(drop=True)
df_gen_cmp  = normalize_matches(df_gen_matches)[CMP_COLS].reset_index(drop=True)

if df_orig_cmp.equals(df_gen_cmp):
    print('✓ Matches files match perfectly!')
else:
    diff = df_orig_cmp.compare(df_gen_cmp)
    print('⚠ Differences found in matches:')
    display(diff)
    assert False, f'Matches mismatch between {os.path.basename(original_zip)} and {os.path.basename(generated_zip)}:\n{diff}'

✓ Matches files match perfectly!


In [13]:
# ── Step 4: Compare drafted_decks ─────────────────────────────────────────────
# Sort by player then scryfallId so card order within a deck doesn't matter.

def normalize_decks(df):
    df = df.copy()
    df.columns = df.columns.str.strip()
    df = df.sort_values(['player', 'scryfallId']).reset_index(drop=True)
    return df

DECK_COLS = ['archetype', 'decktype', 'player', 'quantity', 'scryfallId', 'tournament']

df_orig_decks_cmp = normalize_decks(df_orig_decks)[DECK_COLS]
df_gen_decks_cmp  = normalize_decks(df_gen_decks)[DECK_COLS]

if df_orig_decks_cmp.reset_index(drop=True).equals(df_gen_decks_cmp.reset_index(drop=True)):
    print('✓ Drafted decks files match perfectly!')
else:
    errors = []

    # Row count difference
    if len(df_orig_decks_cmp) != len(df_gen_decks_cmp):
        errors.append(f'Row count: original={len(df_orig_decks_cmp)}, generated={len(df_gen_decks_cmp)}')

    # Extra / missing cards (by player + scryfallId)
    KEY = ['player', 'scryfallId']
    orig_keys = df_orig_decks_cmp[KEY].drop_duplicates()
    gen_keys  = df_gen_decks_cmp[KEY].drop_duplicates()

    only_in_orig = orig_keys.merge(gen_keys, how='left', indicator=True).query('_merge=="left_only"').drop('_merge', axis=1)
    only_in_gen  = gen_keys.merge(orig_keys, how='left', indicator=True).query('_merge=="left_only"').drop('_merge', axis=1)

    if not only_in_orig.empty:
        print('⚠ Cards only in ORIGINAL (missing from generated):')
        display(only_in_orig)
        errors.append(f'{len(only_in_orig)} card(s) missing from generated')

    if not only_in_gen.empty:
        print('⚠ Cards only in GENERATED (extra, not in original):')
        display(only_in_gen)
        errors.append(f'{len(only_in_gen)} extra card(s) in generated')

    # Value differences on matching rows (same player+scryfallId)
    merged = df_orig_decks_cmp.merge(df_gen_decks_cmp, on=KEY, suffixes=('_orig', '_gen'))
    diff_cols = [c for c in DECK_COLS if c not in KEY]
    value_diffs = merged[merged.apply(
        lambda r: any(r[f'{c}_orig'] != r[f'{c}_gen'] for c in diff_cols if f'{c}_orig' in r), axis=1
    )]
    if not value_diffs.empty:
        print('⚠ Value differences on matching cards:')
        display(value_diffs)
        errors.append(f'{len(value_diffs)} row(s) with value differences')

    assert not errors, f'Decks mismatch between {os.path.basename(original_zip)} and {os.path.basename(generated_zip)}:\n' + '\n'.join(errors)


✓ Drafted decks files match perfectly!
