In [1]:
# Archetype Predictor
# Predicts the archetype of each drafted deck by comparing card scryfall IDs
# against the cleaned archetype win rate data.
#
# How it works:
#   For each card in the deck, look up which archetypes it appears in.
#   Each archetype gets a score equal to the sum of games_played for that card.
#   The archetype with the highest total score is the prediction.
#
# Input:  data/archetype_decktype_data/archetype_data.csv
#         data/clean/<draft_folder>/clean_<player>.csv
# Output: predicted archetype per player (printed as a summary table)

import pandas as pd
import os
import glob

# Project root is one level up from this scripts/ folder
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
ARCHETYPE_FILE = os.path.join(PROJECT_ROOT, 'data', 'archetype_decktype_data', 'archetype_data.csv')

# Auto-detect the newest draft folder
CLEAN_ROOT = os.path.join(PROJECT_ROOT, 'data', 'clean')
draft_folders = sorted([
    d for d in os.listdir(CLEAN_ROOT)
    if os.path.isdir(os.path.join(CLEAN_ROOT, d))
])
DRAFT_FOLDER = draft_folders[-1]
CLEAN_DIR = os.path.join(CLEAN_ROOT, DRAFT_FOLDER)
print(f"Using draft folder: {DRAFT_FOLDER}")

ARCHETYPES = [
    'Aggro',
    'Aggro-Combo',
    'Aggro-Control (Tempo)',
    'Combo',
    'Combo-Control',
    'Control',
    'Control-Aggro (Midrange)',
]

# Load archetype data, build lookup: scryfallId -> {archetype: games_played}
archetype_df = pd.read_csv(ARCHETYPE_FILE)
card_scores = (
    archetype_df
    .groupby(['scryfallId', 'archetype'])['games_played']
    .sum()
)
print(f"Loaded {len(archetype_df):,} archetype rows | {archetype_df['scryfallId'].nunique()} unique cards")

Using draft folder: 20260125_Draft_7
Loaded 1,334 archetype rows | 580 unique cards


In [2]:
# Predict archetype for each clean deck

results = []
clean_files = sorted(glob.glob(os.path.join(CLEAN_DIR, 'clean_*.csv')))
print(f"Found {len(clean_files)} deck(s) in {DRAFT_FOLDER}\n")

for filepath in clean_files:
    player = os.path.basename(filepath).replace('clean_', '').replace('.csv', '')

    deck_df = pd.read_csv(filepath)
    scryfall_ids = deck_df['scryfall_id'].dropna().str.strip().tolist()

    # Sum games_played per archetype across all cards in the deck
    scores = {arch: 0 for arch in ARCHETYPES}
    matched = 0
    unmatched = []

    for sid in scryfall_ids:
        if sid in card_scores.index.get_level_values(0):
            matched += 1
            for arch, gp in card_scores[sid].items():
                if arch in scores:
                    scores[arch] += gp
        else:
            card_name = deck_df.loc[deck_df['scryfall_id'] == sid, 'name'].values
            unmatched.append(card_name[0] if len(card_name) > 0 else sid)

    predicted = max(scores, key=scores.get)
    total_score = sum(scores.values())

    breakdown = {arch: round(v / total_score * 100, 1) if total_score > 0 else 0
                 for arch, v in scores.items()}

    results.append({
        'player': player,
        'predicted_archetype': predicted,
        'cards_matched': matched,
        'cards_unmatched': len(unmatched),
        **{f'score_{a}': breakdown[a] for a in ARCHETYPES},
    })

    print(f"{player:20s} -> {predicted}")
    print(f"  matched: {matched}/{len(scryfall_ids)} cards")
    if unmatched:
        print(f"  unmatched: {unmatched}")
    print(f"  scores (%): { {a: breakdown[a] for a in ARCHETYPES} }")
    print()

results_df = pd.DataFrame(results)
print("\n--- Summary ---")
print(results_df[['player', 'predicted_archetype', 'cards_matched', 'cards_unmatched']].to_string(index=False))

Found 12 deck(s) in 20260125_Draft_7

Andrin               -> Control
  matched: 26/28 cards
  unmatched: ['Kaito, Bane of Nightmares', "Bloodchief's Thirst"]
  scores (%): {'Aggro': 10.6, 'Aggro-Combo': 0.0, 'Aggro-Control (Tempo)': 17.1, 'Combo': 31.6, 'Combo-Control': 4.7, 'Control': 33.6, 'Control-Aggro (Midrange)': 2.6}

Dimlas               -> Aggro
  matched: 26/27 cards
  unmatched: ['Enduring Innocence']
  scores (%): {'Aggro': 60.2, 'Aggro-Combo': 0.0, 'Aggro-Control (Tempo)': 2.9, 'Combo': 8.0, 'Combo-Control': 4.0, 'Control': 25.0, 'Control-Aggro (Midrange)': 0.0}

Fubu                 -> Aggro
  matched: 22/27 cards
  unmatched: ['Baloth Prime', 'Greasewrench Goblin', 'Badgermole Cub', 'Shadowspear', 'Formidable Speaker']
  scores (%): {'Aggro': 26.2, 'Aggro-Combo': 24.3, 'Aggro-Control (Tempo)': 0.0, 'Combo': 15.8, 'Combo-Control': 5.2, 'Control': 9.6, 'Control-Aggro (Midrange)': 18.9}

Joel K.              -> Combo
  matched: 19/22 cards
  unmatched: ['Murktide Regent', 

In [3]:
# Method comparison: Raw Sum vs Normalized Per-Card
#
# Raw Sum (current):    score(arch) = sum of games_played(card, arch) across all cards
#                       Cards with more historical data pull harder.
#
# Normalized Per-Card:  score(arch) = sum of [games_played(card, arch) / total_games(card)]
#                       Each card contributes equally regardless of data volume.
#
# Ground truth loaded from data/archetype_decktype_data/<date>_drafted_decks.csv
# (auto-detects the most recent file matching that pattern)

import glob as _glob

# --- Load ground truth ---
gt_pattern = os.path.join(PROJECT_ROOT, 'data', 'archetype_decktype_data', '*_drafted_decks.csv')
gt_files = sorted(_glob.glob(gt_pattern))
if not gt_files:
    print("No ground truth file found - skipping comparison.")
else:
    gt_file = gt_files[-1]
    print(f"Ground truth: {os.path.basename(gt_file)}\n")
    gt_df = pd.read_csv(gt_file)
    # One archetype per player (all rows for a player share the same archetype)
    gt = gt_df.groupby('player')['archetype'].first().to_dict()

    # --- Precompute total games per card for normalization ---
    card_total_games = archetype_df.groupby('scryfallId')['games_played'].sum()

    # --- Run both methods ---
    comparison = []

    for filepath in sorted(_glob.glob(os.path.join(CLEAN_DIR, 'clean_*.csv'))):
        player = os.path.basename(filepath).replace('clean_', '').replace('.csv', '')
        deck_df = pd.read_csv(filepath)
        scryfall_ids = deck_df['scryfall_id'].dropna().str.strip().tolist()

        raw_scores  = {arch: 0.0 for arch in ARCHETYPES}
        norm_scores = {arch: 0.0 for arch in ARCHETYPES}

        for sid in scryfall_ids:
            if sid not in card_scores.index.get_level_values(0):
                continue
            card_archs = card_scores[sid]
            total = card_total_games.get(sid, 1)
            for arch, gp in card_archs.items():
                if arch in ARCHETYPES:
                    raw_scores[arch]  += gp
                    norm_scores[arch] += gp / total

        pred_raw  = max(raw_scores,  key=raw_scores.get)
        pred_norm = max(norm_scores, key=norm_scores.get)
        truth = gt.get(player, '?')

        comparison.append({
            'player':     player,
            'truth':      truth,
            'raw_sum':    pred_raw,
            'normalized': pred_norm,
            'raw_ok':     pred_raw  == truth,
            'norm_ok':    pred_norm == truth,
        })

    cmp_df = pd.DataFrame(comparison)

    # --- Print results ---
    print(f"{'Player':<20} {'Truth':<30} {'Raw Sum':<30} {'Normalized':<30} {'Raw':>4} {'Norm':>5}")
    print('-' * 115)
    for _, row in cmp_df.iterrows():
        print(f"{row['player']:<20} {row['truth']:<30} {row['raw_sum']:<30} {row['normalized']:<30} "
              f"{'OK' if row['raw_ok'] else 'X':>4} {'OK' if row['norm_ok'] else 'X':>5}")

    raw_acc  = cmp_df['raw_ok'].mean()
    norm_acc = cmp_df['norm_ok'].mean()
    both_right = (cmp_df['raw_ok'] & cmp_df['norm_ok']).sum()
    only_raw   = (cmp_df['raw_ok'] & ~cmp_df['norm_ok']).sum()
    only_norm  = (~cmp_df['raw_ok'] & cmp_df['norm_ok']).sum()

    print(f"\nAccuracy — Raw Sum: {raw_acc:.0%} ({cmp_df['raw_ok'].sum()}/{len(cmp_df)})  "
          f"Normalized: {norm_acc:.0%} ({cmp_df['norm_ok'].sum()}/{len(cmp_df)})")
    print(f"Both correct: {both_right}  Only Raw: {only_raw}  Only Norm: {only_norm}")
    winner = 'Normalized' if norm_acc > raw_acc else ('Raw Sum' if raw_acc > norm_acc else 'Tie')
    print(f"Winner: {winner}")

Ground truth: 2026_01_25_drafted_decks.csv

Player               Truth                          Raw Sum                        Normalized                      Raw  Norm
-------------------------------------------------------------------------------------------------------------------
Andrin               Aggro-Control (Tempo)          Control                        Control                           X     X
Dimlas               Aggro                          Aggro                          Aggro                            OK    OK
Fubu                 Aggro-Combo                    Aggro                          Aggro                             X     X
Joel K.              Control                        Combo                          Control                           X    OK
Lukas Stalder        Control                        Control                        Control                          OK    OK
Matthias             Combo                          Combo                          Combo  