In [2]:
# Extractor and OCR
# Automatically reads player deck photos from Google Drive and extracts card names using OCR.
# Run this after each draft to generate deck lists for all players.
#
# What it does:
#   1. Connects to Google Drive using OAuth credentials
#   2. Finds the newest Season and Draft folder automatically
#   3. Downloads all player image files from that draft
#   4. Runs EasyOCR on each image to detect and extract card names
#   5. Saves a deck list (.csv) per player and an annotated image with red boxes around detected cards

from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
import os
import sys
import re
import csv
import easyocr
from PIL import Image, ImageDraw, ImageFont
import io

# Project root is one level up from this scripts/ folder
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, PROJECT_ROOT)

# Configuration
from config import SCOPES, MAIN_FOLDER_ID


# Initialize EasyOCR reader with GPU enabled
print("Initializing EasyOCR with GPU...")
reader = easyocr.Reader(['en'], gpu=True)
print("EasyOCR ready!\n")

# Authenticate with Google Drive
TOKEN_PATH       = os.path.join(PROJECT_ROOT, 'token.json')
CREDENTIALS_PATH = os.path.join(PROJECT_ROOT, 'credentials.json')

if os.path.exists(TOKEN_PATH):
    creds = Credentials.from_authorized_user_file(TOKEN_PATH, SCOPES)
else:
    flow = InstalledAppFlow.from_client_secrets_file(CREDENTIALS_PATH, SCOPES)
    creds = flow.run_local_server(port=0)
    with open(TOKEN_PATH, 'w') as token:
        token.write(creds.to_json())

drive_service = build('drive', 'v3', credentials=creds)

def get_folders(parent_id, name_pattern=None):
    """Get non-trashed folders from a parent folder."""
    query = f"'{parent_id}' in parents and mimeType='application/vnd.google-apps.folder' and trashed=false"
    results = drive_service.files().list(q=query, fields="files(id, name)").execute()
    folders = results.get('files', [])
    
    if name_pattern:
        folders = [f for f in folders if re.search(name_pattern, f['name'])]
    
    return folders

def get_files(parent_id, mime_type_filter=None):
    """Get non-trashed files from a folder, sorted by name."""
    query = f"'{parent_id}' in parents and trashed=false"
    if mime_type_filter:
        query += f" and mimeType contains '{mime_type_filter}'"
    
    results = drive_service.files().list(q=query, fields="files(id, name, mimeType)").execute()
    files = results.get('files', [])
    return sorted(files, key=lambda x: x['name'])

def is_player_file(filename):
    """Check if file is a player file (not an overview/backup file)."""
    name_lower = filename.lower()
    
    if '+' in filename:
        return False
    if 'result' in name_lower:
        return False
    if 'standing' in name_lower:
        return False
    if re.search(r'^r\d', name_lower):
        return False
    
    return True

def download_image(file_id):
    """Download image file from Google Drive."""
    request = drive_service.files().get_media(fileId=file_id)
    image_bytes = request.execute()
    return image_bytes

def extract_text_from_image(image_bytes):
    """Extract text from image using EasyOCR."""
    # Convert bytes to PIL Image
    image = Image.open(io.BytesIO(image_bytes))
    
    # EasyOCR works with file paths or numpy arrays
    # Convert PIL to numpy array
    import numpy as np
    image_array = np.array(image)
    
    # Run EasyOCR
    results = reader.readtext(image_array, detail=1)
    
    return image, results

def boxes_are_adjacent(bbox1, bbox2, max_x_distance=30, max_y_distance=10):
    """Check if two bounding boxes are close to each other."""
    # Get min/max coordinates for both boxes
    x1_min = min(p[0] for p in bbox1)
    x1_max = max(p[0] for p in bbox1)
    y1_min = min(p[1] for p in bbox1)
    y1_max = max(p[1] for p in bbox1)

    x2_min = min(p[0] for p in bbox2)
    x2_max = max(p[0] for p in bbox2)
    y2_min = min(p[1] for p in bbox2)
    y2_max = max(p[1] for p in bbox2)

    # Check if boxes are on roughly the same horizontal line (y-axis tolerance: max_y_distance)
    y_overlap = not (y1_max < y2_min - max_y_distance or y2_max < y1_min - max_y_distance)

    # Check if boxes are close horizontally (x-axis tolerance: max_x_distance)
    horizontal_gap = min(abs(x1_max - x2_min), abs(x2_max - x1_min))

    return y_overlap and horizontal_gap <= max_x_distance

def merge_bboxes(bboxes):
    """Merge multiple bounding boxes into one."""
    all_x = []
    all_y = []
    
    for bbox in bboxes:
        for point in bbox:
            all_x.append(point[0])
            all_y.append(point[1])
    
    # Create merged rectangle
    min_x = min(all_x)
    max_x = max(all_x)
    min_y = min(all_y)
    max_y = max(all_y)
    
    return [(min_x, min_y), (max_x, min_y), (max_x, max_y), (min_x, max_y)]

def should_keep_text(text):
    """Check if text should be kept (basic filtering)."""
    if len(text) < 3 or len(text) > 50:
        return False
    if not any(c.isalpha() for c in text):
        return False
    
    text_lower = text.lower()
    if text_lower in ['tap', 'untap', 'mana', 'cost', 'main', 'deck', 'sideboard']:
        return False
    if len(text) == 1:
        return False
    if all(c.isdigit() or c in '{}/WUBRGC' for c in text):
        return False
    
    return True

def parse_and_merge_card_names(ocr_results):
    """Parse card names from EasyOCR results and merge adjacent text."""
    # First filter by confidence and basic criteria
    filtered_detections = []
    
    for detection in ocr_results:
        bbox, text, confidence = detection
        
        if confidence < 0.05:
            continue
        
        text = text.strip()
        
        if not should_keep_text(text):
            continue
        
        filtered_detections.append({
            'bbox': bbox,
            'text': text,
            'confidence': confidence,
            'x_min': min(p[0] for p in bbox),
            'y_position': bbox[0][1]
        })
    
    # Group adjacent detections
    merged_cards = []
    used_indices = set()
    
    for i, detection in enumerate(filtered_detections):
        if i in used_indices:
            continue
        
        # Start a new group with this detection
        group = [detection]
        used_indices.add(i)
        
        # Find all adjacent detections
        changed = True
        while changed:
            changed = False
            for j, other_detection in enumerate(filtered_detections):
                if j in used_indices:
                    continue
                
                # Check if this detection is adjacent to any in the current group
                for group_detection in group:
                    if boxes_are_adjacent(group_detection['bbox'], other_detection['bbox']):
                        group.append(other_detection)
                        used_indices.add(j)
                        changed = True
                        break
        
        # Sort group by x position (left to right) and combine text
        group.sort(key=lambda x: x['x_min'])
        combined_text = ' '.join(d['text'] for d in group)
        combined_bbox = merge_bboxes([d['bbox'] for d in group])
        avg_confidence = sum(d['confidence'] for d in group) / len(group)
        
        merged_cards.append({
            'text': combined_text,
            'confidence': avg_confidence,
            'bbox': combined_bbox,
            'y_position': group[0]['y_position']
        })
    
    # Sort by vertical position (top to bottom)
    merged_cards.sort(key=lambda x: x['y_position'])
    
    return merged_cards

def draw_boxes_on_image(image, merged_cards):
    """Draw red boxes around recognized card names on the image."""
    img_with_boxes = image.copy()
    draw = ImageDraw.Draw(img_with_boxes)
    
    for card in merged_cards:
        draw.polygon(card['bbox'], outline='red', width=3)
    
    return img_with_boxes

# Main execution
print("Step 1: Looking for season folders...")
season_folders = get_folders(MAIN_FOLDER_ID, r'Season \d+')

newest_season = max(season_folders, 
                    key=lambda f: int(re.search(r'Season (\d+)', f['name']).group(1)),
                    default=None)

if not newest_season:
    print("Error: No Season folders found!")
    exit()

print(f"  → Found: {newest_season['name']}")

print(f"\nStep 2: Looking inside {newest_season['name']}...")
folders_in_season = get_folders(newest_season['id'])

pictures_folder = next((f for f in folders_in_season if f['name'].lower() == 'pictures'), None)

if not pictures_folder:
    print("Error: Pictures folder not found!")
    exit()

print(f"  → Found: Pictures folder")

print("\nStep 3: Looking for Draft folders in Pictures...")
draft_folders = get_folders(pictures_folder['id'], r'\d{8}\s+Draft\s+\d+')

newest_draft = max(draft_folders,
                   key=lambda f: int(re.match(r'(\d{8})', f['name']).group(1)),
                   default=None)

if not newest_draft:
    print("Error: No Draft folders found!")
    exit()

print(f"  → Found: {newest_draft['name']}")

print(f"\nStep 4: Getting image files from {newest_draft['name']}...")
all_files = get_files(newest_draft['id'], mime_type_filter='image/')
player_files = [f for f in all_files if is_player_file(f['name'])]

print(f"  → Found {len(player_files)} player image file(s) to process")

print(f"\nStep 5: Processing images with EasyOCR...")
print("-" * 60)

# Create output directories
BASE_OUTPUT_DIR = os.path.join(PROJECT_ROOT, 'data', 'drafted_decks')
output_dir = os.path.join(BASE_OUTPUT_DIR, newest_draft['name'].replace(' ', '_'))
annotated_dir = os.path.join(output_dir, "annotated_images")

os.makedirs(output_dir, exist_ok=True)
os.makedirs(annotated_dir, exist_ok=True)

processed_count = 0

for idx, file in enumerate(player_files, 1):
    print(f"\n[{idx}/{len(player_files)}] Processing: {file['name']}")
    
    try:
        print(f"  → Downloading...")
        image_bytes = download_image(file['id'])
        
        print(f"  → Running OCR...")
        original_image, ocr_results = extract_text_from_image(image_bytes)
        
        # Parse and merge adjacent card names
        merged_cards = parse_and_merge_card_names(ocr_results)
        
        # Get UNIQUE card names only
        unique_cards = set(card['text'] for card in merged_cards)
        
        print(f"  → Found {len(unique_cards)} unique cards")
        
        print(f"\n  Detected cards (with confidence):")
        print("  " + "-" * 50)
        for card in merged_cards:
            print(f"    • {card['text']} (confidence: {card['confidence']:.2%})")
        print("  " + "-" * 50)
        
        # Draw boxes and save annotated image
        annotated_image = draw_boxes_on_image(original_image, merged_cards)
        annotated_filename = os.path.join(annotated_dir, f"annotated_{file['name']}")
        annotated_image.save(annotated_filename)
        print(f"  → Annotated saved to: {annotated_filename}")
        
        # Save individual deck file as CSV
        deck_filename = os.path.join(output_dir, f"{os.path.splitext(file['name'])[0]}.csv")
        
        with open(deck_filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(["name"])
            for card in sorted(unique_cards):
                writer.writerow([card])
        
        print(f"  → Deck list saved to: {deck_filename}")
        
        processed_count += 1
        
    except Exception as e:
        print(f"  ✗ Error: {str(e)}")

Initializing EasyOCR with GPU...
EasyOCR ready!

Step 1: Looking for season folders...
  → Found: Season 5

Step 2: Looking inside Season 5...
  → Found: Pictures folder

Step 3: Looking for Draft folders in Pictures...
  → Found: 20260125 Draft 7

Step 4: Getting image files from 20260125 Draft 7...
  → Found 12 player image file(s) to process

Step 5: Processing images with EasyOCR...
------------------------------------------------------------

[1/12] Processing: Andrin.jpeg
  → Downloading...
  → Running OCR...
  → Found 29 unique cards

  Detected cards (with confidence):
  --------------------------------------------------
    • Faerie Mastermind (confidence: 99.56%)
    • Dreams of Steel and Oil (confidence: 64.93%)
    • Otawara, Soaring City (confidence: 75.39%)
    • Counterspell (confidence: 68.89%)
    • Verdant Catacombs (confidence: 78.47%)
    • Mesmeric Fien( (confidence: 63.43%)
    • Cryptic Coat (confidence: 99.96%)
    • Jack-in-the-Mox (confidence: 99.28%)
    • Fa