In [13]:
# Imports
import json
import requests
import pandas as pd
import chess.pgn
import io
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from scipy import stats
import chess.engine
import sys
import logging
import math
import random
import tqdm
from enum import Enum
from elocator_test.complexity.model import ChessModel
import torch
from elocator_test.encoder import fen_encoder


# Configure logging to print to stdout
logging.basicConfig(
    level=logging.INFO, format="%(levelname)s: %(message)s", stream=sys.stdout
)

# Configure plotting style
sns.set(style="whitegrid")
# Replace with the actual path to your general population PGN file
GENERAL_PGN_FILE_PATH = '/Users/benjaminrosales/Desktop/Chess Data/Materials & Data/Lichess Published Datasets/lichess_db_standard_rated_2017-05.pgn'

# Path to your Stockfish executable
STOCKFISH_PATH = "/opt/homebrew/bin/stockfish"

# List of ADHD players' usernames (Lichess)
ADHD_USERNAMES = [
    "teoeo",
    "Tobermorey", 
    "apostatlet",
    "LovePump1000",
    "StuntmanAndy",
    "ChessyChesterton12",
    "yastoon",
    "SonnyDayz11",
    "Xiroir",
    "StellaAthena",
    "MagikPigeon",
    "pawnsgoback",
    "Dru403",
    "ellehooq", 
    "Euph4life",
    "Matthew-Marchand",
    "Rosey12",
    "s0mething213",
    "B1SH0P_B1SH0P",
    "Wildwood",
    "Kanaan92",
    "jonesmh"
]

#WrapperClass
class ElocatorModel:
    def __init__(self, model_path):
        self.model = ChessModel()
        self.model.load_state_dict(torch.load(model_path))
        self.model.eval()

In [14]:
class ElocatorAnalyzer:
    def __init__(self, model_path):
        self.model = ChessModel()
        self.model.load_state_dict(torch.load(model_path))
        self.model.eval()
        
        # Correct percentile ranges from the repo
        self.percentile_ranges = {
            1: (0, 0.006848618667572737),
            2: (0.006848618667572737, 0.007860606908798218),
            3: (0.007860606908798218, 0.0093873867765069),
            4: (0.0093873867765069, 0.010885232314467431),
            5: (0.010885232314467431, 0.01191701553761959),
            6: (0.01191701553761959, 0.012793240323662757),
            7: (0.012793240323662757, 0.013946877606213093),
            8: (0.013946877606213093, 0.015834777429699905),
            9: (0.015834777429699905, 0.02067287489771843),
            10: (0.02067287489771843, 1)
        }
    
    def map_prediction_to_complexity(self, prediction):
        """Maps raw model output to complexity score (1-10)"""
        for level, (low, high) in self.percentile_ranges.items():
            if low <= prediction <= high:
                return level
        return None  # Handle predictions outside expected range
    
    def get_position_complexity(self, fen):
        """Get complexity score for a single position"""
        try:
            # Sanitize the FEN string before processing
            clean_fen = sanitize_fen(fen)
            if clean_fen is None:
                return None
                
            encoded_position = fen_encoder(clean_fen)
            position_tensor = torch.FloatTensor(encoded_position).unsqueeze(0)
            
            with torch.no_grad():
                raw_prediction = self.model(position_tensor).item()
                complexity_score = self.map_prediction_to_complexity(raw_prediction)
                return complexity_score
                
        except Exception as e:
            logging.error(f"Error processing FEN {fen}: {str(e)}")
            return None
            
    def analyze_game(self, pgn_game):
        board = pgn_game.board()
        node = pgn_game
        positions = []
        
        # Get initial position
        try:
            complexity = self.get_position_complexity(board.fen())
            positions.append({
                'fen': board.fen(),
                'complexity': complexity,
                'move_number': 0
            })
        except Exception as e:
            logging.error(f"Error analyzing initial position: {str(e)}")
        
        # Process each move
        while node.variations:
            try:
                next_node = node.variations[0]
                move = next_node.move
                board.push(move)
                
                complexity = self.get_position_complexity(board.fen())
                if complexity is not None:  # Only append positions we can analyze
                    positions.append({
                        'fen': board.fen(),
                        'complexity': complexity,
                        'move_number': len(board.move_stack)
                    })
                
                node = next_node
            except Exception as e:
                logging.error(f"Error analyzing position at move {len(board.move_stack)}: {str(e)}")
                break
                
        return positions

# Create global instance
elocator = ElocatorAnalyzer('elocator_test/complexity/models/model.pth')

# Create global instance
elocator = ElocatorAnalyzer('elocator_test/complexity/models/model.pth')
# Create global instance
elocator = ElocatorAnalyzer('elocator_test/complexity/models/model.pth')
    
def get_position_complexity(self, fen):
        """Get complexity score for a single position"""
        encoded_position = fen_encoder(fen)
        position_tensor = torch.FloatTensor(encoded_position).unsqueeze(0)
        with torch.no_grad():
            complexity = self.model(position_tensor)
        return complexity.item()
    
def sanitize_fen(fen):
    """
    Sanitize and validate a FEN string for standard chess.
    Returns None if the FEN is invalid or from a variant game.
    """
    try:
        # If we see brackets or special characters, it's likely a variant game
        if '[' in fen or ']' in fen or '~' in fen:
            return None
            
        # Remove any non-standard characters
        valid_chars = 'rnbqkpRNBQKP12345678/- '
        cleaned_fen = ''.join(c for c in fen if c in valid_chars)
        
        # Get position part (everything before first space)
        position_part = cleaned_fen.split()[0] if ' ' in cleaned_fen else cleaned_fen
        
        # Validate basic FEN structure
        ranks = position_part.split('/')
        if len(ranks) != 8:
            return None
            
        # Validate each rank
        for rank in ranks:
            spaces = 0
            for char in rank:
                if char.isdigit():
                    spaces += int(char)
                else:
                    spaces += 1
            if spaces != 8:
                return None
                
        # Return standardized FEN string
        return f"{position_part} w - - 0 1"
        
    except Exception as e:
        logging.error(f"FEN sanitization failed: {fen}")
        logging.error(f"Error: {str(e)}")
        return None

In [15]:
def safe_int(value, default=None):
    try:
        return int(value)
    except (ValueError, TypeError):
        return default
"""
Setting up Time Functions
"""

def parse_clock_time(comment):
    match = re.search(r'\[%clk (\d+):(\d+):(\d+)\]', comment)  # Adjust regex if needed
    if match:
        hours = int(match.group(1))
        minutes = int(match.group(2))
        seconds = int(match.group(3))
        return hours * 3600 + minutes * 60 + seconds  # Total seconds
    return None

## Determine if a player is under time pressure based on van Harreveld et al. (2007) criteria ---

def is_under_time_pressure(time_remaining, initial_time, time_spent):
    """
    Determine time pressure, accounting for premoves and missing data
    - Premoves (time_spent = 0) should never count as time pressure
    - Missing time data should be handled safely
    """
    # Handle None/missing values
    if any(x is None for x in [time_remaining, initial_time]):
        return False
        
    # Handle invalid values
    try:
        time_remaining = float(time_remaining)
        initial_time = float(initial_time)
        # time_spent can be None for missing data or 0 for premoves
        time_spent = float(time_spent) if time_spent is not None else None
    except (TypeError, ValueError):
        return False
        
    # Invalid time states
    if initial_time <= 0 or time_remaining < 0:
        return False
        
    # If it's a premove (time_spent = 0) or missing time data,
    # only check absolute and relative time remaining
    absolute_pressure = time_remaining < 30  # Less than 30 seconds
    relative_pressure = time_remaining < (0.1 * initial_time)  # Less than 10% of initial time
        
    return absolute_pressure or relative_pressure

class TimeControlType(Enum):
    CLASSICAL = "Classical"
    RAPID = "Rapid"
    BLITZ = "Blitz"
    BULLET = "Bullet"
    UNKNOWN = "Unknown"

#Parsing and Categorizing Time Control
def parse_time_control(time_control):
    """Parse time control string from Lichess format (already in seconds)"""
    if not time_control or time_control == "unknown":
        return None, None, TimeControlType.UNKNOWN
        
    try:
        if "+" in time_control:
            base, increment = time_control.split("+")
            initial_seconds = int(base)  # Already in seconds, don't multiply
            increment_seconds = int(increment)
        else:
            initial_seconds = int(time_control)  # Already in seconds
            increment_seconds = 0
            
        # Categorize based on seconds
        if initial_seconds >= 1800:     # 30 minutes or more
            category = TimeControlType.CLASSICAL
        elif initial_seconds >= 600:     # 10 minutes or more
            category = TimeControlType.RAPID
        elif initial_seconds >= 180:     # 3 minutes or more
            category = TimeControlType.BLITZ
        else:                           # Less than 3 minutes
            category = TimeControlType.BULLET
            
        return initial_seconds, increment_seconds, category
        
    except (ValueError, TypeError):
        return None, None, TimeControlType.UNKNOWN

def calculate_material(board):
    # Returns material balance for both sides
    material = {"White": 0, "Black": 0}
    piece_values = {
        chess.PAWN: 1,
        chess.KNIGHT: 3,
        chess.BISHOP: 3,
        chess.ROOK: 5,
        chess.QUEEN: 9,
        chess.KING: 0,  # King is invaluable, but we set to 0 for simplicity
    }
    for piece_type in piece_values:
        value = piece_values[piece_type]
        material["White"] += len(board.pieces(piece_type, chess.WHITE)) * value
        material["Black"] += len(board.pieces(piece_type, chess.BLACK)) * value
    return material

def categorize_position_complexity(evaluation):
#position needs to handle both text and numeric evaluations because of "mate scores" 
    if evaluation is None:
        return 'Unknown'
    
    # Handle mate scores
    if isinstance(evaluation, str) and '#' in evaluation:
        return 'Decisive Advantage'
        
    try:
        eval_float = float(evaluation)
        if abs(eval_float) < 1:
            return 'Balanced'
        elif abs(eval_float) < 3:
            return 'Slight Advantage'
        else:
            return 'Decisive Advantage'
    except (ValueError, TypeError):
        return 'Unknown'

def categorize_move(eval_before, eval_after):
    if eval_before is None or eval_after is None:
        return "Unknown"

    # Handle mate scores
    if isinstance(eval_after, str) and '#' in eval_after:
        if '-' in eval_after:
            return "Forced Checkmate (Losing)"
        return "Forced Checkmate (Winning)"
    
    try:
        eval_before = float(eval_before)
        eval_after = float(eval_after)
    except (ValueError, TypeError):
        return "Unknown"

    ##numeric evaluation
    SATURATION_LIMIT = 1000  # Equivalent to a 10-pawn advantage
    
    ##eval change
    eval_change = eval_after - eval_before

    if abs(eval_after) >= SATURATION_LIMIT:
        return "Winning Position" if eval_after > 0 else "Losing Position"

    ##mistake calculator
    if eval_change <= -300:
        return "Blunder"
    elif eval_change <= -150:
        return "Mistake"
    elif eval_change <= -50:
        return "Inaccuracy"
    elif eval_change >= 300:
        return "Brilliant Move"
    elif eval_change >= 150:
        return "Great Move"
    elif eval_change >= 50:
        return "Good Move"
    else:
        return "Normal"


def raw_winning_chances(cp):
    MULTIPLIER = -0.00368208
    return 2 / (1 + math.exp(MULTIPLIER * cp)) - 1

def cp_winning_chances(cp):
    cp = max(-1000, min(cp, 1000))
    return raw_winning_chances(cp)

def mate_winning_chances(mate):
    cp = (21 - min(10, abs(mate))) * 100
    signed_cp = cp * (1 if mate > 0 else -1)
    return raw_winning_chances(signed_cp)

def eval_winning_chances(eval_str):
    if eval_str is None:
        return None
    if '#' in str(eval_str):
        # Mate in N moves
        mate_str = str(eval_str).replace('#', '')
        try:
            mate = int(mate_str)
            return mate_winning_chances(mate)
        except ValueError:
            return None
    else:
        try:
            cp = float(eval_str) * 100  # Convert from pawns to centipawns
            return cp_winning_chances(cp)
        except ValueError:
            return None


def safe_int(value, default=None):
    try:
        return int(value)
    except (ValueError, TypeError):
        return default


def parse_clock_time(comment):
    match = re.search(r'\[%clk (\d+):(\d+):(\d+)\]', comment)  # Adjust regex if needed
    if match:
        hours = int(match.group(1))
        minutes = int(match.group(2))
        seconds = int(match.group(3))
        return hours * 3600 + minutes * 60 + seconds  # Total seconds
    return None


def parse_evaluation(comment):
    match = re.search(r'%eval\s([+-]?[\d.]+|#-?\d+)', comment)
    if match:
        eval_str = match.group(1)
        if '#' in eval_str:
            # Mate in N moves
            return eval_str
        else:
            return float(eval_str)  # Convert to float
    return None


def categorize_error(eval_change, player_color="white"):
    if eval_change is None:
        return "Unknown"
        
    # Normalize eval_change to player's perspective
    if player_color.lower() == "black":
        eval_change = -eval_change
        
    # Now we're already in centipawns, no need to divide by 100
    if eval_change <= -300:  
        return "Blunder"
    elif eval_change <= -150:
        return "Mistake"
    elif eval_change <= -50:
        return "Inaccuracy"
    else:
        return "Normal"

def calculate_material(board):
    # Returns material balance for both sides
    material = {"White": 0, "Black": 0}
    piece_values = {
        chess.PAWN: 1,
        chess.KNIGHT: 3,
        chess.BISHOP: 3,
        chess.ROOK: 5,
        chess.QUEEN: 9,
        chess.KING: 0,  # King is invaluable, but we set to 0 for simplicity
    }
    for piece_type in piece_values:
        value = piece_values[piece_type]
        material["White"] += len(board.pieces(piece_type, chess.WHITE)) * value
        material["Black"] += len(board.pieces(piece_type, chess.BLACK)) * value
    return material

def categorize_game_phase(board):
    """
    Enhanced game phase calculation incorporating:
    material balance 
    Piecemobility
    pawn structure
    position
    """
    # Material values calibrated from empirical analysis
    PIECE_VALUES = {
        chess.KNIGHT: 782,
        chess.BISHOP: 830,
        chess.ROOK: 1289,
        chess.QUEEN: 2529
    }
    
    # Phase boundaries from statistical analysis
    ENDGAME_LIMIT = 3915   # ~Queen + Rook
    MIDGAME_LIMIT = 15258  # Total non-pawn material at start
    PHASE_MIDGAME = 128    # Full phase scale
    
    def calculate_piece_mobility(board, piece_type, square):
        """Calculate approximate mobility for a piece"""
        mobility = 0
        attacks = board.attacks(square)
        mobility = len([sq for sq in attacks if not board.is_attacked_by(not board.turn, sq)])
        return mobility
    
    def evaluate_pawn_structure(board):
        """Evaluate pawn structure impact on phase"""
        white_pawns = board.pieces(chess.PAWN, chess.WHITE)
        black_pawns = board.pieces(chess.PAWN, chess.BLACK)
        
        # Calculate pawn structure characteristics
        center_pawns = len([p for p in white_pawns | black_pawns 
                          if chess.square_file(p) in [3,4]])
        passed_pawns = 0
        for p in white_pawns:
            if not any(black_pawns & chess.BB_FILES[chess.square_file(p)]):
                passed_pawns += 1
        for p in black_pawns:
            if not any(white_pawns & chess.BB_FILES[chess.square_file(p)]):
                passed_pawns += 1
                
        return center_pawns * 0.1 + passed_pawns * 0.15
    
    # non-pawn material
    def evaluate_position(color):
        material = 0
        mobility_factor = 0
        
        for piece_type, value in PIECE_VALUES.items():
            pieces = board.pieces(piece_type, color)
            count = len(pieces)
            material += count * value
            
            #mobility consideration
            for square in pieces:
                mobility_factor += calculate_piece_mobility(board, piece_type, square) * 0.01
                
        return material, mobility_factor
    
    # Calculate both sides
    w_material, w_mobility = evaluate_position(chess.WHITE)
    b_material, b_mobility = evaluate_position(chess.BLACK)
    
    # Total non-pawn material with mobility adjustment
    total_material = w_material + b_material
    mobility_adjustment = (w_mobility + b_mobility) * 100
    
    # Pawn structure impact
    pawn_factor = evaluate_pawn_structure(board)
    
    # Adjust material based on mobility and pawn structure
    adjusted_material = total_material * (1 + pawn_factor) + mobility_adjustment
    
    # Clamp between endgame and midgame limits
    npm = max(ENDGAME_LIMIT, min(adjusted_material, MIDGAME_LIMIT))
    
    # Calculate phase score (0 = endgame, 128 = midgame)
    phase = ((npm - ENDGAME_LIMIT) * PHASE_MIDGAME) // (MIDGAME_LIMIT - ENDGAME_LIMIT)
    phase = max(0, min(phase, PHASE_MIDGAME))
    
    # Position-specific adjustments
    if len(board.move_stack) <= 20:  # First 10 moves
        phase = max(phase, 96)  # Ensure early moves are recognized as opening
    
    # Convert to categorical with clear documentation of thresholds
    if phase >= 96:      # 75% of PHASE_MIDGAME - Clear opening characteristics
        return "Opening"
    elif phase >= 32:    # 25% of PHASE_MIDGAME - Significant material remains
        return "Middlegame"
    else:                # Limited material or simplified position
        return "Endgame"

def categorize_move(eval_before, eval_after):
    if eval_before is None or eval_after is None:
        return "Unknown"

    #saturation limits in centipawns to a 10pawn advantage
    SATURATION_LIMIT = 1000  
    MATE_SCORE = 10000 

    # Calculate evaluation change
    eval_change = eval_after - eval_before

    # Handle mate scores (assuming the engine uses large numbers to indicate mate)
    if abs(eval_after) >= MATE_SCORE:
        if eval_after > 0:
            return "Forced Checkmate (Winning)"
        else:
            return "Forced Checkmate (Losing)"

    # Handle evaluation saturation
    if abs(eval_after) >= SATURATION_LIMIT:
        if eval_after > 0:
            return "Winning Position"
        else:
            return "Losing Position"

    # Categorize the move based on evaluation change
    if eval_change <= -300:
        return "Blunder"
    elif eval_change <= -150:
        return "Mistake"
    elif eval_change <= -50:
        return "Inaccuracy"
    elif eval_change >= 300:
        return "Brilliant Move"
    elif eval_change >= 150:
        return "Great Move"
    elif eval_change >= 50:
        return "Good Move"
    else:
        return "Normal"


def raw_winning_chances(cp):
    MULTIPLIER = -0.00368208
    return 2 / (1 + math.exp(MULTIPLIER * cp)) - 1


def cp_winning_chances(cp):
    cp = max(-1000, min(cp, 1000))
    return raw_winning_chances(cp)


def mate_winning_chances(mate):
    cp = (21 - min(10, abs(mate))) * 100
    signed_cp = cp * (1 if mate > 0 else -1)
    return raw_winning_chances(signed_cp)


def eval_winning_chances(evaluation):
    if evaluation is None:
        return None
    if isinstance(evaluation, str) and '#' in evaluation:
        # Mate in N moves
        mate_str = evaluation.replace('#', '')
        try:
            mate = int(mate_str)
            return mate_winning_chances(mate)
        except ValueError:
            return None
    else:
        try:
            cp = float(evaluation) * 100  # Convert from pawns to centipawns
            return cp_winning_chances(cp)
        except ValueError:
            return None
        

def calculate_eval_change(prev_evaluation, evaluation, player):
    if prev_evaluation is None or evaluation is None:
        return None
        
    def process_eval(eval_str):
        MAX_PAWNS = 15  # Cap at ±15 pawns
        
        if isinstance(eval_str, str) and '#' in eval_str:
            mate_num = int(eval_str.replace('#', ''))
            # Convert mate scores to pawns (not centipawns)
            return MAX_PAWNS if mate_num > 0 else -MAX_PAWNS
            
        try:
            # Keep everything in pawns and cap
            val = float(eval_str)
            return max(min(val, MAX_PAWNS), -MAX_PAWNS)
        except ValueError:
            return None
    
    try:
        prev_val = process_eval(prev_evaluation)
        curr_val = process_eval(evaluation)
        
        if prev_val is None or curr_val is None:
            return None
            
        # Calculate change (still in pawns)
        change = curr_val - prev_val
        if player.lower() == "black":
            change = -change
            
        return change
        
    except (ValueError, TypeError) as e:
        logging.error(f"Error calculating eval change: {str(e)}")
        return None

In [16]:
def perform_statistical_test(var, data, test_results, test_type="independent_t"):
    # Prepare data
    group1 = data[data["Group"] == "ADHD"][var].dropna()
    group2 = data[data["Group"] == "General"][var].dropna()

    # Check if data is sufficient
    if len(group1) < 10 or len(group2) < 10:
        logging.warning(f"Not enough data to perform statistical test on '{var}'.")
        return

    # Test for normality
    stat1, p1 = stats.shapiro(group1)
    stat2, p2 = stats.shapiro(group2)
    normal = p1 > 0.05 and p2 > 0.05

    # Test for equal variances
    stat_levene, p_levene = stats.levene(group1, group2)
    equal_var = p_levene > 0.05

    # Choose appropriate test
    if normal and equal_var and test_type == "independent_t":
        # Independent T-test
        stat, p = stats.ttest_ind(group1, group2, equal_var=True)
        test_name = "Independent t-test"
    elif normal and not equal_var and test_type == "independent_t":
        # Welch's T-test
        stat, p = stats.ttest_ind(group1, group2, equal_var=False)
        test_name = "Welch's t-test"
    else:
        # Mann-Whitney U Test
        stat, p = stats.mannwhitneyu(group1, group2, alternative="two-sided")
        test_name = "Mann-Whitney U test"

    test_results.append(
        {"Variable": var, "Test": test_name, "Statistic": stat, "p-value": p}
    )


def perform_chi_squared_test(category_var, data, test_results):
    contingency_table = pd.crosstab(data["Group"], data[category_var])
    if contingency_table.empty or contingency_table.shape[1] == 0:
        logging.warning(f"Contingency table is empty for variable '{category_var}'.")
        return
    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
    test_results.append(
        {
            "Variable": category_var,
            "Test": "Chi-Squared test",
            "Statistic": chi2,
            "p-value": p,
        }
    )

In [17]:
def fetch_lichess_games(username, max_games=4000):  # Increase max_games
    url = f"https://lichess.org/api/games/user/{username}"
    params = {
        "max": max_games,
        "moves": True,
        "evals": True,  # Include evaluations in the PGN comments
        "clocks": True,  # Include clock times in the PGN comments
    }
    headers = {"Accept": "application/x-chess-pgn"}
    response = requests.get(url, params=params, headers=headers)
    if response.status_code != 200:
        logging.warning(
            f"Failed to fetch games for user '{username}'. Status code: {response.status_code}"
        )
        return []
    pgn_text = response.text
    games = []
    pgn_io = io.StringIO(pgn_text)
    while True:
        game = chess.pgn.read_game(pgn_io)
        if game is None:
            break

        # Check if the game contains evaluations
        has_evaluation = False
        node = game
        while node.variations:
            next_node = node.variations[0]
            comment = next_node.comment
            if "%eval" in comment:
                has_evaluation = True
                break
            node = next_node

        if has_evaluation:
            games.append(game)

    logging.info(f"Fetched {len(games)} games with evaluations for user '{username}'.")
    return games


import random
from tqdm import tqdm

def count_games_in_pgn(pgn_file_path):
    """Count total games in PGN file with progress bar"""
    count = 0
    file_size = os.path.getsize(pgn_file_path)
    
    with open(pgn_file_path, "r", encoding="utf-8") as pgn_file:
        pbar = tqdm(total=file_size, desc="Counting games", unit='B', unit_scale=True)
        for line in pgn_file:
            if line.startswith('[Event "'):
                count += 1
            pbar.update(len(line.encode('utf-8')))
        pbar.close()
    return count

def validate_game_evaluations(game):
    """Validate game has proper evaluation structure"""
    try:
        node = game
        while node.variations:
            next_node = node.variations[0]
            if "%eval" in next_node.comment:
                return True
            node = next_node
        return False
    except (IndexError, AttributeError):
        return False

def process_pgn_file(pgn_file_path, max_games=10000, chunk_size=1000):
    games = []
    elo_distribution = {}
    
    try:
        with open(pgn_file_path, "r", encoding="utf-8") as pgn_file:
            pbar = tqdm(total=max_games, desc="Collecting games")
            
            while len(games) < max_games:
                chunk_count = 0
                current_chunk = []
                
                while chunk_count < chunk_size:
                    game = chess.pgn.read_game(pgn_file)
                    if game is None:
                        break
                        
                    # Quick validation before adding to chunk
                    if (game.headers and 
                        game.headers.get("Variant", "Standard").lower() == "standard" and
                        all(game.headers.get(key, "") != "" for key in ["WhiteElo", "BlackElo", "TimeControl"]) and
                        validate_game_evaluations(game)):  # Added evaluation validation
                        current_chunk.append(game)
                        chunk_count += 1
                
                if not current_chunk:
                    break
                
                sample_size = min(chunk_size // 2, max_games - len(games))
                sampled_games = random.sample(current_chunk, min(sample_size, len(current_chunk)))
                
                for game in sampled_games:
                    if len(games) >= max_games:
                        break
                    
                    white_elo = safe_int(game.headers.get("WhiteElo", 0))
                    black_elo = safe_int(game.headers.get("BlackElo", 0))
                    time_control = game.headers.get("TimeControl", "unknown")
                    
                    if all([white_elo, black_elo, time_control != "unknown"]):
                        avg_elo = (white_elo + black_elo) // 2
                        elo_bin = (avg_elo // 50) * 50
                        elo_distribution[elo_bin] = elo_distribution.get(elo_bin, 0) + 1
                        games.append(game)
                        pbar.update(1)
            
            pbar.close()
            
            logging.info("\nELO Distribution:")
            for elo_bin in sorted(elo_distribution.keys()):
                count = elo_distribution[elo_bin]
                logging.info(f"ELO {elo_bin}-{elo_bin+50}: {count} games")
            
        logging.info(f"Successfully collected {len(games)} games")
                           
    except Exception as e:
        logging.error(f"Failed to read PGN file '{pgn_file_path}': {e}")
    
    return games

# Usage
logging.info("Fetching general population games...")
general_games = process_pgn_file(GENERAL_PGN_FILE_PATH, max_games=10000)


def process_games(games, group_label, engine):
    all_moves = []
    total_games = len(games)
    rated_games = 0
    standard_games = 0
    eval_games = 0
    
    logging.info(f"\nProcessing {total_games} games for {group_label} group")
    
    for game in tqdm(games, desc=f"Processing {group_label} games"):
        try:
            # 1. Filter for standard chess and rated games
            variant = game.headers.get("Variant", "Standard")
            event = game.headers.get("Event", "Unknown")
            
            # Skip non-standard chess games
            if variant.lower() != "standard":
                continue
            standard_games += 1
            
            # Check if game is rated (checking both headers and event description)
            rated = ("rated" in event.lower() or 
                    game.headers.get("Rated", "False").lower() == "true")
            if not rated:
                continue
            rated_games += 1
            
            # 2. Initialize basic game data
            board = game.board()
            game_id = game.headers.get("Site", "Unknown")
            white = game.headers.get("White", "Unknown")
            black = game.headers.get("Black", "Unknown")
            result = game.headers.get("Result", "Unknown")
            white_elo = safe_int(game.headers.get("WhiteElo", None))
            black_elo = safe_int(game.headers.get("BlackElo", None))
            time_control = game.headers.get("TimeControl", "Unknown")
            
            # 3. ADHD player identification
            white_has_adhd = white in ADHD_USERNAMES
            black_has_adhd = black in ADHD_USERNAMES
            
            # 4. Time control parsing
            initial_time, increment, time_category = parse_time_control(time_control)
            
            # 5. Game traversal initialization
            node = game
            move_number = 0
            prev_evaluation = None
            current_material = calculate_material(board)
            prev_time_remaining = None
            prev_winning_chances = None
            
            # 6. Verify game has evaluations
            if not any("%eval" in node.variations[0].comment for node in game.mainline()):
                continue
            eval_games += 1

            # 7. Process moves
            while node.variations:
                next_node = node.variations[0]
                move = next_node.move
                san = board.san(move)
                move_number += 1
                player = "White" if board.turn else "Black"
                
                # Get position evaluation data
                try:
                    position_complexity = elocator.get_position_complexity(board.fen())
                except Exception as e:
                    position_complexity = None
                
                # ADHD status for current move
                is_adhd_move = (player == "White" and white_has_adhd) or \
                              (player == "Black" and black_has_adhd)
                
                # Extract move metadata
                comment = next_node.comment
                time_remaining = parse_clock_time(comment)
                evaluation = parse_evaluation(comment)
                
                # Time calculations
                time_spent = (prev_time_remaining - time_remaining) if all(x is not None for x in [prev_time_remaining, time_remaining]) else None
                time_spent = time_spent if time_spent and time_spent > 0 else None
                
                under_pressure = is_under_time_pressure(
                    time_remaining=time_remaining,
                    initial_time=initial_time,
                    time_spent=time_spent
                )
                
                # Calculate winning chances
                winning_chances = eval_winning_chances(evaluation)
                winning_chances_change = winning_chances - prev_winning_chances if all(x is not None for x in [prev_winning_chances, winning_chances]) else None
                
                # Skip positions without evaluations
                if evaluation is None:
                    board.push(move)
                    node = next_node
                    prev_time_remaining = time_remaining
                    current_material = calculate_material(board)
                    prev_winning_chances = winning_chances
                    continue
                
                # Execute move and calculate changes
                board.push(move)
                
                # Evaluation change calculation
                eval_change = calculate_eval_change(prev_evaluation, evaluation, player)
                if eval_change is not None:
                    eval_change = eval_change * 100

                
                new_material = calculate_material(board)
                material_diff = new_material[player] - current_material[player]
                is_sacrifice = material_diff < 0
                game_phase = categorize_game_phase(board)
                position_complexity_category = categorize_position_complexity(prev_evaluation)
                error_category = categorize_error(eval_change, player)  # Note: passing player here is important
                
                # Compile move data
                move_data = {
                    'game_id': game_id,
                    'event': event,
                    'date': game.headers.get("UTCDate", "Unknown"),
                    'result': result,
                    'white': white,
                    'black': black,
                    'white_elo': white_elo,
                    'black_elo': black_elo,
                    'adhd_player': white if white_has_adhd else (black if black_has_adhd else None),
                    'move_number': move_number,
                    'player': player,
                    'san': san,
                    'fen': board.fen(),
                    'game_phase': game_phase,
                    'is_adhd_move': is_adhd_move,
                    'position_complexity': position_complexity,
                    'position_complexity_category': position_complexity_category,
                    'evaluation': evaluation,
                    'eval_change': eval_change,
                    'error_category': categorize_error(eval_change, player),
                    'winning_chances': winning_chances,
                    'winning_chances_change': winning_chances_change,
                    'material_diff': material_diff,
                    'is_sacrifice': is_sacrifice,
                    'time_control': time_control,
                    'time_control_category': time_category.value if time_category else None,
                    'initial_time_seconds': initial_time,
                    'increment_seconds': increment,
                    'time_remaining': time_remaining,
                    'time_spent': time_spent,
                    'under_time_pressure': under_pressure,
                    'group': group_label
                }
                
                all_moves.append(move_data)
                
                # Update previous values
                prev_evaluation = evaluation
                prev_time_remaining = time_remaining
                current_material = new_material
                prev_winning_chances = winning_chances
                node = next_node
                
        except Exception as e:
            logging.error(f"Error processing game {game_id}: {e}")
            continue
    
    # Processing summary
    logging.info(f"\nProcessing Summary for {group_label}:")
    logging.info(f"Total games: {total_games}")
    logging.info(f"Standard chess games: {standard_games}")
    logging.info(f"Rated games: {rated_games}")
    logging.info(f"Games with evaluations: {eval_games}")
    logging.info(f"Total moves processed: {len(all_moves)}")
    
    # Convert to DataFrame
    moves_df = pd.DataFrame(all_moves)
    
    return moves_df

# If you want to specify a particular column order, you can reorder the DataFrame after creation.s
column_order = [
    'GameID', 'Event', 'Date', 'Result',
    'White', 'Black', 'WhiteElo', 'BlackElo', 'ADHDPlayer',
    'MoveNumber', 'Player', 'SAN', 'GamePhase', 'IsADHDMove',
    'Evaluation', 'EvalChange', 'ErrorCategory', 'PositionComplexity', 'Position Complexity Category'
    'MaterialDiff', 'IsSacrifice',
    'TimeControl', 'TimeControlCategory', 'InitialTimeSeconds',
    'IncrementSeconds', 'TimeRemaining', 'TimeSpent', 'UnderTimePressure',
    'Group', 'MoveCondition'
]


INFO: Fetching general population games...


Collecting games: 100%|██████████| 10000/10000 [03:59<00:00, 41.70it/s]

INFO: 
ELO Distribution:
INFO: ELO 800-850: 3 games
INFO: ELO 850-900: 6 games
INFO: ELO 900-950: 21 games
INFO: ELO 950-1000: 33 games
INFO: ELO 1000-1050: 76 games
INFO: ELO 1050-1100: 97 games
INFO: ELO 1100-1150: 131 games
INFO: ELO 1150-1200: 194 games
INFO: ELO 1200-1250: 266 games
INFO: ELO 1250-1300: 302 games
INFO: ELO 1300-1350: 370 games
INFO: ELO 1350-1400: 468 games
INFO: ELO 1400-1450: 553 games
INFO: ELO 1450-1500: 682 games
INFO: ELO 1500-1550: 671 games
INFO: ELO 1550-1600: 665 games
INFO: ELO 1600-1650: 663 games
INFO: ELO 1650-1700: 755 games
INFO: ELO 1700-1750: 685 games
INFO: ELO 1750-1800: 616 games
INFO: ELO 1800-1850: 586 games
INFO: ELO 1850-1900: 497 games
INFO: ELO 1900-1950: 441 games
INFO: ELO 1950-2000: 335 games
INFO: ELO 2000-2050: 255 games
INFO: ELO 2050-2100: 209 games
INFO: ELO 2100-2150: 145 games
INFO: ELO 2150-2200: 99 games
INFO: ELO 2200-2250: 67 games
INFO: ELO 2250-2300: 42 games
INFO: ELO 2300-2350: 22 games
INFO: ELO 2350-2400: 12 games
INF




In [18]:
# ----------------------- 1. Fetch and Process ADHD Players' Games -----------------------

adhd_games = []
for username in ADHD_USERNAMES:
    logging.info(f"Fetching games for user '{username}'...")
    user_games = fetch_lichess_games(username, max_games=4000)  # Adjust max_games as needed
    adhd_games.extend(user_games)

if not adhd_games:
    logging.warning("No ADHD games fetched. Exiting analysis.")
else:
    # Initialize the chess engine
    try:
        engine = chess.engine.SimpleEngine.popen_uci(STOCKFISH_PATH)
        logging.info(f"Initialized Stockfish engine at '{STOCKFISH_PATH}'.")
    except FileNotFoundError:
        logging.critical(f"Stockfish executable not found at '{STOCKFISH_PATH}'. Please update the path.")
        engine = None
    except Exception as e:
        logging.critical(f"Failed to initialize Stockfish engine: {e}")
        engine = None

    if engine is not None:
        # ----------------------- 2. Process ADHD Players' Games -----------------------
        
        logging.info("Processing ADHD players' games...")
        adhd_moves_df = process_games(adhd_games, group_label='ADHD', engine=engine)
        
        # ----------------------- 3. Fetch and Process General Population Games -----------------------
        
        logging.info("Fetching general population games...")
        if not os.path.exists(GENERAL_PGN_FILE_PATH):
            logging.error(f"PGN file not found at path: {GENERAL_PGN_FILE_PATH}")
            general_games = []
        else:
            general_games = process_pgn_file(GENERAL_PGN_FILE_PATH, max_games = 10000)  # Adjust max_games as needed
        
        if not general_games:
            logging.warning("No General population games to process.")
            general_moves_df = pd.DataFrame()
        else:
            logging.info("Processing general population games...")
            general_moves_df = process_games(general_games, group_label='General', engine=engine)
        
        # ----------------------- 4. Combine Datasets -----------------------

        logging.info("Combining datasets...")
        all_moves_df = pd.concat([adhd_moves_df, general_moves_df], ignore_index=True)

# ----------------------- 5. Data Cleaning -----------------------

logging.info("Cleaning data...")
required_columns = ['time_spent', 'evaluation', 'eval_change', 'white_elo', 'black_elo']
all_moves_df = all_moves_df.dropna(subset=required_columns)

# Ensure 'is_sacrifice' (not 'IsSacrifice') is boolean
all_moves_df['is_sacrifice'] = all_moves_df['is_sacrifice'].fillna(False).astype(bool)

# Convert relevant columns to numeric types
numeric_columns = ['time_spent', 'evaluation', 'eval_change', 'white_elo', 'black_elo']
for col in numeric_columns:
    all_moves_df[col] = pd.to_numeric(all_moves_df[col], errors='coerce')

# Drop rows with NaNs resulted from non-numeric conversion
all_moves_df = all_moves_df.dropna(subset=numeric_columns)

# Create ELO brackets for analysis
all_moves_df['elo_bracket'] = pd.cut(
    all_moves_df.apply(lambda row: max(row['white_elo'], row['black_elo']), axis=1),
    bins=[0, 1200, 1600, 2000, float('inf')],
    labels=['0-1200', '1200-1600', '1600-2000', '2000+']
)

# After cleaning, output the number of moves remaining
logging.info(f"Total number of moves after cleaning: {len(all_moves_df)}")

INFO: Fetching games for user 'teoeo'...
INFO: Fetched 1252 games with evaluations for user 'teoeo'.
INFO: Fetching games for user 'Tobermorey'...
INFO: Fetched 172 games with evaluations for user 'Tobermorey'.
INFO: Fetching games for user 'apostatlet'...
INFO: Fetched 415 games with evaluations for user 'apostatlet'.
INFO: Fetching games for user 'LovePump1000'...
INFO: Fetched 773 games with evaluations for user 'LovePump1000'.
INFO: Fetching games for user 'StuntmanAndy'...
INFO: Fetched 952 games with evaluations for user 'StuntmanAndy'.
INFO: Fetching games for user 'ChessyChesterton12'...
INFO: Fetched 311 games with evaluations for user 'ChessyChesterton12'.
INFO: Fetching games for user 'yastoon'...
INFO: Fetched 24 games with evaluations for user 'yastoon'.
INFO: Fetching games for user 'SonnyDayz11'...
INFO: Fetched 16 games with evaluations for user 'SonnyDayz11'.
INFO: Fetching games for user 'Xiroir'...
INFO: Fetched 104 games with evaluations for user 'Xiroir'.
INFO: Fet

Processing ADHD games: 100%|██████████| 8224/8224 [14:16<00:00,  9.60it/s]

INFO: 
Processing Summary for ADHD:
INFO: Total games: 8224
INFO: Standard chess games: 7830
INFO: Rated games: 5106
INFO: Games with evaluations: 5106
INFO: Total moves processed: 335386





INFO: Fetching general population games...


Collecting games: 100%|██████████| 10000/10000 [03:50<00:00, 43.41it/s]

INFO: 
ELO Distribution:
INFO: ELO 800-850: 5 games
INFO: ELO 850-900: 9 games
INFO: ELO 900-950: 22 games
INFO: ELO 950-1000: 30 games
INFO: ELO 1000-1050: 79 games
INFO: ELO 1050-1100: 86 games
INFO: ELO 1100-1150: 145 games
INFO: ELO 1150-1200: 203 games
INFO: ELO 1200-1250: 259 games
INFO: ELO 1250-1300: 274 games
INFO: ELO 1300-1350: 403 games
INFO: ELO 1350-1400: 476 games
INFO: ELO 1400-1450: 534 games
INFO: ELO 1450-1500: 670 games
INFO: ELO 1500-1550: 688 games
INFO: ELO 1550-1600: 628 games
INFO: ELO 1600-1650: 659 games
INFO: ELO 1650-1700: 727 games
INFO: ELO 1700-1750: 701 games
INFO: ELO 1750-1800: 641 games
INFO: ELO 1800-1850: 598 games
INFO: ELO 1850-1900: 527 games
INFO: ELO 1900-1950: 448 games
INFO: ELO 1950-2000: 330 games
INFO: ELO 2000-2050: 252 games
INFO: ELO 2050-2100: 225 games
INFO: ELO 2100-2150: 130 games
INFO: ELO 2150-2200: 86 games
INFO: ELO 2200-2250: 68 games
INFO: ELO 2250-2300: 35 games
INFO: ELO 2300-2350: 21 games
INFO: ELO 2350-2400: 8 games
INFO


Processing General games: 100%|██████████| 10000/10000 [26:58<00:00,  6.18it/s]

INFO: 
Processing Summary for General:
INFO: Total games: 10000
INFO: Standard chess games: 10000
INFO: Rated games: 10000
INFO: Games with evaluations: 10000
INFO: Total moves processed: 636180





INFO: Combining datasets...
INFO: Cleaning data...
INFO: Total number of moves after cleaning: 447391


In [19]:
def analyze_error_distribution(df):
    # Print actual value ranges
    print("\nEvaluation Change Statistics:")
    print(df['eval_change'].describe())
    
    # Count error categories
    print("\nError Category Distribution:")
    print(df['error_category'].value_counts(normalize=True).multiply(100).round(1))
    
    # Look at the largest eval changes
    print("\nLargest Evaluation Changes:")
    print(df.nlargest(5, 'eval_change')[['eval_change', 'error_category', 'san']])
    
    # Look at the smallest eval changes
    print("\nSmallest Evaluation Changes:")
    print(df.nsmallest(5, 'eval_change')[['eval_change', 'error_category', 'san']])

# Add after your data cleaning
analyze_error_distribution(all_moves_df)



Evaluation Change Statistics:
count    447391.000000
mean        -78.137922
std         175.756321
min       -3000.000000
25%         -72.000000
50%         -19.000000
75%          -3.000000
max         973.000000
Name: eval_change, dtype: float64

Error Category Distribution:
error_category
Normal        83.6
Inaccuracy     9.3
Blunder        3.5
Mistake        3.5
Name: proportion, dtype: float64

Largest Evaluation Changes:
        eval_change error_category   san
197728        973.0        Blunder   Kd8
230990        949.0         Normal   Kf3
149838        910.0         Normal  Rb4+
621993        860.0         Normal   Bf2
123540        854.0        Blunder    c5

Smallest Evaluation Changes:
        eval_change error_category   san
106351      -3000.0        Blunder  Qf5+
302884      -3000.0        Blunder    h6
312759      -3000.0        Blunder  Qf6+
353082      -3000.0         Normal  Qxg1
373728      -3000.0        Blunder   Qg5


In [20]:
# ======================================================================
# PUBLICATION DATA FIXES
# ======================================================================

import pandas as pd
import numpy as np
import os
import logging

# Apply this right after: logging.info(f"Total number of moves after cleaning: {len(all_moves_df)}")

logging.info("=== APPLYING PUBLICATION-READY DATA FIXES ===")

# ----------------------- 1. FIX ELO BRACKETS TO MATCH PAPER -----------------------
# Your paper uses ≤1000, 1001-1400, 1401-1800, 1801+ but current code uses different brackets

# Create average ELO (more accurate than max)
all_moves_df['avg_elo'] = (all_moves_df['white_elo'] + all_moves_df['black_elo']) / 2

# FIX: Replace your existing elo_bracket creation with this:
all_moves_df['elo_bracket'] = pd.cut(
    all_moves_df['avg_elo'],
    bins=[0, 1000, 1400, 1800, float('inf')],
    labels=['≤1000', '1001-1400', '1401-1800', '1801+'],
    include_lowest=True
)

logging.info("✓ Fixed ELO brackets to match paper format")

# ----------------------- 2. CREATE PUBLICATION PLAYER TYPES -----------------------

# Create clear player type variable for R ANOVA
all_moves_df['player_type'] = all_moves_df['group'].map({
    'ADHD': 'ADHD', 
    'General': 'Non-ADHD'  # This matches your R scripts!
})

# Create binary indicator for easier analysis
all_moves_df['is_adhd_player'] = (all_moves_df['player_type'] == 'ADHD').astype(int)

logging.info("✓ Created proper player_type variables")

# ----------------------- 3. FIX TIME CONTROL CATEGORIES -----------------------

# Make sure time control categories match R scripts exactly
time_control_mapping = {
    'Bullet': 'Bullet',
    'Blitz': 'Blitz', 
    'Rapid': 'Rapid',
    'Classical': 'Classical'
}

all_moves_df['time_control_category'] = all_moves_df['time_control_category'].map(time_control_mapping)

logging.info("✓ Standardized time control categories")

# ----------------------- 4. CLEAN MISSING VALUES FOR R -----------------------

# Remove rows with critical missing values for regression analysis
critical_vars = ['time_spent', 'move_number', 'position_complexity', 'elo_bracket', 
                'time_control_category', 'player_type']

before_clean = len(all_moves_df)
all_moves_df = all_moves_df.dropna(subset=critical_vars)
after_clean = len(all_moves_df)

logging.info(f"✓ Cleaned missing values: {before_clean:,} → {after_clean:,} rows")

# ----------------------- 5. CREATE EXPORT-READY DATAFRAMES -----------------------

# Create the main analysis dataset
combined_processed_data = all_moves_df.copy()

# Create game-level summary for between-subjects analysis
logging.info("Creating game-level player summary...")

game_summary = combined_processed_data.groupby(['game_id', 'player_type']).agg({
    'time_spent': ['mean', 'std', 'count'],
    'eval_change': ['mean', 'std'],
    'under_time_pressure': lambda x: (x == True).sum() / len(x),  # Proportion under pressure
    'white_elo': 'first',
    'black_elo': 'first',
    'avg_elo': 'first',
    'elo_bracket': 'first',
    'time_control_category': 'first',
    'adhd_player': 'first'
}).reset_index()

# Flatten column names
game_summary.columns = [
    'game_id', 'player_type', 'mean_time_spent', 'std_time_spent', 'n_moves',
    'mean_eval_change', 'std_eval_change', 'prop_under_pressure',
    'white_elo', 'black_elo', 'avg_elo', 'elo_bracket', 'time_control_category', 'adhd_player'
]

logging.info(f"✓ Created game summary: {len(game_summary):,} games")

# ----------------------- 6. CREATE PLAYER-LEVEL SUMMARY -----------------------

# This is what Professor Kleiman needs for ANOVA
player_summary = game_summary.groupby(['adhd_player', 'player_type']).agg({
    'mean_time_spent': ['mean', 'std', 'count'],
    'mean_eval_change': ['mean', 'std'],
    'prop_under_pressure': 'mean',
    'avg_elo': 'mean',
    'n_moves': 'sum'
}).reset_index()

# Flatten columns
player_summary.columns = [
    'player_name', 'player_type', 'overall_mean_time', 'std_mean_time', 'n_games',
    'overall_mean_eval_change', 'std_eval_change', 'avg_prop_under_pressure',
    'average_elo', 'total_moves'
]

logging.info(f"✓ Created player summary: {len(player_summary):,} players")

# ----------------------- 7. EXPORT FILES FOR R ANALYSIS -----------------------

# Create output directory
output_dir = "/Users/benjaminrosales/Desktop/Chess-Worker/Chess-Study/Publication_Data"
os.makedirs(output_dir, exist_ok=True)

# Export main dataset (what your R scripts expect)
main_file = f"{output_dir}/combined_processed_data.csv"
combined_processed_data.to_csv(main_file, index=False)
logging.info(f"✓ Exported main dataset: {main_file}")

# Export game-level summary
game_file = f"{output_dir}/game_level_summary.csv"
game_summary.to_csv(game_file, index=False)
logging.info(f"✓ Exported game summary: {game_file}")

# Export player-level summary (for ANOVA)
player_file = f"{output_dir}/player_level_summary.csv"
player_summary.to_csv(player_file, index=False)
logging.info(f"✓ Exported player summary: {player_file}")

# ----------------------- 8. CREATE DATA VERIFICATION REPORT -----------------------

print("\n" + "="*60)
print("PUBLICATION DATA VERIFICATION REPORT")
print("="*60)

print(f"\n📊 SAMPLE SIZES:")
print(f"Total moves: {len(combined_processed_data):,}")
print(f"ADHD moves: {len(combined_processed_data[combined_processed_data['player_type'] == 'ADHD']):,}")
print(f"Non-ADHD moves: {len(combined_processed_data[combined_processed_data['player_type'] == 'Non-ADHD']):,}")

print(f"\n🎯 ELO DISTRIBUTION:")
elo_dist = combined_processed_data.groupby(['elo_bracket', 'player_type']).size().unstack(fill_value=0)
print(elo_dist)

print(f"\n⏱️ TIME CONTROL DISTRIBUTION:")
time_dist = combined_processed_data.groupby(['time_control_category', 'player_type']).size().unstack(fill_value=0)
print(time_dist)

print(f"\n👥 PLAYER COUNTS:")
print(f"Total unique players: {len(player_summary)}")
print(f"ADHD players: {len(player_summary[player_summary['player_type'] == 'ADHD'])}")
print(f"Non-ADHD players: {len(player_summary[player_summary['player_type'] == 'Non-ADHD'])}")

print(f"\n✅ DATA QUALITY CHECKS:")
print(f"Missing elo_bracket: {combined_processed_data['elo_bracket'].isna().sum()}")
print(f"Missing player_type: {combined_processed_data['player_type'].isna().sum()}")
print(f"Missing time_spent: {combined_processed_data['time_spent'].isna().sum()}")
print(f"Missing position_complexity: {combined_processed_data['position_complexity'].isna().sum()}")

print(f"\n📁 EXPORTED FILES:")
print(f"1. {main_file}")
print(f"2. {game_file}")
print(f"3. {player_file}")

print("\n🎉 DATA IS NOW PUBLICATION-READY!")
print("="*60)

# ----------------------- 9. CREATE R LOADING SNIPPET -----------------------

r_code = f'''
# ======================================================================
# R CODE TO LOAD YOUR PUBLICATION DATA
# Copy this into your R scripts
# ======================================================================

# Load the main dataset
combined_processed_data <- read.csv("{main_file}")

# Load game-level summary
game_summary <- read.csv("{game_file}")

# Load player-level summary (for ANOVA)
player_summary <- read.csv("{player_file}")

# Verify data structure
print("Data loaded successfully!")
print(paste("Total moves:", nrow(combined_processed_data)))
print(paste("ELO brackets:", toString(unique(combined_processed_data$elo_bracket))))
print(paste("Player types:", toString(unique(combined_processed_data$player_type))))
'''

r_file = f"{output_dir}/load_data.R"
with open(r_file, 'w') as f:
    f.write(r_code)

logging.info(f"✓ Created R loading script: {r_file}")

print(f"\n📋 Next steps:")
print(f"1. Run your R regression scripts using combined_processed_data")
print(f"2. Use player_summary for between-subjects ANOVA")
print(f"3. All ELO brackets now match your paper exactly")
print(f"4. Professor Kleiman can run analyses immediately")


INFO: === APPLYING PUBLICATION-READY DATA FIXES ===
INFO: ✓ Fixed ELO brackets to match paper format
INFO: ✓ Created proper player_type variables
INFO: ✓ Standardized time control categories
INFO: ✓ Cleaned missing values: 447,391 → 447,391 rows
INFO: Creating game-level player summary...
INFO: ✓ Created game summary: 15,002 games
INFO: ✓ Created player summary: 22 players
INFO: ✓ Exported main dataset: /Users/benjaminrosales/Desktop/Chess-Worker/Chess-Study/Publication_Data/combined_processed_data.csv
INFO: ✓ Exported game summary: /Users/benjaminrosales/Desktop/Chess-Worker/Chess-Study/Publication_Data/game_level_summary.csv
INFO: ✓ Exported player summary: /Users/benjaminrosales/Desktop/Chess-Worker/Chess-Study/Publication_Data/player_level_summary.csv

PUBLICATION DATA VERIFICATION REPORT

📊 SAMPLE SIZES:
Total moves: 447,391
ADHD moves: 154,524
Non-ADHD moves: 292,867

🎯 ELO DISTRIBUTION:
player_type   ADHD  Non-ADHD
elo_bracket                 
≤1000         4113      1720
1001-1

  elo_dist = combined_processed_data.groupby(['elo_bracket', 'player_type']).size().unstack(fill_value=0)


In [21]:
print(adhd_moves_df['eval_change'].describe())
print(adhd_moves_df['evaluation'].describe())

count    330280.000000
mean        -80.381882
std         197.298415
min       -3000.000000
25%         -71.000000
50%         -16.000000
75%           0.000000
max        1307.000000
Name: eval_change, dtype: float64
count     335386.0
unique      6398.0
top            0.0
freq       15101.0
Name: evaluation, dtype: float64
