In [19]:
# Imports
import json
import requests
import pandas as pd
import chess.pgn
import io
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from scipy import stats
import chess.engine
import sys
import logging
import math
import random
import tqdm
from enum import Enum
from elocator_test.complexity.model import ChessModel
import torch
from elocator_test.encoder import fen_encoder


# Configure logging to print to stdout
logging.basicConfig(
    level=logging.INFO, format="%(levelname)s: %(message)s", stream=sys.stdout
)

# Configure plotting style
sns.set(style="whitegrid")
# Replace with the actual path to your general population PGN file
GENERAL_PGN_FILE_PATH = "/Users/benjaminrosales/Desktop/Chess Study Materials & Data/Comparison Games/lichess_db_standard_rated_2017-05.pgn"

# Path to your Stockfish executable
STOCKFISH_PATH = "/opt/homebrew/bin/stockfish"

# List of ADHD players' usernames (Lichess)
ADHD_USERNAMES = [
    "teoeo",
    "Tobermorey",
    "apostatlet",
    "LovePump1000",
    "StuntmanAndy",
    "Banfy_B",
    "ChessyChesterton12",
    "yastoon",
    "Timy1976",
    "SonnyDayz11",
    "Xiroir",
    "StellaAthena",
    "MagikPigeon"
]

#WrapperClass
class ElocatorModel:
    def __init__(self, model_path):
        self.model = ChessModel()
        self.model.load_state_dict(torch.load(model_path))
        self.model.eval()

In [20]:
class ElocatorAnalyzer:
    def __init__(self, model_path):
        self.model = ChessModel()
        self.model.load_state_dict(torch.load(model_path))
        self.model.eval()
        
        # Correct percentile ranges from the repo
        self.percentile_ranges = {
            1: (0, 0.006848618667572737),
            2: (0.006848618667572737, 0.007860606908798218),
            3: (0.007860606908798218, 0.0093873867765069),
            4: (0.0093873867765069, 0.010885232314467431),
            5: (0.010885232314467431, 0.01191701553761959),
            6: (0.01191701553761959, 0.012793240323662757),
            7: (0.012793240323662757, 0.013946877606213093),
            8: (0.013946877606213093, 0.015834777429699905),
            9: (0.015834777429699905, 0.02067287489771843),
            10: (0.02067287489771843, 1)
        }
    
    def map_prediction_to_complexity(self, prediction):
        """Maps raw model output to complexity score (1-10)"""
        for level, (low, high) in self.percentile_ranges.items():
            if low <= prediction <= high:
                return level
        return None  # Handle predictions outside expected range
    
    def get_position_complexity(self, fen):
        """Get complexity score for a single position"""
        try:
            # Sanitize the FEN string before processing
            clean_fen = sanitize_fen(fen)
            if clean_fen is None:
                return None
                
            encoded_position = fen_encoder(clean_fen)
            position_tensor = torch.FloatTensor(encoded_position).unsqueeze(0)
            
            with torch.no_grad():
                raw_prediction = self.model(position_tensor).item()
                complexity_score = self.map_prediction_to_complexity(raw_prediction)
                return complexity_score
                
        except Exception as e:
            logging.error(f"Error processing FEN {fen}: {str(e)}")
            return None
            
    def analyze_game(self, pgn_game):
        board = pgn_game.board()
        node = pgn_game
        positions = []
        
        # Get initial position
        try:
            complexity = self.get_position_complexity(board.fen())
            positions.append({
                'fen': board.fen(),
                'complexity': complexity,
                'move_number': 0
            })
        except Exception as e:
            logging.error(f"Error analyzing initial position: {str(e)}")
        
        # Process each move
        while node.variations:
            try:
                next_node = node.variations[0]
                move = next_node.move
                board.push(move)
                
                complexity = self.get_position_complexity(board.fen())
                if complexity is not None:  # Only append positions we can analyze
                    positions.append({
                        'fen': board.fen(),
                        'complexity': complexity,
                        'move_number': len(board.move_stack)
                    })
                
                node = next_node
            except Exception as e:
                logging.error(f"Error analyzing position at move {len(board.move_stack)}: {str(e)}")
                break
                
        return positions

# Create global instance
elocator = ElocatorAnalyzer('elocator_test/complexity/models/model.pth')

# Create global instance
elocator = ElocatorAnalyzer('elocator_test/complexity/models/model.pth')
# Create global instance
elocator = ElocatorAnalyzer('elocator_test/complexity/models/model.pth')
    
def get_position_complexity(self, fen):
        """Get complexity score for a single position"""
        encoded_position = fen_encoder(fen)
        position_tensor = torch.FloatTensor(encoded_position).unsqueeze(0)
        with torch.no_grad():
            complexity = self.model(position_tensor)
        return complexity.item()
    
def sanitize_fen(fen):
    """
    Sanitize and validate a FEN string for standard chess.
    Returns None if the FEN is invalid or from a variant game.
    """
    try:
        # If we see brackets or special characters, it's likely a variant game
        if '[' in fen or ']' in fen or '~' in fen:
            return None
            
        # Remove any non-standard characters
        valid_chars = 'rnbqkpRNBQKP12345678/- '
        cleaned_fen = ''.join(c for c in fen if c in valid_chars)
        
        # Get position part (everything before first space)
        position_part = cleaned_fen.split()[0] if ' ' in cleaned_fen else cleaned_fen
        
        # Validate basic FEN structure
        ranks = position_part.split('/')
        if len(ranks) != 8:
            return None
            
        # Validate each rank
        for rank in ranks:
            spaces = 0
            for char in rank:
                if char.isdigit():
                    spaces += int(char)
                else:
                    spaces += 1
            if spaces != 8:
                return None
                
        # Return standardized FEN string
        return f"{position_part} w - - 0 1"
        
    except Exception as e:
        logging.error(f"FEN sanitization failed: {fen}")
        logging.error(f"Error: {str(e)}")
        return None

  self.model.load_state_dict(torch.load(model_path))


In [21]:
def safe_int(value, default=None):
    try:
        return int(value)
    except (ValueError, TypeError):
        return default
"""
Setting up Time Functions
"""

def parse_clock_time(comment):
    match = re.search(r'\[%clk (\d+):(\d+):(\d+)\]', comment)  # Adjust regex if needed
    if match:
        hours = int(match.group(1))
        minutes = int(match.group(2))
        seconds = int(match.group(3))
        return hours * 3600 + minutes * 60 + seconds  # Total seconds
    return None

## Determine if a player is under time pressure based on van Harreveld et al. (2007) criteria ---

def is_under_time_pressure(time_remaining, initial_time, time_spent):
    """
    Determine time pressure, accounting for premoves and missing data
    - Premoves (time_spent = 0) should never count as time pressure
    - Missing time data should be handled safely
    """
    # Handle None/missing values
    if any(x is None for x in [time_remaining, initial_time]):
        return False
        
    # Handle invalid values
    try:
        time_remaining = float(time_remaining)
        initial_time = float(initial_time)
        # time_spent can be None for missing data or 0 for premoves
        time_spent = float(time_spent) if time_spent is not None else None
    except (TypeError, ValueError):
        return False
        
    # Invalid time states
    if initial_time <= 0 or time_remaining < 0:
        return False
        
    # If it's a premove (time_spent = 0) or missing time data,
    # only check absolute and relative time remaining
    absolute_pressure = time_remaining < 30  # Less than 30 seconds
    relative_pressure = time_remaining < (0.1 * initial_time)  # Less than 10% of initial time
        
    return absolute_pressure or relative_pressure

class TimeControlType(Enum):
    CLASSICAL = "Classical"
    RAPID = "Rapid"
    BLITZ = "Blitz"
    BULLET = "Bullet"
    UNKNOWN = "Unknown"

#Parsing and Categorizing Time Control
def parse_time_control(time_control):
    """Parse time control string from Lichess format (already in seconds)"""
    if not time_control or time_control == "unknown":
        return None, None, TimeControlType.UNKNOWN
        
    try:
        if "+" in time_control:
            base, increment = time_control.split("+")
            initial_seconds = int(base)  # Already in seconds, don't multiply
            increment_seconds = int(increment)
        else:
            initial_seconds = int(time_control)  # Already in seconds
            increment_seconds = 0
            
        # Categorize based on seconds
        if initial_seconds >= 1800:     # 30 minutes or more
            category = TimeControlType.CLASSICAL
        elif initial_seconds >= 600:     # 10 minutes or more
            category = TimeControlType.RAPID
        elif initial_seconds >= 180:     # 3 minutes or more
            category = TimeControlType.BLITZ
        else:                           # Less than 3 minutes
            category = TimeControlType.BULLET
            
        return initial_seconds, increment_seconds, category
        
    except (ValueError, TypeError):
        return None, None, TimeControlType.UNKNOWN

def calculate_material(board):
    # Returns material balance for both sides
    material = {"White": 0, "Black": 0}
    piece_values = {
        chess.PAWN: 1,
        chess.KNIGHT: 3,
        chess.BISHOP: 3,
        chess.ROOK: 5,
        chess.QUEEN: 9,
        chess.KING: 0,  # King is invaluable, but we set to 0 for simplicity
    }
    for piece_type in piece_values:
        value = piece_values[piece_type]
        material["White"] += len(board.pieces(piece_type, chess.WHITE)) * value
        material["Black"] += len(board.pieces(piece_type, chess.BLACK)) * value
    return material

def categorize_position_complexity(evaluation):
    """
    Categorize position complexity, handling both numeric and mate evaluations
    """
    if evaluation is None:
        return 'Unknown'
    
    # Handle mate scores
    if isinstance(evaluation, str) and '#' in evaluation:
        return 'Decisive Advantage'  # Mate is always decisive
        
    try:
        eval_float = float(evaluation)
        if abs(eval_float) < 1:
            return 'Balanced'
        elif abs(eval_float) < 3:
            return 'Slight Advantage'
        else:
            return 'Decisive Advantage'
    except (ValueError, TypeError):
        return 'Unknown'

def categorize_move(eval_before, eval_after):
    """
    Categorize move quality, handling both numeric and mate evaluations
    """
    if eval_before is None or eval_after is None:
        return "Unknown"

    # Handle mate scores
    if isinstance(eval_after, str) and '#' in eval_after:
        if '-' in eval_after:
            return "Forced Checkmate (Losing)"
        return "Forced Checkmate (Winning)"
    
    try:
        eval_before = float(eval_before)
        eval_after = float(eval_after)
    except (ValueError, TypeError):
        return "Unknown"

    # Now proceed with numeric evaluation
    SATURATION_LIMIT = 1000  # Equivalent to a 10-pawn advantage
    
    # Calculate evaluation change
    eval_change = eval_after - eval_before

    if abs(eval_after) >= SATURATION_LIMIT:
        return "Winning Position" if eval_after > 0 else "Losing Position"

    # Categorize the move based on evaluation change
    if eval_change <= -300:
        return "Blunder"
    elif eval_change <= -150:
        return "Mistake"
    elif eval_change <= -50:
        return "Inaccuracy"
    elif eval_change >= 300:
        return "Brilliant Move"
    elif eval_change >= 150:
        return "Great Move"
    elif eval_change >= 50:
        return "Good Move"
    else:
        return "Normal"

def debug_data_pipeline(df, label):
    # Function definition here
    logging.info(f"Debugging {label}")
    # Process the DataFrame or print logs for debugging

def raw_winning_chances(cp):
    MULTIPLIER = -0.00368208
    return 2 / (1 + math.exp(MULTIPLIER * cp)) - 1

def cp_winning_chances(cp):
    cp = max(-1000, min(cp, 1000))
    return raw_winning_chances(cp)

def mate_winning_chances(mate):
    cp = (21 - min(10, abs(mate))) * 100
    signed_cp = cp * (1 if mate > 0 else -1)
    return raw_winning_chances(signed_cp)

def eval_winning_chances(eval_str):
    if eval_str is None:
        return None
    if '#' in str(eval_str):
        # Mate in N moves
        mate_str = str(eval_str).replace('#', '')
        try:
            mate = int(mate_str)
            return mate_winning_chances(mate)
        except ValueError:
            return None
    else:
        try:
            cp = float(eval_str) * 100  # Convert from pawns to centipawns
            return cp_winning_chances(cp)
        except ValueError:
            return None


def safe_int(value, default=None):
    try:
        return int(value)
    except (ValueError, TypeError):
        return default


def parse_clock_time(comment):
    match = re.search(r'\[%clk (\d+):(\d+):(\d+)\]', comment)  # Adjust regex if needed
    if match:
        hours = int(match.group(1))
        minutes = int(match.group(2))
        seconds = int(match.group(3))
        return hours * 3600 + minutes * 60 + seconds  # Total seconds
    return None


def parse_evaluation(comment):
    match = re.search(r'%eval\s([+-]?[\d.]+|#-?\d+)', comment)
    if match:
        eval_str = match.group(1)
        if '#' in eval_str:
            # Mate in N moves
            return eval_str
        else:
            return float(eval_str)  # Convert to float
    return None


def categorize_error(eval_change, player_color="white"):
    if eval_change is None:
        return "Unknown"
        
    # Normalize eval_change to player's perspective
    if player_color.lower() == "black":
        eval_change = -eval_change
        
    # Convert to pawn units (divide by 100 since we're in centipawns)
    eval_change = eval_change / 100
    
    # Standard threshold values in pawn units
    if eval_change <= -3.0:  # Loss of 3 pawns or more
        return "Blunder"
    elif eval_change <= -1.5:  # Loss of 1.5 pawns
        return "Mistake"
    elif eval_change <= -0.5:  # Loss of 0.5 pawns
        return "Inaccuracy"
    else:
        return "Normal"


def calculate_material(board):
    # Returns material balance for both sides
    material = {"White": 0, "Black": 0}
    piece_values = {
        chess.PAWN: 1,
        chess.KNIGHT: 3,
        chess.BISHOP: 3,
        chess.ROOK: 5,
        chess.QUEEN: 9,
        chess.KING: 0,  # King is invaluable, but we set to 0 for simplicity
    }
    for piece_type in piece_values:
        value = piece_values[piece_type]
        material["White"] += len(board.pieces(piece_type, chess.WHITE)) * value
        material["Black"] += len(board.pieces(piece_type, chess.BLACK)) * value
    return material

def categorize_game_phase(board):
    """
    Enhanced game phase calculation incorporating:
    - Material balance and distribution
    - Piece mobility
    - Pawn structure
    - Position characteristics
    """
    # Material values calibrated from empirical analysis
    PIECE_VALUES = {
        chess.KNIGHT: 782,
        chess.BISHOP: 830,
        chess.ROOK: 1289,
        chess.QUEEN: 2529
    }
    
    # Phase boundaries from statistical analysis
    ENDGAME_LIMIT = 3915   # ~Queen + Rook
    MIDGAME_LIMIT = 15258  # Total non-pawn material at start
    PHASE_MIDGAME = 128    # Full phase scale
    
    def calculate_piece_mobility(board, piece_type, square):
        """Calculate approximate mobility for a piece"""
        mobility = 0
        attacks = board.attacks(square)
        mobility = len([sq for sq in attacks if not board.is_attacked_by(not board.turn, sq)])
        return mobility
    
    def evaluate_pawn_structure(board):
        """Evaluate pawn structure impact on phase"""
        white_pawns = board.pieces(chess.PAWN, chess.WHITE)
        black_pawns = board.pieces(chess.PAWN, chess.BLACK)
        
        # Calculate pawn structure characteristics
        center_pawns = len([p for p in white_pawns | black_pawns 
                          if chess.square_file(p) in [3,4]])
        passed_pawns = 0
        for p in white_pawns:
            if not any(black_pawns & chess.BB_FILES[chess.square_file(p)]):
                passed_pawns += 1
        for p in black_pawns:
            if not any(white_pawns & chess.BB_FILES[chess.square_file(p)]):
                passed_pawns += 1
                
        return center_pawns * 0.1 + passed_pawns * 0.15
    
    # Calculate non-pawn material and mobility
    def evaluate_position(color):
        material = 0
        mobility_factor = 0
        
        for piece_type, value in PIECE_VALUES.items():
            pieces = board.pieces(piece_type, color)
            count = len(pieces)
            material += count * value
            
            # Add mobility consideration
            for square in pieces:
                mobility_factor += calculate_piece_mobility(board, piece_type, square) * 0.01
                
        return material, mobility_factor
    
    # Calculate for both sides
    w_material, w_mobility = evaluate_position(chess.WHITE)
    b_material, b_mobility = evaluate_position(chess.BLACK)
    
    # Total non-pawn material with mobility adjustment
    total_material = w_material + b_material
    mobility_adjustment = (w_mobility + b_mobility) * 100
    
    # Pawn structure impact
    pawn_factor = evaluate_pawn_structure(board)
    
    # Adjust material based on mobility and pawn structure
    adjusted_material = total_material * (1 + pawn_factor) + mobility_adjustment
    
    # Clamp between endgame and midgame limits
    npm = max(ENDGAME_LIMIT, min(adjusted_material, MIDGAME_LIMIT))
    
    # Calculate phase score (0 = endgame, 128 = midgame)
    phase = ((npm - ENDGAME_LIMIT) * PHASE_MIDGAME) // (MIDGAME_LIMIT - ENDGAME_LIMIT)
    phase = max(0, min(phase, PHASE_MIDGAME))
    
    # Position-specific adjustments
    if len(board.move_stack) <= 20:  # First 10 moves
        phase = max(phase, 96)  # Ensure early moves are recognized as opening
    
    # Convert to categorical with clear documentation of thresholds
    if phase >= 96:      # 75% of PHASE_MIDGAME - Clear opening characteristics
        return "Opening"
    elif phase >= 32:    # 25% of PHASE_MIDGAME - Significant material remains
        return "Middlegame"
    else:                # Limited material or simplified position
        return "Endgame"

def categorize_move(eval_before, eval_after):
    if eval_before is None or eval_after is None:
        return "Unknown"

    # Define saturation limits in centipawns
    SATURATION_LIMIT = 1000  # Equivalent to a 10-pawn advantage
    MATE_SCORE = 10000       # Arbitrary large value representing mate

    # Calculate evaluation change
    eval_change = eval_after - eval_before

    # Handle mate scores (assuming the engine uses large numbers to indicate mate)
    if abs(eval_after) >= MATE_SCORE:
        if eval_after > 0:
            return "Forced Checkmate (Winning)"
        else:
            return "Forced Checkmate (Losing)"

    # Handle evaluation saturation
    if abs(eval_after) >= SATURATION_LIMIT:
        if eval_after > 0:
            return "Winning Position"
        else:
            return "Losing Position"

    # Categorize the move based on evaluation change
    if eval_change <= -300:
        return "Blunder"
    elif eval_change <= -150:
        return "Mistake"
    elif eval_change <= -50:
        return "Inaccuracy"
    elif eval_change >= 300:
        return "Brilliant Move"
    elif eval_change >= 150:
        return "Great Move"
    elif eval_change >= 50:
        return "Good Move"
    else:
        return "Normal"


def debug_data_pipeline(df, label):
    # Function definition here
    logging.info(f"Debugging {label}")
    # Process the DataFrame or print logs for debugging


def raw_winning_chances(cp):
    MULTIPLIER = -0.00368208
    return 2 / (1 + math.exp(MULTIPLIER * cp)) - 1


def cp_winning_chances(cp):
    cp = max(-1000, min(cp, 1000))
    return raw_winning_chances(cp)


def mate_winning_chances(mate):
    cp = (21 - min(10, abs(mate))) * 100
    signed_cp = cp * (1 if mate > 0 else -1)
    return raw_winning_chances(signed_cp)


def eval_winning_chances(evaluation):
    if evaluation is None:
        return None
    if isinstance(evaluation, str) and '#' in evaluation:
        # Mate in N moves
        mate_str = evaluation.replace('#', '')
        try:
            mate = int(mate_str)
            return mate_winning_chances(mate)
        except ValueError:
            return None
    else:
        try:
            cp = float(evaluation) * 100  # Convert from pawns to centipawns
            return cp_winning_chances(cp)
        except ValueError:
            return None
        
def calculate_eval_change(prev_evaluation, evaluation, player):
    """
    Calculate evaluation change accounting for player color and mate scores
    Returns the change in centipawns from the player's perspective
    """
    if prev_evaluation is None or evaluation is None:
        return None
        
    # Convert mate scores to numeric values
    def process_eval(eval_str):
        if isinstance(eval_str, str) and '#' in eval_str:
            # Extract mate number
            mate_num = int(eval_str.replace('#', ''))
            # Convert to centipawns (± 10000 depending on mate for/against)
            return 10000 if mate_num > 0 else -10000
        return float(eval_str)
    
    try:
        prev_val = process_eval(prev_evaluation)
        curr_val = process_eval(evaluation)
        
        # Calculate change (current - prev because an increase is good)
        change = curr_val - prev_val
        
        # Flip for black (since evaluations are from White's perspective)
        if player.lower() == "black":
            change = -change
            
        return change
        
    except (ValueError, TypeError) as e:
        logging.error(f"Error calculating eval change: {str(e)}")
        return None

In [22]:
def perform_statistical_test(var, data, test_results, test_type="independent_t"):
    # Prepare data
    group1 = data[data["Group"] == "ADHD"][var].dropna()
    group2 = data[data["Group"] == "General"][var].dropna()

    # Check if data is sufficient
    if len(group1) < 10 or len(group2) < 10:
        logging.warning(f"Not enough data to perform statistical test on '{var}'.")
        return

    # Test for normality
    stat1, p1 = stats.shapiro(group1)
    stat2, p2 = stats.shapiro(group2)
    normal = p1 > 0.05 and p2 > 0.05

    # Test for equal variances
    stat_levene, p_levene = stats.levene(group1, group2)
    equal_var = p_levene > 0.05

    # Choose appropriate test
    if normal and equal_var and test_type == "independent_t":
        # Independent T-test
        stat, p = stats.ttest_ind(group1, group2, equal_var=True)
        test_name = "Independent t-test"
    elif normal and not equal_var and test_type == "independent_t":
        # Welch's T-test
        stat, p = stats.ttest_ind(group1, group2, equal_var=False)
        test_name = "Welch's t-test"
    else:
        # Mann-Whitney U Test
        stat, p = stats.mannwhitneyu(group1, group2, alternative="two-sided")
        test_name = "Mann-Whitney U test"

    test_results.append(
        {"Variable": var, "Test": test_name, "Statistic": stat, "p-value": p}
    )


def perform_chi_squared_test(category_var, data, test_results):
    contingency_table = pd.crosstab(data["Group"], data[category_var])
    if contingency_table.empty or contingency_table.shape[1] == 0:
        logging.warning(f"Contingency table is empty for variable '{category_var}'.")
        return
    chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
    test_results.append(
        {
            "Variable": category_var,
            "Test": "Chi-Squared test",
            "Statistic": chi2,
            "p-value": p,
        }
    )

In [23]:
def fetch_lichess_games(username, max_games=4000):  # Increase max_games
    url = f"https://lichess.org/api/games/user/{username}"
    params = {
        "max": max_games,
        "moves": True,
        "evals": True,  # Include evaluations in the PGN comments
        "clocks": True,  # Include clock times in the PGN comments
    }
    headers = {"Accept": "application/x-chess-pgn"}
    response = requests.get(url, params=params, headers=headers)
    if response.status_code != 200:
        logging.warning(
            f"Failed to fetch games for user '{username}'. Status code: {response.status_code}"
        )
        return []
    pgn_text = response.text
    games = []
    pgn_io = io.StringIO(pgn_text)
    while True:
        game = chess.pgn.read_game(pgn_io)
        if game is None:
            break

        # Check if the game contains evaluations
        has_evaluation = False
        node = game
        while node.variations:
            next_node = node.variations[0]
            comment = next_node.comment
            if "%eval" in comment:
                has_evaluation = True
                break
            node = next_node

        if has_evaluation:
            games.append(game)

    logging.info(f"Fetched {len(games)} games with evaluations for user '{username}'.")
    return games


import random
from tqdm import tqdm

def count_games_in_pgn(pgn_file_path):
    """Count total games in PGN file with progress bar"""
    count = 0
    file_size = os.path.getsize(pgn_file_path)
    
    with open(pgn_file_path, "r", encoding="utf-8") as pgn_file:
        pbar = tqdm(total=file_size, desc="Counting games", unit='B', unit_scale=True)
        for line in pgn_file:
            if line.startswith('[Event "'):
                count += 1
            pbar.update(len(line.encode('utf-8')))
        pbar.close()
    return count

def validate_game_evaluations(game):
    """Validate game has proper evaluation structure"""
    try:
        node = game
        while node.variations:
            next_node = node.variations[0]
            if "%eval" in next_node.comment:
                return True
            node = next_node
        return False
    except (IndexError, AttributeError):
        return False

def process_pgn_file(pgn_file_path, max_games=10000, chunk_size=1000):
    games = []
    elo_distribution = {}
    
    try:
        with open(pgn_file_path, "r", encoding="utf-8") as pgn_file:
            pbar = tqdm(total=max_games, desc="Collecting games")
            
            while len(games) < max_games:
                chunk_count = 0
                current_chunk = []
                
                while chunk_count < chunk_size:
                    game = chess.pgn.read_game(pgn_file)
                    if game is None:
                        break
                        
                    # Quick validation before adding to chunk
                    if (game.headers and 
                        game.headers.get("Variant", "Standard").lower() == "standard" and
                        all(game.headers.get(key, "") != "" for key in ["WhiteElo", "BlackElo", "TimeControl"]) and
                        validate_game_evaluations(game)):  # Added evaluation validation
                        current_chunk.append(game)
                        chunk_count += 1
                
                if not current_chunk:
                    break
                
                sample_size = min(chunk_size // 2, max_games - len(games))
                sampled_games = random.sample(current_chunk, min(sample_size, len(current_chunk)))
                
                for game in sampled_games:
                    if len(games) >= max_games:
                        break
                    
                    white_elo = safe_int(game.headers.get("WhiteElo", 0))
                    black_elo = safe_int(game.headers.get("BlackElo", 0))
                    time_control = game.headers.get("TimeControl", "unknown")
                    
                    if all([white_elo, black_elo, time_control != "unknown"]):
                        avg_elo = (white_elo + black_elo) // 2
                        elo_bin = (avg_elo // 50) * 50
                        elo_distribution[elo_bin] = elo_distribution.get(elo_bin, 0) + 1
                        games.append(game)
                        pbar.update(1)
            
            pbar.close()
            
            logging.info("\nELO Distribution:")
            for elo_bin in sorted(elo_distribution.keys()):
                count = elo_distribution[elo_bin]
                logging.info(f"ELO {elo_bin}-{elo_bin+50}: {count} games")
            
        logging.info(f"Successfully collected {len(games)} games")
                           
    except Exception as e:
        logging.error(f"Failed to read PGN file '{pgn_file_path}': {e}")
    
    return games

# Usage
logging.info("Fetching general population games...")
general_games = process_pgn_file(GENERAL_PGN_FILE_PATH, max_games=10000)

# Usage with increased max_games
logging.info("Fetching general population games...")
general_games = process_pgn_file(
    GENERAL_PGN_FILE_PATH, 
    max_games=10000,
)

# Usage
logging.info("Fetching general population games...")
general_games = process_pgn_file(
    GENERAL_PGN_FILE_PATH, 
    max_games=10000,
)

def process_games(games, group_label, engine):
    all_moves = []
    total_games = len(games)
    rated_games = 0
    standard_games = 0
    eval_games = 0
    
    logging.info(f"\nProcessing {total_games} games for {group_label} group")
    
    for game in tqdm(games, desc=f"Processing {group_label} games"):
        try:
            # 1. Filter for standard chess and rated games
            variant = game.headers.get("Variant", "Standard")
            event = game.headers.get("Event", "Unknown")
            
            # Skip non-standard chess games
            if variant.lower() != "standard":
                continue
            standard_games += 1
            
            # Check if game is rated (checking both headers and event description)
            rated = ("rated" in event.lower() or 
                    game.headers.get("Rated", "False").lower() == "true")
            if not rated:
                continue
            rated_games += 1
            
            # 2. Initialize basic game data
            board = game.board()
            game_id = game.headers.get("Site", "Unknown")
            white = game.headers.get("White", "Unknown")
            black = game.headers.get("Black", "Unknown")
            result = game.headers.get("Result", "Unknown")
            white_elo = safe_int(game.headers.get("WhiteElo", None))
            black_elo = safe_int(game.headers.get("BlackElo", None))
            time_control = game.headers.get("TimeControl", "Unknown")
            
            # 3. ADHD player identification
            white_has_adhd = white in ADHD_USERNAMES
            black_has_adhd = black in ADHD_USERNAMES
            
            # 4. Time control parsing
            initial_time, increment, time_category = parse_time_control(time_control)
            
            # 5. Game traversal initialization
            node = game
            move_number = 0
            prev_evaluation = None
            current_material = calculate_material(board)
            prev_time_remaining = None
            prev_winning_chances = None
            
            # 6. Verify game has evaluations
            if not any("%eval" in node.variations[0].comment for node in game.mainline()):
                continue
            eval_games += 1

            # 7. Process moves
            while node.variations:
                next_node = node.variations[0]
                move = next_node.move
                san = board.san(move)
                move_number += 1
                player = "White" if board.turn else "Black"
                
                # Get position evaluation data
                try:
                    position_complexity = elocator.get_position_complexity(board.fen())
                except Exception as e:
                    position_complexity = None
                
                # ADHD status for current move
                is_adhd_move = (player == "White" and white_has_adhd) or \
                              (player == "Black" and black_has_adhd)
                
                # Extract move metadata
                comment = next_node.comment
                time_remaining = parse_clock_time(comment)
                evaluation = parse_evaluation(comment)
                
                # Time calculations
                time_spent = (prev_time_remaining - time_remaining) if all(x is not None for x in [prev_time_remaining, time_remaining]) else None
                time_spent = time_spent if time_spent and time_spent > 0 else None
                
                under_pressure = is_under_time_pressure(
                    time_remaining=time_remaining,
                    initial_time=initial_time,
                    time_spent=time_spent
                )
                
                # Calculate winning chances
                winning_chances = eval_winning_chances(evaluation)
                winning_chances_change = winning_chances - prev_winning_chances if all(x is not None for x in [prev_winning_chances, winning_chances]) else None
                
                # Skip positions without evaluations
                if evaluation is None:
                    board.push(move)
                    node = next_node
                    prev_time_remaining = time_remaining
                    current_material = calculate_material(board)
                    prev_winning_chances = winning_chances
                    continue
                
                # Execute move and calculate changes
                board.push(move)
                
                # Evaluation change calculation
                eval_change = calculate_eval_change(prev_evaluation, evaluation, player)
                if eval_change is not None:
                    eval_change = eval_change * 100

                
                new_material = calculate_material(board)
                material_diff = new_material[player] - current_material[player]
                is_sacrifice = material_diff < 0
                game_phase = categorize_game_phase(board)
                position_complexity_category = categorize_position_complexity(prev_evaluation)
                error_category = categorize_error(eval_change, player)  # Note: passing player here is important
                
                # Compile move data
                move_data = {
                    'game_id': game_id,
                    'event': event,
                    'date': game.headers.get("UTCDate", "Unknown"),
                    'result': result,
                    'white': white,
                    'black': black,
                    'white_elo': white_elo,
                    'black_elo': black_elo,
                    'adhd_player': white if white_has_adhd else (black if black_has_adhd else None),
                    'move_number': move_number,
                    'player': player,
                    'san': san,
                    'fen': board.fen(),
                    'game_phase': game_phase,
                    'is_adhd_move': is_adhd_move,
                    'position_complexity': position_complexity,
                    'position_complexity_category': position_complexity_category,
                    'evaluation': evaluation,
                    'eval_change': eval_change,
                    'error_category': categorize_error(eval_change, player),
                    'winning_chances': winning_chances,
                    'winning_chances_change': winning_chances_change,
                    'material_diff': material_diff,
                    'is_sacrifice': is_sacrifice,
                    'time_control': time_control,
                    'time_control_category': time_category.value if time_category else None,
                    'initial_time_seconds': initial_time,
                    'increment_seconds': increment,
                    'time_remaining': time_remaining,
                    'time_spent': time_spent,
                    'under_time_pressure': under_pressure,
                    'group': group_label
                }
                
                all_moves.append(move_data)
                
                # Update previous values
                prev_evaluation = evaluation
                prev_time_remaining = time_remaining
                current_material = new_material
                prev_winning_chances = winning_chances
                node = next_node
                
        except Exception as e:
            logging.error(f"Error processing game {game_id}: {e}")
            continue
    
    # Processing summary
    logging.info(f"\nProcessing Summary for {group_label}:")
    logging.info(f"Total games: {total_games}")
    logging.info(f"Standard chess games: {standard_games}")
    logging.info(f"Rated games: {rated_games}")
    logging.info(f"Games with evaluations: {eval_games}")
    logging.info(f"Total moves processed: {len(all_moves)}")
    
    # Convert to DataFrame
    moves_df = pd.DataFrame(all_moves)
    
    return moves_df

# If you want to specify a particular column order, you can reorder the DataFrame after creation.s
column_order = [
    'GameID', 'Event', 'Date', 'Result',
    'White', 'Black', 'WhiteElo', 'BlackElo', 'ADHDPlayer',
    'MoveNumber', 'Player', 'SAN', 'GamePhase', 'IsADHDMove',
    'Evaluation', 'EvalChange', 'ErrorCategory', 'PositionComplexity', 'Position Complexity Category'
    'MaterialDiff', 'IsSacrifice',
    'TimeControl', 'TimeControlCategory', 'InitialTimeSeconds',
    'IncrementSeconds', 'TimeRemaining', 'TimeSpent', 'UnderTimePressure',
    'Group', 'MoveCondition'
]


INFO: Fetching general population games...


Collecting games: 100%|██████████| 10000/10000 [04:07<00:00, 40.43it/s]

INFO: 
ELO Distribution:
INFO: ELO 800-850: 3 games
INFO: ELO 850-900: 5 games
INFO: ELO 900-950: 15 games
INFO: ELO 950-1000: 37 games
INFO: ELO 1000-1050: 80 games
INFO: ELO 1050-1100: 97 games
INFO: ELO 1100-1150: 138 games
INFO: ELO 1150-1200: 201 games
INFO: ELO 1200-1250: 235 games
INFO: ELO 1250-1300: 317 games
INFO: ELO 1300-1350: 380 games
INFO: ELO 1350-1400: 433 games
INFO: ELO 1400-1450: 561 games
INFO: ELO 1450-1500: 693 games
INFO: ELO 1500-1550: 665 games
INFO: ELO 1550-1600: 691 games
INFO: ELO 1600-1650: 701 games
INFO: ELO 1650-1700: 737 games
INFO: ELO 1700-1750: 644 games
INFO: ELO 1750-1800: 638 games
INFO: ELO 1800-1850: 578 games
INFO: ELO 1850-1900: 512 games
INFO: ELO 1900-1950: 433 games
INFO: ELO 1950-2000: 348 games
INFO: ELO 2000-2050: 266 games
INFO: ELO 2050-2100: 202 games
INFO: ELO 2100-2150: 139 games
INFO: ELO 2150-2200: 103 games
INFO: ELO 2200-2250: 53 games
INFO: ELO 2250-2300: 37 games
INFO: ELO 2300-2350: 15 games
INFO: ELO 2350-2400: 13 games
IN


Collecting games: 100%|██████████| 10000/10000 [04:08<00:00, 40.31it/s]

INFO: 
ELO Distribution:
INFO: ELO 800-850: 3 games
INFO: ELO 850-900: 7 games
INFO: ELO 900-950: 14 games
INFO: ELO 950-1000: 28 games
INFO: ELO 1000-1050: 75 games
INFO: ELO 1050-1100: 97 games
INFO: ELO 1100-1150: 131 games
INFO: ELO 1150-1200: 197 games
INFO: ELO 1200-1250: 265 games
INFO: ELO 1250-1300: 295 games
INFO: ELO 1300-1350: 386 games
INFO: ELO 1350-1400: 464 games
INFO: ELO 1400-1450: 541 games
INFO: ELO 1450-1500: 675 games
INFO: ELO 1500-1550: 693 games
INFO: ELO 1550-1600: 648 games
INFO: ELO 1600-1650: 660 games
INFO: ELO 1650-1700: 785 games
INFO: ELO 1700-1750: 692 games
INFO: ELO 1750-1800: 615 games
INFO: ELO 1800-1850: 586 games
INFO: ELO 1850-1900: 531 games
INFO: ELO 1900-1950: 452 games
INFO: ELO 1950-2000: 331 games
INFO: ELO 2000-2050: 259 games
INFO: ELO 2050-2100: 210 games
INFO: ELO 2100-2150: 130 games
INFO: ELO 2150-2200: 88 games
INFO: ELO 2200-2250: 53 games
INFO: ELO 2250-2300: 33 games
INFO: ELO 2300-2350: 20 games
INFO: ELO 2350-2400: 9 games
INFO


Collecting games: 100%|██████████| 10000/10000 [04:06<00:00, 40.59it/s]

INFO: 
ELO Distribution:
INFO: ELO 800-850: 3 games
INFO: ELO 850-900: 6 games
INFO: ELO 900-950: 18 games
INFO: ELO 950-1000: 34 games
INFO: ELO 1000-1050: 80 games
INFO: ELO 1050-1100: 106 games
INFO: ELO 1100-1150: 132 games
INFO: ELO 1150-1200: 198 games
INFO: ELO 1200-1250: 260 games
INFO: ELO 1250-1300: 300 games
INFO: ELO 1300-1350: 358 games
INFO: ELO 1350-1400: 462 games
INFO: ELO 1400-1450: 552 games
INFO: ELO 1450-1500: 684 games
INFO: ELO 1500-1550: 695 games
INFO: ELO 1550-1600: 685 games
INFO: ELO 1600-1650: 684 games
INFO: ELO 1650-1700: 742 games
INFO: ELO 1700-1750: 657 games
INFO: ELO 1750-1800: 637 games
INFO: ELO 1800-1850: 554 games
INFO: ELO 1850-1900: 515 games
INFO: ELO 1900-1950: 432 games
INFO: ELO 1950-2000: 348 games
INFO: ELO 2000-2050: 258 games
INFO: ELO 2050-2100: 191 games
INFO: ELO 2100-2150: 145 games
INFO: ELO 2150-2200: 102 games
INFO: ELO 2200-2250: 60 games
INFO: ELO 2250-2300: 36 games
INFO: ELO 2300-2350: 21 games
INFO: ELO 2350-2400: 10 games
I




In [24]:
# ----------------------- 1. Fetch and Process ADHD Players' Games -----------------------

adhd_games = []
for username in ADHD_USERNAMES:
    logging.info(f"Fetching games for user '{username}'...")
    user_games = fetch_lichess_games(username, max_games=4000)  # Adjust max_games as needed
    adhd_games.extend(user_games)

if not adhd_games:
    logging.warning("No ADHD games fetched. Exiting analysis.")
else:
    # Initialize the chess engine
    try:
        engine = chess.engine.SimpleEngine.popen_uci(STOCKFISH_PATH)
        logging.info(f"Initialized Stockfish engine at '{STOCKFISH_PATH}'.")
    except FileNotFoundError:
        logging.critical(f"Stockfish executable not found at '{STOCKFISH_PATH}'. Please update the path.")
        engine = None
    except Exception as e:
        logging.critical(f"Failed to initialize Stockfish engine: {e}")
        engine = None

    if engine is not None:
        # ----------------------- 2. Process ADHD Players' Games -----------------------
        
        logging.info("Processing ADHD players' games...")
        adhd_moves_df = process_games(adhd_games, group_label='ADHD', engine=engine)
        debug_data_pipeline(adhd_moves_df, "ADHD GAMES PROCESSING")
        
        # ----------------------- 3. Fetch and Process General Population Games -----------------------
        
        logging.info("Fetching general population games...")
        if not os.path.exists(GENERAL_PGN_FILE_PATH):
            logging.error(f"PGN file not found at path: {GENERAL_PGN_FILE_PATH}")
            general_games = []
        else:
            general_games = process_pgn_file(GENERAL_PGN_FILE_PATH, max_games = 10000)  # Adjust max_games as needed
        
        if not general_games:
            logging.warning("No General population games to process.")
            general_moves_df = pd.DataFrame()
        else:
            logging.info("Processing general population games...")
            general_moves_df = process_games(general_games, group_label='General', engine=engine)
            debug_data_pipeline(general_moves_df, "GENERAL GAMES PROCESSING")
        
        # ----------------------- 4. Combine Datasets -----------------------

        logging.info("Combining datasets...")
        all_moves_df = pd.concat([adhd_moves_df, general_moves_df], ignore_index=True)
        debug_data_pipeline(all_moves_df, "COMBINED DATASET")

# ----------------------- 5. Data Cleaning -----------------------

logging.info("Cleaning data...")
required_columns = ['time_spent', 'evaluation', 'eval_change', 'white_elo', 'black_elo']
all_moves_df = all_moves_df.dropna(subset=required_columns)

# Ensure 'is_sacrifice' (not 'IsSacrifice') is boolean
all_moves_df['is_sacrifice'] = all_moves_df['is_sacrifice'].fillna(False).astype(bool)

# Convert relevant columns to numeric types
numeric_columns = ['time_spent', 'evaluation', 'eval_change', 'white_elo', 'black_elo']
for col in numeric_columns:
    all_moves_df[col] = pd.to_numeric(all_moves_df[col], errors='coerce')

# Drop rows with NaNs resulted from non-numeric conversion
all_moves_df = all_moves_df.dropna(subset=numeric_columns)

# Create ELO brackets for analysis
all_moves_df['elo_bracket'] = pd.cut(
    all_moves_df.apply(lambda row: max(row['white_elo'], row['black_elo']), axis=1),
    bins=[0, 1200, 1600, 2000, float('inf')],
    labels=['0-1200', '1200-1600', '1600-2000', '2000+']
)

# After cleaning, output the number of moves remaining
logging.info(f"Total number of moves after cleaning: {len(all_moves_df)}")

INFO: Fetching games for user 'teoeo'...
INFO: Fetched 1252 games with evaluations for user 'teoeo'.
INFO: Fetching games for user 'Tobermorey'...
INFO: Fetched 189 games with evaluations for user 'Tobermorey'.
INFO: Fetching games for user 'apostatlet'...
INFO: Fetched 424 games with evaluations for user 'apostatlet'.
INFO: Fetching games for user 'LovePump1000'...
INFO: Fetched 576 games with evaluations for user 'LovePump1000'.
INFO: Fetching games for user 'StuntmanAndy'...
INFO: Fetched 825 games with evaluations for user 'StuntmanAndy'.
INFO: Fetching games for user 'Banfy_B'...
INFO: Fetching games for user 'ChessyChesterton12'...
INFO: Fetched 260 games with evaluations for user 'ChessyChesterton12'.
INFO: Fetching games for user 'yastoon'...
INFO: Fetched 24 games with evaluations for user 'yastoon'.
INFO: Fetching games for user 'Timy1976'...
INFO: Fetching games for user 'SonnyDayz11'...
INFO: Fetched 16 games with evaluations for user 'SonnyDayz11'.
INFO: Fetching games for

Processing ADHD games: 100%|██████████| 3930/3930 [05:52<00:00, 11.15it/s] 

INFO: 
Processing Summary for ADHD:
INFO: Total games: 3930
INFO: Standard chess games: 3671
INFO: Rated games: 2177
INFO: Games with evaluations: 2177
INFO: Total moves processed: 135548





INFO: Debugging ADHD GAMES PROCESSING
INFO: Fetching general population games...


Collecting games: 100%|██████████| 10000/10000 [04:15<00:00, 39.09it/s]

INFO: 
ELO Distribution:
INFO: ELO 800-850: 3 games
INFO: ELO 850-900: 6 games
INFO: ELO 900-950: 16 games
INFO: ELO 950-1000: 28 games
INFO: ELO 1000-1050: 75 games
INFO: ELO 1050-1100: 106 games
INFO: ELO 1100-1150: 125 games
INFO: ELO 1150-1200: 199 games
INFO: ELO 1200-1250: 238 games
INFO: ELO 1250-1300: 315 games
INFO: ELO 1300-1350: 393 games
INFO: ELO 1350-1400: 465 games
INFO: ELO 1400-1450: 548 games
INFO: ELO 1450-1500: 707 games
INFO: ELO 1500-1550: 659 games
INFO: ELO 1550-1600: 672 games
INFO: ELO 1600-1650: 678 games
INFO: ELO 1650-1700: 734 games
INFO: ELO 1700-1750: 662 games
INFO: ELO 1750-1800: 633 games
INFO: ELO 1800-1850: 573 games
INFO: ELO 1850-1900: 509 games
INFO: ELO 1900-1950: 478 games
INFO: ELO 1950-2000: 323 games
INFO: ELO 2000-2050: 240 games
INFO: ELO 2050-2100: 211 games
INFO: ELO 2100-2150: 120 games
INFO: ELO 2150-2200: 96 games
INFO: ELO 2200-2250: 78 games
INFO: ELO 2250-2300: 41 games
INFO: ELO 2300-2350: 20 games
INFO: ELO 2350-2400: 14 games
IN


Processing General games: 100%|██████████| 10000/10000 [27:19<00:00,  6.10it/s]

INFO: 
Processing Summary for General:
INFO: Total games: 10000
INFO: Standard chess games: 10000
INFO: Rated games: 10000
INFO: Games with evaluations: 10000
INFO: Total moves processed: 631299





INFO: Debugging GENERAL GAMES PROCESSING
INFO: Combining datasets...
INFO: Debugging COMBINED DATASET
INFO: Cleaning data...
INFO: Total number of moves after cleaning: 352716


In [25]:
def analyze_error_distribution(df):
    # Print actual value ranges
    print("\nEvaluation Change Statistics:")
    print(df['eval_change'].describe())
    
    # Count error categories
    print("\nError Category Distribution:")
    print(df['error_category'].value_counts(normalize=True).multiply(100).round(1))
    
    # Look at the largest eval changes
    print("\nLargest Evaluation Changes:")
    print(df.nlargest(5, 'eval_change')[['eval_change', 'error_category', 'san']])
    
    # Look at the smallest eval changes
    print("\nSmallest Evaluation Changes:")
    print(df.nsmallest(5, 'eval_change')[['eval_change', 'error_category', 'san']])

# Add after your data cleaning
analyze_error_distribution(all_moves_df)


Evaluation Change Statistics:
count    3.527160e+05
mean    -7.529939e+03
std      8.989324e+04
min     -1.005377e+06
25%     -7.900000e+01
50%     -2.200000e+01
75%     -4.000000e+00
max      9.993540e+05
Name: eval_change, dtype: float64

Error Category Distribution:
error_category
Normal        82.7
Inaccuracy     9.4
Blunder        4.4
Mistake        3.5
Name: proportion, dtype: float64

Largest Evaluation Changes:
        eval_change error_category  san
109659     999354.0        Blunder  Kg8
13428      999226.0        Blunder  Rf7
40739      999011.0         Normal  Ke2
30470      999009.0         Normal  Re8
16177      998999.0        Blunder   g5

Smallest Evaluation Changes:
        eval_change error_category   san
487739   -1005377.0        Blunder   Qa8
725849   -1005140.0         Normal  Qf2+
213712   -1005112.0         Normal  Rg5+
86255    -1002013.0        Blunder  Qf5+
328354   -1001949.0        Blunder   Qd5
