# Module imports

In [96]:
%pip install pandas matplotlib scikit-learn --quiet
# Install and import the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Supress warnings
import warnings
warnings.filterwarnings('ignore')


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


# Loading the data

In [97]:
# Define the path to the database and the table name (from my private darts-data-tracker repo)
database_url = '../../darts-data-tracker/darts_stats.db'
game_table_name = 'darts_overall_match_stats'

In [98]:
def load_data():
    """
    Load the data from the database and return a pandas dataframe

    Returns:
        df (pd.DataFrame): A pandas dataframe containing the data from the database
    """
    conn = sqlite3.connect(database_url)
    df = pd.read_sql_query(f"SELECT * FROM {game_table_name}", conn)
    conn.close()
    return df

def convert_date_and_time(df):
    """
    Convert date and time columns into a combined datetime column, replace original columns, and sort.
    
    Args:
        df (pd.DataFrame): Input dataframe with separate date and time columns
        
    Returns:
        pd.DataFrame: Modified dataframe with combined datetime column in original date position
    """
    # Convert string dates to proper datetime objects
    df['date'] = pd.to_datetime(df['date'], format='%d %b, %Y')
    
    # Clean time values by extracting just hours:minutes
    df['time'] = df['time'].str.extract(r'(\d{2}:\d{2})', expand=False)
    
    # Merge date and time into single datetime column
    df['datetime'] = pd.to_datetime(
        df['date'].dt.strftime('%Y-%m-%d') + ' ' + df['time'],
        format='%Y-%m-%d %H:%M'
    )
    
    # Record original position of date column for later insertion
    original_column_position = df.columns.get_loc('date')
    
    # Remove obsolete columns
    df = df.drop(['date', 'time'], axis=1)
    
    # Reorganize columns to place datetime where date was originally
    column_list = df.columns.tolist()
    column_list.remove('datetime')  
    column_list.insert(original_column_position, 'datetime')  
    df = df[column_list]
    
    # Sort chronologically and reset index
    df = df.sort_values(by='datetime').reset_index(drop=True)
    
    return df

# Load the data
darts_matches = load_data()

# Apply the conversion and sorting
darts_matches = convert_date_and_time(darts_matches)

# Data Preprocessing

In [99]:
def drop_unnecessary_columns(df):
    """
    Drop unnecessary columns from the dataframe

    Args:
        df (pd.DataFrame): The dataframe to drop columns from

    Returns:
        pd.DataFrame: The dataframe with the specified columns dropped
    """
    try:
        df = df.drop(['uid', 'event_title', 'leg_count'], axis=1)
    except KeyError:
        pass
    return df

def create_dual_perspective_df(df):
    """
    Transforms match data to include both player perspectives per match.
    
    Creates two rows per match:
    1. Home team as player, away team as opponent
    2. Away team as player, home team as opponent
    
    Args:
        df (pd.DataFrame): Original dataframe with home/away columns
    
    Returns:
        pd.DataFrame: Restructured dataframe with player/opponent perspectives
    """
    
    # Create home team perspective
    home = df.rename(columns=lambda x: x.replace('home_team', 'player').replace('away_team', 'opponent'))
    
    # Create away team perspective with column swaps
    away = df.rename(columns=lambda x: x.replace('away_team', 'player').replace('home_team', 'opponent'))
    
    # Combine both perspectives
    dual_df = pd.concat([home, away], ignore_index=True)
    
    # Create target variable
    dual_df['player_won'] = (dual_df['player_legs_won'] > dual_df['opponent_legs_won']).astype(int)
    
    # Clean column order
    base_cols = ['uid', 'event_title', 'leg_count', 'datetime']
    player_cols = [c for c in dual_df if c.startswith('player')]
    opponent_cols = [c for c in dual_df if c.startswith('opponent')]
    
    df = dual_df[base_cols + player_cols + opponent_cols]
    df.sort_values(by='datetime', inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df

def manage_checkout_columns(df):
    """
    Split checkouts column into attempts and successful checkouts. 
    Drop checkouts of 100+ from the dataframe.
    Handles invalid checkout formats and zero denominators.
    Organizes checkout columns at end of player/opponent sections.
    Moves target variable to end and renames it.

    Args:
        df (pd.DataFrame): The dataframe to process

    Returns:
        pd.DataFrame: Processed dataframe with organized columns
    """
    # Remove 100+ checkout columns
    df = df.drop(['player_checkouts_100_plus', 'opponent_checkouts_100_plus'], axis=1)

    def split_checkout(col):
        """Split checkout column into made/attempted with error handling"""
        result = col.str.split('/', expand=True)
        return (
            result[0].fillna('0').astype(int),  # Made
            result[1].fillna('0').astype(int)   # Attempted
        )

    # Process player checkouts
    player_made, player_attempted = split_checkout(df['player_checkouts'])
    df = df.assign(
        player_checkouts_made=player_made,
        player_checkouts_attempted=player_attempted,
        player_checkout_percentage=np.where(
            player_attempted > 0,
            player_made / player_attempted * 100,
            0.0
        )
    )

    # Process opponent checkouts
    opponent_made, opponent_attempted = split_checkout(df['opponent_checkouts'])
    df = df.assign(
        opponent_checkouts_made=opponent_made,
        opponent_checkouts_attempted=opponent_attempted,
        opponent_checkout_percentage=np.where(
            opponent_attempted > 0,
            opponent_made / opponent_attempted * 100,
            0.0
        )
    )

    # Remove original checkout columns
    df = df.drop(['player_checkouts', 'opponent_checkouts'], axis=1)
    
    # Rename target variable
    df = df.rename(columns={'player_won': 'did_player_win'})

    # Reorganize column order
    cols = df.columns.tolist()
    
    # Columns to reposition
    player_cols = ['player_checkouts_made', 'player_checkouts_attempted']
    opponent_cols = ['opponent_checkouts_made', 'opponent_checkouts_attempted']
    target_col = 'did_player_win'
    
    # Remove from current positions
    cols = [c for c in cols if c not in player_cols + opponent_cols + [target_col]]
    
    # Insert player checkout columns after player_checkout_percentage
    player_cp_idx = cols.index('player_checkout_percentage')
    cols[player_cp_idx+1:player_cp_idx+1] = player_cols
    
    # Insert opponent checkout columns after opponent_checkout_percentage
    opponent_cp_idx = cols.index('opponent_checkout_percentage')
    cols[opponent_cp_idx+1:opponent_cp_idx+1] = opponent_cols
    
    # Add target column at end
    cols.append(target_col)
    
    return df[cols]

def convert_legs_won_columns_to_int(df):
    """
    Convert legs_won columns to int

    Args:
        df (pd.DataFrame): The dataframe to convert

    Returns:
        pd.DataFrame: The converted dataframe
    """
    df['player_legs_won'] = df['player_legs_won'].astype(int)
    df['opponent_legs_won'] = df['opponent_legs_won'].astype(int)
    return df

# darts_matches = drop_unnecessary_columns(darts_matches)
darts_matches = create_dual_perspective_df(darts_matches)
darts_matches = manage_checkout_columns(darts_matches)
darts_matches = convert_legs_won_columns_to_int(darts_matches)

In [100]:
# Print columns and their types
for column in darts_matches.columns:
    print(f"{column}: {darts_matches[column].dtype}")

uid: object
event_title: object
leg_count: object
datetime: datetime64[ns]
player_name: object
player_legs_won: int64
player_average: float64
player_100_plus_thrown: int64
player_140_plus_thrown: int64
player_180_thrown: int64
player_highest_checkout: int64
player_checkout_percentage: float64
player_checkouts_made: int64
player_checkouts_attempted: int64
opponent_name: object
opponent_legs_won: int64
opponent_average: float64
opponent_100_plus_thrown: int64
opponent_140_plus_thrown: int64
opponent_180_thrown: int64
opponent_highest_checkout: int64
opponent_checkout_percentage: float64
opponent_checkouts_made: int64
opponent_checkouts_attempted: int64
did_player_win: int64


# Feature Engineering

In [101]:
def drop_opponent_columns(df):
    """
    Drop any columns that start with 'opponent_' except for 'opponent_name'.

    Args:
        df (pd.DataFrame): The dataframe from which to drop columns.

    Returns:
        pd.DataFrame: The dataframe with specified opponent columns dropped.
    """
    return df.drop(columns=[col for col in df.columns if col.startswith('opponent_') and col != 'opponent_name'])

# Preprocessing
darts_matches = drop_opponent_columns(darts_matches)

# Print the shape of the dataframe where player_checkouts_attempted is 0 but player_checkouts_made is not 0
temp_more_checkouts_than_attempts = darts_matches[
    (darts_matches['player_checkouts_attempted'] < darts_matches['player_checkouts_made'])
]
print(f'Records where checkouts are not tracked properly: {temp_more_checkouts_than_attempts.shape}')

Records where checkouts are not tracked properly: (635, 16)


In [102]:
def regression_impute_checkout_attempts(df, random_state=9):
    """
    Fixes invalid checkout attempts using regression on valid patterns
    (More appropriate than logistic regression for continuous outcomes)

    Args:
        df (pd.DataFrame): The dataframe to process
        random_state (int): The random state for reproducibility

    Returns:
        pd.DataFrame: The processed dataframe with invalid checkout attempts fixed
    """
    df = df.copy()
    
    # Identify invalid cases
    invalid_mask = (df['player_checkouts_attempted'] < df['player_checkouts_made']) | \
                   ((df['player_checkouts_attempted'] == 0) & (df['player_checkouts_made'] > 0))
    
    if not invalid_mask.any():
        print("No invalid checkout attempts found")
        return df

    # Prepare valid/invalid data splits
    valid_data = df[~invalid_mask].dropna(subset=['player_checkouts_attempted'])
    invalid_data = df[invalid_mask]
    
    if valid_data.empty or invalid_data.empty:
        return df

    # Feature engineering
    features = [
        'player_checkouts_made',
        'player_legs_won', 
        'player_average',
        'player_100_plus_thrown',
        'player_140_plus_thrown',
        'player_180_thrown',
        'player_highest_checkout',
        'did_player_win'
    ]
    
    # Regression pipeline with regularization
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', Ridge(alpha=1.0))
    ])
    
    # Train on valid data
    model.fit(valid_data[features], valid_data['player_checkouts_attempted'])
    
    # Predict attempts for invalid cases
    pred_attempts = model.predict(invalid_data[features])
    pred_attempts = np.ceil(pred_attempts).astype(int)
    
    # Apply logical constraints
    min_attempts = np.where(
        invalid_data['player_checkouts_made'] > 0,
        invalid_data['player_checkouts_made'] + 1, 
        1 
    )
    final_attempts = np.maximum(pred_attempts, min_attempts)
    
    # Update dataframe
    df.loc[invalid_mask, 'player_checkouts_attempted'] = final_attempts
    
    # Final validation and cleanup
    df['player_checkout_percentage'] = np.where(
        df['player_checkouts_attempted'] > 0,
        (df['player_checkouts_made'] / df['player_checkouts_attempted']) * 100,
        0
    ).clip(0, 100).round(1)
    
    # Ensure no remaining invalid cases
    final_check = df[df['player_checkouts_attempted'] < df['player_checkouts_made']]
    if not final_check.empty:
        df.loc[final_check.index, 'player_checkouts_attempted'] = \
            final_check['player_checkouts_made'] + 1
    
    print(f"Corrected {invalid_mask.sum()} invalid rows")
    return df

darts_matches = regression_impute_checkout_attempts(darts_matches)

Corrected 635 invalid rows


In [103]:
def get_historical_matches(df, current_date, player_name):
    """
    Get all matches for a player before the current date, sorted chronologically.

    Args:
        df (pd.DataFrame): The dataframe containing match data.
        current_date (datetime): The date to filter matches before.
        player_name (str): The name of the player for whom to retrieve matches.

    Returns:
        pd.DataFrame: A dataframe containing historical matches for the specified player.
    """
    return df[
        (df['player_name'] == player_name) &
        (df['datetime'] < current_date)
    ].sort_values('datetime')

def calculate_averages(historical_df, base_features, n_windows=[6], min_matches=1):
    """
    Calculate averages ensuring no NaN propagation.

    Args:
        historical_df (pd.DataFrame): The dataframe containing historical match data.
        base_features (list): List of features for which to calculate averages.
        n_windows (list): List of window sizes for rolling averages.
        min_matches (int): Minimum number of matches required to calculate rolling averages.

    Returns:
        dict: A dictionary containing calculated averages.
    """
    stats = {}
    
    # Career averages (all available history)
    if len(historical_df) >= 1:  # Need at least 1 match
        for feat in base_features:
            stats[f'career_avg_{feat}'] = historical_df[feat].mean()
    else:
        for feat in base_features:
            stats[f'career_avg_{feat}'] = None
    
    # Rolling averages
    for n in n_windows:
        window_df = historical_df.tail(n)
        if len(window_df) >= min_matches:
            for feat in base_features:
                stats[f'rolling_{n}_avg_{feat}'] = window_df[feat].mean()
        else:
            for feat in base_features:
                stats[f'rolling_{n}_avg_{feat}'] = None
    
    return stats

def append_stats(df, base_features, n_windows=[6], min_matches=1):
    """
    Perform row-by-row calculation that never references new columns.

    Args:
        df (pd.DataFrame): The dataframe to which stats will be appended.
        base_features (list): List of features for which to calculate averages.
        n_windows (list): List of window sizes for rolling averages.
        min_matches (int): Minimum number of matches required to calculate rolling averages.

    Returns:
        pd.DataFrame: The dataframe with appended statistics.
    """
    # Create copy to avoid modifying original dataframe
    result_df = df.copy()
    
    # Pre-initialize all stat columns with None
    stat_columns = []
    for feat in base_features:
        result_df[f'career_avg_{feat}'] = None
        stat_columns.append(f'career_avg_{feat}')
    for n in n_windows:
        for feat in base_features:
            col_name = f'rolling_{n}_avg_{feat}'
            result_df[col_name] = None
            stat_columns.append(col_name)
    
    # Process each row
    total = len(result_df)
    for idx, row in result_df.iterrows():
        # Get historical data using ORIGINAL dataframe
        historical = get_historical_matches(
            df,  # Use original DF without any stat columns
            row['datetime'],
            row['player_name']
        )
        
        # Calculate stats
        stats = calculate_averages(historical, base_features, n_windows, min_matches)
        
        # Update only the stat columns in result_df
        for col in stat_columns:
            result_df.at[idx, col] = stats.get(col, None)
        
        # Progress tracking
        if idx % 10 == 0:
            print(f'\rProcessed {idx+1}/{total} rows...', end=' ')
    return result_df

# Legs won is not included as it could be deceptive given match formats with different numbers of legs
BASE_FEATURES = [
    'player_average', 'player_100_plus_thrown',
    'player_140_plus_thrown', 'player_180_thrown', 'player_highest_checkout',
    'player_checkout_percentage', 'player_checkouts_made', 'player_checkouts_attempted'
]

# Apply the calculation
darts_matches = append_stats(
    df=darts_matches,
    base_features=BASE_FEATURES,
    n_windows=[6], 
    min_matches=1   
)

# Print columns after processing
print("\nColumns after processing:", darts_matches.columns.tolist())

Processed 4551/4556 rows...      
Columns after processing: ['uid', 'event_title', 'leg_count', 'datetime', 'player_name', 'player_legs_won', 'player_average', 'player_100_plus_thrown', 'player_140_plus_thrown', 'player_180_thrown', 'player_highest_checkout', 'player_checkout_percentage', 'player_checkouts_made', 'player_checkouts_attempted', 'opponent_name', 'did_player_win', 'career_avg_player_average', 'career_avg_player_100_plus_thrown', 'career_avg_player_140_plus_thrown', 'career_avg_player_180_thrown', 'career_avg_player_highest_checkout', 'career_avg_player_checkout_percentage', 'career_avg_player_checkouts_made', 'career_avg_player_checkouts_attempted', 'rolling_6_avg_player_average', 'rolling_6_avg_player_100_plus_thrown', 'rolling_6_avg_player_140_plus_thrown', 'rolling_6_avg_player_180_thrown', 'rolling_6_avg_player_highest_checkout', 'rolling_6_avg_player_checkout_percentage', 'rolling_6_avg_player_checkouts_made', 'rolling_6_avg_player_checkouts_attempted']


In [104]:
def add_h2h_record(df):
    """
    Adds historical head-to-head (H2H) records between players based on previous matches.
    Operates on the dual perspective dataframe to maintain match symmetry.
    
    Args:
        df (pd.DataFrame): Dual perspective dataframe with 'player_name', 'opponent_name', 
                          'did_player_win', and 'datetime' columns
                          
    Returns:
        pd.DataFrame: Original dataframe with added 'previous_h2h_wins' and 'previous_h2h_losses' columns
    """
    
    # Create working copy to preserve original data
    temp_df = df.copy()
    
    # Sort matches chronologically for accurate cumulative counts
    temp_df = temp_df.sort_values('datetime').reset_index(drop=True)
    
    # Track original index for final reordering
    temp_df['original_index'] = temp_df.index
    
    # Calculate cumulative match counts and wins for each player-opponent pair
    temp_df['cum_matches'] = temp_df.groupby(['player_name', 'opponent_name']).cumcount() + 1
    temp_df['cum_wins'] = temp_df.groupby(['player_name', 'opponent_name'])['did_player_win'].cumsum()
    
    # Shift results to exclude current match from H2H counts
    temp_df['previous_h2h_wins'] = temp_df.groupby(['player_name', 'opponent_name'])['cum_wins'].shift(1).fillna(0).astype(int)
    temp_df['previous_h2h_losses'] = (
        temp_df.groupby(['player_name', 'opponent_name'])['cum_matches'].shift(1).fillna(0) 
        - temp_df['previous_h2h_wins']
    ).astype(int)
    
    # Clean up temporary columns
    temp_df = temp_df.drop(['cum_matches', 'cum_wins', 'original_index'], axis=1)

    # Calculate the difference between wins and losses
    temp_df['previous_h2h_differential'] = temp_df['previous_h2h_wins'] - temp_df['previous_h2h_losses']
    
    # Reorder to match original input sequence
    return temp_df.sort_index()

darts_matches = add_h2h_record(darts_matches)

# Preparing model input format

In [105]:
# Rename the 'leg_count' column to 'is_set_play'
darts_matches = darts_matches.rename(columns={'leg_count': 'is_set_play'})

# Update 'is_set_play' to 1 if the string contains 'set' (case-insensitive), 0 otherwise
try:
    darts_matches['is_set_play'] = darts_matches['is_set_play'].str.contains('set', case=False, na=False).astype(int)
except:
    pass

with pd.option_context('display.max_columns', None):
    display(darts_matches)

Unnamed: 0,uid,event_title,is_set_play,datetime,player_name,player_legs_won,player_average,player_100_plus_thrown,player_140_plus_thrown,player_180_thrown,player_highest_checkout,player_checkout_percentage,player_checkouts_made,player_checkouts_attempted,opponent_name,did_player_win,career_avg_player_average,career_avg_player_100_plus_thrown,career_avg_player_140_plus_thrown,career_avg_player_180_thrown,career_avg_player_highest_checkout,career_avg_player_checkout_percentage,career_avg_player_checkouts_made,career_avg_player_checkouts_attempted,rolling_6_avg_player_average,rolling_6_avg_player_100_plus_thrown,rolling_6_avg_player_140_plus_thrown,rolling_6_avg_player_180_thrown,rolling_6_avg_player_highest_checkout,rolling_6_avg_player_checkout_percentage,rolling_6_avg_player_checkouts_made,rolling_6_avg_player_checkouts_attempted,previous_h2h_wins,previous_h2h_losses,previous_h2h_differential
0,17_Jan_2024_16:10_GMT+0000_Peter_Wright_Haruki...,International | Bahrain Darts Masters 2024,0,2024-01-17 16:10:00,Peter Wright,6,91.69,17,8,1,40,46.2,6,13,Haruki Muramatsu,1,,,,,,,,,,,,,,,,,0,0,0
1,17_Jan_2024_16:10_GMT+0000_Peter_Wright_Haruki...,International | Bahrain Darts Masters 2024,0,2024-01-17 16:10:00,Haruki Muramatsu,5,87.11,15,4,2,170,35.7,5,14,Peter Wright,0,,,,,,,,,,,,,,,,,0,0,0
2,17_Jan_2024_16:50_GMT+0000_Gerwyn_Price_Reynal...,International | Bahrain Darts Masters 2024,0,2024-01-17 16:50:00,Gerwyn Price,6,92.13,8,7,3,93,28.6,6,21,Reynaldo Rivera,1,,,,,,,,,,,,,,,,,0,0,0
3,17_Jan_2024_16:50_GMT+0000_Gerwyn_Price_Reynal...,International | Bahrain Darts Masters 2024,0,2024-01-17 16:50:00,Reynaldo Rivera,4,94.86,16,11,0,104,33.3,4,12,Gerwyn Price,0,,,,,,,,,,,,,,,,,0,0,0
4,17_Jan_2024_17:30_GMT+0000_Nathan_Aspinall_Lou...,International | Bahrain Darts Masters 2024,0,2024-01-17 17:30:00,Nathan Aspinall,6,97.20,14,8,2,102,28.6,6,21,Lourence Ilagan,1,,,,,,,,,,,,,,,,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4551,16_Jan_2025_18:10_GMT+0000_Luke_Humphries_Step...,International | Bahrain Darts Masters 2025,0,2025-01-16 18:10:00,Luke Humphries,6,94.76,19,6,5,123,50.0,6,12,Stephen Bunting,0,99.200132,12.822368,8.098684,3.690789,107.638158,45.143421,6.460526,15.289474,99.405,19.333333,10.333333,3.5,143.833333,53.8,8.833333,18.833333,4,3,1
4552,16_Jan_2025_18:50_GMT+0000_Gerwyn_Price_Peter_...,International | Bahrain Darts Masters 2025,0,2025-01-16 18:50:00,Peter Wright,4,91.15,16,6,4,36,18.2,4,22,Gerwyn Price,0,93.277788,13.625,7.144231,2.375,90.048077,41.086538,4.884615,12.653846,93.261667,23.166667,11.666667,4.5,113.5,45.0,9.833333,22.5,2,4,-2
4553,16_Jan_2025_18:50_GMT+0000_Gerwyn_Price_Peter_...,International | Bahrain Darts Masters 2025,0,2025-01-16 18:50:00,Gerwyn Price,7,92.58,13,4,3,124,38.9,7,18,Peter Wright,1,97.705859,12.272727,7.070707,3.242424,98.777778,42.687879,5.616162,14.717172,96.155,24.166667,11.5,6.166667,112.833333,37.433333,11.166667,33.666667,4,2,2
4554,16_Jan_2025_19:30_GMT+0000_Stephen_Bunting_Ger...,International | Bahrain Darts Masters 2025,0,2025-01-16 19:30:00,Stephen Bunting,8,99.33,13,8,5,85,47.1,8,17,Gerwyn Price,1,97.212959,13.357143,8.622449,3.377551,101.612245,43.832653,5.918367,14.336735,99.888333,19.833333,13.833333,4.833333,111.0,51.0,9.833333,21.833333,1,2,-1


In [106]:
def get_opponent_match_stats(row):
    """
    Get the opponent match stats from the row

    Args:
        row (pd.Series): The row to get the opponent match stats from

    Returns:
        pd.DataFrame: The opponent match stats
    """
    match_uid = row['uid']
    opponent_name = row['opponent_name']

    opponent_match_stats = darts_matches[
        (darts_matches['uid'] == match_uid) &
        (darts_matches['player_name'] == opponent_name)
    ]

    if opponent_match_stats.empty:
        print(f'Warning: No opponent stats found for {opponent_name} in match {match_uid}')
        return None
    
    if len(opponent_match_stats) > 1:
        print(f'Warning: Multiple rows found for {opponent_name} in match {match_uid}, using first')
    
    return opponent_match_stats.iloc[[0]] 

def append_opponent_match_stats(df):
    """
    Append the opponent match stats to the dataframe

    Args:
        df (pd.DataFrame): The dataframe to append the opponent match stats to

    Returns:
        pd.DataFrame: The dataframe with the opponent match stats appended
    """
    # Pre-identify columns to rename to avoid processing multiple times
    career_cols = [col for col in df.columns if col.startswith('career_avg_')]
    rolling_cols = [col for col in df.columns if col.startswith('rolling_')]
    all_opponent_cols = [f'opponent_{col}' for col in career_cols + rolling_cols]
    
    # Ensure all opponent columns exist in the dataframe with float type to prevent type issues
    for col in all_opponent_cols:
        if col not in df:
            df[col] = np.nan  # Initialize with NaN float
    
    total_rows = df.shape[0]
    for index, row in df.iterrows():
        opponent_match_stats = get_opponent_match_stats(row)
        if opponent_match_stats is None:
            continue
        
        # Extract the single row as a Series
        opponent_stats = opponent_match_stats.iloc[0]
        
        # Iterate over predefined opponent columns to ensure consistency
        for orig_col in career_cols + rolling_cols:
            opponent_col = f'opponent_{orig_col}'
            value = opponent_stats.get(orig_col)
            if value is not None:
                # Ensure the value is a float to prevent dtype issues
                try:
                    df.at[index, opponent_col] = float(value)
                except (TypeError, ValueError) as e:
                    print(f"Error setting {opponent_col} for row {index}: {e}")
                    df.at[index, opponent_col] = np.nan
        
        print(f'Processing row {index + 1}/{total_rows}', end='\r')
    
    print()
    return df

darts_matches = append_opponent_match_stats(darts_matches)

Processing row 4556/4556


In [107]:
def reorganise_columns(df):
    """Reorganises the dataframe columns into logical groups while preserving order within categories.
    
    Args:
        df (pd.DataFrame): The dataframe to be reorganised.
        
    Returns:
        pd.DataFrame: The dataframe with columns reorganised.
    """
    # Get current column list
    all_columns = df.columns.tolist()
    
    # Move 'opponent_name' to come directly after 'player_name'
    if 'opponent_name' in all_columns:
        # Remove opponent_name from its current position
        all_columns.remove('opponent_name')
        # Find player_name's index
        player_idx = all_columns.index('player_name')
        # Insert opponent_name right after player_name
        all_columns.insert(player_idx + 1, 'opponent_name')
    
    # 1. Preserve columns up to opponent_name (now after player_name)
    opponent_idx = all_columns.index('opponent_name')
    fixed_columns = all_columns[:opponent_idx + 1]  # Includes up to opponent_name

    #2. Collect player match cols (preserve original order)
    player_match_cols = [c for c in all_columns if c.startswith('player_') and c != 'player_name']
    
    # 3. Collect career averages (preserve original order)
    career_cols = [c for c in all_columns if c.startswith('career_avg_')]
    
    # 4. Collect rolling averages (preserve original order)
    rolling_cols = [c for c in all_columns if c.startswith('rolling_')]
    
    # 5. Collect H2H columns
    h2h_cols = [c for c in all_columns if c.startswith('previous_h2h')]
    
    # 6. Collect opponent match stats columns
    opponent_cols = [c for c in all_columns if c.startswith('opponent_') and c != 'opponent_name']
    
    # 7. Final columns (did_player_win and any remaining)
    final_cols = ['did_player_win'] + \
                 [c for c in all_columns if c not in fixed_columns + player_match_cols + career_cols + rolling_cols + opponent_cols + h2h_cols + ['did_player_win']]

    # Combine all sections
    new_order = fixed_columns + player_match_cols + career_cols + rolling_cols + opponent_cols + h2h_cols + final_cols
    
    return df[new_order]

# Apply reorganisation
darts_matches = reorganise_columns(darts_matches)

# Verify new order
print(darts_matches.columns.tolist())

['uid', 'event_title', 'is_set_play', 'datetime', 'player_name', 'opponent_name', 'player_legs_won', 'player_average', 'player_100_plus_thrown', 'player_140_plus_thrown', 'player_180_thrown', 'player_highest_checkout', 'player_checkout_percentage', 'player_checkouts_made', 'player_checkouts_attempted', 'career_avg_player_average', 'career_avg_player_100_plus_thrown', 'career_avg_player_140_plus_thrown', 'career_avg_player_180_thrown', 'career_avg_player_highest_checkout', 'career_avg_player_checkout_percentage', 'career_avg_player_checkouts_made', 'career_avg_player_checkouts_attempted', 'rolling_6_avg_player_average', 'rolling_6_avg_player_100_plus_thrown', 'rolling_6_avg_player_140_plus_thrown', 'rolling_6_avg_player_180_thrown', 'rolling_6_avg_player_highest_checkout', 'rolling_6_avg_player_checkout_percentage', 'rolling_6_avg_player_checkouts_made', 'rolling_6_avg_player_checkouts_attempted', 'opponent_career_avg_player_average', 'opponent_career_avg_player_100_plus_thrown', 'oppon

In [108]:
# Specify here any tournaments to drop from the model. For example, I am going to be using
# the 2025 PDC World Championship as an evaluation set to compare to my ELO model, so I will drop
# all the 2025 PDC World Championship matches from the model.

DROP_TOURNAMENTS = ['International | PDC World Championship 2025']

def drop_tournaments(df, tournaments_to_drop):
    """
    Drop the tournaments from the dataframe

    Args:
        df (pd.DataFrame): The dataframe to drop the tournaments from
        tournaments_to_drop (list): The tournaments to drop

    Returns:
        pd.DataFrame: The dataframe with the tournaments dropped
    """
    try:
        return df[~df['event_title'].isin(tournaments_to_drop)]
    except Exception as e:
        print(f'Error dropping tournaments: {e}')
        return df

darts_matches = drop_tournaments(darts_matches, DROP_TOURNAMENTS)

In [109]:
def drop_unnecessary_model_columns(df):
    """
    Drop unnecessary columns for the model

    Args:
        df (pd.DataFrame): The dataframe to drop the columns from

    Returns:
        pd.DataFrame: The dataframe with the unnecessary columns dropped
    """
    # Drop uid, event_title, datetime, opponent_name and all columns beginning with 'player_
    player_column_names = [col for col in df.columns if col.startswith('player_')]
    try:
        return df.drop(['uid', 'event_title', 'datetime', 'opponent_name'] + player_column_names, axis=1)
    except Exception as e:
        print(f'Error dropping columns: {e}')
        return df
    
def drop_nan_rows(df):
    """
    Drop any rows with nan values

    Args:
        df (pd.DataFrame): The dataframe to drop the rows from

    Returns:
        pd.DataFrame: The dataframe with the nan rows dropped
    """
    return df.dropna()

def convert_dtypes(df):
    """Convert object columns to numeric types where possible."""
    for col in df.columns:
        if df[col].dtype == object:
            # Attempt conversion to numeric
            try:
                # First try converting directly to float
                df[col] = pd.to_numeric(df[col], errors='raise')
                
                # Check if all values are integers
                if (df[col].dropna() % 1 == 0).all():
                    df[col] = df[col].astype('Int64')  # Nullable integer type
                else:
                    df[col] = df[col].astype(float)
            except (ValueError, TypeError):
                # Leave as object if conversion fails
                pass
    return df

darts_matches = drop_unnecessary_model_columns(darts_matches)
darts_matches = drop_nan_rows(darts_matches)
darts_matches = convert_dtypes(darts_matches)

with pd.option_context('display.max_columns', None):
    display(darts_matches.head(1))

# Print df shape
print(f'DataFrame shape: {darts_matches.shape}')

Unnamed: 0,is_set_play,career_avg_player_average,career_avg_player_100_plus_thrown,career_avg_player_140_plus_thrown,career_avg_player_180_thrown,career_avg_player_highest_checkout,career_avg_player_checkout_percentage,career_avg_player_checkouts_made,career_avg_player_checkouts_attempted,rolling_6_avg_player_average,rolling_6_avg_player_100_plus_thrown,rolling_6_avg_player_140_plus_thrown,rolling_6_avg_player_180_thrown,rolling_6_avg_player_highest_checkout,rolling_6_avg_player_checkout_percentage,rolling_6_avg_player_checkouts_made,rolling_6_avg_player_checkouts_attempted,opponent_career_avg_player_average,opponent_career_avg_player_100_plus_thrown,opponent_career_avg_player_140_plus_thrown,opponent_career_avg_player_180_thrown,opponent_career_avg_player_highest_checkout,opponent_career_avg_player_checkout_percentage,opponent_career_avg_player_checkouts_made,opponent_career_avg_player_checkouts_attempted,opponent_rolling_6_avg_player_average,opponent_rolling_6_avg_player_100_plus_thrown,opponent_rolling_6_avg_player_140_plus_thrown,opponent_rolling_6_avg_player_180_thrown,opponent_rolling_6_avg_player_highest_checkout,opponent_rolling_6_avg_player_checkout_percentage,opponent_rolling_6_avg_player_checkouts_made,opponent_rolling_6_avg_player_checkouts_attempted,previous_h2h_wins,previous_h2h_losses,previous_h2h_differential,did_player_win
16,0,98.02,10.0,2.0,4.0,108.0,100.0,6.0,6.0,98.02,10.0,2.0,4.0,108.0,100.0,6.0,6.0,92.13,8.0,7.0,3.0,93.0,28.6,6.0,21.0,92.13,8.0,7.0,3.0,93.0,28.6,6.0,21.0,0,0,0,0


DataFrame shape: (3894, 37)
