In [78]:
%pip install pandas
import sqlite3
import pandas as pd
import json


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [66]:
database_url = '../../darts-data-tracker/darts_stats.db'
game_table_name = 'darts_overall_match_stats'
throw_table_name = 'darts_individual_throw_stats'

In [67]:
def read_db_into_df(database_url, table_name):
    """Reads a database table into a pandas DataFrame.

    Args:
        database_url (str): The file path to the SQLite database.
        table_name (str): The name of the table to read from the database.

    Returns:
        pd.DataFrame: A DataFrame containing the data from the specified table.
    """
    conn = sqlite3.connect(database_url)
    df = pd.read_sql_query(f"SELECT * FROM {table_name}", conn)
    return df

def get_unique_uids(df):
    """Retrieves unique user IDs from the DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame containing throw data.

    Returns:
        numpy.ndarray: An array of unique user IDs.
    """
    return df['uid'].unique()

def filter_df_by_uid(df, uid):
    """Filters the DataFrame to include only rows with the specified user ID.

    Args:
        df (pd.DataFrame): The DataFrame to filter.
        uid (str): The user ID to filter by.

    Returns:
        pd.DataFrame: A DataFrame containing only the rows with the specified user ID.
    """
    return df[df['uid'] == uid]

throw_df = read_db_into_df(database_url, throw_table_name)
display(throw_df.head())

Unnamed: 0,uid,visit_num,remaining_home_score,remaining_away_score,throwing_team,score_thrown,throw_1,throw_2,throw_3,was_checkout
0,02_Jul_2024_13:05_BST_Brendan_Dolan_Mickey_Man...,1,440,501,Home,61,,,,0
1,02_Jul_2024_13:05_BST_Brendan_Dolan_Mickey_Man...,2,440,361,Away,140,,,,0
2,02_Jul_2024_13:05_BST_Brendan_Dolan_Mickey_Man...,3,344,361,Home,96,,,,0
3,02_Jul_2024_13:05_BST_Brendan_Dolan_Mickey_Man...,4,344,181,Away,180,,,,0
4,02_Jul_2024_13:05_BST_Brendan_Dolan_Mickey_Man...,5,299,181,Home,45,,,,0


In [68]:
def assign_leg_number_by_uid(df):
    """Assigns a leg number to each throw in the dataframe for each user ID.
    
    The leg number starts at 1 for each user ID and increments by 1 after each row
    where 'was_checkout' is '1'.
    
    Args:
        df (pd.DataFrame): DataFrame containing throw data with 'was_checkout' and 'uid' columns.
        
    Returns:
        pd.DataFrame: The input DataFrame with an additional 'leg_number' column as integers.
    """
    
    df['leg_number'] = 0  
    leg_number = 1  
    current_uid = None  
    total_rows = len(df)

    for index, row in df.iterrows():
        # New user ID encountered
        if row['uid'] != current_uid:  
            current_uid = row['uid']
            # Reset leg number for new user ID
            leg_number = 1  
        df.at[index, 'leg_number'] = leg_number
        # Increment leg number on checkout
        if row['was_checkout'] == '1':
            leg_number += 1  

        # Print progress every 1000 rows
        if (index + 1) % 1000 == 0:
            print(f"Processing row {index + 1}/{total_rows}", end='\r')

    return df

throw_df = assign_leg_number_by_uid(throw_df)
display(throw_df.head(1000))

Processing row 226000/226531

Unnamed: 0,uid,visit_num,remaining_home_score,remaining_away_score,throwing_team,score_thrown,throw_1,throw_2,throw_3,was_checkout,leg_number
0,02_Jul_2024_13:05_BST_Brendan_Dolan_Mickey_Man...,1,440,501,Home,61,,,,0,1
1,02_Jul_2024_13:05_BST_Brendan_Dolan_Mickey_Man...,2,440,361,Away,140,,,,0,1
2,02_Jul_2024_13:05_BST_Brendan_Dolan_Mickey_Man...,3,344,361,Home,96,,,,0,1
3,02_Jul_2024_13:05_BST_Brendan_Dolan_Mickey_Man...,4,344,181,Away,180,,,,0,1
4,02_Jul_2024_13:05_BST_Brendan_Dolan_Mickey_Man...,5,299,181,Home,45,,,,0,1
...,...,...,...,...,...,...,...,...,...,...,...
995,02_Jul_2024_13:50_BST_Ross_Smith_Darryl_Pilgrim,81,204,128,Away,96,,,,0,8
996,02_Jul_2024_13:50_BST_Ross_Smith_Darryl_Pilgrim,82,104,128,Home,100,,,,0,8
997,02_Jul_2024_13:50_BST_Ross_Smith_Darryl_Pilgrim,83,104,40,Away,88,,,,0,8
998,02_Jul_2024_13:50_BST_Ross_Smith_Darryl_Pilgrim,84,0,40,Home,104,,,,1,8


In [69]:
def convert_date_and_time(df):
    """Convert date and time columns to a consistent format and sort by date and time.
    
    Args:
        df (pd.DataFrame): DataFrame containing match data with 'date' and 'time' columns.
        
    Returns:
        pd.DataFrame: Updated DataFrame with converted 'date' and 'time' columns.
    """
    def convert_date_to_date(df):
        """Convert date column in format DD MMM, YYYY to date column in format YYYY-MM-DD and sort."""
        df['date'] = pd.to_datetime(df['date'], format='%d %b, %Y').dt.strftime('%Y-%m-%d')
        df = df.sort_values(by='date')
        return df

    def convert_time_to_time(df):
        """Convert time column to a consistent format and sort by date and time.
        
        Handles multiple time formats:
        - HH:MM BST
        - HH:MM CUT  
        - HH:MM GMT+0000
        - HH:MM:SS GMT+0000
        
        All times are converted to HH:MM format without timezone.
        """
        # Extract just the HH:MM portion, handling both HH:MM:SS and HH:MM formats
        df['time'] = df['time'].str.replace(r'(\d{2}:\d{2})(:\d{2})?\s+.*', r'\1', regex=True)
        
        # Clean up any extra whitespace
        df['time'] = df['time'].str.strip()
        
        # Sort by date and time
        df = df.sort_values(by=['date', 'time'])

        # Reset index
        df = df.reset_index(drop=True)
        
        return df
    
    df = convert_date_to_date(df)
    df = convert_time_to_time(df)
    return df


matches_df = read_db_into_df(database_url, game_table_name)
matches_df = convert_date_and_time(matches_df)

display(matches_df.head())

Unnamed: 0,uid,event_title,date,time,leg_count,home_team_name,away_team_name,home_team_legs_won,away_team_legs_won,home_team_average,...,home_team_180_thrown,away_team_180_thrown,home_team_highest_checkout,away_team_highest_checkout,home_team_checkouts_100_plus,away_team_checkouts_100_plus,home_team_checkout_percentage,away_team_checkout_percentage,home_team_checkouts,away_team_checkouts
0,17_Jan_2024_16:10_GMT+0000_Peter_Wright_Haruki...,International | Bahrain Darts Masters 2024,2024-01-17,16:10,Best of 11 legs,Peter Wright,Haruki Muramatsu,6,5,91.69,...,1,2,40,170,0,1,46.2%,35.7%,6/13,5/14
1,17_Jan_2024_16:50_GMT+0000_Gerwyn_Price_Reynal...,International | Bahrain Darts Masters 2024,2024-01-17,16:50,Best of 11 legs,Gerwyn Price,Reynaldo Rivera,6,4,92.13,...,3,0,93,104,0,1,28.6%,33.3%,6/21,4/12
2,17_Jan_2024_17:30_GMT+0000_Nathan_Aspinall_Lou...,International | Bahrain Darts Masters 2024,2024-01-17,17:30,Best of 11 legs,Nathan Aspinall,Lourence Ilagan,6,5,97.2,...,2,1,102,80,1,0,28.6%,62.5%,6/21,5/8
3,17_Jan_2024_18:05_GMT+0000_Rob_Cross_Tomoya_Goto,International | Bahrain Darts Masters 2024,2024-01-17,18:05,Best of 11 legs,Rob Cross,Tomoya Goto,6,3,94.66,...,0,3,112,96,1,0,54.5%,37.5%,6/11,3/8
4,17_Jan_2024_18:40_GMT+0000_Luke_Littler_Man_Lo...,International | Bahrain Darts Masters 2024,2024-01-17,18:40,Best of 11 legs,Luke Littler,Man Lok Leung,6,3,97.92,...,4,3,151,107,3,1,26.1%,37.5%,6/23,3/8


In [75]:
def fix_blank_leg_count(df):
    """Where leg count is 'Best of legs' set to 2* the higher value of home_team_legs_won or away_team_legs_won, then subtract 1.

    Args:
        df (pd.DataFrame): DataFrame containing match data with columns 'leg_count', 'home_team_legs_won', and 'away_team_legs_won'.

    Returns:
        pd.DataFrame: Updated DataFrame with corrected 'leg_count' values.
    """
    # Get max legs won for each row
    df = df.copy()
    mask = df['leg_count'] == 'Best of  legs'
    
    # Convert legs won columns to numeric
    home_legs = pd.to_numeric(df.loc[mask, 'home_team_legs_won'])
    away_legs = pd.to_numeric(df.loc[mask, 'away_team_legs_won'])
    
    # Calculate best of legs count and format as string
    best_of = (2 * pd.DataFrame({'home': home_legs, 'away': away_legs}).max(axis=1) - 1)
    df.loc[mask, 'leg_count'] = 'Best of ' + best_of.astype(str) + ' legs'
    
    return df

def update_tournament_names(df):
    """Remove 'International |' from front of tournament name then strip any trailing whitespace.

    Args:
        df (pd.DataFrame): DataFrame containing match data with an 'event_title' column.

    Returns:
        pd.DataFrame: Updated DataFrame with cleaned 'event_title' values.
    """
    df['event_title'] = df['event_title'].str.replace(r'^International \| ', '', regex=True).str.strip()
    return df

matches_df = fix_blank_leg_count(matches_df)
matches_df = update_tournament_names(matches_df)
display(matches_df.head())

Unnamed: 0,uid,event_title,date,time,leg_count,home_team_name,away_team_name,home_team_legs_won,away_team_legs_won,home_team_average,...,home_team_180_thrown,away_team_180_thrown,home_team_highest_checkout,away_team_highest_checkout,home_team_checkouts_100_plus,away_team_checkouts_100_plus,home_team_checkout_percentage,away_team_checkout_percentage,home_team_checkouts,away_team_checkouts
0,17_Jan_2024_16:10_GMT+0000_Peter_Wright_Haruki...,Bahrain Darts Masters 2024,2024-01-17,16:10,Best of 11 legs,Peter Wright,Haruki Muramatsu,6,5,91.69,...,1,2,40,170,0,1,46.2%,35.7%,6/13,5/14
1,17_Jan_2024_16:50_GMT+0000_Gerwyn_Price_Reynal...,Bahrain Darts Masters 2024,2024-01-17,16:50,Best of 11 legs,Gerwyn Price,Reynaldo Rivera,6,4,92.13,...,3,0,93,104,0,1,28.6%,33.3%,6/21,4/12
2,17_Jan_2024_17:30_GMT+0000_Nathan_Aspinall_Lou...,Bahrain Darts Masters 2024,2024-01-17,17:30,Best of 11 legs,Nathan Aspinall,Lourence Ilagan,6,5,97.2,...,2,1,102,80,1,0,28.6%,62.5%,6/21,5/8
3,17_Jan_2024_18:05_GMT+0000_Rob_Cross_Tomoya_Goto,Bahrain Darts Masters 2024,2024-01-17,18:05,Best of 11 legs,Rob Cross,Tomoya Goto,6,3,94.66,...,0,3,112,96,1,0,54.5%,37.5%,6/11,3/8
4,17_Jan_2024_18:40_GMT+0000_Luke_Littler_Man_Lo...,Bahrain Darts Masters 2024,2024-01-17,18:40,Best of 11 legs,Luke Littler,Man Lok Leung,6,3,97.92,...,4,3,151,107,3,1,26.1%,37.5%,6/23,3/8


In [109]:
def convert_game_to_json(matches_df, throw_df):
    """Convert match data from DataFrame to a JSON-compatible dictionary.

    Args:
        matches_df (pd.DataFrame): DataFrame containing match data.
        throw_df (pd.DataFrame): DataFrame containing throw data.

    Returns:
        dict: A dictionary representation of the match data in JSON format.
    """
    
    def is_set_play(matches_df):
        """Determine if the match is a set play based on the leg count.

        Args:
            matches_df (pd.DataFrame): DataFrame containing match data.

        Returns:
            bool: True if the match is set play, False otherwise.
        """
        if 'legs' in matches_df['leg_count'].values[0]:
            return False
        else:
            return True
        
    def date_and_time_to_timestamp(matches_df):
        """Convert the date and time from the DataFrame to a Unix timestamp.

        Args:
            matches_df (pd.DataFrame): DataFrame containing match data.

        Returns:
            int: Unix timestamp representing the match date and time.
        """
        return int(pd.Timestamp(matches_df['date'].values[0] + ' ' + matches_df['time'].values[0]).timestamp())
    
    def create_uid(matches_df):
        """Create a unique identifier for the match based on team names and timestamp.

        Args:
            matches_df (pd.DataFrame): DataFrame containing match data.

        Returns:
            str: A unique identifier for the match.
        """
        home_team_name = matches_df['home_team_name'].values[0].replace(' ', '_')
        away_team_name = matches_df['away_team_name'].values[0].replace(' ', '_')
        tournament_name = matches_df['event_title'].values[0].replace(' ', '_')
        return f'{home_team_name}_vs_{away_team_name}_{tournament_name}_{date_and_time_to_timestamp(matches_df)}'
    
    # Initialize the return dictionary
    return_dic = {}
    uid = create_uid(matches_df)

    # Prepare match information
    return_dic_info = {}
    return_dic_info['event_title'] = matches_df['event_title'].values[0]
    return_dic_info['is_set_play'] = is_set_play(matches_df)
    return_dic_info['format'] = matches_df['leg_count'].values[0]
    return_dic_info['date'] = matches_df['date'].values[0]
    return_dic_info['time'] = matches_df['time'].values[0]

    # Add the info dictionary to the return dictionary
    return_dic['info'] = return_dic_info
    
    # Prepare team information
    return_dic_teams = {}
    return_dic_teams['home_team'] = {
        'name': matches_df['home_team_name'].values[0],
        'score': int(matches_df['home_team_legs_won'].values[0]),
        'is_winner': matches_df['home_team_legs_won'].values[0] > matches_df['away_team_legs_won'].values[0],
        'average': float(matches_df['home_team_average'].values[0]),
        '180s': int(matches_df['home_team_180_thrown'].values[0]),
        'checkouts': {
            'checkouts_attempted': int(matches_df['home_team_checkouts'].values[0].split('/')[1]),
            'checkouts_completed': int(matches_df['home_team_checkouts'].values[0].split('/')[0]),
            'checkouts_percentage': 0 if int(matches_df['home_team_checkouts'].values[0].split('/')[1]) == 0 else round((float(matches_df['home_team_checkouts'].values[0].split('/')[0]) / float(matches_df['home_team_checkouts'].values[0].split('/')[1])) * 100, 1),
            'highest_checkout': int(matches_df['home_team_highest_checkout'].values[0])
        }
    }

    return_dic_teams['away_team'] = {
        'name': matches_df['away_team_name'].values[0],
        'score': int(matches_df['away_team_legs_won'].values[0]),
        'is_winner': matches_df['home_team_legs_won'].values[0] < matches_df['away_team_legs_won'].values[0],
        'average': float(matches_df['away_team_average'].values[0]),
        '180s': int(matches_df['away_team_180_thrown'].values[0]),
        'checkouts': {
            'checkouts_attempted': int(matches_df['away_team_checkouts'].values[0].split('/')[1]),
            'checkouts_completed': int(matches_df['away_team_checkouts'].values[0].split('/')[0]),
            'checkouts_percentage': 0 if int(matches_df['away_team_checkouts'].values[0].split('/')[1]) == 0 else round((float(matches_df['away_team_checkouts'].values[0].split('/')[0]) / float(matches_df['away_team_checkouts'].values[0].split('/')[1])) * 100, 1),
            'highest_checkout': int(matches_df['away_team_highest_checkout'].values[0])
        }
    }
    
    # Add the teams dictionary to the return dictionary
    return_dic_info['teams'] = return_dic_teams

    return_dic_throws = throw_timeline_to_json(throw_df, is_set_play(matches_df))

    return_dic['throws'] = return_dic_throws

    return uid, return_dic

def throw_timeline_to_json(df, is_set_play):
    def add_set_number(df):
        """Add a set number to each throw in the dataframe.

        Args:
            df (pd.DataFrame): DataFrame containing throw data.

        Returns:
            pd.DataFrame: Updated DataFrame with an additional 'set_number' column.
        """
        if not is_set_play:
            return df
        else:
            # Create a copy of the dataframe
            df = df.copy()
            
            # Initialize variables to track set number and leg wins
            current_set = 1
            home_legs_in_set = 0
            away_legs_in_set = 0
            
            # Create set_number column initialized to 1
            df['set_number'] = 1
            
            # Iterate through throws in order
            current_leg = 1
            for idx, row in df.iterrows():
                # If we've moved to a new leg
                if row['leg_number'] != current_leg:
                    # Get last throw of previous leg to determine winner
                    last_throw = df[df['leg_number'] == current_leg].iloc[-1]
                    
                    # Update leg counts based on winner
                    if last_throw['throwing_team'] == 'Home':
                        home_legs_in_set += 1
                    else:
                        away_legs_in_set += 1
                        
                    # Check if set is complete (first to 3 legs)
                    if home_legs_in_set == 3 or away_legs_in_set == 3:
                        current_set += 1
                        home_legs_in_set = 0 
                        away_legs_in_set = 0
                        
                        # Update set number for current and future rows
                        df.loc[idx:, 'set_number'] = current_set
                    
                    current_leg = row['leg_number']
            
            return df

    def create_leg_throw_timeline(df):
        """Create a timeline of throws for a match.

        Args:
            df (pd.DataFrame): DataFrame containing throw data.

        Returns:
            dict: A dictionary containing the throw timeline organized by legs and visits.
        """
        timeline = {
            'legs': []
        }
        
        current_leg = None
        current_leg_data = None
        
        # Iterate through throws in order
        for _, visit in df.iterrows():
            # If we've moved to a new leg
            if current_leg != visit['leg_number']:
                if current_leg_data is not None:
                    timeline['legs'].append(current_leg_data)
                    
                current_leg = visit['leg_number']
                current_leg_data = {
                    'leg_number': int(current_leg),
                    'visits': []
                }
                if is_set_play:
                    current_leg_data['set_number'] = int(visit['set_number'])
            
            visit_data = {
                'visit_number': int(visit['visit_num']),
                'scores': {
                    'home': int(visit['remaining_home_score']),
                    'away': int(visit['remaining_away_score'])
                },
                'throwing_team': visit['throwing_team'].lower(),
                'score_thrown': int(visit['score_thrown']),
                'throws': [
                    visit['throw_1'] if visit['throw_1'] != 'N/A' else None,
                    visit['throw_2'] if visit['throw_2'] != 'N/A' else None,
                    visit['throw_3'] if visit['throw_3'] != 'N/A' else None
                ],
                'was_checkout': visit['was_checkout'] == '1'
            }
            
            current_leg_data['visits'].append(visit_data)
        
        # Add the last leg
        if current_leg_data is not None:
            timeline['legs'].append(current_leg_data)
            
        return timeline
    
    def create_set_throw_timeline(df):
        """Create a timeline of throws for a match.

        Args:
            df (pd.DataFrame): DataFrame containing throw data.

        Returns:
            dict: A dictionary containing the throw timeline organized by sets and legs.
        """
        timeline = {
            'sets': []
        }
        
        current_set = None
        current_leg = None
        current_set_data = None
        current_leg_data = None
        
        # Iterate through throws in order
        for _, visit in df.iterrows():
            # If we've moved to a new set
            if current_set != visit['set_number']:
                if current_set_data is not None:
                    if current_leg_data is not None:
                        current_set_data['legs'].append(current_leg_data)
                    timeline['sets'].append(current_set_data)
                    
                current_set = visit['set_number']
                current_set_data = {
                    'set_number': int(current_set),
                    'legs': []
                }
                current_leg = None
            
            # If we've moved to a new leg
            if current_leg != visit['leg_number']:
                if current_leg_data is not None:
                    current_set_data['legs'].append(current_leg_data)
                    
                current_leg = visit['leg_number']
                current_leg_data = {
                    'leg_number': int(current_leg),
                    'visits': []
                }
            
            visit_data = {
                'visit_number': int(visit['visit_num']),
                'scores': {
                    'home': int(visit['remaining_home_score']),
                    'away': int(visit['remaining_away_score'])
                },
                'throwing_team': visit['throwing_team'].lower(),
                'score_thrown': int(visit['score_thrown']),
                'throws': [
                    visit['throw_1'] if visit['throw_1'] != 'N/A' else None,
                    visit['throw_2'] if visit['throw_2'] != 'N/A' else None,
                    visit['throw_3'] if visit['throw_3'] != 'N/A' else None
                ],
                'was_checkout': visit['was_checkout'] == '1'
            }
            
            current_leg_data['visits'].append(visit_data)
        
        # Add the last leg and set
        if current_set_data is not None:
            if current_leg_data is not None:
                current_set_data['legs'].append(current_leg_data)
            timeline['sets'].append(current_set_data)
            
        return timeline

    def create_throw_timeline(df):
        """Create a timeline of throws for a match.

        Args:
            df (pd.DataFrame): DataFrame containing throw data.

        Returns:
            dict: A dictionary containing the throw timeline organized by sets and legs.
        """
        if is_set_play:
            return create_set_throw_timeline(df)
        else:
            return create_leg_throw_timeline(df)

    df = add_set_number(df)
    
    return create_throw_timeline(df)

def convert_all_games_to_json(df, throw_df):
    return_dic = {}
    uids = get_unique_uids(df)
    total_uids = len(uids)
    
    for index, uid in enumerate(uids):
        individual_matches = filter_df_by_uid(df, uid)
        individual_throws = filter_df_by_uid(throw_df, uid)
        game_uid, game_json = convert_game_to_json(individual_matches, individual_throws)
        return_dic[game_uid] = game_json
        
        # Print progress on the same line
        print(f"\rProcessing {index + 1} out of {total_uids} uids", end='')
        
    return return_dic

json_games = convert_all_games_to_json(matches_df, throw_df)
with open('data/matches.json', 'w') as f:
    json.dump(json_games, f, indent=4)

Processing 2263 out of 2263 uids