In [1]:
# Basic
from collections import Counter
import math
import numpy as np
import os
import pprint

# yaml specific
import yaml

# Data handling
from fuzzywuzzy import fuzz, process
import pandas as pd
from tqdm import tqdm

In [2]:
# Config variables
raw_data_path = "raw_data"
clean_data_path = "clean_data"
tournament_name = "IPL"

### Utility functions. 

In [3]:
def parse_yaml(path):
    """
    Parses a given yaml file and returns the object
    Args:
        path - path of the yaml file to be parsed
    """
    with open(path, 'r') as stream:
        try:
            data = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
    return data

In [4]:
def get_all_matches_raw_data(tournament_name):
    """
        Parses all the matches inside the tournament_name folder and returns a list of dicts
        Args:
            tournament_name - Name of tournament to be parsed
    """
    
    tournament_path = os.path.join(raw_data_path, tournament_name)
    match_files = os.listdir(tournament_path)
    matches = []
    print(f"Parsing {len(match_files)} matches for {tournament_name}")
    for match_file in tqdm(match_files):
        if ".yaml" not in match_file:
            continue
        match_path = os.path.join(tournament_path, match_file)
        match = parse_yaml(match_path)
        matches.append(match)
    return matches

In [5]:
def get_best_possible_name_match(query_name, players_with_same_surname):
    """
    This function does a fuzzy match to find where to insert query_name in the players database. It returns the best matched player_name
    Args:
        query_name - query name to be checked with
        players_with_same_surname - a subset of the players dataframe for checking with the query_name
    """
    
    #print(f"Getting best match for {query_name}")
    
    # collecting all capital letters in the name to be stored as initials
    initials = [c for c in query_name if c.isupper()]
    
    # creating a hash map between the player_name and player_full_name for easy access
    name_full_name_map = {}
    for index, row in players_with_same_surname.iterrows():
        name_full_name_map[row["player_name"]] = row["player_full_name"]
    
    # List of all choices for the fuzzy algorithm to run on. 
    choices_name = np.array(players_with_same_surname.player_name)
    potential_matches = process.extract(query_name, choices_name, processor=None, limit=50)
    
    # IF no potential match was found, return null string
    if len(potential_matches) == 0:
        return ""
    
    #print(f"All matches: {potential_matches}")
    top_score = potential_matches[0][1]
    best_match = ""
    for potential_match in potential_matches:
        score = potential_match[1]
        match_name = potential_match[0]
    
        # if you get a perfect score, that should definitely be the correct match
        if score == 100:
            best_match = match_name
            break
        
        # if you dont get a perfect score, check if all the initials are present in the players full name, that should be the best match
        initial_bool = []
        for initial in initials:
            #print(f"Checking if {initial} is present in {name_full_name_map[match_name]}")
            if initial in name_full_name_map[match_name]:
                initial_bool.append(True)
            else:
                initial_bool.append(False)
            name_full_name_map[match_name] = name_full_name_map[match_name].replace(initial, '', 1)

        if all(initial_bool):
            best_match = match_name
            break
        
        # if either of methods fail, return the best match the fuzzy scoring algo returned
        if score > top_score:
            top_score = score
            best_match = match_name

    return best_match

In [6]:
def update_players(update_column, update_value, check_column, check_value):
    """
    This function 
        - reads the player.csv file
        - updates update_column with update_value where check_column is check_value
        - writes it back
    Args:
        query_name - query name to be checked with
        players_with_same_surname - a subset of the players dataframe for checking with the query_name
    """
    
    #print(f"Updating {update_column} with {update_value} where {check_column} is {check_value}")
    players = pd.read_csv(os.path.join(clean_data_path, "player.csv"))
    players = players.loc[:, ~players.columns.str.contains('^Unnamed')]
    players.loc[players[check_column] == check_value, update_column] = update_value
    #players[update_column] = np.where(players[check_column] == check_value, update_value)
    players.to_csv(os.path.join(clean_data_path, "player.csv"), index=False)

# This notebook will be used to curate 2 tables. 
## 1) Match 2) Ball

## Match table

In [7]:
matches = get_all_matches_raw_data(tournament_name)

  0%|                                                                                          | 0/818 [00:00<?, ?it/s]

Parsing 818 matches for IPL


100%|████████████████████████████████████████████████████████████████████████████████| 818/818 [03:11<00:00,  4.27it/s]


#### We need to first match all player names found in this dataset to the player_ids we have. Our player table contains full names (eg: David Warner) while this dataset has only the short names typically used in score cards (eg. DA Warner). So using a combination of fuzzy and boolean matching to map these

In [8]:
players = pd.read_csv(os.path.join(clean_data_path, "player.csv"))

### Creating a unique player names set with information from each ball of all the 817 IPL matches. Considering batsman bowler and non-striker

In [9]:
unique_player_names = set()
for match in tqdm(matches):
    for inning in match['innings']:
        for inning_number in inning:
            for ball in inning[inning_number]['deliveries']:
                for ball_number in ball:
                    unique_player_names.add(ball[ball_number]['batsman'])
                    unique_player_names.add(ball[ball_number]['bowler'])
                    unique_player_names.add(ball[ball_number]['non_striker'])

100%|██████████████████████████████████████████████████████████████████████████████| 816/816 [00:00<00:00, 4152.47it/s]


In [10]:
len(unique_player_names)

580

### for each one of these unique players, we are trying to find a best match map from the players sheet and updating the player_display_name

In [11]:
for player_name in tqdm(unique_player_names):
    players_with_same_surname = players[players.player_name.str.contains(player_name.split(" ")[-1])]
    best_match_name = get_best_possible_name_match(player_name, players_with_same_surname)
    # if null string was returned, skip the player
    if best_match_name:
        update_players("player_display_name", player_name, "player_name", best_match_name)

  return func(self, *args, **kwargs)
100%|████████████████████████████████████████████████████████████████████████████████| 580/580 [00:29<00:00, 19.53it/s]


### Beyond this if we find any inaccuracies in the mapping, lets just manually correct them