In [15]:
# Basic
from collections import Counter
import math
import numpy as np
import os
import pprint

# yaml specific
import yaml

# Data handling
from fuzzywuzzy import fuzz, process
import pandas as pd
from tqdm import tqdm

In [3]:
# Config variables
raw_data_path = "raw_data"
clean_data_path = "clean_data"
tournament_name = "IPL"

In [4]:
def parse_yaml(path):
    """
    Parses a given yaml file and returns the object
    Args:
        path - path of the yaml file to be parsed
    """
    with open(path, 'r') as stream:
        try:
            data = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
    return data

In [5]:
def get_all_matches_raw_data(tournament_name):
    tournament_path = os.path.join(raw_data_path, tournament_name)
    match_files = os.listdir(tournament_path)
    matches = []
    print(f"Parsing {len(match_files)} matches for {tournament_name}")
    for match_file in tqdm(match_files):
        match_path = os.path.join(tournament_path, match_file)
        match = parse_yaml(match_path)
        matches.append(match)
    return matches

In [65]:
def get_best_possible_name_match(query_name, players_with_same_surname):
    #players_with_same_surname = players[players.player_name.str.contains(query_name.split(" ")[-1])]
    
    # collecting all capital letters in the name to be stored as initials
    initials = [c for c in query_name if c.isupper()]
    
    # creating a hash map between the player_name and player_full_name for easy access
    name_full_name_map = {}
    for index, row in players_with_same_surname.iterrows():
        name_full_name_map[row["player_name"]] = row["player_full_name"]
    
    # List of all choices for the fuzzy algorithm to run on. 
    choices_name = np.array(players_with_same_surname.player_name)
    potential_matches = process.extract(query_name, choices_name, limit=50)
    top_score = potential_matches[0][1]
    best_match = ""
    for potential_match in potential_matches:

        score = potential_match[1]
        match_name = potential_match[0]
    
        # if you get a perfect score, that should definitely be the correct match
        if score == 100:
            best_match = match_name
            break
        
        # if you dont get a perfect score, check if all the initials are present in the players full name, that should be the best match
        if all([initial in name_full_name_map[match_name] for initial in initials]):
            best_match = match_name
            break
        
        # if either of methods fail, return the best match the fuzzy scoring algo returned
        if score > top_score:
            top_score = score
            best_match = match_name

    return best_match

In [70]:
def update_players(update_column, update_value, check_column, check_value):
    players = pd.read_csv(os.path.join(clean_data_path, "player.csv"))
    players = players.loc[:, ~players.columns.str.contains('^Unnamed')]
    players.loc[players[check_column] == check_value, update_column] = update_value
    #players[update_column] = np.where(players[check_column] == check_value, update_value)
    players.to_csv(os.path.join(clean_data_path, "player.csv"), index=False)

# This notebook will be used to curate 2 tables. 
## 1) Match 2) Ball

## Match table

In [6]:
matches = get_all_matches_raw_data(tournament_name)

  0%|                                                                                          | 0/817 [00:00<?, ?it/s]

Parsing 817 matches for IPL


100%|████████████████████████████████████████████████████████████████████████████████| 817/817 [03:52<00:00,  3.51it/s]


#### We need to first match all player names found in this dataset to the player_ids we have. Our player table contains full names (eg: David Warner) while this dataset has only the short names typically used in score cards (eg. DA Warner). So lets use fuzzy matching to map these

In [16]:
players = pd.read_csv(os.path.join(clean_data_path, "player.csv"))

array(['Rupert Kitzinger', 'Zulqarnain Haider', 'Daniel Zvidzai', ...,
       'Audley Sanson', 'Robert Samuels', 'Marlon Samuels'], dtype=object)

In [45]:
a = [True, True, False]
all(a)

False

In [63]:
unique_player_names = set()
for match in matches:
    for inning in match['innings']:
        for inning_number in inning:
            for ball in inning[inning_number]['deliveries']:
                print(ball)
                for ball_number in ball:
                    unique_player_names.add(ball[ball_number]['batsman'])
                    unique_player_names.add(ball[ball_number]['bowler'])
                    unique_player_names.add(ball[ball_number]['non_striker'])
    break

{0.1: {'batsman': 'DA Warner', 'bowler': 'TS Mills', 'non_striker': 'S Dhawan', 'runs': {'batsman': 0, 'extras': 0, 'total': 0}}}
{0.2: {'batsman': 'DA Warner', 'bowler': 'TS Mills', 'non_striker': 'S Dhawan', 'runs': {'batsman': 0, 'extras': 0, 'total': 0}}}
{0.3: {'batsman': 'DA Warner', 'bowler': 'TS Mills', 'non_striker': 'S Dhawan', 'runs': {'batsman': 4, 'extras': 0, 'total': 4}}}
{0.4: {'batsman': 'DA Warner', 'bowler': 'TS Mills', 'non_striker': 'S Dhawan', 'runs': {'batsman': 0, 'extras': 0, 'total': 0}}}
{0.5: {'batsman': 'DA Warner', 'bowler': 'TS Mills', 'extras': {'wides': 2}, 'non_striker': 'S Dhawan', 'runs': {'batsman': 0, 'extras': 2, 'total': 2}}}
{0.6: {'batsman': 'S Dhawan', 'bowler': 'TS Mills', 'non_striker': 'DA Warner', 'runs': {'batsman': 0, 'extras': 0, 'total': 0}}}
{0.7: {'batsman': 'S Dhawan', 'bowler': 'TS Mills', 'extras': {'legbyes': 1}, 'non_striker': 'DA Warner', 'runs': {'batsman': 0, 'extras': 1, 'total': 1}}}
{1.1: {'batsman': 'S Dhawan', 'bowler': 

In [71]:
for player_name in unique_player_names:
    players_with_same_surname = players[players.player_name.str.contains(player_name.split(" ")[-1])]
    best_match_name = get_best_possible_name_match(player_name, players_with_same_surname)
    update_players("player_display_name", player_name, "player_name", best_match_name)

('Aravind', 95)
('Sreenath Aravind', 86)
('N Choudhary', 91)
('Narendra Choudhary', 90)
('Fahad Choudhary', 86)
('Vikram Choudhary', 86)
('Vinay Choudhary', 86)
('Satyam Choudhary', 86)
('Pradeep Choudhary', 86)
('Nitesh Choudhary', 86)
('Nishu Choudhary', 86)
('Kishan Choudhary', 86)
('Akash Choudhary', 86)
('Richard Mills', 86)
('Frederick Mills', 86)
('Andrew Mills', 86)
('Stu Mills', 82)
('Tymal Mills', 74)
('Shikhar Dhawan', 86)
('Rashid Khan', 100)
('Kurtley Watson', 86)
('Matthew Watson', 86)
('Frederic Watson', 86)
('William Watson', 86)
('Ryan Watson', 80)
('Todd Watson', 76)
('Helen Watson', 76)
('George Watson', 76)
('Dane Watson', 76)
('Robert Watson', 76)
('John Watson', 76)
('James Watson', 76)
('David Watson', 76)
('Ashley Watson', 76)
('Aaron Watson', 76)
('Shane Watson', 76)
('Ashish Nehra', 86)
('Broderick Warner', 86)
('William Warner', 86)
('David Warner', 86)
('Yuvraj Singh', 100)
('Ben Cutting', 82)
('Satyajit Jadhav', 86)
('Narendra Jadhav', 86)
('Himanshu Jadhav