In [24]:
# Basic
from collections import Counter
import datetime
import math
import numpy as np
import os
import pprint
import sys

# yaml specific
import yaml

# Data handling
from fuzzywuzzy import fuzz, process
import pandas as pd
from tqdm import tqdm

# Appending root folder to path for easy importing
sys.path.append("../")

# my library
from src.db_utils import update_player, add_player 

In [25]:
# Config variables
raw_data_path = os.path.join("..", "raw_data") 
clean_data_path = os.path.join("..", "clean_data") 

# ingest each tournament sequentially
tournament_name = "IPL"
#tournament_name = "BBL"

### Utility functions. 

In [26]:
def find_players_not_mapped(matches):
    """
    Utility function to quickly run through all deliveries of every match and see if there is any exception coming (if there is any player unmapped)
    Args:
        matches - a list of match objects parsed from the yaml files
    """
    players_not_found_or_mapped = []
    for match in tqdm(matches):
        try:
            for inning in match['innings']:
                for inning_number in inning:
                    for ball in inning[inning_number]['deliveries']:
                        for ball_number in ball:

                            batsman = player_id_map[ball[ball_number]['batsman']]
                            bowler = player_id_map[ball[ball_number]['bowler']]
                            non_striker = player_id_map[ball[ball_number]['non_striker']]

                            if "wicket" in ball[ball_number]:
                                player_dismissed = player_id_map[ball[ball_number]["wicket"]["player_out"]]
                                dismissal_type = ball[ball_number]["wicket"]["kind"]

                                if "fielders" in ball[ball_number]["wicket"]:
                                    # There is "(sub)" when a substitute fielder is involved in a wicket
                                    fielders = ",".join([str(player_id_map[fielder.replace(" (sub)", "")]) for fielder in ball[ball_number]["wicket"]["fielders"]])
        except Exception as e:
            print(f"Exception {e} happened in ball number {ball_number} ")
            print(f"ball {ball[ball_number]}")
            print(f"match info: {match['info']}")
            players_not_found_or_mapped.append(e.args[0])
            
    return players_not_found_or_mapped

In [27]:
def parse_yaml(path):
    """
    Parses a given yaml file and returns the object
    Args:
        path - path of the yaml file to be parsed
    """
    with open(path, 'r') as stream:
        try:
            data = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
    return data

In [28]:
def get_all_matches_raw_data(tournament_name):
    """
        Parses all the matches inside the tournament_name folder and returns a list of dicts
        Args:
            tournament_name - Name of tournament to be parsed
    """
    
    tournament_path = os.path.join(raw_data_path, tournament_name)
    match_files = os.listdir(tournament_path)
    matches = []
    print(f"Parsing {len(match_files)} matches for {tournament_name}")
    for match_file in tqdm(match_files):
        if ".yaml" not in match_file:
            continue
        match_path = os.path.join(tournament_path, match_file)
        match = parse_yaml(match_path)
        matches.append(match)
    return matches

In [29]:
def get_best_possible_name_match(query_name, players_with_same_surname):
    """
    This function does a fuzzy match to find where to insert query_name in the players database. It returns the best matched player_name. 
    eg: DA Warner needs to be mapped to David Warner  
    Args:
        query_name - query name to be checked with
        players_with_same_surname - a subset of the players dataframe for checking with the query_name
    """
    
    #print(f"Getting best match for {query_name}")
    
    # collecting all capital letters in the name to be stored as initials
    initials = [c for c in query_name if c.isupper()]
    
    # creating a hash map between the player_name and player_full_name for easy access
    name_full_name_map = {}
    for index, row in players_with_same_surname.iterrows():
        name_full_name_map[row["player_name"]] = row["player_full_name"]
    
    # List of all choices for the fuzzy algorithm to run on. 
    choices_name = np.array(players_with_same_surname.player_name)
    potential_matches = process.extract(query_name, choices_name, processor=None, limit=50)
    
    # IF no potential match was found, return null string
    if len(potential_matches) == 0:
        return ""
    
    #print(f"All matches: {potential_matches}")
    top_score = potential_matches[0][1]
    best_match = ""
    for potential_match in potential_matches:
        score = potential_match[1]
        match_name = potential_match[0]
    
        # if you get a perfect score, that should definitely be the correct match
        if score == 100:
            best_match = match_name
            break
        
        # if you dont get a perfect score, check if all the initials are present in the players full name, that should be the best match
        initial_bool = []
        #char_index = []
        for initial in initials:
            #print(f"Checking if {initial} is present in {name_full_name_map[match_name]}")
            if initial in name_full_name_map[match_name]:
                initial_bool.append(True)
                #char_index.append(name_full_name_map[match_name].index(initial))
            else:
                initial_bool.append(False)
            name_full_name_map[match_name] = name_full_name_map[match_name].replace(initial, '', 1)
        
        is_initials_index_increasing = True
        
        # This logic is unfortunately failing for players like "CV Varun" that needs to be mapped to "Varun Chakravarthy"
        '''
        for i in range(1, len(char_index)):
            if char_index[i] < char_index[i-1]:
                is_initials_index_increasing = False
        '''
        
        if all(initial_bool) and is_initials_index_increasing:
            best_match = match_name
            break
        
        # if either of methods fail, return the best match the fuzzy scoring algo returned
        if score > top_score:
            top_score = score
            best_match = match_name

    return best_match

# This notebook will be used to curate 3 tables. 
## 1) Venue 2) Match 3) Ball

In [30]:
matches = get_all_matches_raw_data(tournament_name)

  0%|                                                                                          | 0/818 [00:00<?, ?it/s]

Parsing 818 matches for IPL


100%|████████████████████████████████████████████████████████████████████████████████| 818/818 [03:54<00:00,  3.49it/s]


In [44]:
matches[0]['innings'][0]['1st innings']['deliveries'][0][0.1]

{'batsman': 'DA Warner',
 'bowler': 'TS Mills',
 'non_striker': 'S Dhawan',
 'runs': {'batsman': 0, 'extras': 0, 'total': 0}}

#### We need to first match all player names found in this dataset to the player_ids we have. Our player table contains full names (eg: David Warner) while this dataset has only the short names typically used in score cards (eg. DA Warner). So using a combination of fuzzy and boolean matching to map these

In [46]:
players = pd.read_csv(os.path.join(clean_data_path, "player.csv"))

### Creating a unique player names set with information from each ball of all the 817 IPL matches. Considering batsman bowler and non-striker

In [47]:
unique_player_names = set()
for match in tqdm(matches):
    for inning in match['innings']:
        for inning_number in inning:
            for ball in inning[inning_number]['deliveries']:
                for ball_number in ball:
                    unique_player_names.add(ball[ball_number]['batsman'])
                    unique_player_names.add(ball[ball_number]['bowler'])
                    unique_player_names.add(ball[ball_number]['non_striker'])

100%|██████████████████████████████████████████████████████████████████████████████| 816/816 [00:00<00:00, 3551.83it/s]


In [50]:
unique_player_names

{'A Ashish Reddy',
 'A Chandila',
 'A Chopra',
 'A Choudhary',
 'A Dananjaya',
 'A Flintoff',
 'A Kumble',
 'A Mishra',
 'A Mithun',
 'A Mukund',
 'A Nehra',
 'A Nel',
 'A Nortje',
 'A Singh',
 'A Symonds',
 'A Uniyal',
 'A Zampa',
 'AA Bilakhia',
 'AA Chavan',
 'AA Jhunjhunwala',
 'AA Kazi',
 'AA Noffke',
 'AB Agarkar',
 'AB Barath',
 'AB Dinda',
 'AB McDonald',
 'AB de Villiers',
 'AC Blizzard',
 'AC Gilchrist',
 'AC Thomas',
 'AC Voges',
 'AD Hales',
 'AD Mascarenhas',
 'AD Mathews',
 'AD Nath',
 'AD Russell',
 'AF Milne',
 'AG Murtaza',
 'AG Paunikar',
 'AJ Finch',
 'AJ Turner',
 'AJ Tye',
 'AL Menaria',
 'AM Nayar',
 'AM Rahane',
 'AM Salvi',
 'AN Ahmed',
 'AN Ghosh',
 'AP Dole',
 'AP Majumdar',
 'AP Tare',
 'AR Bawne',
 'AR Patel',
 'AS Joseph',
 'AS Rajpoot',
 'AS Raut',
 'AS Roy',
 'AS Yadav',
 'AT Carey',
 'AT Rayudu',
 'AUK Pathan',
 'Abdul Samad',
 'Abdur Razzak',
 'Abhishek Sharma',
 'Anand Rajan',
 'Anirudh Singh',
 'Ankit Sharma',
 'Ankit Soni',
 'Anureet Singh',
 'Arshde

### for each one of these unique players, we are trying to find a best match map from the players sheet and updating the player_display_name

In [51]:
for player_name in tqdm(unique_player_names):
    players_with_same_surname = players[players.player_name.str.contains(player_name.split(" ")[-1])]
    best_match_name = get_best_possible_name_match(player_name, players_with_same_surname)
    # if null string was returned, skip the player
    if best_match_name:
        update_player("player_display_name", player_name, "player_name", best_match_name)

100%|████████████████████████████████████████████████████████████████████████████████| 580/580 [00:37<00:00, 15.55it/s]


### Beyond this if we find any inaccuracies in the mapping, lets just manually correct them

## Venue table

In [52]:
unique_venues = set()
all_venues = []
for match in tqdm(matches):
    unique_venues.add(match['info']['venue'])
    all_venues.append(match['info']['venue'])

100%|████████████████████████████████████████████████████████████████████████████| 816/816 [00:00<00:00, 274815.49it/s]


### Trying to find if there are duplicate names for the same stadium

In [53]:
unique_venues_list = list(unique_venues)
similarity_scores = []
for i in range(len(unique_venues_list)-1):
    for j in range(i+1, len(unique_venues_list)):
        similarity_score = fuzz.WRatio(unique_venues_list[i], unique_venues_list[j])
        similarity_scores.append([i, j, similarity_score])

similarity_scores_sorted = sorted(similarity_scores, key=lambda x : x[2], reverse=True)
counter = 0
for pair in similarity_scores_sorted:
    print(f"{unique_venues_list[pair[0]]} - {unique_venues_list[pair[1]]} --> {pair[2]}")
    if counter > 20:
        break
print(Counter(all_venues))

M Chinnaswamy Stadium - M.Chinnaswamy Stadium --> 100
Punjab Cricket Association Stadium, Mohali - Punjab Cricket Association IS Bindra Stadium, Mohali --> 95
Saurashtra Cricket Association Stadium - Maharashtra Cricket Association Stadium --> 94
Himachal Pradesh Cricket Association Stadium - Maharashtra Cricket Association Stadium --> 87
M Chinnaswamy Stadium - Punjab Cricket Association Stadium, Mohali --> 86
M Chinnaswamy Stadium - Saurashtra Cricket Association Stadium --> 86
M Chinnaswamy Stadium - Vidarbha Cricket Association Stadium, Jamtha --> 86
M Chinnaswamy Stadium - Shaheed Veer Narayan Singh International Stadium --> 86
M Chinnaswamy Stadium - Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium --> 86
M Chinnaswamy Stadium - Himachal Pradesh Cricket Association Stadium --> 86
M Chinnaswamy Stadium - Nehru Stadium --> 86
M Chinnaswamy Stadium - Rajiv Gandhi International Stadium, Uppal --> 86
M Chinnaswamy Stadium - Dubai International Cricket Stadium --> 86
M Chinnaswamy S

### These stadiums have duplicates so removing them, but they need to be mapped to the same venue_id when iterating over the matches

In [54]:
if tournament_name == "IPL":
    unique_venues_list.remove("M.Chinnaswamy Stadium")
    unique_venues_list.remove("Punjab Cricket Association IS Bindra Stadium, Mohali")
if tournament_name == "BBL":
    unique_venues_list.remove("Brisbane Cricket Ground")

In [55]:
pad=0
# if venue.csv already exists, read it and append the new data into it. if it does not exist, create new file
if os.path.exists(os.path.join(clean_data_path, "venue.csv")):
    old_venue_csv = pd.read_csv(os.path.join(clean_data_path, "venue.csv"))
    pad = len(old_venue_csv)

venue_columns = ["venue_id", "venue_name"]
venue_csv = pd.DataFrame(columns = venue_columns)
    
for i in range(len(unique_venues_list)):
    venue_id = i+1+pad
    venue_name = unique_venues_list[i]
    venue_csv = venue_csv.append({"venue_id" : venue_id, 
                                  "venue_name" : venue_name}, ignore_index=True)
    
if os.path.exists(os.path.join(clean_data_path, "venue.csv")):
    old_venue_csv = pd.read_csv(os.path.join(clean_data_path, "venue.csv"))
    venue_csv = old_venue_csv.append(venue_csv)
    
venue_csv.to_csv(os.path.join(clean_data_path, "venue.csv"), index=False)

### Utility maps for tournaments, venues, teams and players

In [56]:
df_tournament = pd.read_csv(os.path.join(clean_data_path, "tournament.csv"))
df_tournament = df_tournament.loc[:, ~df_tournament.columns.str.contains('^Unnamed')]
tournament_id_map = dict(zip(df_tournament.tournament_name, df_tournament.tournament_id))
tournament_fullname_id_map = dict(zip(df_tournament.tournament_full_name, df_tournament.tournament_id))

In [58]:
df_venue = pd.read_csv(os.path.join(clean_data_path, "venue.csv"))
df_venue = df_venue.loc[:, ~df_venue.columns.str.contains('^Unnamed')]
venue_id_map = dict(zip(df_venue.venue_name, df_venue.venue_id))

# Mapping the duplicates as well to its correct venue ids
# IPL maps
if tournament_name == "IPL":
    venue_id_map["Punjab Cricket Association IS Bindra Stadium, Mohali"] = venue_id_map["Punjab Cricket Association Stadium, Mohali"]
    venue_id_map["M.Chinnaswamy Stadium"] = venue_id_map["M Chinnaswamy Stadium"]
# BBL maps
if tournament_name == "BBL":
    venue_id_map["Brisbane Cricket Ground"] = venue_id_map["Brisbane Cricket Ground, Woolloongabba"]

In [59]:
df_team = pd.read_csv(os.path.join(clean_data_path, "team.csv"))
df_team = df_team.loc[:, ~df_team.columns.str.contains('^Unnamed')]
team_id_map = dict(zip(df_team.team_name, df_team.team_id))

### Use this cell for all manual corrections to update/insert in players table

In [60]:
# updates

# IPL
update_player("player_display_name", "RG Sharma", "player_name", "Rohit Sharma")
update_player("player_display_name", "SA Yadav", "player_name", "Suryakumar Yadav")
update_player("player_display_name", "CRD Fernando", "player_name", "Dilhara Fernando")
update_player("player_display_name", "DPMD Jayawardene", "player_name", "Mahela Jayawardene")
update_player("player_display_name", "R Powell", "player_name", "Rovman Powell")
update_player("player_display_name", "RK Singh", "player_name", "Rinku Singh")
update_player("player_display_name", "JPR Scantlebury-Searles", "player_name", "Javon Searles")
update_player("player_display_name", "Milind Kumar", "player_name", "Milind Kumar")
update_player("player_display_name", "NB Singh", "player_name", "Nathu Singh")
update_player("player_display_name", "AS Yadav", "player_name", "Arjun Yadav")
update_player("player_display_name", "VRV Singh", "player_full_name", "Vikram Raj Vir Singh")
update_player("player_display_name", "R Bishnoi", "player_full_name", "Rajesh Bishnoi")
update_player("player_display_name", "KH Devdhar", "player_full_name", "Kedar Hemant Devdhar")
update_player("player_display_name", "Harmeet Singh (2)", "player_full_name", "Harmeet Singh")
update_player("player_display_name", "AV Wankhade", "player_full_name", "Apoorv Vijay Wankhade")
update_player("player_display_name", "B Aparajith", "player_full_name", "Baba Aparajith")
update_player("player_display_name", "Anmolpreet Singh", "player_full_name", "Anmolpreet Singh")
update_player("player_display_name", "A Mishra", "player_full_name", "Amit Mishra")
update_player("player_display_name", "", "player_full_name", "Aditya Mishra")
update_player("player_display_name", "B Kumar", "player_full_name", "Bhuvneshwar Kumar Singh")
update_player("player_display_name", "", "player_full_name", "Kumar Dinesh Boresa")



# BBL
update_player("player_display_name", "CJ Green", "player_full_name", "Christopher James Green")
update_player("player_display_name", "C Green", "player_full_name", "Cameron Green")
update_player("player_display_name", "DP Hughes", "player_full_name", "Daniel Peter Hughes")
update_player("player_display_name", "TH David", "player_full_name", "Timothy Hays David")



# inserts

# IPL
add_player("Ankit Soni", "Ankit Soni", "Ankit Soni", "Right-hand bat", "Legbreak googly", "02/08/1993", "2", str(team_id_map["Gujarat Lions"]))
add_player("J Suchith", "Jagadeesha Suchith", "Jagadeesha Suchith", "Left-hand bat", "Slow left-arm orthodox", "01/16/1994", "2", str(team_id_map["Kings XI Punjab"]))
add_player("SD Lad", "Siddhesh Lad", "Siddhesh Dinesh Lad", "Right-hand bat", "Right-arm offbreak", "23/05/92", "2", str(team_id_map["Mumbai Indians"]))
add_player("S Kaushik", "Shivil Kaushik", "Shivil Sharma Kaushik", "Left-hand bat", "Slow left-arm wrist-spin", "07/09/95", "2", str(team_id_map["Gujarat Lions"]))
add_player("KM Asif", "KM Asif", "KM Asif", "Right-hand bat", "Right-arm medium", "24/07/93", "2", str(team_id_map["Chennai Super Kings"]))
add_player("AS Roy", "Anukul Roy", "Anukul Sudhakar Roy", "Left-hand bat", "Slow left-arm orthodox", "30/11/98", "2", str(team_id_map["Mumbai Indians"]))
add_player("YBK Jaiswal", "Yashasvi Jaiswal", "Yashasvi Bhupendra Kumar Jaiswal", "Left-hand bat", "", "28/12/01", "2", str(team_id_map["Rajasthan Royals"]))
add_player("Abdul Samad", "Abdul Samad", "Abdul Samad", "Right-hand bat", "Right-arm legbreak", "10/28/01", "2", str(team_id_map["Sunrisers Hyderabad"]))
add_player("Kartik Tyagi", "Kartik Tyagi", "Kartik Tyagi", "Right-hand bat", "Right-arm fast", "11/08/00", "2", str(team_id_map["Rajasthan Royals"]))
add_player("Lalit Yadav", "Lalit Yadav", "Lalit Yadav", "Right-hand bat", "Right-arm offbreak", "03/01/97", "2", str(team_id_map["Delhi Daredevils"]))
add_player("Ravi Bishnoi", "Ravi Bishnoi", "Ravi Bishnoi", "Right-hand bat", "Legbreak googly", "05/09/00", "2", str(team_id_map["Kings XI Punjab"]))
add_player("RA Shaikh", "Rahil Shaikh", "Rahil Akhil Ahmed Shaikh", "Left-hand bat", "Left-arm medium", "12/06/85", "2", str(team_id_map["Mumbai Indians"]))
add_player("AN Ahmed", "AN Ahmed", "AN Ahmed", "Left-hand bat", "Left-arm medium", "12/06/85", "2", str(team_id_map["Mumbai Indians"]))
add_player("AA Kazi", "Abrar Kazi", "Abrar Anjum Kazi", "Left-hand bat", "Slow left-arm orthodox", "10/29/89", "2", str(team_id_map["Royal Challengers Bangalore"]))
add_player("T Mishra", "Tanmay Mishra", "Tanmay Mishra", "Right-hand bat", "Right-arm medium-fast", "12/22/86", "2", str(team_id_map["Deccan Chargers"]))

# BBL
add_player("AG Harriott", "Andrew Harriott", "Andrew Harriott", "Right-hand bat", "Right-arm medium-fast", "05/03/92", "4", str(team_id_map["Melbourne Renegades"]))
add_player("B Doggett", "Brendan Doggett", "Brendan Doggett", "Right-hand bat", "Right-arm fast-medium", "03/05/94", "4", str(team_id_map["Sydney Thunder"]) + "," + str(team_id_map["Brisbane Heat"]))
add_player("CP Simpson", "Chris Simpson", "Christopher Patrick Simpson", "Right-hand bat", "Right-arm offbreak", "09/01/82", "4", str(team_id_map["Sydney Thunder"]))
add_player("D Morton", "Daniel Morton", "Daniel Morton", "Right-hand bat", "Right-arm offbreak", "", "4", str(team_id_map["Perth Scorchers"]))
add_player("Dilbar Hussain", "Dilbar Hussain", "Dilbar Hussain", "Right-hand bat", "Right-arm fast-medium", "", "5", str(team_id_map["Melbourne Stars"]))
add_player("GA West", "GA West", "GA West", "", "", "", "", str(team_id_map["Adelaide Strikers"]))
add_player("H Kerr", "Hayden Kerr", "Hayden Kerr", "", "", "", "", str(team_id_map["Sydney Sixers"]))
add_player("Haris Rauf", "Haris Rauf", "Haris Rauf", "Right-hand bat", "Right-arm fast", "11/07/93", "5", str(team_id_map["Melbourne Stars"]))
add_player("J Fraser-McGurk", "Jake Fraser-McGurk", "Jake Fraser-McGurk", "Right-hand bat", "Legbreak googly", "11/04/02", "4", str(team_id_map["Melbourne Renegades"]))
add_player("CR Swan", "CR Swan", "CR Swan", "", "", "", "4", "72")
add_player("JA Prestwidge", "JA Prestwidge", "JA Prestwidge", "", "", "", "4", "199, 72")
add_player("JP Wood", "JP Wood", "JP Wood", "", "", "", "4", "72")
add_player("K White (sub)", "K White (sub)", "K White (sub)", "", "", "", "4", "512")
add_player("L Bowe", "L Bowe", "L Bowe", "", "", "", "4", "512, 568")
add_player("LR Morris", "LR Morris", "LR Morris", "", "", "", "4", "568")
add_player("M Perry", "M Perry", "M Perry", "", "", "", "4", "199")
add_player("MJ Owen", "MJ Owen", "MJ Owen", "", "", "", "4", "45")
add_player("NA McSweeney", "NA McSweeney", "NA McSweeney", "", "", "", "4", "199")
add_player("NT Ellis", "NT Ellis", "NT Ellis", "", "", "", "4", "45")
add_player("Noor Ahmad", "Noor Ahmad", "Noor Ahmad", "", "", "", "4", "199")
add_player("O Davies", "O Davies", "O Davies", "", "", "", "4", "512")
add_player("P Hatzoglou", "P Hatzoglou", "P Hatzoglou", "", "", "", "4", "199")
add_player("RJG Lockyear", "RJG Lockyear", "RJG Lockyear", "", "", "", "4", "45, 512")
add_player("RR Ayre (sub)", "RR Ayre (sub)", "RR Ayre (sub)", "", "", "", "4", "455")
add_player("SJ Coyte (2)", "SJ Coyte (2)", "SJ Coyte (2)", "", "", "", "4", "512")
add_player("T Sangha", "T Sangha", "T Sangha", "", "", "", "4", "512")
add_player("XC Bartlett", "XC Bartlett", "XC Bartlett", "", "", "", "4", "72")
add_player("K White", "K White", "K White", "", "", "", "4", "447")
add_player("RR Ayre", "RR Ayre", "RR Ayre", "", "", "", "4", "241")



In [61]:
df_player = pd.read_csv(os.path.join(clean_data_path, "player.csv"))
df_player = df_player.loc[:, ~df_player.columns.str.contains('^Unnamed')]
player_id_map = dict(zip(df_player.player_display_name, df_player.player_id))

In [62]:
player_id_map

{nan: 9445,
 'Z Khan': 3,
 'DNT Zoysa': 17,
 'VH Zol': 53,
 'EJG Morgan': 94,
 'TS Mills': 128,
 'MJ Lumb': 136,
 'P Kumar': 186,
 'A Zampa': 197,
 'LS Livingstone': 216,
 'E Lewis': 255,
 'AD Mascarenhas': 292,
 'GR Napier': 490,
 'MM Ali': 553,
 'SS Agarwal': 581,
 'Mujeeb Ur Rahman': 604,
 'Mohammad Nabi': 608,
 'Rashid Khan': 3949,
 'A Singh': 793,
 'S Lamichhane': 807,
 'Mustafizur Rahman': 876,
 'Mohammad Ashraful': 900,
 'Mashrafe Mortaza': 909,
 'Shakib Al Hasan': 974,
 'Abdur Razzak': 1211,
 'RN ten Doeschate': 1275,
 'RE van der Merwe': 1286,
 'RW Price': 1506,
 'T Taibu': 1546,
 'S Sohal': 1603,
 'R Sharma': 1627,
 'Harmeet Singh': 1628,
 'Gagandeep Singh': 2367,
 'O Thomas': 1640,
 'JE Taylor': 1644,
 'CJ Anderson': 1674,
 'DAJ Bracewell': 1697,
 'TA Boult': 1698,
 'SE Bond': 1699,
 'C de Grandhomme': 1755,
 'JEC Franklin': 1775,
 'SP Fleming': 1782,
 'LH Ferguson': 1788,
 'MJ Guptill': 1792,
 'MJ Henry': 1820,
 'SC Kuggeleijn': 1845,
 'C Munro': 1868,
 'AF Milne': 1884,
 '

## Utility code to catch exceptions and add those entries to the player database. 
### 1) Run this cell
### 2) copy all the print statements and add it to the updates cell and run it.
### 3) Finally run the above cell to update player maps. 
### This is to be done so that ingestion of matches and balls takes place smoothly without any exceptions

In [63]:
print_statements = set()
country_id = 4 # set this to the host nation for maximum correct values

for match in matches:
        
    if match["info"]["toss"]["decision"] == "field":
        team_name_batting_second = match["info"]["toss"]["winner"]
        if team_name_batting_second == match["info"]["teams"][0]:
            team_name_batting_first = match["info"]["teams"][1]
        elif team_name_batting_second == match["info"]["teams"][1]:
            team_name_batting_first = match["info"]["teams"][0]
    elif match["info"]["toss"]["decision"] == "bat":
        team_name_batting_first = match["info"]["toss"]["winner"]
        if team_name_batting_first == match["info"]["teams"][0]:
            team_name_batting_second = match["info"]["teams"][1]
        elif team_name_batting_first == match["info"]["teams"][1]:
            team_name_batting_second = match["info"]["teams"][0]
    
    for inning in match['innings']:
        for inning_number in inning:
            for ball in inning[inning_number]['deliveries']:
                for ball_number in ball:
                    
                    try:
                        batsman = player_id_map[ball[ball_number]['batsman']]
                    except Exception as e:
                        if int(inning_number[0]) % 2 == 1:
                            player_nf = ball[ball_number]['batsman']
                            print_statement = f'add_player("{player_nf}", "{player_nf}", "{player_nf}", "", "", "", "{country_id}", "{str(team_id_map[team_name_batting_first])}")'
                            print_statements.add(print_statement)
                        else:
                            player_nf = ball[ball_number]['batsman']
                            print_statement = f'add_player("{player_nf}", "{player_nf}", "{player_nf}", "", "", "", "{country_id}", "{str(team_id_map[team_name_batting_second])}")'
                            print_statements.add(print_statement)
                    
                    try:
                        bowler = player_id_map[ball[ball_number]['bowler']]
                    except Exception as e:
                        if int(inning_number[0]) % 2 == 1:
                            player_nf = ball[ball_number]['bowler']
                            print_statement = f'add_player("{player_nf}", "{player_nf}", "{player_nf}", "", "", "", "{country_id}", "{str(team_id_map[team_name_batting_second])}")'
                            print_statements.add(print_statement)
                        else:
                            player_nf = ball[ball_number]['bowler']
                            print_statement = f'add_player("{player_nf}", "{player_nf}", "{player_nf}", "", "", "", "{country_id}", "{str(team_id_map[team_name_batting_first])}")'
                            print_statements.add(print_statement)
                            
                    try:
                        non_striker = player_id_map[ball[ball_number]['non_striker']]
                    except Exception as e:
                        if int(inning_number[0]) % 2 == 1:
                            player_nf = ball[ball_number]['non_striker']
                            print_statement = f'add_player("{player_nf}", "{player_nf}", "{player_nf}", "", "", "", "{country_id}", "{str(team_id_map[team_name_batting_first])}")'
                            print_statements.add(print_statement)
                        else:
                            player_nf = ball[ball_number]['non_striker']
                            print_statement = f'add_player("{player_nf}", "{player_nf}", "{player_nf}", "", "", "", "{country_id}", "{str(team_id_map[team_name_batting_second])}")'
                            print_statements.add(print_statement)
                    

                    if "wicket" in ball[ball_number]:
                        try:
                            player_dismissed = player_id_map[ball[ball_number]["wicket"]["player_out"]]
                            dismissal_type = ball[ball_number]["wicket"]["kind"]
                        except Exception as e:
                            pass

                        if "fielders" in ball[ball_number]["wicket"]:
                            
                            try:
                                # There is "(sub)" when a substitute fielder is involved in a wicket
                                fielders = ",".join([str(player_id_map[fielder.replace(" (sub)", "")]) for fielder in ball[ball_number]["wicket"]["fielders"]])
                            except Exception as e:
                                if int(inning_number[0]) % 2 == 1:
                                    player_nf = ball[ball_number]['wicket']['fielders'][0].replace(" (sub)", "")
                                    print_statement = f'add_player("{player_nf}", "{player_nf}", "{player_nf}", "", "", "", "{country_id}", "{str(team_id_map[team_name_batting_second])}")'
                                    print_statements.add(print_statement)
                                else:
                                    player_nf = ball[ball_number]['wicket']['fielders'][0].replace(" (sub)", "")
                                    print_statement = f'add_player("{player_nf}", "{player_nf}", "{player_nf}", "", "", "", "{country_id}", "{str(team_id_map[team_name_batting_first])}")'
                                    print_statements.add(print_statement)
for stmt in sorted(list(print_statements)):
    print(stmt)

## Match and Ball table

In [64]:
match_columns = ["match_id", "tournament_id", "venue_id", "match_date", "team_1", "team_2", "toss_winner", "toss_decision", "player_of_match", 
                 "match_winner", "match_win_by_runs", "match_win_by_wickets", "highlights_url", "match_url", "match_description"]
#match_csv = pd.DataFrame(columns = match_columns)

ball_columns = ["ball_id", "match_id", "ball_number", "innings_number", "batsman", "bowler", "non_striker", "batsman_runs", "wide_runs", "noball_runs",
                "bye_runs", "legbye_runs", "extras_runs", "total_runs", "player_dismissed", "dismissal_type", "fielders", "ball_description", "ball_url"]
#ball_csv = pd.DataFrame(columns = ball_columns)

if os.path.exists("./clean_data/match.csv"):
    old_match_csv = pd.read_csv("./clean_data/match.csv")
    i = len(old_match_csv)
else:
    i=0
    
if os.path.exists("./clean_data/ball.csv"):
    old_ball_csv = pd.read_csv("./clean_data/ball.csv")
    j = len(old_ball_csv)
else:
    j=0
    
match_csv = {}
ball_csv = {}

for match in tqdm((matches), position=0, leave=True):
    
    # First adding entries to the match table
    try:
        
        match_id = i
        if match['info']['competition'] in tournament_id_map:
            tournament_id = tournament_id_map[match['info']['competition']] # For ODI and TEST "competition" to be changed to "match_Type"
        elif match['info']['competition'] in tournament_fullname_id_map:
            tournament_id = tournament_fullname_id_map[match['info']['competition']]
            
        venue_id = venue_id_map[match['info']['venue']]
        
        # if date is a date instance, parse and read. if not directly read
        match_date = [",".join([str(date.strftime('%Y-%m-%d')) if isinstance(date, datetime.date) else str(date) for date in match['info']['dates']])]

        team_1_best_fuzzy_match = process.extractOne(match['info']['teams'][0], team_id_map.keys())[0]
        team_2_best_fuzzy_match = process.extractOne(match['info']['teams'][1], team_id_map.keys())[0]
        team_1 = team_id_map[team_1_best_fuzzy_match]
        team_2 = team_id_map[team_2_best_fuzzy_match]

        toss_winner_best_fuzzy_match = process.extractOne(match['info']['toss']['winner'], team_id_map.keys())[0]
        toss_winner = team_id_map[toss_winner_best_fuzzy_match]
        toss_decision = match['info']['toss']['decision']
        
        # Match was canceled/washed out
        if "result" in match['info']['outcome'] and match['info']['outcome']['result'] == "no result":
            match_winner = "NA"
            match_win_by_wickets = "NA"
            match_win_by_runs = "NA"
            player_of_match = "NA"
            
        # Match was tied
        elif "result" in match['info']['outcome'] and match['info']['outcome']['result'] == "tie":
            match_winner = "TIE"
            match_win_by_wickets = "NA"
            match_win_by_runs = "NA"
            player_of_match = ",".join([str(player_id_map[player]) for player in match['info']['player_of_match']])

        elif "runs" in match['info']['outcome']['by']:
            match_win_by_runs = match['info']['outcome']['by']['runs']
            match_win_by_wickets = 'NA'
            match_winner_best_fuzzy_match = process.extractOne(match['info']['outcome']['winner'], team_id_map.keys())[0]
            match_winner = team_id_map[match_winner_best_fuzzy_match]
            player_of_match = ",".join([str(player_id_map[player]) for player in match['info']['player_of_match']])

        elif "wickets" in match['info']['outcome']['by']:
            match_win_by_wickets = match['info']['outcome']['by']['wickets']
            match_win_by_runs = 'NA'
            match_winner_best_fuzzy_match = process.extractOne(match['info']['outcome']['winner'], team_id_map.keys())[0]
            match_winner = team_id_map[match_winner_best_fuzzy_match]
            player_of_match = ",".join([str(player_id_map[player]) for player in match['info']['player_of_match']])
        
        # For ambitious future deep learning projects
        highlights_url = 'NA'
        match_url = 'NA'
        match_description = 'NA'
        
        match_csv[i] = {
                        "match_id" : match_id, 
                        "tournament_id" : tournament_id,
                        "venue_id" : venue_id,
                        "match_date" : match_date, 
                        "team_1" : team_1,
                        "team_2" : team_2,
                        "toss_winner" : toss_winner,
                        "toss_decision" : toss_decision,
                        "player_of_match" : player_of_match,
                        "match_winner" : match_winner,
                        "match_win_by_runs" : match_win_by_runs,
                        "match_win_by_wickets" : match_win_by_wickets,
                        "highlights_url" : highlights_url,
                        "match_url" : match_url,
                        "match_description" : match_description
        }
    
    except Exception as e:
        
        print("Exception happened in match: ", e)
        print(match['info'])
        
        break
    
    
    # Then adding entries to the ball table
    
    try:
        for inning in match['innings']:
            for inning_number in inning:
                for ball in inning[inning_number]['deliveries']:
                    for ball_number in ball:
                        ball_id = j
                        innings_number = inning_number[0]
                        batsman = player_id_map[ball[ball_number]['batsman']]
                        bowler = player_id_map[ball[ball_number]['bowler']]
                        non_striker = player_id_map[ball[ball_number]['non_striker']]
                        
                        wide_runs = 0
                        bye_runs = 0
                        noball_runs = 0
                        legbye_runs = 0
                        batsman_runs = 0
                        extras_runs = 0
                        total_runs = 0
                        
                        player_dismissed = "NA"
                        dismissal_type = "NA"
                        fielders = "NA"
                        
                        if "extras" in ball[ball_number]:
                            if "wides" in ball[ball_number]["extras"]:
                                wide_runs = ball[ball_number]["extras"]["wides"]
                            if "legbyes" in ball[ball_number]["extras"]:
                                legbye_runs = ball[ball_number]["extras"]["legbyes"]
                            if "noballs" in ball[ball_number]["extras"]:
                                noball_runs = ball[ball_number]["extras"]["noballs"]
                            if "byes" in ball[ball_number]["extras"]:
                                bye_runs = ball[ball_number]["extras"]["byes"]
                        
                        if "runs" in ball[ball_number]:
                            batsman_runs = ball[ball_number]["runs"]["batsman"]
                            extras_runs = ball[ball_number]["runs"]["extras"]
                            total_runs = ball[ball_number]["runs"]["total"]
                            
                        if "wicket" in ball[ball_number]:
                            player_dismissed = player_id_map[ball[ball_number]["wicket"]["player_out"]]
                            dismissal_type = ball[ball_number]["wicket"]["kind"]
                            
                            if "fielders" in ball[ball_number]["wicket"]:
                                # There is "(sub)" when a substitute fielder is involved in a wicket
                                fielders = ",".join([str(player_id_map[fielder.replace(" (sub)", "")]) for fielder in ball[ball_number]["wicket"]["fielders"]])
                        
                        # For ambitious future deep learning projects
                        ball_description = "NA"
                        ball_url = "NA"
                        
                        ball_csv[j] = {
                                        "ball_id" : j, 
                                        "match_id" : match_id,
                                        "ball_number" : ball_number,
                                        "innings_number" : innings_number, 
                                        "batsman" : batsman,
                                        "bowler" : bowler,
                                        "non_striker" : non_striker,
                                        "batsman_runs" : batsman_runs,
                                        "wide_runs" : wide_runs,
                                        "bye_runs" : bye_runs,
                                        "noball_runs" : noball_runs,
                                        "legbye_runs" : legbye_runs,
                                        "extras_runs" : extras_runs,
                                        "total_runs" : total_runs,
                                        "player_dismissed" : player_dismissed,
                                        "dismissal_type" : dismissal_type,
                                        "fielders" : fielders,
                                        "ball_description" : ball_description,
                                        "ball_url" : ball_url,
                        }
                        
                        j += 1
    
    except Exception as e:
        print(f"Exception {e} happened in ball number {ball_number} ")
        print(f"ball {ball[ball_number]}")
        print(f"match info: {match['info']}")
        break
        
    i += 1

100%|████████████████████████████████████████████████████████████████████████████████| 816/816 [01:39<00:00,  8.18it/s]


In [65]:
df_match_csv = pd.DataFrame.from_dict(match_csv, "index")
df_ball_csv = pd.DataFrame.from_dict(ball_csv, "index")

if os.path.exists(os.path.join(clean_data_path, "match.csv")):
    old_match_csv = pd.read_csv(os.path.join(clean_data_path, "match.csv"))
    df_match_csv = old_match_csv.append(df_match_csv)
    
if os.path.exists(os.path.join(clean_data_path, "ball.csv")):
    old_ball_csv = pd.read_csv(os.path.join(clean_data_path, "ball.csv"))
    df_ball_csv = old_ball_csv.append(df_ball_csv)


df_match_csv.to_csv(os.path.join(clean_data_path, "match.csv"), index=False)
df_ball_csv.to_csv(os.path.join(clean_data_path, "ball.csv"), index=False)