In [1]:
# Basic
from collections import Counter
import math
import numpy as np
import os
import pprint

# yaml specific
import yaml

# Data handling
import pandas as pd
from tqdm import tqdm

In [2]:
# Config variables
raw_data_path = "raw_data"
clean_data_path = "clean_data"
tournament_name = "IPL"

## Utility functions. Move them to separate files later

In [3]:
def parse_yaml(path):
    """
    Parses a given yaml file and returns the object
    Args:
        path - path of the yaml file to be parsed
    """
    with open(path, 'r') as stream:
        try:
            data = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
    return data

In [4]:
def get_all_matches_raw_data(tournament_name):
    tournament_path = os.path.join(raw_data_path, tournament_name)
    match_files = os.listdir(tournament_path)
    matches = []
    print(f"Parsing {len(match_files)} matches for {tournament_name}")
    for match_file in tqdm(match_files):
        match_path = os.path.join(tournament_path, match_file)
        match = parse_yaml(match_path)
        matches.append(match)
    return matches

In [5]:
def get_clean_teams(major_teams):
    # remove bad chars from string
    bad_chars = [';', ':', '!', "*", "'", '"', '[', ']', ',']
    teams = []
    for team in major_teams.split(", "):
        for char in bad_chars:
            team = team.replace(char, "")
        teams.append(team.strip().rstrip())
    return teams

# Defining tables
### Initially handling everything through CSVs. Later, we will see if using a DB is required

In [6]:
players = pd.read_csv("raw_data/players.csv")
players = players.loc[:, ~players.columns.str.contains('^Unnamed')]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


## Country table

### Focusing only on the top 20 countries with professional cricket players

In [7]:
countries = []
for key, value in players["COUNTRY"].value_counts().nlargest(20).iteritems():
    print(f"{key} --> {value}")
    countries.append(key)

England --> 20908
India --> 13257
South Africa --> 10095
Australia --> 6921
Pakistan --> 6846
Sri Lanka --> 6079
New Zealand --> 3731
United Arab Emirates --> 3331
West Indies --> 3317
Bangladesh --> 1815
Afghanistan --> 1410
Zimbabwe --> 1237
Ireland --> 794
Scotland --> 663
United States of America --> 618
Netherlands --> 495
Malaysia --> 462
Canada --> 436
Hong Kong --> 428
Nepal --> 411


In [8]:
country_columns = ["country_id", "country_name"]
country_csv = pd.DataFrame(columns = country_columns)
for i in range(len(countries)):
    country_id = i+1
    country_name = countries[i]
    country_csv = country_csv.append({"country_id" : country_id, 
                                      "country_name" : country_name}, ignore_index=True)

In [9]:
country_csv

Unnamed: 0,country_id,country_name
0,1,England
1,2,India
2,3,South Africa
3,4,Australia
4,5,Pakistan
5,6,Sri Lanka
6,7,New Zealand
7,8,United Arab Emirates
8,9,West Indies
9,10,Bangladesh


In [10]:
country_csv.to_csv(os.path.join(clean_data_path, "country.csv"), index=False)

## Team table


### Considering only those teams where the players belong to the above 20 countries and aged < 50 (as of 2019)

In [11]:
required_columns = ['NAME', 'COUNTRY', 'Full name', 'Birthdate', 'Major teams', 'Batting style', 'Bowling style', 'Other']
required_players = players[(players["COUNTRY"].isin(countries)) & (players["Age"] < 50)][required_columns]

In [12]:
teams = []
for index, row in tqdm(required_players.iterrows()):
    #print(f'{row["NAME"]} -- > {row["Major teams"]}')
    # skip columns that are not string
    if type(row["Major teams"]) is not str:
        continue
    for team in get_clean_teams(row["Major teams"]):
        teams.append(team)

34016it [00:02, 15088.93it/s]


### If a team has less than 'threshold' registered professional players, ignore that team

In [13]:
threshold = 10
threshold_count = 0
teams_set = set()
for team, count in Counter(teams).most_common():
    if count > threshold:
        threshold_count += 1
        teams_set.add(team)

In [14]:
teams_list = list(teams_set)

In [15]:
team_columns = ["team_id", "team_name", "team_label"]
team_csv = pd.DataFrame(columns = team_columns)
for i in tqdm(range(len(teams_list))):
    team_id = i+1
    team_name = teams_list[i]
    team_label = "".join(word[0] for word in team_name.split(" "))
    team_csv = team_csv.append({"team_id" : team_id, 
                                "team_name" : team_name, 
                                "team_label" : team_label}, ignore_index=True)

100%|█████████████████████████████████████████████████████████████████████████████| 1216/1216 [00:02<00:00, 522.96it/s]


In [16]:
team_csv

Unnamed: 0,team_id,team_name,team_label
0,1,Tasmania Under-17s,TU
1,2,Australian Capital Territory Under-23s,ACTU
2,3,Assam Under-14s,AU
3,4,Middlesex Cricket Board,MCB
4,5,Sri Lanka Cricket Development XI,SLCDX
...,...,...,...
1211,1212,Canterbury A,CA
1212,1213,Karachi Port Trust,KPT
1213,1214,Australia,A
1214,1215,Kurunegala Youth Cricket Club,KYCC


In [17]:
team_csv.to_csv(os.path.join(clean_data_path, "team.csv"), index=False)

## Player table

### Considering only those players who belong to the 20 countries and aged < 50 (as of 2019)

### Utility maps to ingest players

In [19]:
df_country = pd.read_csv(os.path.join(clean_data_path, "country.csv"))
df_country = df_country.loc[:, ~df_country.columns.str.contains('^Unnamed')]
country_id_map = dict(zip(df_country.country_name, df_country.country_id))

In [20]:
df_team = pd.read_csv(os.path.join(clean_data_path, "team.csv"))
df_team = df_team.loc[:, ~df_team.columns.str.contains('^Unnamed')]
team_id_map = dict(zip(df_team.team_name, df_team.team_id))

In [21]:
team_id_map

{'Tasmania Under-17s': 1,
 'Australian Capital Territory Under-23s': 2,
 'Assam Under-14s': 3,
 'Middlesex Cricket Board': 4,
 'Sri Lanka Cricket Development XI': 5,
 'England Cricket Board XI': 6,
 'Takhar Province Under-16s': 7,
 'West Indies Women': 8,
 'Federally Administered Tribal Areas': 9,
 'Sapphires': 10,
 'Cricket Association of Bengal Under-17s': 11,
 'Andhra Under-14s': 12,
 'Jamaica Women': 13,
 'Kilinochchi Combined Schools': 14,
 'United Arab Emirates Women': 15,
 'Queensland Colts': 16,
 'Kaluthara Maha Vidyalaya': 17,
 'Andhra Women': 18,
 'Northerns': 19,
 'Northern Territory Under-19s': 20,
 'Sheikh Jamal Dhanmondi Club': 21,
 'Australia Under-21s Women': 22,
 'Eastern Province Under-19s': 23,
 'Panadura Sports Club': 24,
 'The Rest': 25,
 'Free State Under-13s': 26,
 'Pakistan Under-23s': 27,
 'Pakistan Under-15s': 28,
 'AMC Abbottabad': 29,
 'Sylhet Division': 30,
 'KwaZulu-Natal Under-13s': 31,
 'Surrey Cricket Board': 32,
 'Laghman Province Under-16s': 33,
 'Uni

In [22]:
players = []
count = 0
for index, row in tqdm(required_players.iterrows()):
    
    # If player doesnt have a proper team skip
    if type(row["Major teams"]) is not str:
        continue
        
    player = {}
    player["player_name"] = row["NAME"]
    player["player_full_name"] = row["Full name"]
    player["batting_style"] = row["Batting style"]
    player["bowling_style"] = row["Bowling style"]
    player["birthdate"] = row["Birthdate"]
    player["country_id"] = country_id_map[row["COUNTRY"]]
    
    team_list_ids = ""
    for team in get_clean_teams(row["Major teams"]):
        if team in team_id_map:
            team_list_ids += str(team_id_map[team]) + ","
    # removing last comma
    team_list_ids = team_list_ids[:-1]
    
    player["list_team_ids"] = team_list_ids
    players.append(player)

34016it [00:03, 11296.70it/s]


In [25]:
player_columns = ["player_id", "player_display_name", "player_name", "player_full_name", "batting_style", "bowling_style", "birthdate", "country_id"]
player_csv = pd.DataFrame(columns = player_columns)
for i in tqdm(range(len(players))):
    player_id = i+1
    player_display_name = "" # This needs to be updated later with fuzzy matching
    player_name = players[i]["player_name"]
    player_full_name = players[i]["player_full_name"]
    batting_style = players[i]["batting_style"]
    bowling_style = players[i]["bowling_style"]
    birthdate = players[i]["birthdate"]
    country_id = players[i]["country_id"]
    list_team_ids = players[i]["list_team_ids"]
    player_csv = player_csv.append({"player_id" : player_id, 
                                      "player_display_name" : player_display_name,
                                      "player_name" : player_name,
                                      "player_full_name" : player_full_name,
                                      "batting_style" : batting_style,
                                      "bowling_style" : bowling_style,
                                      "birthdate" : birthdate,
                                      "country_id" : country_id,
                                      "list_team_ids" : list_team_ids
                                     }, ignore_index=True)

100%|███████████████████████████████████████████████████████████████████████████| 32870/32870 [04:01<00:00, 136.29it/s]


In [30]:
player_csv[player_csv.player_name.str.contains("Kohli", na=False)]

Unnamed: 0,player_id,player_display_name,player_name,player_full_name,batting_style,bowling_style,birthdate,country_id,list_team_ids
4748,4749,,Somesh Kohli,Somesh K Kohli,,,06/05/86,16,6424511067937
9513,9514,,Virat Kohli,Virat Kohli,Right-hand bat,Right-arm medium,05/11/88,2,1035704658921186
9514,9515,,Taruwar Kohli,Taruwar Sushil Kohli,Right-hand bat,Right-arm medium,17/12/88,2,92262232445
9515,9516,,Parth Kohli,Parth Sachin Kohli,Right-hand bat,Legbreak googly,09/08/96,2,435659755
9536,9537,,Akanksha Kohli,Akanksha Kohli,Right-hand bat,Right-arm medium-fast,13/08/89,2,384


In [32]:
player_csv.to_csv(os.path.join(clean_data_path, "player.csv"), index=False)