In [1]:
# Basic
from collections import Counter
import math
import numpy as np
import os
import pprint

# yaml specific
import yaml

# Data handling
import pandas as pd
from tqdm import tqdm

In [2]:
# Config variables
raw_data_path = "raw_data"
clean_data_path = "clean_data"
tournament_name = "IPL"

### Utility functions. 

In [3]:
def get_clean_teams(major_teams):
    # remove bad chars from string
    bad_chars = [';', ':', '!', "*", "'", '"', '[', ']', ',']
    teams = []
    for team in major_teams.split(", "):
        for char in bad_chars:
            team = team.replace(char, "")
        teams.append(team.strip().rstrip())
    return teams

# This notebook will be used to curate 4 tables. 
## 1) Country 2) Tournament 3) Team 4) Player
### Initially handling everything through CSVs. Later, we will see if using a DB is required

In [4]:
players = pd.read_csv("raw_data/players.csv")
players = players.loc[:, ~players.columns.str.contains('^Unnamed')]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


## Country table

### Focusing only on the top 20 countries with professional cricket players

In [5]:
countries = []
for key, value in players["COUNTRY"].value_counts().nlargest(20).iteritems():
    print(f"Number of players in {key} = {value}")
    countries.append(key)

Number of players in England = 20908
Number of players in India = 13257
Number of players in South Africa = 10095
Number of players in Australia = 6921
Number of players in Pakistan = 6846
Number of players in Sri Lanka = 6079
Number of players in New Zealand = 3731
Number of players in United Arab Emirates = 3331
Number of players in West Indies = 3317
Number of players in Bangladesh = 1815
Number of players in Afghanistan = 1410
Number of players in Zimbabwe = 1237
Number of players in Ireland = 794
Number of players in Scotland = 663
Number of players in United States of America = 618
Number of players in Netherlands = 495
Number of players in Malaysia = 462
Number of players in Canada = 436
Number of players in Hong Kong = 428
Number of players in Nepal = 411


In [6]:
country_columns = ["country_id", "country_name"]
country_csv = pd.DataFrame(columns = country_columns)
for i in range(len(countries)):
    country_id = i+1
    country_name = countries[i]
    country_csv = country_csv.append({"country_id" : country_id, 
                                      "country_name" : country_name}, ignore_index=True)

In [7]:
country_csv

Unnamed: 0,country_id,country_name
0,1,England
1,2,India
2,3,South Africa
3,4,Australia
4,5,Pakistan
5,6,Sri Lanka
6,7,New Zealand
7,8,United Arab Emirates
8,9,West Indies
9,10,Bangladesh


In [8]:
country_csv.to_csv(os.path.join(clean_data_path, "country.csv"), index=False)

## Tournament table

### Each zip file in https://cricsheet.org/ is going to be considered as a separate tournament. This list is going to be manually curated due to its short size

In [9]:
tournaments = [
    {
        "tournament_name" : "TEST",
        "tournament_format" : "TEST",
        "tournament_full_name" : "Test International Match"
    },
    {
        "tournament_name" : "ODI",
        "tournament_format" : "ODI",
        "tournament_full_name" : "ODI Match"
    },
    {
        "tournament_name" : "T20I",
        "tournament_format" : "T20",
        "tournament_full_name" : "T20 International Match"
    },
    {
        "tournament_name" : "BBL",
        "tournament_format" : "T20",
        "tournament_full_name" : "Big Bash League"
    },
    {
        "tournament_name" : "IPL",
        "tournament_format" : "T20",
        "tournament_full_name" : "Indian Premier Leargue"
    },
    {
        "tournament_name" : "CPL",
        "tournament_format" : "T20",
        "tournament_full_name" : "Caribbean Premier Leargue"
    },
    {
        "tournament_name" : "T20B",
        "tournament_format" : "T20",
        "tournament_full_name" : "T20 Blast"
    },
    {
        "tournament_name" : "PSL",
        "tournament_format" : "T20",
        "tournament_full_name" : "Pakistan Super League"
    },
    {
        "tournament_name" : "SSM",
        "tournament_format" : "T20",
        "tournament_full_name" : "Super Smash"
    },
    {
        "tournament_name" : "WBB",
        "tournament_format" : "T20",
        "tournament_full_name" : "Womens Big Bash League"
    }
]

In [10]:
tournament_columns = ["tournament_id", "tournament_name", "tournament_format", "tournament_full_name"]
tournament_csv = pd.DataFrame(columns = tournament_columns)
for i in range(len(tournaments)):
    tournament_id = i+1
    tournament_name = tournaments[i]["tournament_name"]
    tournament_format = tournaments[i]["tournament_format"]
    tournament_full_name = tournaments[i]["tournament_full_name"]
    tournament_csv = tournament_csv.append({"tournament_id" : tournament_id, 
                                      "tournament_name" : tournament_name,
                                      "tournament_format" : tournament_format, 
                                      "tournament_full_name" : tournament_full_name}, ignore_index=True)

In [11]:
tournament_csv

Unnamed: 0,tournament_id,tournament_name,tournament_format,tournament_full_name
0,1,TEST,TEST,Test International Match
1,2,ODI,ODI,ODI Match
2,3,T20I,T20,T20 International Match
3,4,BBL,T20,Big Bash League
4,5,IPL,T20,Indian Premier Leargue
5,6,CPL,T20,Caribbean Premier Leargue
6,7,T20B,T20,T20 Blast
7,8,PSL,T20,Pakistan Super League
8,9,SSM,T20,Super Smash
9,10,WBB,T20,Womens Big Bash League


In [12]:
tournament_csv.to_csv(os.path.join(clean_data_path, "tournament.csv"), index=False)

## Team table


### Considering only those teams where the players belong to the above 20 countries and aged < 50 (as of 2019)

In [13]:
# considering only the top 10 countries as the remaining countries barely have 10 important players that can be seperately ingested. 
required_countries = countries[:-10]
required_columns = ['NAME', 'COUNTRY', 'Full name', 'Birthdate', 'Major teams', 'Batting style', 'Bowling style', 'Other']
required_players = players[(players["COUNTRY"].isin(countries)) & (players["Age"] < 50)][required_columns]

In [14]:
teams = []
for index, row in tqdm(required_players.iterrows()):
    # skip columns that are not string
    if type(row["Major teams"]) is not str:
        continue
    for team in get_clean_teams(row["Major teams"]):
        teams.append(team)

34016it [00:02, 14864.18it/s]


### If a team has less than 'threshold' registered professional players, ignore that team

In [15]:
threshold = 30
threshold_count = 0
teams_set = set()
for team, count in Counter(teams).most_common():
    if count > threshold:
        threshold_count += 1
        teams_set.add(team)
teams_list = list(teams_set)

In [16]:
len(teams_list)

484

In [17]:
team_columns = ["team_id", "team_name", "team_label"]
team_csv = pd.DataFrame(columns = team_columns)
for i in tqdm(range(len(teams_list))):
    team_id = i+1
    team_name = teams_list[i]
    team_label = "".join(word[0] for word in team_name.split(" "))
    team_csv = team_csv.append({"team_id" : team_id, 
                                "team_name" : team_name, 
                                "team_label" : team_label}, ignore_index=True)

100%|███████████████████████████████████████████████████████████████████████████████| 484/484 [00:00<00:00, 516.27it/s]


In [18]:
team_csv

Unnamed: 0,team_id,team_name,team_label
0,1,Tamil Union Cricket and Athletic Club,TUCaAC
1,2,Australia,A
2,3,Wellington,W
3,4,Nottinghamshire,N
4,5,North West Under-13s,NWU
...,...,...,...
479,480,Wales Minor Counties,WMC
480,481,Otago Women,OW
481,482,Wiltshire,W
482,483,Shakthi Ladies,SL


In [19]:
team_csv.to_csv(os.path.join(clean_data_path, "team.csv"), index=False)

## Player table

### Considering only those players who belong to the 20 countries and aged < 50 (as of 2019)

### Utility maps to ingest players

In [20]:
df_country = pd.read_csv(os.path.join(clean_data_path, "country.csv"))
df_country = df_country.loc[:, ~df_country.columns.str.contains('^Unnamed')]
country_id_map = dict(zip(df_country.country_name, df_country.country_id))

In [21]:
df_team = pd.read_csv(os.path.join(clean_data_path, "team.csv"))
df_team = df_team.loc[:, ~df_team.columns.str.contains('^Unnamed')]
team_id_map = dict(zip(df_team.team_name, df_team.team_id))

In [22]:
players = []
for index, row in tqdm(required_players.iterrows()):
    
    # If player doesnt have a proper team skip
    if type(row["Major teams"]) is not str:
        continue
    
    # Getting the list of team ids this player has played in
    team_list_ids = ""
    for team in get_clean_teams(row["Major teams"]):
        if team in team_id_map:
            team_list_ids += str(team_id_map[team]) + ","
            
    # If the player doesnt play in atleast 2 teams skip them
    if len(team_list_ids.split(",")) < 3 :
        continue
        
    # removing last comma
    team_list_ids = team_list_ids[:-1]
    
    player = {}
    player["player_name"] = row["NAME"]
    player["player_full_name"] = row["Full name"]
    player["batting_style"] = row["Batting style"]
    player["bowling_style"] = row["Bowling style"]
    player["birthdate"] = row["Birthdate"]
    player["country_id"] = country_id_map[row["COUNTRY"]]
    player["list_team_ids"] = team_list_ids
    
    players.append(player)

34016it [00:02, 12908.57it/s]


In [23]:
player_columns = ["player_id", "player_display_name", "player_name", "player_full_name", "batting_style", "bowling_style", "birthdate", "country_id"]
player_csv = pd.DataFrame(columns = player_columns)
for i in tqdm(range(len(players))):
    player_id = i+1
    player_display_name = "" # This needs to be updated later with fuzzy matching
    player_name = players[i]["player_name"]
    player_full_name = players[i]["player_full_name"]
    batting_style = players[i]["batting_style"]
    bowling_style = players[i]["bowling_style"]
    birthdate = players[i]["birthdate"]
    country_id = players[i]["country_id"]
    list_team_ids = players[i]["list_team_ids"]
    player_csv = player_csv.append({"player_id" : player_id, 
                                      "player_display_name" : player_display_name,
                                      "player_name" : player_name,
                                      "player_full_name" : player_full_name,
                                      "batting_style" : batting_style,
                                      "bowling_style" : bowling_style,
                                      "birthdate" : birthdate,
                                      "country_id" : country_id,
                                      "list_team_ids" : list_team_ids
                                     }, ignore_index=True)

100%|█████████████████████████████████████████████████████████████████████████████| 7913/7913 [00:29<00:00, 269.37it/s]


In [24]:
player_csv.to_csv(os.path.join(clean_data_path, "player.csv"), index=False)