In [1]:
# Basic
from collections import Counter
import datetime
import math
import numpy as np
import os
import pprint

# yaml specific
import yaml

# Data handling
from fuzzywuzzy import fuzz, process
import pandas as pd
from tqdm import tqdm

# my library
from db_utils import update_player, add_player 

In [2]:
# Config variables
raw_data_path = "raw_data"
clean_data_path = "clean_data"
tournament_name = "IPL"

In [31]:
df_tournament = pd.read_csv(os.path.join(clean_data_path, "tournament.csv"))
df_tournament = df_tournament.loc[:, ~df_tournament.columns.str.contains('^Unnamed')]
tournament_id_map = dict(zip(df_tournament.tournament_name, df_tournament.tournament_id))

In [32]:
df_venue = pd.read_csv(os.path.join(clean_data_path, "venue.csv"))
df_venue = df_venue.loc[:, ~df_venue.columns.str.contains('^Unnamed')]
venue_id_map = dict(zip(df_venue.venue_name, df_venue.venue_id))

# Mapping the duplicates as well to its correct venue ids
venue_id_map["Punjab Cricket Association IS Bindra Stadium, Mohali"] = 1
venue_id_map["M.Chinnaswamy Stadium"] = 23

In [5]:
df_team = pd.read_csv(os.path.join(clean_data_path, "team.csv"))
df_team = df_team.loc[:, ~df_team.columns.str.contains('^Unnamed')]
team_id_map = dict(zip(df_team.team_name, df_team.team_id))

In [33]:
df_player = pd.read_csv(os.path.join(clean_data_path, "player.csv"))
df_player = df_player.loc[:, ~df_player.columns.str.contains('^Unnamed')]
player_id_map = dict(zip(df_player.player_display_name, df_player.player_id))

In [123]:
df_match = pd.read_csv(os.path.join(clean_data_path, "match.csv"))
df_match = df_match.loc[:, ~df_match.columns.str.contains('^Unnamed')]

df_ball = pd.read_csv(os.path.join(clean_data_path, "ball.csv"))
df_ball = df_ball.loc[:, ~df_ball.columns.str.contains('^Unnamed')]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [142]:
def total_runs(player_id, tournaments=None, venue_id=None, past_n_years=None, overs_range=None, against_bowler=None, against_spin=None, against_pace=None):
    """
        Total runs for a player
        Args:
            player_id - (integer) id of the target player
            tournaments - (string) comma separated tournament codes eg: "IPL,BBL"
            venue_id - (integer) id of venue. 
            past_n_years - (integer) value telling how many years in the past you want to consider
            overs_range - (string) should be in the format 'a-b'. if you want to specifically know how the player is playing in a certain overs range. eg: "0-6"
            against_bowler - (integer) player id of a specific bowler
            against_spin - (boolean) mark it true if you want data only specific to spin. dont mark this if you supply 'against_bowler'
            against_pace - (boolean) mark it true if you want data only specific to pace. dont mark this if you supply 'against_bowler'
    """
    
    # Grabbing all balls faced by this player
    required_balls = df_ball[df_ball["batsman"] == player_id]
    
    # Grabbing all matches to map ids
    required_matches = df_match
    
    # Run the required_matches dataframe through each match filter
    
    if tournaments is not None:
        tournaments_to_consider = [tournament_id_map[tournament_name] for tournament_name in tournaments.split(",")]
        required_matches = required_matches[required_matches['tournament_id'].isin(tournaments_to_consider)]
        
    if venue_id is not None:
        required_matches = required_matches[required_matches['venue_id'] == venue_id]
        
    if past_n_years is not None:
        years_to_consider = [(datetime.date.today().year - i) for i in range(past_n_years+1)]
        required_matches = required_matches[required_matches['match_date'].str.contains('|'.join(years_to_consider))]
    
    match_ids_to_consider = np.array(required_matches['match_id'])
    required_balls = required_balls[required_balls['match_id'].isin(match_ids_to_consider)]
        
    if overs_range is not None:
        start_over_to_consider = int(overs_range.split("-")[0])
        end_over_to_consider = int(overs_range.split("-")[1])
        required_balls = required_balls[(required_balls['ball_number'] >= start_over_to_consider) & (required_balls['ball_number'] <= end_over_to_consider)]
        
    if against_bowler is not None:
        required_balls = required_balls[required_balls['bowler'] == against_bowler]
    
    # TODO
    if against_spin is not None:
        only_spin = True
    
    # TODO
    if against_pace is not None:
        only_pace = True
        
    return required_balls

In [150]:
player_id = player_id_map["KA Pollard"]
venue_id = venue_id_map["M Chinnaswamy Stadium"]
overs_range = "15-20"
required_balls = total_runs(player_id, venue_id=venue_id, overs_range=overs_range)
print(f"Overs range {overs_range}")
print(f"Total balls faced {len(required_balls)}")
print(f"Total runs {required_balls['batsman_runs'].sum()}")
print(f"Strike Rate {(required_balls['batsman_runs'].sum() / len(required_balls)) * 100}")

Overs range 15-20
Total balls faced 59
Total runs 95
Strike Rate 161.01694915254237
