In [None]:
#Code to get the load_csv_database function.
#We pass this function as context so there's no need to include it when we parse the notebook
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..", "..")))
from spider2_utils import load_csv_database


-setup-

In [None]:
import pandas as pd
_database = load_csv_database("IPL", rows_limit=-1)

# Create a dataframe for each key in _database
ball_by_ball = _database["ball_by_ball"]
batsman_scored = _database["batsman_scored"]
match = _database["match"]
player = _database["player"]
player_match = _database["player_match"]

# Question
Please help me find the names of top 5 players with the highest average runs per match in season 5, along with their batting averages.

# Step 1: Filter ball_by_ball for Season 5 Matches
Subset the ball_by_ball table to only include records from matches in season 5.

In [None]:
season5_match_ids = match[match['season_id'] == 5]['match_id']
ball_by_ball_season5 = ball_by_ball[ball_by_ball['match_id'].isin(season5_match_ids)]

# Step 2: Join ball_by_ball_season5 with batsman_scored
Merge the filtered ball_by_ball with batsman_scored on match_id, over_id, ball_id, and innings_no.

In [None]:
runs_scored = ball_by_ball_season5.merge(batsman_scored, on=['match_id', 'over_id', 'ball_id', 'innings_no'])

# Step 3: Select Relevant Columns
Keep only the striker (as player_id), match_id, and runs_scored columns.

In [None]:
runs_scored = runs_scored[['striker', 'match_id', 'runs_scored']].rename(columns={'striker': 'player_id', 'runs_scored': 'runs'})

# Step 4: Group by Player and Match to Calculate Total Runs
Aggregate the total runs scored by each player in each match.

In [None]:
total_runs = runs_scored.groupby(['player_id', 'match_id']).agg(total_runs=('runs', 'sum')).reset_index()

# Step 5: Group by Player to Calculate Batting Averages
For each player, sum total runs and count matches, then calculate batting average.

In [None]:
batting_averages = total_runs.groupby('player_id').agg(
    runs=('total_runs', 'sum'),
    num_matches=('match_id', 'count')
).reset_index()
batting_averages['batting_avg'] = (batting_averages['runs'] / batting_averages['num_matches']).round(3)

# Step 6: Select Top 5 Players by Batting Average
Sort by batting average descending and take the top 5 players.

In [None]:
top5 = batting_averages.sort_values('batting_avg', ascending=False).head(5)

# Step 7: Join with Player Table to Get Player Names
Merge the top 5 batting averages with the player table to get player names.

In [None]:
top5_with_names = top5.merge(player, left_on='player_id', right_on='player_id')[['player_name', 'batting_avg']]
top5_with_names = top5_with_names.sort_values('batting_avg', ascending=False).reset_index(drop=True)
top5_with_names

In [None]:
# player_name	batting_avg
# CH Gayle	52.357
# KP Pietersen	38.125
# S Dhawan	37.933
# CL White	36.846
# SR Watson	36.429