# Data Wrangling

Create a function to read in the data and then manipulate it to include: 

- Which ball of the over it is
- Innings
- Runs up until then
- Whether it is a powerplay or not ([Powerplay](https://en.wikipedia.org/wiki/Powerplay_(cricket)) means fielding restrictions, so it is harder to get a wicket through catches)

In [1]:
import pandas as pd
import json

In [2]:
def parse_cricket_json(file_path):
    """
    Parses the T20 match deliveries from a JSON file, including wickets, wides, no-balls, legbyes,
    byes, fielder's name and id, kind of dismissal, and player out details.
    """
    with open(file_path) as file:
        data = json.load(file)

    innings = data['innings']
    player_registry = data['info']['registry']['people']
    game_id = file_path.split('/')[-1].split('.')[0]
    season = data['info']['season']

    deliveries_data = []

    for inning in innings:
        team_name = inning['team']
        for over in inning['overs']:
            over_number = over['over']
            for delivery in over['deliveries']:
                batter_id = player_registry.get(delivery['batter'], "Unknown")
                bowler_id = player_registry.get(delivery['bowler'], "Unknown")
                non_striker_id = player_registry.get(delivery['non_striker'], "Unknown")
                wides = delivery.get('extras', {}).get('wides', 0)
                noballs = delivery.get('extras', {}).get('noballs', 0)
                legbyes = delivery.get('extras', {}).get('legbyes', 0)
                byes = delivery.get('extras', {}).get('byes', 0)
                wicket_info = delivery.get('wickets')
                wicket = 1 if wicket_info else 0
                player_out = wicket_info[0]['player_out'] if wicket_info else ""
                player_out_id = player_registry.get(player_out, "Unknown") if player_out else ""
                fielders = [wicket_info[0]['fielders'][0]['name'] if wicket_info and 'fielders' in wicket_info[0] else ""]
                fielders_id = [player_registry.get(fielders[0], "Unknown") if fielders[0] else ""]
                kind = [wicket_info[0]['kind'] if wicket_info else ""]
                
                delivery_info = {
                    "game_id": game_id,
                    "season": season,
                    "team": team_name,
                    "over": over_number,
                    "batter": delivery['batter'],
                    "batter_id": batter_id,
                    "bowler": delivery['bowler'],
                    "bowler_id": bowler_id,
                    "non_striker": delivery['non_striker'],
                    "non_striker_id": non_striker_id,
                    "wides": wides,
                    "noballs": noballs,
                    "legbyes": legbyes,
                    "byes": byes,
                    "wicket": wicket,
                    "player_out": player_out,
                    "player_out_id": player_out_id,
                    "fielders_name": fielders[0],
                    "fielders_id": fielders_id[0],
                    "wicket_type": kind[0],
                    "runs_batter": delivery['runs']['batter'],
                    "runs_extras": delivery['runs']['extras'],
                    "runs_total": delivery['runs']['total']
                }
                deliveries_data.append(delivery_info)
    
    return pd.DataFrame(deliveries_data)

# Test the function with the given JSON file path
test_df = parse_cricket_json('data/211028.json') # Example File Path 
test_df.head()

Unnamed: 0,game_id,season,team,over,batter,batter_id,bowler,bowler_id,non_striker,non_striker_id,...,byes,wicket,player_out,player_out_id,fielders_name,fielders_id,wicket_type,runs_batter,runs_extras,runs_total
0,211028,2005,England,0,ME Trescothick,ea42ddb9,B Lee,dd09ff8e,GO Jones,2e929b99,...,0,0,,,,,,0,0,0
1,211028,2005,England,0,ME Trescothick,ea42ddb9,B Lee,dd09ff8e,GO Jones,2e929b99,...,0,0,,,,,,1,0,1
2,211028,2005,England,0,GO Jones,2e929b99,B Lee,dd09ff8e,ME Trescothick,ea42ddb9,...,0,0,,,,,,0,0,0
3,211028,2005,England,0,GO Jones,2e929b99,B Lee,dd09ff8e,ME Trescothick,ea42ddb9,...,0,0,,,,,,0,0,0
4,211028,2005,England,0,GO Jones,2e929b99,B Lee,dd09ff8e,ME Trescothick,ea42ddb9,...,0,0,,,,,,0,0,0


In [3]:
def add_columns(df):

    # add the over for each team specifically
    df['team_over'] = df['team'] + "_" + df['over'].astype('str')

    # indicate which ball it is in the over
    df['over_ball'] = df.groupby('team_over').cumcount() + 1

    # list the teams in specific game
    teams = df['team'].unique() 

    # create inning column
    df['inning'] = [1 if x == teams[0] else 2 for x in df['team']]

    # calculate runs so far in innings
    df['runs_cumulative'] = df.groupby('inning')['runs_total'].cumsum()

    # check if it is powerplay 
    df['powerplay'] = [1 if x <= 5 else 0 for x in df['over']]
    
    return df


In [4]:
df_final = add_columns(test_df)
df_final.head()

Unnamed: 0,game_id,season,team,over,batter,batter_id,bowler,bowler_id,non_striker,non_striker_id,...,fielders_id,wicket_type,runs_batter,runs_extras,runs_total,team_over,over_ball,inning,runs_cumulative,powerplay
0,211028,2005,England,0,ME Trescothick,ea42ddb9,B Lee,dd09ff8e,GO Jones,2e929b99,...,,,0,0,0,England_0,1,1,0,1
1,211028,2005,England,0,ME Trescothick,ea42ddb9,B Lee,dd09ff8e,GO Jones,2e929b99,...,,,1,0,1,England_0,2,1,1,1
2,211028,2005,England,0,GO Jones,2e929b99,B Lee,dd09ff8e,ME Trescothick,ea42ddb9,...,,,0,0,0,England_0,3,1,1,1
3,211028,2005,England,0,GO Jones,2e929b99,B Lee,dd09ff8e,ME Trescothick,ea42ddb9,...,,,0,0,0,England_0,4,1,1,1
4,211028,2005,England,0,GO Jones,2e929b99,B Lee,dd09ff8e,ME Trescothick,ea42ddb9,...,,,0,0,0,England_0,5,1,1,1
