In [1]:
import os
import json
import pandas as pd

def process_match_file(file_path):
    with open(file_path) as f:
        data = json.load(f)
    
    # Extract match info
    match_info = {
    'match_id': data['info']['match_type_number'],
    'date': pd.to_datetime(data['info']['dates'][0]),
    'venue': data['info']['venue'],
    'city': data['info']['city'],
    'team1': data['info']['teams'][0],
    'team2': data['info']['teams'][1],
    'toss_winner': 0 if data['info']['toss']['winner'] == data['info']['teams'][0] else 1 ,
    'toss_decision': data['info']['toss']['decision'],
    'winner': 0 if data['info']['outcome']['winner'] == data['info']['teams'][0] else 1,
    }

    # Extract players per team
    team1 = match_info['team1']
    team2 = match_info['team2']
    players_team1 = data['info']['players'].get(team1, [])
    players_team2 = data['info']['players'].get(team2, [])

    # Add player names as separate columns
    for i in range(11):
        match_info[f'team1_player{i+1}'] = players_team1[i] if i < len(players_team1) else None
        match_info[f'team2_player{i+1}'] = players_team2[i] if i < len(players_team2) else None

    
    rows = []
    for inning in data['innings']:
        team = inning['team']
        target_info = inning.get('target', {})
        target_runs = target_info.get('runs', None)
        target_overs = target_info.get('overs', None)
        
        current_runs = 0
        wickets = 0
        balls_bowled = 0
        partnership_runs = 0
        for over in inning['overs']:
            over_num = over['over']
            ball_in_over = 0
            
            for delivery in over['deliveries']:
                ball_in_over += 1
                balls_bowled += 1
                current_runs += delivery['runs']['total']
                
                wicket = 0
                wicket_type = None
                player_out = None
                partnership_runs += delivery['runs']['batter']

                if 'wickets' in delivery:
                    wicket = 1
                    wickets += 1
                    wicket_type = delivery['wickets'][0]['kind']
                    player_out = delivery['wickets'][0]['player_out']
                    partnership_runs = 0
                row = {
                    'match_id': match_info['match_id'],
                    'inning': 1 if team == match_info['team1'] else 2,
                    'batting_team': team,
                    'bowling_team': match_info['team2'] if team == match_info['team1'] else match_info['team1'],
                    'over': over_num,
                    'ball_in_over': ball_in_over,
                    'batter': delivery['batter'],
                    'bowler': delivery['bowler'],
                    'runs_off_bat': delivery['runs']['batter'],
                    'extras': delivery['runs']['extras'],
                    'total_runs': delivery['runs']['total'],
                    'is_wicket': wicket,
                    'wicket_type': wicket_type,
                    'player_out': player_out,
                    'current_score': current_runs,
                    'partnership_runs': partnership_runs, 
                    'wickets_lost': wickets,
                    'balls_bowled': balls_bowled,
                    'run_rate': (current_runs / balls_bowled) * 6 if balls_bowled > 0 else 0,
                    'target_runs': target_runs,
                    'target_overs': target_overs,
                    'runs_remaining': target_runs - current_runs if target_runs else None,
                    'balls_remaining': (target_overs * 6 - balls_bowled) if target_overs else None,
                    'required_run_rate': (target_runs - current_runs) / ((target_overs * 6 - balls_bowled)/6) 
                                        if target_runs and target_overs and balls_bowled < target_overs * 6 else None,
                    'projected_score': ((current_runs / balls_bowled) * 6) * (target_overs - balls_bowled / 6) + current_runs
                        if balls_bowled > 0 and target_overs is not None else 0,
                    'pressure_index': (
                        "high" if (
                            target_runs is not None and target_overs is not None and balls_bowled < target_overs * 6 and (
                                (wickets <= 2 and ((target_runs - current_runs) / ((target_overs * 6 - balls_bowled) / 6) >= 8)) or
                                (3 <= wickets <= 5 and ((target_runs - current_runs) / ((target_overs * 6 - balls_bowled) / 6) >= 7.5)) or
                                (wickets >= 6 and ((target_runs - current_runs) / ((target_overs * 6 - balls_bowled) / 6) >= 5))
                            )
                        ) else
                        "medium" if (
                            target_runs is not None and target_overs is not None and balls_bowled < target_overs * 6 and (
                                (wickets <= 2 and 6 <= ((target_runs - current_runs) / ((target_overs * 6 - balls_bowled) / 6)) < 8) or
                                (3 <= wickets <= 5 and 5.5 <= ((target_runs - current_runs) / ((target_overs * 6 - balls_bowled) / 6)) < 7.5) or
                                (wickets >= 6 and ((target_runs - current_runs) / ((target_overs * 6 - balls_bowled) / 6)) < 5)
                            )
                        ) else
                        "low" if (
                            target_runs is not None and target_overs is not None and balls_bowled < target_overs * 6 and (
                                (wickets <= 2 and ((target_runs - current_runs) / ((target_overs * 6 - balls_bowled) / 6)) < 6) or
                                (3 <= wickets <= 5 and ((target_runs - current_runs) / ((target_overs * 6 - balls_bowled) / 6)) < 5.5)
                            )
                        ) else
                        None
                    ),
                    'winner': data['info']['outcome']['winner']
                }
                rows.append(row)

    return pd.DataFrame(rows), pd.DataFrame([match_info])


# Process all JSON files
all_ball_dfs = []
all_match_dfs = []

folder = "Data Files"
for filename in os.listdir(folder):
    if filename.endswith(".json"):
        filepath = os.path.join(folder, filename)
        try:
            ball_df, match_df = process_match_file(filepath)
            all_ball_dfs.append(ball_df)
            all_match_dfs.append(match_df)
        except Exception as e:
            print(f"Error processing {filename}: {e}")

# Combine all DataFrames
combined_ball_df = pd.concat(all_ball_dfs, ignore_index=True)
combined_match_df = pd.concat(all_match_dfs, ignore_index=True)


Error processing 1144497.json: 'winner'
Error processing 1144530.json: 'winner'
Error processing 247461.json: 'winner'
Error processing 433568.json: 'winner'
Error processing 433577.json: 'winner'
Error processing 433581.json: 'city'
Error processing 433583.json: 'city'
Error processing 433589.json: 'city'
Error processing 65249.json: 'winner'
Error processing 65272.json: 'winner'
Error processing 65273.json: 'winner'
Error processing 656401.json: 'city'
Error processing 656405.json: 'city'
Error processing 656423.json: 'city'
Error processing 656433.json: 'city'
Error processing 656435.json: 'city'
Error processing 656461.json: 'city'
Error processing 656463.json: 'city'
Error processing 656481.json: 'city'
Error processing 656483.json: 'city'
Error processing 656485.json: 'city'
Error processing 656487.json: 'city'
Error processing 656493.json: 'city'
Error processing 656495.json: 'city'


In [2]:
combined_match_df.to_csv("Match Dataset.csv")
combined_ball_df.to_csv("Ball_by_Ball Dataset.csv")