In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os, json
import pandas as pd
from tqdm import tqdm

# Path to the folder
json_folder = "/content/drive/MyDrive/t20s_male_json"

all_matches = []

for fname in tqdm(os.listdir(json_folder)):
    if not fname.endswith(".json"):
        continue

    try:
        with open(os.path.join(json_folder, fname), 'r') as f:
            data = json.load(f)

        outcome = data['info'].get('outcome', {})
        winner = outcome.get('winner', 'NA')

        # Determine if win was defending or chasing
        if 'by' in outcome:
            if 'runs' in outcome['by']:
                won_by = 'defending'
                win_margin = f"by {outcome['by']['runs']} runs"
            elif 'wickets' in outcome['by']:
                won_by = 'chasing'
                win_margin = f"by {outcome['by']['wickets']} wickets"
            else:
                won_by = 'NA'
                win_margin = 'NA'
        else:
            won_by = 'NA'
            win_margin = 'NA'

        match_info = {
            'match_id': fname.replace('.json', ''),
            'team1': data['info']['teams'][0],
            'team2': data['info']['teams'][1],
            'venue': data['info'].get('venue', ''),
            'date': data['info']['dates'][0],
            'winner': winner,
            'won_by': won_by,
            'win_margin': win_margin
        }

        innings_stats = []

        for i, inning in enumerate(data['innings']):
            team = inning['team']
            overs_data = inning['overs']

            stats = {
                'team': team,
                'runs': 0,
                'fours': 0,
                'sixes': 0,
                'powerplay_runs': 0,
                'middle_overs_runs': 0,
                'death_overs_runs': 0,
                'powerplay_fours': 0,
                'middle_overs_fours': 0,
                'death_overs_fours': 0,
                'powerplay_sixes': 0,
                'middle_overs_sixes': 0,
                'death_overs_sixes': 0,
                'powerplay_dots': 0,
                'middle_overs_dots': 0,
                'death_overs_dots': 0,
                'powerplay_wkts': 0,
                'middle_overs_wkts': 0,
                'death_overs_wkts': 0,
                'total_wickets': 0,
                'top_scorer': '',
                'top_score': 0
            }

            batter_scores = {}

            for over_data in overs_data:
                over = over_data['over']
                deliveries = over_data['deliveries']

                for delivery in deliveries:
                    batter = delivery['batter']
                    shot_runs = delivery['runs']['batter']
                    total_runs = delivery['runs']['total']
                    stats['runs'] += total_runs
                    batter_scores[batter] = batter_scores.get(batter, 0) + shot_runs

                    # Phase classification
                    if over < 6:
                        phase = 'powerplay'
                    elif over < 15:
                        phase = 'middle_overs'
                    else:
                        phase = 'death_overs'

                    # Boundary counts
                    if shot_runs == 4:
                        stats['fours'] += 1
                        stats[f'{phase}_fours'] += 1
                    elif shot_runs == 6:
                        stats['sixes'] += 1
                        stats[f'{phase}_sixes'] += 1

                    # Runs per phase
                    stats[f'{phase}_runs'] += total_runs

                    # Dot balls
                    if total_runs == 0:
                        stats[f'{phase}_dots'] += 1

                    # Wickets
                    if 'wickets' in delivery:
                        stats[f'{phase}_wkts'] += len(delivery['wickets'])
                        stats['total_wickets'] += len(delivery['wickets'])

            # Top scorer
            if batter_scores:
                top_batter = max(batter_scores, key=batter_scores.get)
                stats['top_scorer'] = top_batter
                stats['top_score'] = batter_scores[top_batter]

            innings_stats.append(stats)

        # Combine innings with match info
        if len(innings_stats) == 2:
            combined = {**match_info}
            for idx, inns in enumerate(innings_stats, start=1):
                for key, val in inns.items():
                    combined[f'innings_{idx}_{key}'] = val
            all_matches.append(combined)

    except Exception as e:
        print(f"Skipped {fname}: {e}")

# Save to CSV
df = pd.DataFrame(all_matches)
df.to_csv("enriched_t20_matches.csv", index=False)
print(f"Saved enriched_t20_matches.csv with {len(df)} matches.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


100%|██████████| 2741/2741 [00:33<00:00, 83.04it/s] 


Saved enriched_t20_matches.csv with 2676 matches.


In [2]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

# Setup
import os, json
import pandas as pd
from tqdm import tqdm

# Change to your IPL JSON folder shortcut
json_folder = "/content/drive/MyDrive/ipl_male_json"

all_matches = []

for fname in tqdm(os.listdir(json_folder)):
    if not fname.endswith(".json"):
        continue

    try:
        with open(os.path.join(json_folder, fname), 'r') as f:
            data = json.load(f)

        # === Basic Info ===
        outcome = data['info'].get('outcome', {})
        winner = outcome.get('winner', 'NA')
        won_by = 'NA'; win_margin = 'NA'

        if 'by' in outcome:
            if 'runs' in outcome['by']:
                won_by = 'defending'
                win_margin = f"by {outcome['by']['runs']} runs"
            elif 'wickets' in outcome['by']:
                won_by = 'chasing'
                win_margin = f"by {outcome['by']['wickets']} wickets"

        match_info = {
            'match_id': fname.replace('.json', ''),
            'team1': data['info']['teams'][0],
            'team2': data['info']['teams'][1],
            'venue': data['info'].get('venue', ''),
            'date': data['info']['dates'][0],
            'winner': winner,
            'won_by': won_by,
            'win_margin': win_margin
        }

        innings_stats = []

        for inning in data['innings']:
            team = inning['team']
            overs_data = inning['overs']

            stats = {
                'team': team,
                'runs': 0,
                'fours': 0,
                'sixes': 0,
                'total_wickets': 0,
                'top_scorer': '',
                'top_score': 0,
                'powerplay_runs': 0,
                'middle_overs_runs': 0,
                'death_overs_runs': 0,
                'powerplay_wkts': 0,
                'middle_overs_wkts': 0,
                'death_overs_wkts': 0,
                'powerplay_dots': 0,
                'middle_overs_dots': 0,
                'death_overs_dots': 0,
                'overs_faced': 0
            }

            batter_scores = {}
            over_runs = [0]*20  # Init with 20 overs

            for over_block in overs_data:
                over_no = int(over_block['over'])  # 0-indexed
                deliveries = over_block['deliveries']

                # Sum runs for this over
                over_total = sum(d['runs']['total'] for d in deliveries)
                if over_no < 20:
                    over_runs[over_no] = over_total
                    stats['overs_faced'] += 1

                for delivery in deliveries:
                    batter = delivery['batter']
                    shot = delivery['runs']['batter']
                    total = delivery['runs']['total']
                    batter_scores[batter] = batter_scores.get(batter, 0) + shot
                    stats['runs'] += total

                    # Phase
                    if over_no < 6:
                        phase = 'powerplay'
                    elif over_no < 15:
                        phase = 'middle_overs'
                    else:
                        phase = 'death_overs'

                    # Fours and Sixes
                    if shot == 4:
                        stats['fours'] += 1
                    elif shot == 6:
                        stats['sixes'] += 1

                    # Phase stats
                    stats[f'{phase}_runs'] += total
                    if total == 0:
                        stats[f'{phase}_dots'] += 1
                    if 'wickets' in delivery:
                        wkts = len(delivery['wickets'])
                        stats[f'{phase}_wkts'] += wkts
                        stats['total_wickets'] += wkts

            # Top scorer
            if batter_scores:
                top = max(batter_scores, key=batter_scores.get)
                stats['top_scorer'] = top
                stats['top_score'] = batter_scores[top]

            # Add per-over runs
            for i in range(20):
                stats[f'over_{i+1}_runs'] = over_runs[i]

            innings_stats.append(stats)

        # === Combine innings & match info ===
        if len(innings_stats) == 2:
            combined = match_info.copy()
            for idx, inns in enumerate(innings_stats, start=1):
                prefix = f'innings_{idx}'
                combined[f'{prefix}_team'] = inns['team']
                combined[f'{prefix}_runs'] = inns['runs']
                combined[f'{prefix}_fours'] = inns['fours']
                combined[f'{prefix}_sixes'] = inns['sixes']
                combined[f'{prefix}_top_scorer'] = inns['top_scorer']
                combined[f'{prefix}_top_score'] = inns['top_score']
                combined[f'{prefix}_total_wickets'] = inns['total_wickets']
                combined[f'{prefix}_overs_faced'] = inns['overs_faced']

                for p in ['powerplay', 'middle_overs', 'death_overs']:
                    for stat in ['runs', 'wkts', 'dots']:
                        combined[f'{prefix}_{p}_{stat}'] = inns[f'{p}_{stat}']

                for i in range(20):
                    combined[f'{prefix}_over_{i+1}_runs'] = inns[f'over_{i+1}_runs']

            all_matches.append(combined)

    except Exception as e:
        print(f" Skipped {fname}: {e}")

# === Save final CSV ===
df = pd.DataFrame(all_matches)
df.to_csv("enriched_ipl_matches.csv", index=False)
print(f"Done! {len(df)} matches processed and saved as enriched_ipl_matches.csv")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


100%|██████████| 1170/1170 [00:32<00:00, 35.79it/s] 


Done! 1149 matches processed and saved as enriched_ipl_matches.csv
