In [7]:
import os
import requests
import zipfile
from io import BytesIO

# Format and download URL mapping
formats = {
    "ODI": "https://cricsheet.org/downloads/odis_json.zip",
    "T20": "https://cricsheet.org/downloads/t20s_json.zip",
    "Test": "https://cricsheet.org/downloads/tests_json.zip",
    "IPL": "https://cricsheet.org/downloads/ipl_json.zip"
}

BASE_DIR = "cricsheet_data"
os.makedirs(BASE_DIR, exist_ok=True)

for fmt, url in formats.items():
    print(f"‚¨áÔ∏è Downloading {fmt} data from {url}")
    response = requests.get(url)

    if response.status_code == 200:
        zip_path = os.path.join(BASE_DIR, f"{fmt}.zip")
        with open(zip_path, "wb") as f:
            f.write(response.content)

        # Extract ZIP
        extract_dir = os.path.join(BASE_DIR, fmt)
        os.makedirs(extract_dir, exist_ok=True)
        with zipfile.ZipFile(BytesIO(response.content)) as z:
            z.extractall(extract_dir)

        print(f"‚úÖ Extracted {fmt} JSONs to {extract_dir}")
    else:
        print(f"‚ùå Failed to download {fmt} data. Status: {response.status_code}")

‚¨áÔ∏è Downloading ODI data from https://cricsheet.org/downloads/odis_json.zip
‚úÖ Extracted ODI JSONs to cricsheet_data\ODI
‚¨áÔ∏è Downloading T20 data from https://cricsheet.org/downloads/t20s_json.zip
‚úÖ Extracted T20 JSONs to cricsheet_data\T20
‚¨áÔ∏è Downloading Test data from https://cricsheet.org/downloads/tests_json.zip
‚úÖ Extracted Test JSONs to cricsheet_data\Test
‚¨áÔ∏è Downloading IPL data from https://cricsheet.org/downloads/ipl_json.zip
‚úÖ Extracted IPL JSONs to cricsheet_data\IPL


In [None]:
# step2_parse_cricsheet.py
# Parse extracted Cricsheet JSONs

import os
import json
import pandas as pd

BASE_DIR = "cricsheet_data"  # contains Test/, ODI/, T20/, IPL/
INPUT_DIRS = {
    "Test": os.path.join(BASE_DIR, "Test"),
    "ODI":  os.path.join(BASE_DIR, "ODI"),
    "T20":  os.path.join(BASE_DIR, "T20"),
    "IPL":  os.path.join(BASE_DIR, "IPL"),
}
OUTPUT_DIR = os.path.join(BASE_DIR, "processed")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Storage
detailed_data = {"Test": [], "ODI": [], "T20": [], "IPL": []}

def extract_first_wicket_player_out(delivery: dict) -> str:
    """Return first wicket's player_out if any, else 'None' (matches friend's logic)."""
    wickets = delivery.get("wickets")
    if isinstance(wickets, list) and len(wickets) > 0:
        return wickets[0].get("player_out", "None")
    return "None"

def parse_json_file(filepath: str, target_match_type: str):
    """Parse a single Cricsheet JSON and append delivery-level rows for the target match type."""
    try:
        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
            match_data = json.load(f)
    except json.JSONDecodeError as je:
        print(f" *** JSONDecodeError in {filepath}: {je}")
        return
    except Exception as er:
        print(f" error reading json {filepath}: {er}")
        return

    match_info = match_data.get("info", {})
    teams = match_info.get("teams", [])

    # stable match_id from filename + match_date from info.dates[0]
    match_id = os.path.splitext(os.path.basename(filepath))[0]
    match_date = None
    dates = match_info.get("dates") or []
    if isinstance(dates, list) and dates:
        match_date = str(dates[0])

    match_details = {
        "match_id": match_id,
        "match_date": match_date,
        "match_type": match_info.get("match_type"),
        "season": match_info.get("season"),
        "city": match_info.get("city"),
        "venue": match_info.get("venue"),
        "toss_winner": match_info.get("toss", {}).get("winner"),
        "toss_decision": match_info.get("toss", {}).get("decision"),
        "winner": match_info.get("outcome", {}).get("winner", "draw"),
        "player_of_match": ", ".join(match_info.get("player_of_match", [])),
        "teams": ", ".join(teams),
    }

    for inning in match_data.get("innings", []):
        team = inning.get("team", "Unknown")
        for over in inning.get("overs", []):
            over_number = over.get("over")
            for delivery in over.get("deliveries", []):
                delivery_row = {
                    "team": team,
                    "over": over_number,
                    "batter": delivery.get("batter"),
                    "bowler": delivery.get("bowler"),
                    "non_striker": delivery.get("non_striker"),
                    "runs_batter": delivery.get("runs", {}).get("batter", 0),
                    "runs_extras": delivery.get("runs", {}).get("extras", 0),
                    "runs_total": delivery.get("runs", {}).get("total", 0),
                    "wicket": extract_first_wicket_player_out(delivery),
                }
                detailed_data[target_match_type].append({**match_details, **delivery_row})

def process_format(match_type: str, input_dir: str):
    """Walk the format folder and parse all .json files."""
    if not os.path.isdir(input_dir):
        print(f"‚ùå Missing folder for {match_type}: {input_dir}")
        return

    file_count = 0
    print(f"\nProcessing {match_type} JSONs from: {input_dir}")
    for root, _, files in os.walk(input_dir):
        for name in files:
            if name.lower().endswith(".json"):
                file_count += 1
                parse_json_file(os.path.join(root, name), match_type)

    print(f"‚úÖ {match_type}: scanned {file_count} files, rows so far: {len(detailed_data[match_type])}")

# Run all four formats
for mt, in_dir in INPUT_DIRS.items():
    process_format(mt, in_dir)

# create DataFrames and write CSVs (under /processed)
test_df = pd.DataFrame(detailed_data.get("Test", []))
odi_df  = pd.DataFrame(detailed_data.get("ODI", []))
t20_df  = pd.DataFrame(detailed_data.get("T20", []))
ipl_df  = pd.DataFrame(detailed_data.get("IPL", []))

test_csv = os.path.join(OUTPUT_DIR, "test.csv")
odi_csv  = os.path.join(OUTPUT_DIR, "ODI.csv")
t20_csv  = os.path.join(OUTPUT_DIR, "T20.csv")
ipl_csv  = os.path.join(OUTPUT_DIR, "IPL.csv")

test_df.to_csv(test_csv, index=False)
odi_df.to_csv(odi_csv, index=False)
t20_df.to_csv(t20_csv, index=False)
ipl_df.to_csv(ipl_csv, index=False)

print("\nüìÅ Output CSVs written to:", OUTPUT_DIR)



Processing Test JSONs from: cricsheet_data\Test
‚úÖ Test: scanned 880 files, rows so far: 1701764

Processing ODI JSONs from: cricsheet_data\ODI
‚úÖ ODI: scanned 3019 files, rows so far: 1598807

Processing T20 JSONs from: cricsheet_data\T20
‚úÖ T20: scanned 4673 files, rows so far: 1058045

Processing IPL JSONs from: cricsheet_data\IPL
‚úÖ IPL: scanned 1169 files, rows so far: 278205

üìÅ Output CSVs written to: cricsheet_data\processed


In [9]:

# ---- Print small heads like the friend‚Äôs script ----
if not test_df.empty:
    print("\n Test DataFrame:")
    print(test_df.head())
if not odi_df.empty:
    print("\n ODI DataFrame:")
    print(odi_df.head())
if not t20_df.empty:
    print("\n T20 DataFrame:")
    print(t20_df.head())
if not ipl_df.empty:
    print("\n IPL DataFrame:")
    print(ipl_df.head())




 Test DataFrame:
  match_id  match_date match_type   season   city  \
0  1000851  2016-11-03       Test  2016/17  Perth   
1  1000851  2016-11-03       Test  2016/17  Perth   
2  1000851  2016-11-03       Test  2016/17  Perth   
3  1000851  2016-11-03       Test  2016/17  Perth   
4  1000851  2016-11-03       Test  2016/17  Perth   

                                          venue   toss_winner toss_decision  \
0  Western Australia Cricket Association Ground  South Africa           bat   
1  Western Australia Cricket Association Ground  South Africa           bat   
2  Western Australia Cricket Association Ground  South Africa           bat   
3  Western Australia Cricket Association Ground  South Africa           bat   
4  Western Australia Cricket Association Ground  South Africa           bat   

         winner player_of_match                    teams          team  over  \
0  South Africa        K Rabada  Australia, South Africa  South Africa     0   
1  South Africa        K Rab