In [2]:
import pandas as pd
import json

def parse_cricket_json(file_path):
    """
    Parses the T20 match deliveries from a JSON file, including wickets, wides, no-balls, legbyes,
    byes, fielder's name and id, kind of dismissal, and player out details.
    """
    with open(file_path) as file:
        data = json.load(file)

    innings = data['innings']
    player_registry = data['info']['registry']['people']
    game_id = file_path.split('/')[-1].split('.')[0]
    season = data['info']['season']

    deliveries_data = []

    for inning in innings:
        team_name = inning['team']
        for over in inning['overs']:
            over_number = over['over']
            for delivery in over['deliveries']:
                batter_id = player_registry.get(delivery['batter'], "Unknown")
                bowler_id = player_registry.get(delivery['bowler'], "Unknown")
                non_striker_id = player_registry.get(delivery['non_striker'], "Unknown")
                wides = delivery.get('extras', {}).get('wides', 0)
                noballs = delivery.get('extras', {}).get('noballs', 0)
                legbyes = delivery.get('extras', {}).get('legbyes', 0)
                byes = delivery.get('extras', {}).get('byes', 0)
                wicket_info = delivery.get('wickets')
                wicket = 1 if wicket_info else 0
                player_out = wicket_info[0]['player_out'] if wicket_info else ""
                player_out_id = player_registry.get(player_out, "Unknown") if player_out else ""
                fielders = [wicket_info[0]['fielders'][0]['name'] if wicket_info and 'fielders' in wicket_info[0] else ""]
                fielders_id = [player_registry.get(fielders[0], "Unknown") if fielders[0] else ""]
                kind = [wicket_info[0]['kind'] if wicket_info else ""]

                delivery_info = {
                    "game_id": game_id,
                    "season": season,
                    "team": team_name,
                    "over": over_number,
                    "batter": delivery['batter'],
                    "batter_id": batter_id,
                    "bowler": delivery['bowler'],
                    "bowler_id": bowler_id,
                    "non_striker": delivery['non_striker'],
                    "non_striker_id": non_striker_id,
                    "wides": wides,
                    "noballs": noballs,
                    "legbyes": legbyes,
                    "byes": byes,
                    "wicket": wicket,
                    "player_out": player_out,
                    "player_out_id": player_out_id,
                    "fielders_name": fielders[0],
                    "fielders_id": fielders_id[0],
                    "wicket_type": kind[0],
                    "runs_batter": delivery['runs']['batter'],
                    "runs_extras": delivery['runs']['extras'],
                    "runs_total": delivery['runs']['total']
                }
                deliveries_data.append(delivery_info)

    return pd.DataFrame(deliveries_data)

# Test the function with the given JSON file path
test_df = parse_cricket_json('data/int_T20/1346916.json') # Example File Path 
test_df.head()



Unnamed: 0,game_id,season,team,over,batter,batter_id,bowler,bowler_id,non_striker,non_striker_id,...,byes,wicket,player_out,player_out_id,fielders_name,fielders_id,wicket_type,runs_batter,runs_extras,runs_total
0,1346916,2022/23,Dubai Capitals,0,RV Uthappa,1c17e270,R Rampaul,d68e7f48,JE Root,a343262c,...,0,0,,,,,,0,0,0
1,1346916,2022/23,Dubai Capitals,0,RV Uthappa,1c17e270,R Rampaul,d68e7f48,JE Root,a343262c,...,0,0,,,,,,0,0,0
2,1346916,2022/23,Dubai Capitals,0,RV Uthappa,1c17e270,R Rampaul,d68e7f48,JE Root,a343262c,...,0,0,,,,,,1,0,1
3,1346916,2022/23,Dubai Capitals,0,JE Root,a343262c,R Rampaul,d68e7f48,RV Uthappa,1c17e270,...,0,0,,,,,,1,0,1
4,1346916,2022/23,Dubai Capitals,0,RV Uthappa,1c17e270,R Rampaul,d68e7f48,JE Root,a343262c,...,0,0,,,,,,0,0,0


In [3]:
test_df.to_csv("cricket_test.csv")

In [3]:
import os
import pandas as pd

def process_cricket_jsons(input_folder, output_folder):
    """
    Iterates through all JSON files in the input folder, tries to convert each to a DataFrame using parse_cricket_json,
    saves the DataFrame as a CSV in the output folder, prints the progress status, and skips files that cause issues.
    """
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # List all JSON files in the input folder
    json_files = [f for f in os.listdir(input_folder) if f.endswith(".json")]
    total_files = len(json_files)
    processed_files = 0
    
    for filename in json_files:
        try:
            file_path = os.path.join(input_folder, filename)
            # Use the provided parsing function
            df = parse_cricket_json(file_path)
            # Define the output file path
            output_file_path = os.path.join(output_folder, filename.replace(".json", ".csv"))
            # Save the DataFrame as a CSV
            df.to_csv(output_file_path, index=False)
            
            print(f"Processed and saved: {output_file_path}")
        except Exception as e:
            print(f"Skipping file {filename} due to an error: {e}")
        
        # Update the processed files count and print the progress after each file, including skipped ones
        processed_files += 1
        progress_percentage = (processed_files / total_files) * 100
        print(f"Progress: {progress_percentage:.2f}%")

# Example usage
input_folder = 'path/to/input_folder'  # Update this to your input folder path
output_folder = 'path/to/output_folder'  # Update this to your output folder path
process_cricket_jsons(input_folder, output_folder)



KeyError: 'kind'