In [17]:
#importing necessary libraries
import os
import json
import pandas as pd
import numpy as np 

In [18]:
#Reusable Cleanup Function
def clean_and_select_dataframe(df, desired_columns=None, df_name=""):
    
    # Convert column names to lowercase first
    df.columns = df.columns.str.lower()

    # Apply specific column selection if desired_columns are provided
    if desired_columns:
        actual_desired_cols = [col for col in desired_columns if col in df.columns]
        if len(actual_desired_cols) < len(desired_columns):
            missing_cols = set(desired_columns) - set(actual_desired_cols)
            print(f"Warning: For {df_name}, some desired columns were not found: {missing_cols}")
        
        df = df.loc[:, actual_desired_cols].copy()
    
    # Strip whitespace from all string columns
    for col in df.select_dtypes(include='object').columns:
        
        df.loc[:, col] = df[col].astype(str).str.strip()

    # Replace common empty/placeholder strings with NaN for robust empty column detection
    
    df.replace(["", "NA", "-", "N/A", "NULL", "None"], np.nan, inplace=True) 

    # Remove 'Unnamed:' columns
    unnamed_cols = [col for col in df.columns if 'unnamed:' in str(col).lower()]
    if unnamed_cols:
        print(f"Dropping 'Unnamed:' columns from {df_name}: {unnamed_cols}")
        df.drop(columns=unnamed_cols, inplace=True)

    # Remove columns that are entirely empty (all NaN values)
    empty_cols = [col for col in df.columns if df[col].isnull().all()]
    if empty_cols:
        print(f"Dropping entirely empty columns from {df_name}: {empty_cols}")
        df.drop(columns=empty_cols, inplace=True)

    # Drop rows where all values are NaN 
    df.dropna(how="all", inplace=True)
    
    # Perform numeric conversion BEFORE filling remaining NaNs with "NA" string
    num_cols = df.select_dtypes(include=["int64", "float64"]).columns
    df.loc[:, num_cols] = df[num_cols].apply(lambda col: pd.to_numeric(col, errors="coerce").fillna(0).astype(int))

    # After numeric conversions, fill any remaining NaNs with NA
    df.fillna("NA", inplace=True) 
    
    return df



In [19]:
# Define Desired Columns for Each Table

# Common summary columns for ODI, Test 
ODI_SUMMARY_DESIRED_COLUMNS = [
    "match_id", "match_format", "date", "city", "venue", "season", "gender",
    "team_1", "team_2", "toss_winner", "toss_decision", "winner",
    "win_by_runs", "win_by_wickets", "overs", "balls_per_over", "player_of_match"
]

# T20 and IPL summary columns
T20_IPL_SUMMARY_DESIRED_COLUMNS = [
    "match_id", "match_format", "date", "city", "venue", "season", "gender",
    "team_1", "team_2", "toss_winner", "toss_decision", "winner",
    "win_by_runs", "win_by_wickets", "overs", "balls_per_over", "player_of_match",
    "target_runs", "target_overs", "powerplays"
]

# Test summary columns 
TEST_SUMMARY_DESIRED_COLUMNS = [
    "match_id", "match_format", "start_date", "end_date", "city", "venue",
    "gender", "season", "team_1", "team_2", "toss_winner", "toss_decision",
    "winner", "win_by_runs", "win_by_wickets", "balls_per_over", "player_of_match"
]

# Common ball-by-ball columns for ODI, Test
COMMON_BALL_BY_BALL_DESIRED_COLS = [
    "match_id", "inning_team", "over_number", "batter", "non_striker",
    "bowler", "legbyes", "wides", "no_balls", "byes", "runs_batter",
    "runs_extras", "runs_total", "wicket_kind", "player_out"
]

# Player Registry columns
PLAYER_REGISTRY_DESIRED_COLS = [
    "match_id", "player_name", "player_id"
]



In [20]:
#ODI Data Processing Section
# Path to your ODI JSON files
odi_folder = r"D:/vscode/cricket data analysis/Extracted_jsons/odis_json"

# Lists to collect data
odi_summary_list = []
odi_ball_by_ball_list = []
odi_player_registry_list = []

#Loop through all ODI JSON files and extract data
for filename in os.listdir(odi_folder):
    if filename.endswith(".json"):
        file_path = os.path.join(odi_folder, filename)

        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            info = data.get("info", {})

            # Match Summary Data Extraction
            match_summary = {
                "match_id": filename,
                "match_format": info.get("match_type", "ODI"),
                "date": info.get("dates", ["NA"])[0],
                "city": info.get("city") or "NA",
                "venue": info.get("venue") or "NA",
                "season": info.get("season") or "NA",
                "gender": info.get("gender") or "NA",
                "team_1": info.get("teams", ["NA", "NA"])[0],
                "team_2": info.get("teams", ["NA", "NA"])[1],
                "toss_winner": info.get("toss", {}).get("winner") or "NA",
                "toss_decision": info.get("toss", {}).get("decision") or "NA",
                "winner": info.get("outcome", {}).get("winner") or "No Result",
                "win_by_runs": info.get("outcome", {}).get("by", {}).get("runs", 0),
                "win_by_wickets": info.get("outcome", {}).get("by", {}).get("wickets", 0),
                "overs": info.get("overs", 0),
                "balls_per_over": info.get("balls_per_over", 6),
                "player_of_match": info.get("player_of_match", ["NA"])[0]
            }
            odi_summary_list.append(match_summary)

            #Ball-by-ball Data Extraction
            for inning in data.get("innings", []):
                batting_team = inning.get("team") or "NA"
                for over_info in inning.get("overs", []):
                    over_number = over_info.get("over", -1)
                    for delivery in over_info.get("deliveries", []):
                        batter = delivery.get("batter") or "NA"
                        bowler = delivery.get("bowler") or "NA"
                        non_striker = delivery.get("non_striker") or "NA"

                        extras_detail = delivery.get("extras", {})
                        leg_byes = extras_detail.get("legbyes", 0)
                        wide = extras_detail.get("wides", 0)
                        no_ball = extras_detail.get("noballs", 0)
                        byes = extras_detail.get("byes", 0)

                        runs = delivery.get("runs", {})
                        batter_runs = runs.get("batter", 0)
                        extras = runs.get("extras", 0)
                        total_runs = runs.get("total", 0)

                        if "wickets" in delivery:
                            wicket_type = delivery["wickets"][0].get("kind") or "NA"
                            player_out = delivery["wickets"][0].get("player_out") or "NA"
                        else:
                            wicket_type = None
                            player_out = None

                        odi_ball_by_ball_list.append({
                            "match_id": filename,
                            "inning_team": batting_team,
                            "over_number": over_number,
                            "batter": batter,
                            "non_striker": non_striker,
                            "bowler": bowler,
                            "legbyes": leg_byes,
                            "wides": wide,
                            "no_balls": no_ball,
                            "byes": byes,
                            "runs_batter": batter_runs,
                            "runs_extras": extras,
                            "runs_total": total_runs,
                            "wicket_kind": wicket_type,
                            "player_out": player_out
                        })

            #Player Registry Data Extraction
            registry = info.get("registry", {}).get("people", {})
            for player_name, player_id in registry.items():
                odi_player_registry_list.append({
                    "match_id": filename,
                    "player_name": player_name or "NA",
                    "player_id": player_id or "NA"
                })

        except Exception as e:
            print(f"Error in ODI file {filename}: {e}")

#Convert ODI data to DataFrames
df_odi_summary = pd.DataFrame(odi_summary_list)
df_odi_balls = pd.DataFrame(odi_ball_by_ball_list)
df_odi_registry = pd.DataFrame(odi_player_registry_list)

# Apply Cleanup to ODI DataFrames
df_odi_summary = clean_and_select_dataframe(df_odi_summary, ODI_SUMMARY_DESIRED_COLUMNS, "df_odi_summary")
df_odi_balls = clean_and_select_dataframe(df_odi_balls, COMMON_BALL_BY_BALL_DESIRED_COLS, "df_odi_balls")
df_odi_registry = clean_and_select_dataframe(df_odi_registry, PLAYER_REGISTRY_DESIRED_COLS, "df_odi_registry")

#Convert 'date' column to datetime (specific to df_odi_summary)
if "date" in df_odi_summary.columns:
    df_odi_summary.loc[:, "date"] = df_odi_summary["date"].replace("NA", np.nan)
    df_odi_summary.loc[:, "date"] = pd.to_datetime(df_odi_summary["date"], errors="coerce")
    df_odi_summary.loc[:, "date"] = df_odi_summary["date"].fillna("NA")


  df_odi_summary.loc[:, "date"] = df_odi_summary["date"].fillna("NA")


In [21]:
#T20 Data Processing Section

#Path to your T20 JSON files
t20_folder = r"D:/vscode/cricket data analysis/Extracted_jsons/t20s_json"

#Lists to collect data
t20_summary_list = []
t20_ball_by_ball_list = []
t20_player_registry_list = []

#Loop through all T20 JSON files and extract data
for filename in os.listdir(t20_folder):
    if filename.endswith(".json"):
        file_path = os.path.join(t20_folder, filename)

        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            info = data.get("info", {})

            # Initialize target and powerplay for summary, default to None
            summary_target_runs = None
            summary_target_overs = None
            summary_powerplays = None 

            # Extract target and powerplays from innings for summary table
            for idx, inning in enumerate(data.get("innings", [])):
                if idx == 0: 
                    summary_powerplays = json.dumps(inning.get("powerplays", []))
                elif idx == 1: 
                    target_info = inning.get("target", {})
                    summary_target_runs = target_info.get("runs", None)
                    summary_target_overs = target_info.get("overs", None)
                

            # Match Summary Data Extraction
            t20_match_summary = {
                "match_id": filename,
                "match_format": info.get("match_type") or "T20",
                "date": info.get("dates", ["NA"])[0],
                "city": info.get("city") or "NA",
                "venue": info.get("venue") or "NA",
                "season": info.get("season") or "NA",
                "gender": info.get("gender") or "NA",
                "team_1": info.get("teams", ["NA", "NA"])[0],
                "team_2": info.get("teams", ["NA", "NA"])[1],
                "toss_winner": info.get("toss", {}).get("winner") or "NA",
                "toss_decision": info.get("toss", {}).get("decision") or "NA",
                "winner": info.get("outcome", {}).get("winner") or "No Result",
                "win_by_runs": info.get("outcome", {}).get("by", {}).get("runs", 0),
                "win_by_wickets": info.get("outcome", {}).get("by", {}).get("wickets", 0),
                "overs": info.get("overs", 0),
                "balls_per_over": info.get("balls_per_over", 6),
                "player_of_match": info.get("player_of_match", ["NA"])[0],
                "target_runs": summary_target_runs,
                "target_overs": summary_target_overs,
                "powerplays": summary_powerplays 
            }
            t20_summary_list.append(t20_match_summary)

            # Ball-by-ball Data Extraction
            for inning in data.get("innings", []):
                batting_team = inning.get("team") or "NA"
                for over_info in inning.get("overs", []):
                    over_number = over_info.get("over", -1)
                    for delivery in over_info.get("deliveries", []):
                        batter = delivery.get("batter") or "NA"
                        bowler = delivery.get("bowler") or "NA"
                        non_striker = delivery.get("non_striker") or "NA"

                        extras_detail = delivery.get("extras", {})
                        leg_byes = extras_detail.get("legbyes", 0)
                        wide = extras_detail.get("wides", 0)
                        no_ball = extras_detail.get("noballs", 0)
                        byes = extras_detail.get("byes", 0)

                        runs = delivery.get("runs", {})
                        batter_runs = runs.get("batter", 0)
                        extras = runs.get("extras", 0)
                        total_runs = runs.get("total", 0)

                        if "wickets" in delivery:
                            wicket_type = delivery["wickets"][0].get("kind") or "NA"
                            player_out = delivery["wickets"][0].get("player_out") or "NA"
                        else:
                            wicket_type = None
                            player_out = None

                        t20_ball_by_ball_list.append({
                            "match_id": filename,
                            "inning_team": batting_team,
                            "over_number": over_number,
                            "batter": batter,
                            "non_striker": non_striker,
                            "bowler": bowler,
                            "legbyes": leg_byes,
                            "wides": wide,
                            "no_balls": no_ball,
                            "byes": byes,
                            "runs_batter": batter_runs,
                            "runs_extras": extras,
                            "runs_total": total_runs,
                            "wicket_kind": wicket_type,
                            "player_out": player_out
                        })

            # Player Registry Data Extraction
            registry = info.get("registry", {}).get("people", {})
            for player_name, player_id in registry.items():
                t20_player_registry_list.append({
                    "match_id": filename,
                    "player_name": player_name or "NA",
                    "player_id": player_id or "NA"
                })

        except Exception as e:
            print(f"Error in T20 file {filename}: {e}")

#Convert T20 data to DataFrames
df_t20_summary = pd.DataFrame(t20_summary_list)
df_t20_balls = pd.DataFrame(t20_ball_by_ball_list)
df_t20_registry = pd.DataFrame(t20_player_registry_list)

#Apply Cleanup to T20 DataFrames
df_t20_summary = clean_and_select_dataframe(df_t20_summary, T20_IPL_SUMMARY_DESIRED_COLUMNS, "df_t20_summary")
df_t20_balls = clean_and_select_dataframe(df_t20_balls, COMMON_BALL_BY_BALL_DESIRED_COLS, "df_t20_balls")
df_t20_registry = clean_and_select_dataframe(df_t20_registry, PLAYER_REGISTRY_DESIRED_COLS, "df_t20_registry")

# Convert 'date' column to datetime (specific to df_t20_summary)
if "date" in df_t20_summary.columns:
    df_t20_summary.loc[:, "date"] = df_t20_summary["date"].replace("NA", np.nan)
    df_t20_summary.loc[:, "date"] = pd.to_datetime(df_t20_summary["date"], errors="coerce")
    df_t20_summary.loc[:, "date"] = df_t20_summary["date"].fillna("NA")



  df_t20_summary.loc[:, "date"] = df_t20_summary["date"].fillna("NA")


In [22]:
#IPL Data Processing Section
#Path to IPL JSON files
ipl_folder = r"D:/vscode/cricket data analysis/Extracted_jsons/ipl_json"

#Lists to collect data
ipl_summary_list = []
ipl_ball_by_ball_list = []
ipl_player_registry_list = []

#Loop through IPL JSON files and extract data
for filename in os.listdir(ipl_folder):
    if filename.endswith(".json"):
        file_path = os.path.join(ipl_folder, filename)

        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            info = data.get("info", {})

            # Initialize target and powerplay for summary, default to None
            summary_target_runs = None
            summary_target_overs = None
            summary_powerplays = None 

            # Extract target and powerplays from innings for summary table
            for idx, inning in enumerate(data.get("innings", [])):
                if idx == 0: 
                    summary_powerplays = json.dumps(inning.get("powerplays", []))
                elif idx == 1: 
                    target_info = inning.get("target", {})
                    summary_target_runs = target_info.get("runs", None)
                    summary_target_overs = target_info.get("overs", None)
                
            # Match Summary Data Extraction
            ipl_match_summary = {
                "match_id": filename,
                "match_format": info.get("match_type") or "IPL",
                "date": info.get("dates", ["NA"])[0],
                "city": info.get("city") or "NA",
                "venue": info.get("venue") or "NA",
                "season": info.get("season") or "NA",
                "gender": info.get("gender") or "NA",
                "team_1": info.get("teams", ["NA", "NA"])[0],
                "team_2": info.get("teams", ["NA", "NA"])[1],
                "toss_winner": info.get("toss", {}).get("winner") or "NA",
                "toss_decision": info.get("toss", {}).get("decision") or "NA",
                "winner": info.get("outcome", {}).get("winner") or "No Result",
                "win_by_runs": info.get("outcome", {}).get("by", {}).get("runs", 0),
                "win_by_wickets": info.get("outcome", {}).get("by", {}).get("wickets", 0),
                "overs": info.get("overs", 0),
                "balls_per_over": info.get("balls_per_over", 6),
                "player_of_match": info.get("player_of_match", ["NA"])[0],
                "target_runs": summary_target_runs, 
                "target_overs": summary_target_overs,
                "powerplays": summary_powerplays 
            }
            ipl_summary_list.append(ipl_match_summary)

            # Ball-by-ball Data Extraction
            for inning in data.get("innings", []):
                batting_team = inning.get("team") or "NA"
                for over_info in inning.get("overs", []):
                    over_number = over_info.get("over", -1)

                    for delivery in over_info.get("deliveries", []):
                        batter = delivery.get("batter") or "NA"
                        bowler = delivery.get("bowler") or "NA"
                        non_striker = delivery.get("non_striker") or "NA"

                        extras_detail = delivery.get("extras", {})
                        leg_byes = extras_detail.get("legbyes", 0)
                        wide = extras_detail.get("wides", 0)
                        no_ball = extras_detail.get("noballs", 0)
                        byes = extras_detail.get("byes", 0)

                        runs = delivery.get("runs", {})
                        batter_runs = runs.get("batter", 0)
                        extras = runs.get("extras", 0)
                        total_runs = runs.get("total", 0)

                        if "wickets" in delivery:
                            wicket_type = delivery["wickets"][0].get("kind") or "NA"
                            player_out = delivery["wickets"][0].get("player_out") or "NA"
                        else:
                            wicket_type = None
                            player_out = None

                        ipl_ball_by_ball_list.append({
                            "match_id": filename,
                            "inning_team": batting_team,
                            "over_number": over_number,
                            "batter": batter,
                            "non_striker": non_striker,
                            "bowler": bowler,
                            "legbyes": leg_byes,
                            "wides": wide,
                            "no_balls": no_ball,
                            "byes": byes,
                            "runs_batter": batter_runs,
                            "runs_extras": extras,
                            "runs_total": total_runs,
                            "wicket_kind": wicket_type,
                            "player_out": player_out
                        })

            # Player Registry Data Extraction
            registry = info.get("registry", {}).get("people", {})
            for player_name, player_id in registry.items():
                ipl_player_registry_list.append({
                    "match_id": filename,
                    "player_name": player_name or "NA",
                    "player_id": player_id or "NA"
                })

        except Exception as e:
            print(f"Error in IPL file {filename}: {e}")

#Convert IPL data to DataFrames
df_ipl_summary = pd.DataFrame(ipl_summary_list)
df_ipl_balls = pd.DataFrame(ipl_ball_by_ball_list)
df_ipl_registry = pd.DataFrame(ipl_player_registry_list)

#Apply Cleanup to IPL DataFrames

df_ipl_summary = clean_and_select_dataframe(df_ipl_summary, T20_IPL_SUMMARY_DESIRED_COLUMNS, "df_ipl_summary")
df_ipl_balls = clean_and_select_dataframe(df_ipl_balls, COMMON_BALL_BY_BALL_DESIRED_COLS, "df_ipl_balls")
df_ipl_registry = clean_and_select_dataframe(df_ipl_registry, PLAYER_REGISTRY_DESIRED_COLS, "df_ipl_registry")

#Convert 'date' column to datetime (specific to df_ipl_summary)
if "date" in df_ipl_summary.columns:
    df_ipl_summary.loc[:, "date"] = df_ipl_summary["date"].replace("NA", np.nan)
    df_ipl_summary.loc[:, "date"] = pd.to_datetime(df_ipl_summary["date"], errors="coerce")
    df_ipl_summary.loc[:, "date"] = df_ipl_summary["date"].fillna("NA")



  df_ipl_summary.loc[:, "date"] = df_ipl_summary["date"].fillna("NA")


In [23]:
#Test Data Processing Section
#Path to Test JSON files
test_folder = r"D:/vscode/cricket data analysis/Extracted_jsons/tests_json"

# Lists to collect data
test_summary_list = []
test_ball_by_ball_list = []
test_player_registry_list = []

# Loop through Test JSON files and extract data

for filename in os.listdir(test_folder):
    if filename.endswith(".json"):
        file_path = os.path.join(test_folder, filename)

        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            info = data.get("info", {})
            dates = info.get("dates", [])
            start_date = dates[0] if dates else "NA"
            end_date = dates[-1] if len(dates) > 1 else start_date

            # Match Summary Data Extraction
            test_match_summary = {
                "match_id": filename,
                "match_format": info.get("match_type", "Test"),
                "start_date": start_date,
                "end_date": end_date,
                "city": info.get("city", "NA"),
                "venue": info.get("venue", "NA"),
                "gender": info.get("gender", "NA"),
                "season": info.get("season", "NA"),
                "team_1": info.get("teams", ["NA", "NA"])[0],
                "team_2": info.get("teams", ["NA", "NA"])[1],
                "toss_winner": info.get("toss", {}).get("winner", "NA"),
                "toss_decision": info.get("toss", {}).get("decision", "NA"),
                "winner": info.get("outcome", {}).get("winner", "No Result"),
                "win_by_runs": info.get("outcome", {}).get("by", {}).get("runs", 0),
                "win_by_wickets": info.get("outcome", {}).get("by", {}).get("wickets", 0),
                "balls_per_over": info.get("balls_per_over", 6),
                "player_of_match": info.get("player_of_match", ["NA"])[0]
            }
            test_summary_list.append(test_match_summary)

            # Ball-by-ball Data Extraction
            for inning in data.get("innings", []):
                batting_team = inning.get("team", "NA")
                for over_info in inning.get("overs", []):
                    over_number = over_info.get("over", -1)
                    for delivery in over_info.get("deliveries", []):
                        batter = delivery.get("batter", "NA")
                        bowler = delivery.get("bowler", "NA")
                        non_striker = delivery.get("non_striker", "NA")

                        extras_detail = delivery.get("extras", {})
                        leg_byes = extras_detail.get("legbyes", 0)
                        wide = extras_detail.get("wides", 0)
                        no_ball = extras_detail.get("noballs", 0)
                        byes = extras_detail.get("byes", 0)

                        runs = delivery.get("runs", {})
                        batter_runs = runs.get("batter", 0)
                        extras = runs.get("extras", 0)
                        total_runs = runs.get("total", 0)

                        if "wickets" in delivery:
                            wicket_type = delivery["wickets"][0].get("kind", "NA")
                            player_out = delivery["wickets"][0].get("player_out", "NA")
                        else:
                            wicket_type = None
                            player_out = None

                        test_ball_by_ball_list.append({
                            "match_id": filename,
                            "inning_team": batting_team,
                            "over_number": over_number,
                            "batter": batter,
                            "non_striker": non_striker,
                            "bowler": bowler,
                            "legbyes": leg_byes,
                            "wides": wide,
                            "no_balls": no_ball,
                            "byes": byes,
                            "runs_batter": batter_runs,
                            "runs_extras": extras,
                            "runs_total": total_runs,
                            "wicket_kind": wicket_type,
                            "player_out": player_out
                        })

            # Player Registry Data Extraction
            registry = info.get("registry", {}).get("people", {})
            for player_name, player_id in registry.items():
                test_player_registry_list.append({
                    "match_id": filename,
                    "player_name": player_name or "NA",
                    "player_id": player_id or "NA"
                })

        except Exception as e:
            print(f"Error in Test file {filename}: {e}")

# Convert Test data to DataFrames
df_test_summary = pd.DataFrame(test_summary_list)
df_test_balls = pd.DataFrame(test_ball_by_ball_list)
df_test_registry = pd.DataFrame(test_player_registry_list)

# Apply Cleanup to Test DataFrames

df_test_summary = clean_and_select_dataframe(df_test_summary, TEST_SUMMARY_DESIRED_COLUMNS, "df_test_summary")
df_test_balls = clean_and_select_dataframe(df_test_balls, COMMON_BALL_BY_BALL_DESIRED_COLS, "df_test_balls")
df_test_registry = clean_and_select_dataframe(df_test_registry, PLAYER_REGISTRY_DESIRED_COLS, "df_test_registry")

# Convert 'date' columns to datetime (specific to df_test_summary)
if "start_date" in df_test_summary.columns:
    df_test_summary.loc[:, "start_date"] = df_test_summary["start_date"].replace("NA", np.nan)
    df_test_summary.loc[:, "start_date"] = pd.to_datetime(df_test_summary["start_date"], errors="coerce")
    df_test_summary.loc[:, "start_date"] = df_test_summary["start_date"].fillna("NA")
if "end_date" in df_test_summary.columns:
    df_test_summary.loc[:, "end_date"] = df_test_summary["end_date"].replace("NA", np.nan)
    df_test_summary.loc[:, "end_date"] = pd.to_datetime(df_test_summary["end_date"], errors="coerce")
    df_test_summary.loc[:, "end_date"] = df_test_summary["end_date"].fillna("NA")


  df_test_summary.loc[:, "start_date"] = df_test_summary["start_date"].fillna("NA")
  df_test_summary.loc[:, "end_date"] = df_test_summary["end_date"].fillna("NA")


In [24]:

#Combined Player Registry Merge and Final Saving

# Initialize combined registry with ODI data
df_combined_registry = df_odi_registry.copy()

# Concatenate T20 registry if available and not empty
if not df_t20_registry.empty:
    df_combined_registry = pd.concat([df_combined_registry, df_t20_registry], ignore_index=True)

# Concatenate IPL registry if available and not empty
if not df_ipl_registry.empty:
    df_combined_registry = pd.concat([df_combined_registry, df_ipl_registry], ignore_index=True)

# Concatenate Test registry if available and not empty
if not df_test_registry.empty:
    df_combined_registry = pd.concat([df_combined_registry, df_test_registry], ignore_index=True)

# Drop duplicates from the combined registry based on player_name and player_id
df_combined_registry.drop_duplicates(subset=["player_name", "player_id"], inplace=True)


#Save All Cleaned CSVs
df_odi_summary.to_csv("odi_match_summary.csv", index=False)
df_odi_balls.to_csv("odi_ball_by_ball.csv", index=False)
df_t20_summary.to_csv("t20_match_summary.csv", index=False)
df_t20_balls.to_csv("t20_ball_by_ball.csv", index=False)
df_ipl_summary.to_csv("ipl_match_summary.csv", index=False)
df_ipl_balls.to_csv("ipl_ball_by_ball.csv", index=False)
df_test_summary.to_csv("test_match_summary.csv", index=False)
df_test_balls.to_csv("test_ball_by_ball.csv", index=False)
df_combined_registry.to_csv("player_registry.csv", index=False)