In [None]:
# from kfp import dsl
# from kfp.dsl import (
#     component,
# )

# from kfp import compiler
# from google.cloud import aiplatform

import pandas as pd
import requests


# display the DataFrame - all columns are displayed
pd.set_option('display.max_columns', None)

In [11]:
API_BASE_URL = "https://big-data-project-api-248863766350.europe-west3.run.app/cricket"

In [None]:
def process_delivery(delivery, match_id, inning, over, cumulative_score, cumulative_wickets):
    batter = delivery["batter"]
    bowler = delivery["bowler"]
    non_striker = delivery["non_striker"]
    runs_batter = delivery["runs"]["batter"]
    runs_extras = delivery["runs"]["extras"]
    runs_total = delivery["runs"]["total"]

    cumulative_score += runs_total
    if "wickets" in delivery:
        cumulative_wickets += len(delivery["wickets"])

    return {
        "match_id": match_id,
        "inning": inning,
        "over": over,
        "batter": batter,
        "bowler": bowler,
        "non_striker": non_striker,
        "runs_batter": runs_batter,
        "runs_extras": runs_extras,
        "runs_total": runs_total,
        "cumulative_score": cumulative_score,
        "cumulative_wickets": cumulative_wickets
    }, cumulative_score, cumulative_wickets

def process_players(data, first_team):
    teams = {}
    for team, players in data["players"].items():
        teams[team] = players[:11] if len(players) >= 11 else players + [f"Placeholder {i}" for i in range(11 - len(players))]

    player_dict = {}

    team_1_players = teams[first_team]
    team_2 = [team for team in teams if team != first_team][0]
    team_2_players = teams[team_2]

    for i, player in enumerate(team_1_players, start=1):
        player_dict[f"team_1_player_{i}"] = player
    for i, player in enumerate(team_2_players, start=1):
        player_dict[f"team_2_player_{i}"] = player

    return player_dict


def process_match(match_id):
    rows = []
    cumulative_score = {1: 0, 2: 0}
    cumulative_wickets = {1: 0, 2: 0}
    first_inning_summary = {}

    for inning in [1, 2]:
        for over in range(20):
            api_url = f"{API_BASE_URL}/{match_id}/{inning}/{over}"
            response = requests.get(api_url)
            data = response.json()

            if data == "Invalid data provided.":
                break

            if inning == 1 and over == 0:
                teams = process_players(data, first_team = data.get("team"))

            if inning == 1:
                for delivery in data.get("deliveries", []):
                    row, cumulative_score[inning], cumulative_wickets[inning] = process_delivery(
                        delivery, match_id, inning, over, 
                        cumulative_score[inning], cumulative_wickets[inning]
                    )
            else:
                for delivery in data.get("deliveries", []):
                    row, cumulative_score[inning], cumulative_wickets[inning] = process_delivery(
                        delivery, match_id, inning, over, 
                        cumulative_score[inning], cumulative_wickets[inning]
                    )

            row.update({
                "winner": data.get("winner"),
                "toss_decision": data.get("toss_decision"),
                "toss_winner": data.get("toss_winner"),
                "season": data.get("season"),
                "city": data.get("city"),
                "current_team": data.get("team"),
                "first_inning_total_score": first_inning_summary.get("total_score", 0),
                "first_inning_total_wickets": first_inning_summary.get("total_wickets", 0),
                "first_inning_run_rate": first_inning_summary.get("run_rate", 0)
            })
            row.update(teams)
            rows.append(row)

        first_inning_summary = {
            "total_score": cumulative_score[1],
            "total_wickets": cumulative_wickets[1],
            "run_rate": cumulative_score[1] / (over + 1)
        }

    df = pd.DataFrame(rows)
    return df

def load_data(match_ids):
    all_matches = []
    for match_id in match_ids:
        match_data = process_match(match_id)
        all_matches.append(match_data)

    full_df = pd.concat(all_matches, ignore_index=True)
    return full_df

MATCH_IDS = [1, 2]


df = load_data(MATCH_IDS)
df


Unnamed: 0,match_id,inning,over,batter,bowler,runs_batter,runs_extras,runs_total,cumulative_score,cumulative_wickets,winner,toss_decision,toss_winner,season,city,current_team,first_inning_total_score,first_inning_total_wickets,first_inning_run_rate,team_1_player_1,team_1_player_2,team_1_player_3,team_1_player_4,team_1_player_5,team_1_player_6,team_1_player_7,team_1_player_8,team_1_player_9,team_1_player_10,team_1_player_11,team_2_player_1,team_2_player_2,team_2_player_3,team_2_player_4,team_2_player_5,team_2_player_6,team_2_player_7,team_2_player_8,team_2_player_9,team_2_player_10,team_2_player_11
0,1,1,0,MJ Clarke,DR Tuffey,0,0,0,10,1,Australia,bat,Australia,2004/05,Auckland,Australia,0,0,0.00,AC Gilchrist,MJ Clarke,A Symonds,RT Ponting,DR Martyn,SM Katich,MEK Hussey,JR Hopes,B Lee,MS Kasprowicz,GD McGrath,BB McCullum,SP Fleming,MS Sinclair,SB Styris,CD McMillan,CL Cairns,HJH Marshall,AR Adams,JW Wilson,KD Mills,DR Tuffey
1,1,1,1,RT Ponting,KD Mills,1,0,1,22,2,Australia,bat,Australia,2004/05,Auckland,Australia,0,0,0.00,AC Gilchrist,MJ Clarke,A Symonds,RT Ponting,DR Martyn,SM Katich,MEK Hussey,JR Hopes,B Lee,MS Kasprowicz,GD McGrath,BB McCullum,SP Fleming,MS Sinclair,SB Styris,CD McMillan,CL Cairns,HJH Marshall,AR Adams,JW Wilson,KD Mills,DR Tuffey
2,1,1,2,A Symonds,DR Tuffey,1,0,1,26,2,Australia,bat,Australia,2004/05,Auckland,Australia,0,0,0.00,AC Gilchrist,MJ Clarke,A Symonds,RT Ponting,DR Martyn,SM Katich,MEK Hussey,JR Hopes,B Lee,MS Kasprowicz,GD McGrath,BB McCullum,SP Fleming,MS Sinclair,SB Styris,CD McMillan,CL Cairns,HJH Marshall,AR Adams,JW Wilson,KD Mills,DR Tuffey
3,1,1,3,A Symonds,KD Mills,0,0,0,46,3,Australia,bat,Australia,2004/05,Auckland,Australia,0,0,0.00,AC Gilchrist,MJ Clarke,A Symonds,RT Ponting,DR Martyn,SM Katich,MEK Hussey,JR Hopes,B Lee,MS Kasprowicz,GD McGrath,BB McCullum,SP Fleming,MS Sinclair,SB Styris,CD McMillan,CL Cairns,HJH Marshall,AR Adams,JW Wilson,KD Mills,DR Tuffey
4,1,1,4,DR Martyn,DR Tuffey,1,0,1,54,3,Australia,bat,Australia,2004/05,Auckland,Australia,0,0,0.00,AC Gilchrist,MJ Clarke,A Symonds,RT Ponting,DR Martyn,SM Katich,MEK Hussey,JR Hopes,B Lee,MS Kasprowicz,GD McGrath,BB McCullum,SP Fleming,MS Sinclair,SB Styris,CD McMillan,CL Cairns,HJH Marshall,AR Adams,JW Wilson,KD Mills,DR Tuffey
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,2,2,10,MS Kasprowicz,PD Collingwood,0,0,0,68,8,England,bat,England,2005,Southampton,Australia,179,8,8.95,ME Trescothick,GO Jones,A Flintoff,KP Pietersen,MP Vaughan,PD Collingwood,AJ Strauss,VS Solanki,J Lewis,D Gough,SJ Harmison,AC Gilchrist,ML Hayden,A Symonds,MJ Clarke,MEK Hussey,RT Ponting,DR Martyn,B Lee,JN Gillespie,MS Kasprowicz,GD McGrath
71,2,2,11,B Lee,A Flintoff,1,0,1,72,8,England,bat,England,2005,Southampton,Australia,179,8,8.95,ME Trescothick,GO Jones,A Flintoff,KP Pietersen,MP Vaughan,PD Collingwood,AJ Strauss,VS Solanki,J Lewis,D Gough,SJ Harmison,AC Gilchrist,ML Hayden,A Symonds,MJ Clarke,MEK Hussey,RT Ponting,DR Martyn,B Lee,JN Gillespie,MS Kasprowicz,GD McGrath
72,2,2,12,GD McGrath,PD Collingwood,1,0,1,78,9,England,bat,England,2005,Southampton,Australia,179,8,8.95,ME Trescothick,GO Jones,A Flintoff,KP Pietersen,MP Vaughan,PD Collingwood,AJ Strauss,VS Solanki,J Lewis,D Gough,SJ Harmison,AC Gilchrist,ML Hayden,A Symonds,MJ Clarke,MEK Hussey,RT Ponting,DR Martyn,B Lee,JN Gillespie,MS Kasprowicz,GD McGrath
73,2,2,13,GD McGrath,A Flintoff,1,0,1,79,9,England,bat,England,2005,Southampton,Australia,179,8,8.95,ME Trescothick,GO Jones,A Flintoff,KP Pietersen,MP Vaughan,PD Collingwood,AJ Strauss,VS Solanki,J Lewis,D Gough,SJ Harmison,AC Gilchrist,ML Hayden,A Symonds,MJ Clarke,MEK Hussey,RT Ponting,DR Martyn,B Lee,JN Gillespie,MS Kasprowicz,GD McGrath
