In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

In [2]:
match_results = "https://www.espncricinfo.com/records/tournament/team-match-results/icc-cricket-world-cup-2023-24-15338"
base_url = 'https://www.espncricinfo.com'

# SCRAPING MATCH IDs AND SCORECARD URLS

In [3]:
response = requests.get(match_results)
if response.status_code != 200:
    raise Exception('Failed to load page {}'.format(match_results))
doc = BeautifulSoup(response.text, 'html.parser')

urls =[]
matchId = []
table = doc.tbody

for row in table.find_all('tr'):    
    columns = row.find_all('td')
    if(columns != []):    
        scoreCard = base_url + columns[6].a.get('href')
        matchId.append(columns[6].a.span.text.strip())
        urls.append(scoreCard)        

# SCRAPING BATTING DATA

In [4]:
all_match_data_batting = []

def get_batting_scorecard():
    i = 0
    for url in urls:
        r = requests.get(url)
        if r.status_code != 200:
            raise Exception('Failed to load page {}'.format(url))
        soup = BeautifulSoup(r.text, 'html.parser')

    #Finding the rival teams
        teams = soup.find('div', class_ = 'ds-flex ds-flex-col ds-mt-3 md:ds-mt-0 ds-mt-0 ds-mb-1').find_all('div', class_= 'ci-team-score')
        matchup = []
        for child in teams:
            matchup.append(child.span.text)
        match = matchup[0] + " vs " + matchup[1]
        
    #Finding the batting data
        batting_data = []
        
        divs = soup.find('div', 'ds-grow').find_all('div', class_ = 'ds-mt-3')
        innings = divs[1].find_all('div', class_ = 'ds-rounded-lg ds-mt-2')
        first_innings = innings[0]
        second_innings = innings[1]
        
    #Function to collect the data of each innings
        batting_data.append(get_batting_data(first_innings, 1, first_innings.find('span', class_ = 'ds-text-title-xs ds-font-bold ds-text-typo').text.split('\xa0')[0]))
        batting_data.append(get_batting_data(second_innings, 2, second_innings.find('span', class_ = 'ds-text-title-xs ds-font-bold ds-text-typo').text.split('\xa0')[0]))
               
        all_match_data_batting.append([match, batting_data, matchId[i]])
        i = i+1
        
    return(all_match_data_batting)

In [5]:
#Function to collect the data of each innings
def get_batting_data(innings, order, team_inning):
    batting = innings.find('table', class_ = 'ci-scorecard-table').tbody
    batting_summary = []
    index = 0
    
    for row in batting.find_all('tr'):  
        columns = row.find_all('td')
    
        try:
            if(index<=11): 
                battingPos = index+1
                batsmanName = columns[0].div.span.span.text.strip()
                dismissal = "not_out" if(columns[1].text.strip().replace(" ","") == "notout") else "out"
                runs = columns[2].strong.text
                balls = columns[3].text
                fours = columns[5].text
                sixes = columns[6].text
                strikeRate = columns[7].text
                index = index+1;
                
                batting_summary.append([order, team_inning, battingPos, batsmanName, dismissal, runs, balls, fours, sixes, strikeRate])
                
        except AttributeError:
            continue
        except IndexError:
            continue
    
    return batting_summary

In [6]:
batting_data = get_batting_scorecard()

In [7]:
batting_columns = ['Match', 'Team', 'Match_id']

In [8]:
batting_df = pd.DataFrame(batting_data, columns = batting_columns)

In [9]:
batting_df.head()

Unnamed: 0,Match,Team,Match_id
0,India vs Australia,"[[[1, India, 1, Rohit Sharma (c), out, 47, 31,...",ODI # 4705
1,South Africa vs Australia,"[[[1, South Africa, 1, Quinton de Kock †, out,...",ODI # 4704
2,India vs New Zealand,"[[[1, India, 1, Rohit Sharma (c), out, 47, 29,...",ODI # 4703
3,India vs Netherlands,"[[[1, India, 1, Rohit Sharma (c), out, 61, 54,...",ODI # 4702
4,England vs Pakistan,"[[[1, England, 1, Dawid Malan, out, 31, 39, 5,...",ODI # 4701


In [10]:
#Destructuring data of both the teams
batting_df[['Team1', 'Team2']] = pd.DataFrame(batting_df['Team'].tolist(), index = batting_df.index)

In [11]:
batting_df = batting_df.drop(['Team'], axis=1)

In [12]:
batting_df.head()

Unnamed: 0,Match,Match_id,Team1,Team2
0,India vs Australia,ODI # 4705,"[[1, India, 1, Rohit Sharma (c), out, 47, 31, ...","[[2, Australia, 1, David Warner, out, 7, 3, 1,..."
1,South Africa vs Australia,ODI # 4704,"[[1, South Africa, 1, Quinton de Kock †, out, ...","[[2, Australia, 1, Travis Head, out, 62, 48, 9..."
2,India vs New Zealand,ODI # 4703,"[[1, India, 1, Rohit Sharma (c), out, 47, 29, ...","[[2, New Zealand, 1, Devon Conway, out, 13, 15..."
3,India vs Netherlands,ODI # 4702,"[[1, India, 1, Rohit Sharma (c), out, 61, 54, ...","[[2, Netherlands, 1, Wesley Barresi, out, 4, 5..."
4,England vs Pakistan,ODI # 4701,"[[1, England, 1, Dawid Malan, out, 31, 39, 5, ...","[[2, Pakistan, 1, Abdullah Shafique, out, 0, 2..."


In [13]:
#   Explodes Team1 and Team2 columns of a DataFrame and creates new DataFrames for each team.
def explode_batting_teams(df):

  # Define column names for the new DataFrame
    new_columns = ["order", "team_inning", "battingPos", "name", "out/notout", "runs", "balls", "fours", "sixes", "sr", "match_id"]

  # Create empty DataFrames to store exploded data
    team1_df = pd.DataFrame(columns=new_columns)
    team2_df = pd.DataFrame(columns=new_columns)

  # Loop through each match data in Team1 and Team2
    for match_data, team_data in zip(df[["Match", "Match_id"]].itertuples(), df[["Team1", "Team2"]].itertuples()):
    
    # Explode each team's data into separate rows
        exploded_team1 = pd.DataFrame(team_data.Team1, columns=new_columns[:-1])
        exploded_team2 = pd.DataFrame(team_data.Team2, columns=new_columns[:-1])

    # Add Match, Match_id columns
        exploded_team1["Match"] = match_data.Match
        exploded_team1["Match_id"] = match_data.Match_id

        exploded_team2["Match"] = match_data.Match
        exploded_team2["Match_id"] = match_data.Match_id

    # Concatenate exploded data to respective DataFrames
        team1_df = pd.concat([team1_df, exploded_team1], ignore_index=True)
        team2_df = pd.concat([team2_df, exploded_team2], ignore_index=True)

  # Combine Team1 and Team2 DataFrames
        final_df = pd.concat([team1_df, team2_df], ignore_index=True)

    return final_df

In [14]:
exploded_batting_df = explode_batting_teams(batting_df.copy())

In [15]:
exploded_batting_df.head(12)

Unnamed: 0,order,team_inning,battingPos,name,out/notout,runs,balls,fours,sixes,sr,match_id,Match,Match_id
0,1,India,1,Rohit Sharma (c),out,47,31,4,3,151.61,,India vs Australia,ODI # 4705
1,1,India,2,Shubman Gill,out,4,7,0,0,57.14,,India vs Australia,ODI # 4705
2,1,India,3,Virat Kohli,out,54,63,4,0,85.71,,India vs Australia,ODI # 4705
3,1,India,4,Shreyas Iyer,out,4,3,1,0,133.33,,India vs Australia,ODI # 4705
4,1,India,5,KL Rahul †,out,66,107,1,0,61.68,,India vs Australia,ODI # 4705
5,1,India,6,Ravindra Jadeja,out,9,22,0,0,40.9,,India vs Australia,ODI # 4705
6,1,India,7,Suryakumar Yadav,out,18,28,1,0,64.28,,India vs Australia,ODI # 4705
7,1,India,8,Mohammed Shami,out,6,10,1,0,60.0,,India vs Australia,ODI # 4705
8,1,India,9,Jasprit Bumrah,out,1,3,0,0,33.33,,India vs Australia,ODI # 4705
9,1,India,10,Kuldeep Yadav,out,10,18,0,0,55.55,,India vs Australia,ODI # 4705


In [16]:
#Reordering the data in match-wise manner
batting_summary_data = exploded_batting_df.sort_values(by = ['Match_id', 'order'], axis=0, ascending=[False, True], inplace=False)

In [17]:
batting_summary_data.head(10)

Unnamed: 0,order,team_inning,battingPos,name,out/notout,runs,balls,fours,sixes,sr,match_id,Match,Match_id
0,1,India,1,Rohit Sharma (c),out,47,31,4,3,151.61,,India vs Australia,ODI # 4705
1,1,India,2,Shubman Gill,out,4,7,0,0,57.14,,India vs Australia,ODI # 4705
2,1,India,3,Virat Kohli,out,54,63,4,0,85.71,,India vs Australia,ODI # 4705
3,1,India,4,Shreyas Iyer,out,4,3,1,0,133.33,,India vs Australia,ODI # 4705
4,1,India,5,KL Rahul †,out,66,107,1,0,61.68,,India vs Australia,ODI # 4705
5,1,India,6,Ravindra Jadeja,out,9,22,0,0,40.9,,India vs Australia,ODI # 4705
6,1,India,7,Suryakumar Yadav,out,18,28,1,0,64.28,,India vs Australia,ODI # 4705
7,1,India,8,Mohammed Shami,out,6,10,1,0,60.0,,India vs Australia,ODI # 4705
8,1,India,9,Jasprit Bumrah,out,1,3,0,0,33.33,,India vs Australia,ODI # 4705
9,1,India,10,Kuldeep Yadav,out,10,18,0,0,55.55,,India vs Australia,ODI # 4705


In [18]:
batting_summary_data.drop(['match_id','order'], axis=1, inplace=True)

In [19]:
batting_summary_data

Unnamed: 0,team_inning,battingPos,name,out/notout,runs,balls,fours,sixes,sr,Match,Match_id
0,India,1,Rohit Sharma (c),out,47,31,4,3,151.61,India vs Australia,ODI # 4705
1,India,2,Shubman Gill,out,4,7,0,0,57.14,India vs Australia,ODI # 4705
2,India,3,Virat Kohli,out,54,63,4,0,85.71,India vs Australia,ODI # 4705
3,India,4,Shreyas Iyer,out,4,3,1,0,133.33,India vs Australia,ODI # 4705
4,India,5,KL Rahul †,out,66,107,1,0,61.68,India vs Australia,ODI # 4705
...,...,...,...,...,...,...,...,...,...,...,...
471,England,10,Adil Rashid,not_out,15,13,0,1,115.38,England vs New Zealand,ODI # 4658
472,England,11,Mark Wood,not_out,13,14,0,0,92.85,England vs New Zealand,ODI # 4658
873,New Zealand,1,Devon Conway,not_out,152,121,19,3,125.61,England vs New Zealand,ODI # 4658
874,New Zealand,2,Will Young,out,0,1,0,0,0.00,England vs New Zealand,ODI # 4658


In [20]:
os.makedirs('./data', exist_ok=True)

In [21]:
batting_summary_data.to_csv('./data/wc2023_batting_summary.csv', index = None)

# SCRAPING BOWLING DATA

In [22]:
all_match_data_bowling = []

def get_bowling_scorecard():
    i=0
    for url in urls:
        r = requests.get(url)
        if r.status_code != 200:
            raise Exception('Failed to load page {}'.format(url))
        soup = BeautifulSoup(r.text, 'html.parser')
        
    #Find the rival teams
        teams = soup.find('div', class_ = 'ds-flex ds-flex-col ds-mt-3 md:ds-mt-0 ds-mt-0 ds-mb-1').find_all('div', class_= 'ci-team-score')
        matchup = []
        for child in teams:
            matchup.append(child.span.text)
        match = matchup[0] + " vs " + matchup[1]
        
    #Find the bowling data
        bowling_data = []
        
        divs = soup.find('div', 'ds-grow').find_all('div', class_ = 'ds-mt-3')
        innings = divs[1].find_all('div', class_ = 'ds-rounded-lg ds-mt-2')
        first_innings = innings[0]
        second_innings = innings[1]
        
    #Function to collect the data of each innings
        bowling_data.append(get_bowling_data(first_innings, 1, second_innings.find('span', class_ = 'ds-text-title-xs ds-font-bold ds-text-typo').text.split('\xa0')[0]))
        bowling_data.append(get_bowling_data(second_innings, 2, first_innings.find('span', class_ = 'ds-text-title-xs ds-font-bold ds-text-typo').text.split('\xa0')[0]))
               
        
        all_match_data_bowling.append([match, bowling_data, matchId[i]])
        i = i+1
        
    return(all_match_data_bowling)

In [23]:
#Function to collect the data of each innings
def get_bowling_data(innings,order, team_inning):
    bowling = innings.find('table', class_ = 'ds-w-full ds-table ds-table-md ds-table-auto').tbody
    bowling_summary = []
    
    for row in bowling.find_all('tr'):  
        columns = row.find_all('td')
        
        try:
            bowlerName = columns[0].div.span.text.strip()
            overs = columns[1].text
            maidens = columns[2].text
            runs = columns[3].text
            wickets = columns[4].strong.text
            economy = columns[5].text
            dots = columns[6].text
            fours = columns[7].text
            sixes = columns[8].text
            wides = columns[9].text
            no_balls = columns[10].text
            bowling_summary.append([order, team_inning, bowlerName, overs, maidens, runs, wickets, economy, dots, fours, sixes, wides, no_balls])

        except AttributeError:
            continue
        except IndexError:
            continue
    
    return bowling_summary

In [24]:
bowling_data = get_bowling_scorecard()

In [25]:
bowling_columns = ['Match', 'Team', 'Match_id']

In [26]:
bowling_df = pd.DataFrame(bowling_data, columns=bowling_columns)

In [27]:
bowling_df.head()

Unnamed: 0,Match,Team,Match_id
0,India vs Australia,"[[[1, Australia, Mitchell Starc, 10, 0, 55, 3,...",ODI # 4705
1,South Africa vs Australia,"[[[1, Australia, Mitchell Starc, 10, 1, 34, 3,...",ODI # 4704
2,India vs New Zealand,"[[[1, New Zealand, Trent Boult, 10, 0, 86, 1, ...",ODI # 4703
3,India vs Netherlands,"[[[1, Netherlands, Aryan Dutt, 7, 0, 52, 0, 7....",ODI # 4702
4,England vs Pakistan,"[[[1, Pakistan, Shaheen Shah Afridi, 10, 1, 72...",ODI # 4701


In [28]:
bowling_df[['Team1', 'Team2']] = pd.DataFrame(bowling_df['Team'].tolist(), index=bowling_df.index)

In [29]:
bowling_df = bowling_df.drop(['Team'], axis=1)

In [30]:
bowling_df.head()

Unnamed: 0,Match,Match_id,Team1,Team2
0,India vs Australia,ODI # 4705,"[[1, Australia, Mitchell Starc, 10, 0, 55, 3, ...","[[2, India, Jasprit Bumrah, 9, 2, 43, 2, 4.77,..."
1,South Africa vs Australia,ODI # 4704,"[[1, Australia, Mitchell Starc, 10, 1, 34, 3, ...","[[2, South Africa, Marco Jansen, 4.2, 0, 35, 0..."
2,India vs New Zealand,ODI # 4703,"[[1, New Zealand, Trent Boult, 10, 0, 86, 1, 8...","[[2, India, Jasprit Bumrah, 10, 1, 64, 1, 6.40..."
3,India vs Netherlands,ODI # 4702,"[[1, Netherlands, Aryan Dutt, 7, 0, 52, 0, 7.4...","[[2, India, Jasprit Bumrah, 9, 1, 33, 2, 3.66,..."
4,England vs Pakistan,ODI # 4701,"[[1, Pakistan, Shaheen Shah Afridi, 10, 1, 72,...","[[2, England, David Willey, 10, 0, 56, 3, 5.60..."


In [33]:
#   Explodes Team1 and Team2 columns of a DataFrame and creates new DataFrames for each team.
def explode_teams_bowling(df):

  # Define column names for the new DataFrame
    new_columns = ["order", "team_inning", "bowlerName", "overs", "maidens", "runs", "wickets", "economy", "dots", "fours", "sixes", "wides", "no_balls", 'match_id']

  # Create empty DataFrames to store exploded data
    team1_df = pd.DataFrame(columns=new_columns)
    team2_df = pd.DataFrame(columns=new_columns)

  # Loop through each match data in Team1 and Team2
    for match_data, team_data in zip(df[["Match", "Match_id"]].itertuples(), df[["Team1", "Team2"]].itertuples()):
    # Explode each team's data into separate rows
        exploded_team1 = pd.DataFrame(team_data.Team1, columns=new_columns[:-1])
        exploded_team2 = pd.DataFrame(team_data.Team2, columns=new_columns[:-1])

    # Add Match, Match_id columns
        exploded_team1["Match"] = match_data.Match
        exploded_team1["Match_id"] = match_data.Match_id

        exploded_team2["Match"] = match_data.Match
        exploded_team2["Match_id"] = match_data.Match_id

    # Concatenate exploded data to respective DataFrames
        team1_df = pd.concat([team1_df, exploded_team1], ignore_index=True)
        team2_df = pd.concat([team2_df, exploded_team2], ignore_index=True)

  # Combine Team1 and Team2 DataFrames
        final_df = pd.concat([team1_df, team2_df], ignore_index=True)

    return final_df

In [34]:
exploded_bowling_df = explode_teams_bowling(bowling_df.copy())

In [35]:
exploded_bowling_df.head(10)

Unnamed: 0,order,team_inning,bowlerName,overs,maidens,runs,wickets,economy,dots,fours,sixes,wides,no_balls,match_id,Match,Match_id
0,1,Australia,Mitchell Starc,10.0,0,55,3,5.5,30,4,1,3,0,,India vs Australia,ODI # 4705
1,1,Australia,Josh Hazlewood,10.0,0,60,2,6.0,22,4,1,1,0,,India vs Australia,ODI # 4705
2,1,Australia,Glenn Maxwell,6.0,0,35,1,5.83,19,4,1,0,0,,India vs Australia,ODI # 4705
3,1,Australia,Pat Cummins,10.0,0,34,2,3.4,30,0,0,2,0,,India vs Australia,ODI # 4705
4,1,Australia,Adam Zampa,10.0,0,44,1,4.4,22,1,0,1,0,,India vs Australia,ODI # 4705
5,1,Australia,Mitchell Marsh,2.0,0,5,0,2.5,7,0,0,0,0,,India vs Australia,ODI # 4705
6,1,Australia,Travis Head,2.0,0,4,0,2.0,8,0,0,0,0,,India vs Australia,ODI # 4705
7,1,Australia,Mitchell Starc,10.0,1,34,3,3.4,46,3,1,4,0,,South Africa vs Australia,ODI # 4704
8,1,Australia,Josh Hazlewood,8.0,3,12,2,1.5,39,1,0,0,0,,South Africa vs Australia,ODI # 4704
9,1,Australia,Pat Cummins,9.4,0,51,3,5.27,32,5,1,2,0,,South Africa vs Australia,ODI # 4704


In [36]:
#Reordering the data in match-wise manner
bowling_summary_data = exploded_bowling_df.sort_values(by = ['Match_id', 'order'], axis=0, ascending=[False, True], inplace=False)

In [37]:
bowling_summary_data.head(12)

Unnamed: 0,order,team_inning,bowlerName,overs,maidens,runs,wickets,economy,dots,fours,sixes,wides,no_balls,match_id,Match,Match_id
0,1,Australia,Mitchell Starc,10,0,55,3,5.5,30,4,1,3,0,,India vs Australia,ODI # 4705
1,1,Australia,Josh Hazlewood,10,0,60,2,6.0,22,4,1,1,0,,India vs Australia,ODI # 4705
2,1,Australia,Glenn Maxwell,6,0,35,1,5.83,19,4,1,0,0,,India vs Australia,ODI # 4705
3,1,Australia,Pat Cummins,10,0,34,2,3.4,30,0,0,2,0,,India vs Australia,ODI # 4705
4,1,Australia,Adam Zampa,10,0,44,1,4.4,22,1,0,1,0,,India vs Australia,ODI # 4705
5,1,Australia,Mitchell Marsh,2,0,5,0,2.5,7,0,0,0,0,,India vs Australia,ODI # 4705
6,1,Australia,Travis Head,2,0,4,0,2.0,8,0,0,0,0,,India vs Australia,ODI # 4705
298,2,India,Jasprit Bumrah,9,2,43,2,4.77,37,8,0,0,0,,India vs Australia,ODI # 4705
299,2,India,Mohammed Shami,7,1,47,1,6.71,28,6,1,3,0,,India vs Australia,ODI # 4705
300,2,India,Ravindra Jadeja,10,0,43,0,4.3,29,1,1,1,0,,India vs Australia,ODI # 4705


In [38]:
bowling_summary_data.drop(['match_id','order'], axis=1, inplace=True)

In [39]:
bowling_summary_data.head(10)

Unnamed: 0,team_inning,bowlerName,overs,maidens,runs,wickets,economy,dots,fours,sixes,wides,no_balls,Match,Match_id
0,Australia,Mitchell Starc,10,0,55,3,5.5,30,4,1,3,0,India vs Australia,ODI # 4705
1,Australia,Josh Hazlewood,10,0,60,2,6.0,22,4,1,1,0,India vs Australia,ODI # 4705
2,Australia,Glenn Maxwell,6,0,35,1,5.83,19,4,1,0,0,India vs Australia,ODI # 4705
3,Australia,Pat Cummins,10,0,34,2,3.4,30,0,0,2,0,India vs Australia,ODI # 4705
4,Australia,Adam Zampa,10,0,44,1,4.4,22,1,0,1,0,India vs Australia,ODI # 4705
5,Australia,Mitchell Marsh,2,0,5,0,2.5,7,0,0,0,0,India vs Australia,ODI # 4705
6,Australia,Travis Head,2,0,4,0,2.0,8,0,0,0,0,India vs Australia,ODI # 4705
298,India,Jasprit Bumrah,9,2,43,2,4.77,37,8,0,0,0,India vs Australia,ODI # 4705
299,India,Mohammed Shami,7,1,47,1,6.71,28,6,1,3,0,India vs Australia,ODI # 4705
300,India,Ravindra Jadeja,10,0,43,0,4.3,29,1,1,1,0,India vs Australia,ODI # 4705


In [41]:
bowling_summary_data.to_csv('./data/wc2023_bowling_summary.csv', index = None)