# Import necessary libraries

In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re
import json
import csv

In [15]:
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
    "Dnt": "1",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:123.0) Gecko/20100101 Firefox/123.0",
}

In [16]:
url2023 = 'https://www.espncricinfo.com/records/year/team-match-results/2023-2023/test-matches-1'
url2024 = 'https://www.espncricinfo.com/records/year/team-match-results/2024-2024/test-matches-1'

# Scraping data from table tags

In [17]:
def extract_match(soup:BeautifulSoup):
    all_rows = soup.select('table > tbody > tr')
    match_summary = []

    for row in all_rows:
        element = row.find_all('td')

        team1 = element[0].text.strip()
        team2 = element[1].text.strip()
        winner = element[2].text.strip()
        margin = element[3].text.strip()
        ground = element[4].text.strip()
        match_date = element[5].text.strip()
        scorecard_link = "https://www.espncricinfo.com" + element[6].find('a')['href'].strip()
        matchid = element[6].text.strip()

        match_summary.append({
            'matchid': matchid,
            'team1': team1,
            'team2': team2,
            'winner': winner,
            'margin': margin,
            'ground': ground,
            'matchDate': match_date,
            'scorecard_link': scorecard_link
        })

    # Step 4: Creating a new list to filter out matches with a valid margin
    newMatchSummary = []

    for match in match_summary:
        if match['margin'] != '-':
            if ('wickets' in match['margin']) or ('wicket' in match['margin']):
                finT2 = match['winner']
                finT1 = match['team1'] if match['team1'] != match['winner'] else match['team2']
            elif ('runs' in match['margin']) or ('run' in match['margin']):
                finT1 = match['winner']
                finT2 = match['team1'] if match['team1'] != match['winner'] else match['team2']

            newMatchSummary.append({
                'matchid': match['matchid'],
                'team1': finT1,
                'team2': finT2,
                'winner': match['winner'],
                'margin': match['margin'],
                'matchDate': match['matchDate'],
                'scorecard_link': match['scorecard_link']
            })

    # Step 5: Convert the data into a pandas DataFrame
    return pd.DataFrame(newMatchSummary)

# Scrape data from both links and combine them into one

In [18]:
page = requests.get(url2023)
soup = BeautifulSoup(page.content, 'html.parser')

match2023df = extract_match(soup)

page = requests.get(url2024)
soup = BeautifulSoup(page.content, 'html.parser')

match2024df = extract_match(soup)

In [19]:
match2023df = match2023df[:12]
matchdf = pd.concat([match2024df.reset_index(drop=True), match2023df.reset_index(drop=True)]).reset_index(drop=True)

# Drop 'Afghanistan' and 'Ireland' as they are not part of WTC2023-2025

In [20]:
# drop every record from matchdf that matches 'Afganistan' or 'Ireland' in team1 and team2

matchdf = matchdf[~matchdf['team1'].isin(['Afghanistan', 'Ireland'])]
matchdf = matchdf[~matchdf['team2'].isin(['Afghanistan', 'Ireland'])]

# Scraping from each match

In [46]:
def scrape_match_data(link:str):
  scorecard = requests.get(link)
  soup = BeautifulSoup(scorecard.content, 'html.parser')

  innings = soup.find_all('div', class_='ds-rounded-lg ds-mt-2')

  series = soup.select_one('h1.ds-text-title-xs').get_text()

  match_head = []
  match_body = []
  inning_info = []

  for inning in innings:
    match_body.append(inning.select('table > tbody'))

    inning_info.append(
        inning.find('div', class_='ds-flex ds-px-4 ds-border-b ds-border-line ds-py-3 ds-bg-ui-fill-translucent-hover')
        .get_text().replace('\xa0',' ')
        .split('(')[0]
        .strip()
        )

    match_summary = {}
    batting_summary = {}
    bowling_summary = {}

  for inning_body, inning in zip(match_body, inning_info):
      batting = inning_body[0].select('tr:not(.ds-hidden)') #batting
      bowling = inning_body[1].select('tr:not(.ds-hidden)')#bowling

      bowler_stats = {}
      batsman_stats = {}

      for tr in batting:
          tds = tr.select('td')
          if tds[0].get_text() == 'Extras':
              break
          batsman = tds[0].get_text()
          batsman_stats[batsman] = {
            'R': tds[2].get_text(),
            'B': tds[3].get_text(),
            'M': tds[4].get_text(),
            '4s': tds[5].get_text(),
            '6s': tds[6].get_text(),
            'SR': tds[7].get_text(),
          }
          batting_summary[inning] = batsman_stats

      for tr in bowling:
        tds = tr.select('td:not(.ds-hidden)')
        bowler = tds[0].get_text()
        bowler_stats[bowler] = {
          'O': tds[1].get_text(),
          'M': tds[2].get_text(),
          'R': tds[3].get_text(),
          'W': tds[4].get_text(),
          'EC' : tds[5].get_text(),
          'WD' : tds[6].get_text(),
          'NB' : tds[7].get_text(),
        }
      bowling_summary[inning] = bowler_stats

      match_summary.update({
          'bowling_summary': bowling_summary,
          'batting_summary': batting_summary,
          'series': series,
          })
  return json.dumps(match_summary)

### Create a header

In [55]:
with open('bowling_stats.csv', 'w', newline='\n') as bowling:
  writer = csv.writer(bowling, delimiter=',')
  titles = ['bowler', 'over', 'maiden', 'runs', 'wickets', 'economy', 'wide', 'noball', 'inning', 'series']
  writer.writerow(titles)

with open('batting_stats.csv', 'w', newline='\n') as batting:
  writer = csv.writer(batting, delimiter=',')
  titles = ['batsman', 'runs', 'balls', 'minutes', 'fours', 'sixes', 'strike_rate', 'inning', 'series']
  writer.writerow(titles)

### Write into files

In [59]:
for scorecard_link in matchdf['scorecard_link']:
  data = scrape_match_data(scorecard_link)
  json_data = json.loads(data)
  bowling_summary, batting_summary, series = json_data['bowling_summary'], json_data['batting_summary'], json_data['series']

  with open('bowling_stats.csv', 'a', newline='\n') as bowling, open('batting_stats.csv', 'a', newline='\n') as batting:

    bowling_writer = csv.writer(bowling, delimiter=',')

    for inning, bowler_stats in bowling_summary.items():
      for bowler, stats in bowler_stats.items():
        stats_list = list(stats.values())
        line = [bowler, *stats_list, inning, series]
        bowling_writer.writerow(line)

    batting_writer = csv.writer(batting, delimiter=',')

    for inning, batsman_stats in batting_summary.items():
      for batsman, stats in batsman_stats.items():
        stats_list = list(stats.values())
        line = [batsman, *stats_list, inning, series]
        batting_writer.writerow(line)

  print(f"Finished scraping for link: {scorecard_link}")

Finished scraping for link: https://www.espncricinfo.com/series/australia-in-new-zealand-2023-24-1388188/new-zealand-vs-australia-2nd-test-1388227/full-scorecard
Finished scraping for link: https://www.espncricinfo.com/series/england-in-india-2023-24-1389386/india-vs-england-5th-test-1389403/full-scorecard
Finished scraping for link: https://www.espncricinfo.com/series/australia-in-new-zealand-2023-24-1388188/new-zealand-vs-australia-1st-test-1388226/full-scorecard
Finished scraping for link: https://www.espncricinfo.com/series/england-in-india-2023-24-1389386/india-vs-england-4th-test-1389402/full-scorecard
Finished scraping for link: https://www.espncricinfo.com/series/england-in-india-2023-24-1389386/india-vs-england-3rd-test-1389401/full-scorecard
Finished scraping for link: https://www.espncricinfo.com/series/south-africa-in-new-zealand-2023-24-1388186/new-zealand-vs-south-africa-2nd-test-1388222/full-scorecard
Finished scraping for link: https://www.espncricinfo.com/series/south-