In [6]:
import os
import time
import pandas as pd
import re
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException, ElementNotInteractableException

In [7]:
# Create Chrome options
chrome_options = Options()
# Run in headless mode (no browser window)
# chrome_options.add_argument("--headless")

# Disable notifications
chrome_options.add_argument("--disable-notifications")

# Create a Service object
service = Service('chromedriver.exe')

driver = webdriver.Chrome(service=service, options=chrome_options)
driver.implicitly_wait(5)

driver.get('https://www.cricbuzz.com/cricket-scorecard-archives')

# 1. Start with Series Archive Page
## Fetch and Store all the series link href to scrape later

In [8]:
if os.path.exists('supporting_cache_data/all_years_series_hrefs.csv'):
  years_series_data = pd.read_csv('supporting_cache_data/all_years_series_hrefs.csv').to_dict(orient='records')
  print("Fetched from cache")
else:
  years = driver.find_elements(By.CSS_SELECTOR, ".cb-yr-tmline > a")
  print(f"Total of {len(years)} years found")
  years_series_data = []
  year_i = 0
  year_retry = 0
  while year_i < len(years) and year_retry <= 5:
    try:
      ActionChains(driver).move_to_element(years[year_i]).click().perform()
      year_name = years[year_i].text.strip()
      time.sleep(2)
      try:
        series_containers = driver.find_elements(By.CSS_SELECTOR, ".cb-schdl > div.cb-col")
        for series_container in series_containers:
          try:
            # Get the Series type by looking at the previous h2 sibling and contains class name "cb-srs-cat"
            series_type = series_container.find_element(By.XPATH, "./preceding-sibling::h2[1][contains(@class, 'cb-srs-cat')]").text
            if not series_type:
              raise ValueError("Series type not found")
            # Get the Series name and href link
            series = series_container.find_elements(By.CSS_SELECTOR, ".cb-srs-lst-itm > a")

            for serie in series:
              years_series_data.append({
                "year": year_name,
                "series_type": series_type,
                "series_name": serie.text.strip(),
                "series_href": serie.get_attribute("href"),
              })
          except (TimeoutException, NoSuchElementException) as e:
            print("Element not found or timeout occurred.", e.msg)
      except (TimeoutException, NoSuchElementException) as e:
        print("Element not found or timeout occurred.", e.msg)
      year_i += 1
      year_retry = 0
      # Print progress in percentage
      progress = (year_i / len(years)) * 100
      print(f"Year {year_name} processed after {year_retry} retries. Progress: {progress:.2f}%")
    except StaleElementReferenceException:
      years = driver.find_elements(By.CSS_SELECTOR, ".cb-yr-tmline > a")
      year_retry += 1
      if year_retry == 5: print(f"StaleElementReferenceException Error with {year_name} after 5 retries")
  # Save to CSV if data exists
  if years_series_data:
    pd.DataFrame(years_series_data).to_csv('supporting_cache_data/all_years_series_hrefs.csv', index=False, header=True)
  else:
    print("No data collected. Skipping CSV save.")

pd.DataFrame(years_series_data)
#3262

Fetched from cache


Unnamed: 0,year,series_type,series_name,series_href
0,2021,International,"Bangladesh tour of New Zealand, 2022",https://www.cricbuzz.com/cricket-series/3876/b...
1,2021,International,"India tour of South Africa, 2021-22",https://www.cricbuzz.com/cricket-series/3656/i...
2,2021,International,"Ireland tour of USA, 2021",https://www.cricbuzz.com/cricket-series/3866/i...
3,2021,International,"West Indies tour of Pakistan, 2021-22",https://www.cricbuzz.com/cricket-series/3858/w...
4,2021,International,"The Ashes, 2021-22",https://www.cricbuzz.com/cricket-series/3532/t...
...,...,...,...,...
3257,1887,International,"England in Australia, 1887",https://www.cricbuzz.com/cricket-series/1022/e...
3258,1888,International,"Australia in England, 1888",https://www.cricbuzz.com/cricket-series/1024/a...
3259,1888,International,"England in Australia, 1888",https://www.cricbuzz.com/cricket-series/1023/e...
3260,1889,International,England in South Africa Test Series,https://www.cricbuzz.com/cricket-series/1025/e...


# 2. Start scraping each Series Page
## Fetch and Store all the match link href to scrape later

In [9]:
if os.path.exists('supporting_cache_data/all_years_series_match_hrefs.csv'):
  years_series_match_data = pd.read_csv('supporting_cache_data/all_years_series_match_hrefs.csv').to_dict(orient='records')
  print("Fetched from cache")
else:
  years_series_match_data = []
  series_i = 0
  while series_i < len(years_series_data):
    driver.get(years_series_data[series_i]['series_href'])
    time.sleep(2)
    match_rows = driver.find_elements(By.CSS_SELECTOR, "#series-matches div.cb-series-matches")
    # print(f"Series: {years_series_data[series_i]['series_name']} - Total of {len(match_rows)} matches found")
    series_match_i = 0
    series_retry = 0
    for match in match_rows:
      try:
        # Extract match dates, might have both start and end dates
        # %b - Abbreviated month name
        # %d - Day of the month as a zero-padded decimal
        # %a - Abbreviated weekday name
        # %-I - hour(12 hour clock) as a decimal number
        # %M - Minute as a zero padded decimal number
        # %p - AM or PM
        # %z - UTC offset in the form +HHMM or -HHMM
        match_dates = match.find_element(By.CSS_SELECTOR, "div.cb-col.schedule-date").text.strip()
        if match_dates:
          if " - " in match_dates:
            match_date_start, match_date_end_str = list(map(str.strip, match_dates.split(" - ")))
            try:
              match_date_end = datetime.strptime(f"{years_series_data[series_i]['year']} {match_date_end_str} +0530", "%Y %b %d, %a %z").date()
            except ValueError:
              match_date_end = datetime.strptime(f"{int(years_series_data[series_i]['year'])+1} {match_date_end_str} +0530", "%Y %b %d %z").date()
          else:
            match_date_start = match_dates.strip()
            match_date_end = None
        
        # Extract Match Start Time
        match_time = match.find_element(By.CSS_SELECTOR, "span.schedule-date").text.strip()
        if '(' in match_time:
          match_time = match_time.split('(')[0].strip()
        match_time_parts = match_time.split(':')
        match_time_parts[0] = match_time_parts[0].zfill(2)  # Zero-pad the hour part
        match_time = ':'.join(match_time_parts)
        try:
          match_datetime_str = f"{years_series_data[series_i]['year']} {match_date_start} {match_time} +0530"
          match_datetime_start = datetime.strptime(match_datetime_str, "%Y %b %d, %a %I:%M %p %z")
        except ValueError:
          match_datetime_str = f"{int(years_series_data[series_i]['year'])+1} {match_date_start} {match_time} +0530"
          match_datetime_start = datetime.strptime(match_datetime_str, "%Y %b %d, %a %I:%M %p %z")

        if match_datetime_start.date() >= datetime.now().date():
          # Skip future matches
          continue
        try:
          match_link = match.find_element(By.CSS_SELECTOR, "a.text-hvr-underline")
        except NoSuchElementException:
          continue
        match_href = match_link.get_attribute('href')
        match_venue = match_link.find_element(By.XPATH, "following-sibling::div[1][contains(@class, 'text-gray')]").text.strip()

        match_name, match_no = map(str.strip, match_link.text.strip().split(", ", maxsplit=1))
        match_team1, match_team2 = map(str.strip, match_name.split(" vs "))  # Team names
        
        if 'T20' in match_no:
          match_type = 'T20'
        elif 'ODI' in match_no:
          match_type = 'ODI'
        elif 'Test' in match_no:
          match_type = 'Test'
        else:
          # Look at the top left corner for the match type
          try:
            sep = driver.find_element(By.CSS_SELECTOR, "span.cb-nav-dt")
            if sep:
              match_types = sep.find_element(By.XPATH, "./preceding-sibling::span[1]").text.strip()
              if "," in match_types:
                match_type = match_types
              elif "Test" in match_types:
                match_type = "Test"
              elif "ODI" in match_types:
                match_type = "ODI"
              elif "T20" in match_types:
                match_type = "T20"
              else:
                match_type = None
          except NoSuchElementException:
            match_type = None

        years_series_match_data.append({
          "year": years_series_data[series_i]["year"],
          "series_type": years_series_data[series_i]["series_type"],
          "series_name": years_series_data[series_i]["series_name"],
          "match_no": match_no,
          "match_type": match_type,
          "match_name": match_name,
          "match_href": match_href,
          "match_team1": match_team1,
          "match_team2": match_team2,
          "match_datetime_start": match_datetime_start,
          "match_date_end": match_date_end,
          "match_venue": match_venue,
        })
        series_match_i += 1
        series_retry = 0
      except StaleElementReferenceException:
        match_rows = driver.find_elements(By.CSS_SELECTOR, "#series-matches div.cb-series-matches")
        series_retry += 1
        if series_retry == 5: print(f"StaleElementReferenceException Error in {years_series_data[series_i]['series_href']} after 5 retries")
    # Print progress in percentage
    progress = (series_i / len(years_series_data)) * 100
    print(f"Series {years_series_data[series_i]['series_name']} processed with {len(match_rows)} matches after {series_retry} retries. Progress: {progress:.2f}%")
    series_i += 1

  # Save to CSV
  pd.DataFrame(years_series_match_data).to_csv('supporting_cache_data/all_years_series_match_hrefs.csv', index=False, header=True)
pd.DataFrame(years_series_match_data)

Series Bangladesh tour of New Zealand, 2022 processed with 3 matches after 0 retries. Progress: 0.00%
Series India tour of South Africa, 2021-22 processed with 6 matches after 0 retries. Progress: 0.03%
Series Ireland tour of USA, 2021 processed with 4 matches after 0 retries. Progress: 0.06%
Series West Indies tour of Pakistan, 2021-22 processed with 6 matches after 0 retries. Progress: 0.09%
Series The Ashes, 2021-22 processed with 5 matches after 0 retries. Progress: 0.12%
Series Afghanistan tour of Australia Only Test, 2021 (Postponed) processed with 0 matches after 0 retries. Progress: 0.15%
Series Netherlands tour of South Africa, 2021 (Postponed) processed with 1 matches after 0 retries. Progress: 0.18%
Series Pakistan tour of Bangladesh, 2021 processed with 5 matches after 0 retries. Progress: 0.21%
Series New Zealand tour of India, 2021 processed with 5 matches after 0 retries. Progress: 0.25%
Series ICC Mens T20 World Cup Africa Qualifier 2021 processed with 12 matches after 

Unnamed: 0,year,series_type,series_name,match_no,match_type,match_name,match_href,match_team1,match_team2,match_datetime_start,match_date_end,match_venue
0,2021,International,"Bangladesh tour of New Zealand, 2022",2-day Practice Match,Test,NEW ZEALAND XI vs BANGLADESH,https://www.cricbuzz.com/cricket-scores/41931/...,NEW ZEALAND XI,BANGLADESH,2021-12-28 03:30:00+05:30,2021-12-29,"Bay Oval No2, Mount Maunganui"
1,2021,International,"Bangladesh tour of New Zealand, 2022",1st Test,Test,NEW ZEALAND vs BANGLADESH,https://www.cricbuzz.com/cricket-scores/40536/...,NEW ZEALAND,BANGLADESH,2021-01-01 03:30:00+05:30,2021-01-05,"Bay Oval, Mount Maunganui"
2,2021,International,"Bangladesh tour of New Zealand, 2022",2nd Test,Test,NEW ZEALAND vs BANGLADESH,https://www.cricbuzz.com/cricket-scores/40538/...,NEW ZEALAND,BANGLADESH,2021-01-09 03:30:00+05:30,2021-01-13,"Hagley Oval, Christchurch"
3,2021,International,"India tour of South Africa, 2021-22",1st Test,Test,SOUTH AFRICA vs INDIA,https://www.cricbuzz.com/cricket-scores/38397/...,SOUTH AFRICA,INDIA,2021-12-26 13:30:00+05:30,2021-12-30,"SuperSport Park, Centurion"
4,2021,International,"India tour of South Africa, 2021-22",2nd Test,Test,SOUTH AFRICA vs INDIA,https://www.cricbuzz.com/cricket-scores/38401/...,SOUTH AFRICA,INDIA,2021-01-03 13:30:00+05:30,2021-01-07,"The Wanderers Stadium, Johannesburg"
...,...,...,...,...,...,...,...,...,...,...,...,...
33521,1889,International,England in South Africa Test Series,1st Test,Test,SOUTH AFRICA vs ENGLAND,https://www.cricbuzz.com/cricket-scores/8629/r...,SOUTH AFRICA,ENGLAND,1889-03-12 05:21:00+05:30,1889-03-13,"St George's Park, Gqeberha"
33522,1889,International,England in South Africa Test Series,2nd Test,Test,SOUTH AFRICA vs ENGLAND,https://www.cricbuzz.com/cricket-scores/8630/r...,SOUTH AFRICA,ENGLAND,1889-03-25 05:21:00+05:30,1889-03-26,"Newlands, Cape Town"
33523,1890,International,"Australia in England, 1890",1st Test,Test,ENGLAND vs AUSTRALIA,https://www.cricbuzz.com/cricket-scores/8631/e...,ENGLAND,AUSTRALIA,1890-07-21 05:21:00+05:30,1890-07-23,"Lord's, London"
33524,1890,International,"Australia in England, 1890",2nd Test,Test,ENGLAND vs AUSTRALIA,https://www.cricbuzz.com/cricket-scores/8632/e...,ENGLAND,AUSTRALIA,1890-08-11 05:21:00+05:30,1890-08-12,"Kennington Oval, London"


In [None]:
players_href = dict()
def get_player_details(element, default_role):
  if element:
    res = []
    captain = None
    for player in element.find_elements(By.CSS_SELECTOR, 'a'):
      txt = player.text.strip()
      href = player.get_attribute('href')
      profile_id, name = href.lstrip('https://www.cricbuzz.com/profiles/').split('/')
      name = name.replace("-", " ").capitalize()
      if '(c & wk)' in txt:
        captain = name
        role = 'captain and wk'
      elif '(c)' in txt:
        captain = name
        role = 'captain'
      elif '(w)' in txt or '(wk)' in txt:
        role = 'wicket_keeper'
      else:
        role = default_role
        
      res.append({
        "profile_id": profile_id,
        "role": role,
      })
      if profile_id not in players_href:
        players_href[profile_id] = {
          "name": name,
          "href": href,
        }
    return res, captain
  return None, None

# 3. Scrape commentary from each Match Page
## 

In [None]:
import pytz
local_tz = pytz.timezone('Asia/Kolkata')

if os.path.exists('supporting_cache_data/ipl_commentary_data.csv'):
  years_series_match_commentary_data = pd.read_csv('supporting_cache_data/ipl_commentary_data.csv').to_dict(orient='records')
  years_series_match_over_data = pd.read_csv('supporting_cache_data/ipl_over_data.csv').to_dict(orient='records')
  print("Fetched from cache")
else:
  years_series_match_commentary_data = []
  years_series_match_over_data = []
  match_i = 0
  while match_i < len(years_series_match_data):
    if not 'Indian' in years_series_match_data[match_i]['series_name'] or int(years_series_match_data[match_i]['year']) < 2017:
      match_i += 1
      continue

    ##################### Scrape Scorecard Page #####################
    if '/live-cricket-scores/' in years_series_match_data[match_i]['match_href']:
      years_series_match_data[match_i]['match_href'] = years_series_match_data[match_i]['match_href'].replace('/live-cricket-scores/', '/cricket-scores/')
    driver.get(years_series_match_data[match_i]['match_href'].replace('/cricket-scores/', '/live-cricket-scorecard/'))
    match_id = years_series_match_data[match_i]['match_href'].lstrip('https://www.cricbuzz.com/cricket-scores/').split('/')[0]
    time.sleep(2)

    match_status_el = driver.find_element(By.CSS_SELECTOR, "div.cb-scrcrd-status").text.strip()
    # "There is no scorecard available for this match." also matches "div.cb-scrcrd-status"
    if any(['Match abandoned' in s.text.strip() for s in driver.find_elements(By.CSS_SELECTOR, "div.cb-scrcrd-status")]):
      match_status = 'abandoned'
      match_i += 1
      continue
    elif any(['Match rescheduled' in s.text.strip() for s in driver.find_elements(By.CSS_SELECTOR, "div.cb-scrcrd-status")]):
      match_status = 'rescheduled'
      match_i += 1
      continue
    elif any(['Match postponed' in s.text.strip() for s in driver.find_elements(By.CSS_SELECTOR, "div.cb-scrcrd-status")]):
      match_status = 'postponed'
      match_i += 1
      continue
    elif any(['No result' in s.text.strip() for s in driver.find_elements(By.CSS_SELECTOR, "div.cb-scrcrd-status")]):
      match_status = 'no_result'
      match_i += 1
      continue
    elif 'Match tied' in match_status_el:
      match_status = 'tied'
      match_winning_team, match_tie_breaker = map(lambda x: x.replace('the ', '').strip('()').strip(), match_status_el.split(" won ", maxsplit=1))
    else:
      match_status = 'completed'
      match_winning_team = match_status_el.split(" won ", maxsplit=1)[0]
      match_tie_breaker = None

    # Extract the toss if exists
    try:
      toss_hdr = driver.find_element(By.XPATH, "//div[contains(@class, 'cb-col-100')]/div[contains(text(), 'Toss')]")
      toss = toss_hdr.find_element(By.XPATH, "./following-sibling::div[1][contains(@class, 'cb-col')]").text.strip()
    except NoSuchElementException:
      toss = None

    match_date_hdr = driver.find_element(By.XPATH, "//div[contains(@class, 'cb-col-100')]/div[contains(text(), 'Date')]")
    match_date = match_date_hdr.find_element(By.XPATH, "./following-sibling::div[1][contains(@class, 'cb-col')]").text.strip()

    match_time_hdr = driver.find_element(By.XPATH, "//div[contains(@class, 'cb-col-100')]/div[contains(text(), 'Time')]")
    match_time = match_time_hdr.find_element(By.XPATH, "./following-sibling::div[1][contains(@class, 'cb-col')]").text.strip()

    match_time_parts = match_time.split(':')
    match_time_parts[0] = match_time_parts[0].zfill(2)  # Zero-pad the hour part
    match_time = ':'.join(match_time_parts)

    match_datetime_str = f"{match_date} {match_time} +0530"
    match_datetime = datetime.strptime(match_datetime_str, "%A, %B %d, %Y %I:%M %p %z")

    if match_datetime >= datetime.now(local_tz):
      match_i += 1
      continue

    score1 = driver.find_element(By.CSS_SELECTOR, "#innings_1 .cb-scrd-hdr-rw .pull-right").text.strip()
    team1_score, team1_wickets = map(str.strip, score1.split(" ", maxsplit=1)[0].split("-"))

    score2 = driver.find_element(By.CSS_SELECTOR, "#innings_2 .cb-scrd-hdr-rw .pull-right").text.strip()
    team2_score, team2_wickets = map(str.strip, score2.split(" ", maxsplit=1)[0].split("-"))

    try:
      umpires_hdr = driver.find_element(By.XPATH, "//div[contains(@class, 'cb-col-100')]/div[contains(text(), 'Umpires')]")
      umpires = umpires_hdr.find_element(By.XPATH, "./following-sibling::div[1][contains(@class, 'cb-col')]").text.strip()
    except NoSuchElementException:
      umpires = None

    try:
      third_umpires_hdr = driver.find_element(By.XPATH, "//div[contains(@class, 'cb-col-100')]/div[contains(text(), 'Third Umpire')]")
      third_umpires = third_umpires_hdr.find_element(By.XPATH, "./following-sibling::div[1][contains(@class, 'cb-col')]").text.strip()
    except NoSuchElementException:
      third_umpires = None

    try:
      match_referee_hdr = driver.find_element(By.XPATH, "//div[contains(@class, 'cb-col-100')]/div[contains(text(), 'Match Referee')]")
      match_referee = match_referee_hdr.find_element(By.XPATH, "./following-sibling::div[1][contains(@class, 'cb-col')]").text.strip()
    except NoSuchElementException:
      match_referee = None

    try:
      playing_11s_hdrs = driver.find_elements(By.XPATH, "//div[contains(@class, 'cb-col-100')]/div[contains(text(), 'Playing')]")
      playing_11s = list(map(lambda x: x.find_element(By.XPATH, "./following-sibling::div[1][contains(@class, 'cb-col')]"), playing_11s_hdrs))

      assert len(playing_11s) == 2, f"Expected 2 playing 11s, but found {len(playing_11s)}"
      team1_players, team1_captain = get_player_details(playing_11s[0], 'playing_11')
      team2_players, team2_captain = get_player_details(playing_11s[1], 'playing_11')
    except NoSuchElementException:
      team1_players = team1_captain = team2_players = team2_captain = None
      print(f"Error: Playing 11s not found for match {years_series_match_data[match_i]['match_href']}")

    try:
      bench_hdrs = driver.find_elements(By.XPATH, "//div[contains(@class, 'cb-col-100')]/div[contains(text(), 'Bench')]")
      benchs = list(map(lambda x: x.find_element(By.XPATH, "./following-sibling::div[1][contains(@class, 'cb-col')]"), bench_hdrs))

      team1_bench, _ = get_player_details(benchs[0], 'bench')
      if not team1_bench:
        team1_bench = benchs[0].text.strip()
      team2_bench, _ = get_player_details(benchs[1], 'bench')
      if not team2_bench:
        team2_bench = benchs[1].text.strip()
    except NoSuchElementException:
      team1_bench = team2bench = None
      print(f"Error: Bench not found for match {years_series_match_data[match_i]['match_href']}")

    try:
      support_staff_hdrs = driver.find_elements(By.XPATH, "//div[contains(@class, 'cb-col-100')]/div[contains(text(), 'Support Staff')]")
      support_staffs = list(map(lambda x: x.find_element(By.XPATH, "./following-sibling::div[1][contains(@class, 'cb-col')]"), support_staff_hdrs))

      team1_support_staff, _ = get_player_details(support_staffs[0], 'support_staff')
      if not team1_support_staff:
        team1_support_staff = support_staffs[0].text.strip()
      team2_support_staff, _ = get_player_details(support_staffs[1], 'support_staff')
      if not team2_support_staff:
        team2_support_staff = support_staffs[1].text.strip()
    except NoSuchElementException:
      team1_support_staff = team2_support_staff = None
      print(f"Error: support_staffs not found for match {years_series_match_data[match_i]['match_href']}")

    ##################### Scrape Match Facts Page for Venue Details #####################
    driver.get(years_series_match_data[match_i]['match_href'].replace('/cricket-scores/', '/cricket-match-facts/'))
    time.sleep(2)
    
    stadium_el = driver.find_element(By.XPATH, "//div[contains(@class, 'cb-col-rt')]/div[contains(@class, 'text-bold') and contains(text(), 'Stadium:')]")
    match_stadium = stadium_el.find_element(By.XPATH, "./following-sibling::div[1][contains(@class, 'cb-mat-fct-itm')]").text.strip()

    city_el = driver.find_element(By.XPATH, "//div[contains(@class, 'cb-col-rt')]/div[contains(@class, 'text-bold') and contains(text(), 'City:')]")
    match_city = city_el.find_element(By.XPATH, "./following-sibling::div[1][contains(@class, 'cb-mat-fct-itm')]").text.strip()

    try:
      capacity_el = driver.find_element(By.XPATH, "//div[contains(@class, 'cb-col-rt')]/div[contains(@class, 'text-bold') and contains(text(), 'Capacity:')]")
      match_capacity = capacity_el.find_element(By.XPATH, "./following-sibling::div[1][contains(@class, 'cb-mat-fct-itm')]").text.strip()
    except NoSuchElementException:
      match_capacity = None
    try:
      hosts_el = driver.find_element(By.XPATH, "//div[contains(@class, 'cb-col-rt')]/div[contains(@class, 'text-bold') and contains(text(), 'Hosts to:')]")
      match_venue_host_teams = hosts_el.find_element(By.XPATH, "./following-sibling::div[1][contains(@class, 'cb-mat-fct-itm')]").text.strip()
    except NoSuchElementException:
      match_venue_host_teams = None
    ##################### Scrape Full Commentary Page #####################
    driver.get(years_series_match_data[match_i]['match_href'].replace('/cricket-scores/', '/cricket-full-commentary/'))
    time.sleep(2)
    
    match_retry = 0
    while match_retry <= 5:
      try:
        if not toss:
          try:
            toss_el = driver.find_element(By.XPATH, "//div[contains(@class, 'text-bold') and contains(text(), 'Toss')]")
            toss = toss_el.find_element(By.XPATH, "./following-sibling::div[1][contains(@class, 'cb-col')]").text.strip()
          except NoSuchElementException:
            try:
              toss = driver.find_element(By.XPATH, "//*[contains(text(), 'won the toss')]").text.strip()
            except NoSuchElementException:
              toss = None
        subtabs = driver.find_elements(By.CSS_SELECTOR, "a.cb-nav-pill-1")
        if len(subtabs) < 3:
          print("Subtabs < 3")
          raise NoSuchElementException("Subtabs not found")
        subtab_i = 0
        team1_name = subtabs[1].text.replace('Inns', '').strip()
        team2_name = subtabs[2].text.replace('Inns', '').strip()
        while subtab_i < len(subtabs):
          if subtabs[subtab_i].text != 'Preview':
            ActionChains(driver).move_to_element(subtabs[subtab_i]).click().perform()
            time.sleep(2)
            over_balls_el = driver.find_elements(By.CSS_SELECTOR, "div[ng-bind='comm.over']")[::-1]

            for ball_i, over_ball in enumerate(over_balls_el):
              parent_el = over_ball.find_element(By.XPATH, "./parent::*")
              comm = parent_el.find_element(By.XPATH, "./following-sibling::p[1]").text.strip()

              years_series_match_commentary_data.append({
                "year": years_series_match_data[match_i]["year"],
                "series_type": years_series_match_data[match_i]["series_type"],
                "series_name": years_series_match_data[match_i]["series_name"],
                "match_no": years_series_match_data[match_i]["match_no"],
                "match_type": "T20 IPL",
                "match_id": match_id,
                "match_venue": {
                  "stadium": match_stadium,
                  "city": match_city,
                  "capacity": match_capacity,
                  "host_teams": match_venue_host_teams,
                },
                "match_status": match_status,
                "match_winning_team": match_winning_team,
                "match_tie_breaker": match_tie_breaker,
                "match_toss": toss,
                "umpires": umpires,
                "match_referee": match_referee,
                "third_umpires": third_umpires,
                "match_datetime": match_datetime,
                "team1_name": team1_name,
                "team2_name": team2_name,
                "team1_score": team1_score,
                "team1_wickets": team1_wickets,
                "team2_score": team2_score,
                "team2_wickets": team2_wickets,
                "team1_captain": team1_captain,
                "team1_players": team1_players,
                "team1_bench": team1_bench,
                "team1_support_staff": team1_support_staff,
                "team2_captain": team2_captain,
                "team2_players": team2_players,
                "team2_bench": team2_bench,
                "team2_support_staff": team2_support_staff,
                "match_innings": subtab_i,
                "ball_no": ball_i+1,
                "over_no": over_ball.text.strip(),
                "ball_commentary": comm,
              })
            
            overs = driver.find_elements(By.CSS_SELECTOR, ".cb-com-ovr-sum")[::-1]
            for over_i, over in enumerate(overs):
              over_total_runs = over.find_element(By.CSS_SELECTOR, 'span[ng-bind="comm.overSeparator.runs"]').text.strip()
              over_summary = over.find_element(By.CSS_SELECTOR, 'div[ng-bind="comm.overSeparator.o_summary"]').text.strip()
              
              over_batsman1_name = over.find_element(By.CSS_SELECTOR, 'div[ng-bind="comm.overSeparator.batStrikerNames[0]"]').text.strip()
              over_batsman1_curr_scr, over_batsman1_played_balls = over.find_element(By.CSS_SELECTOR, '''div[ng-bind="' ' + comm.overSeparator.batStrikerRuns + '(' + comm.overSeparator.batStrikerBalls +')'"]''').text.strip().rstrip(')').split('(', maxsplit=1)
              
              over_batsman2_name = over.find_element(By.CSS_SELECTOR, 'div[ng-bind="comm.overSeparator.batNonStrikerNames[0]"]').text.strip()
              over_batsman2_curr_scr, over_batsman2_played_balls = over.find_element(By.CSS_SELECTOR, '''div[ng-bind="' ' + comm.overSeparator.batNonStrikerRuns + '(' + comm.overSeparator.batNonStrikerBalls +')'"]''').text.strip().rstrip(')').split('(', maxsplit=1)

              over_bowler_name = over.find_element(By.CSS_SELECTOR, 'div[ng-bind="comm.overSeparator.bowlNames[0]"]').text.strip()
              over_bowler_bowled_overs, over_bowler_bowled_maidens, over_bowler_bowled_runs, over_bowler_bowled_wickets = over.find_element(By.CSS_SELECTOR, '''div[ng-bind="' ' + comm.overSeparator.bowlOvers + '-' + comm.overSeparator.bowlMaidens + '-'+ comm.overSeparator.bowlRuns + '-'+ comm.overSeparator.bowlWickets"]''').text.strip().split("-", maxsplit=3)

              years_series_match_over_data.append({
                "year": years_series_match_data[match_i]["year"],
                "series_type": years_series_match_data[match_i]["series_type"],
                "series_name": years_series_match_data[match_i]["series_name"],
                "match_no": years_series_match_data[match_i]["match_no"],
                "match_type": "T20 IPL",
                "match_id": match_id,
                "match_venue": {
                  "stadium": match_stadium,
                  "city": match_city,
                  "capacity": match_capacity,
                  "host_teams": match_venue_host_teams,
                },
                "match_status": match_status,
                "match_winning_team": match_winning_team,
                "match_tie_breaker": match_tie_breaker,
                "match_toss": toss,
                "umpires": umpires,
                "match_referee": match_referee,
                "third_umpires": third_umpires,
                "match_datetime": match_datetime,
                "team1_name": team1_name,
                "team2_name": team2_name,
                "team1_score": team1_score,
                "team1_wickets": team1_wickets,
                "team2_score": team2_score,
                "team2_wickets": team2_wickets,
                "team1_captain": team1_captain,
                "team1_players": team1_players,
                "team1_bench": team1_bench,
                "team1_support_staff": team1_support_staff,
                "team2_captain": team2_captain,
                "team2_players": team2_players,
                "team2_bench": team2_bench,
                "team2_support_staff": team2_support_staff,
                "match_innings": subtab_i,
                "over_no": over_i+1,
                
                "over_total_runs": over_total_runs,
                "over_summary": over_summary,
                "over_batsman1_name": over_batsman1_name,
                "over_batsman1_curr_scr": over_batsman1_curr_scr,
                "over_batsman1_played_balls": over_batsman1_played_balls,
                "over_batsman2_name": over_batsman2_name,
                "over_batsman2_curr_scr": over_batsman2_curr_scr,
                "over_batsman2_played_balls": over_batsman2_played_balls,
                "over_bowler_name": over_bowler_name,
                "over_bowler_bowled_overs": over_bowler_bowled_overs,
                "over_bowler_bowled_maidens": over_bowler_bowled_maidens,
                "over_bowler_bowled_runs": over_bowler_bowled_runs,
                "over_bowler_bowled_wickets": over_bowler_bowled_wickets,
              })
          subtab_i += 1
        break
      except Exception as e:
        print(e.msg)
        # Retry by refreshing the page
        driver.refresh()
        time.sleep(5)
        match_retry += 1
    
    # Print progress in percentage
    progress = (match_i / len(years_series_match_data)) * 100
    print(f"Match {match_i} {years_series_match_data[match_i]['match_name']} processed with {ball_i} balls after {match_retry} retries. Progress: {progress:.2f}%")
    match_i += 1

  # Save to CSV
  pd.DataFrame(years_series_match_commentary_data).to_csv('supporting_cache_data/ipl_commentary_data.csv', index=False, header=True)
  pd.DataFrame(years_series_match_over_data).to_csv('supporting_cache_data/ipl_over_data.csv', index=False, header=True)

players_href_df = pd.DataFrame(players_href.values())
players_href_df['profile_id'] = players_href_df['href'].str.replace('https://www.cricbuzz.com/profiles/', '').str.split('/').str[0]
players_href_df.to_csv('supporting_cache_data/ipl_players_href.csv', index=False, header=True)

print(len(years_series_match_commentary_data))
print(len(years_series_match_over_data))
print(len(players_href))

Match 11769 Lucknow Super Giants vs Delhi Capitals processed with 120 balls after 0 retries. Progress: 35.10%
Match 11770 Punjab Kings vs Gujarat Titans processed with 127 balls after 0 retries. Progress: 35.11%
Match 11771 Rajasthan Royals vs Kolkata Knight Riders processed with 109 balls after 0 retries. Progress: 35.11%
Match 11772 Sunrisers Hyderabad vs Lucknow Super Giants processed with 105 balls after 0 retries. Progress: 35.11%
Match 11773 Royal Challengers Bengaluru vs Chennai Super Kings processed with 121 balls after 0 retries. Progress: 35.12%
Match 19518 SUNRISERS HYDERABAD vs ROYAL CHALLENGERS BENGALURU processed with 122 balls after 0 retries. Progress: 58.22%
Match 19519 RISING PUNE SUPERGIANT vs MUMBAI INDIANS processed with 121 balls after 0 retries. Progress: 58.22%
Match 19520 GUJARAT LIONS vs KOLKATA KNIGHT RIDERS processed with 95 balls after 0 retries. Progress: 58.22%
Match 19521 PUNJAB KINGS vs RISING PUNE SUPERGIANT processed with 121 balls after 0 retries. Pr

In [128]:
driver.quit()

# 4. Clean Data

In [None]:
commentary_df = pd.DataFrame(years_series_match_commentary_data)
commentary_df['match_id'] = commentary_df.match_id.astype(int)
commentary_df.to_csv('supporting_cache_data/ipl_commentary_data_cleaned.csv', index=False, header=True)

overs_df = pd.DataFrame(years_series_match_over_data)
overs_df['match_id'] = commentary_df.match_id.astype(int)
overs_df.to_csv('supporting_cache_data/ipl_over_data_cleaned.csv', index=False, header=True)

players_href_df = pd.DataFrame(players_href.values())
players_href_df['profile_id'] = players_href_df['href'].str.replace('https://www.cricbuzz.com/profiles/', '').str.split('/').str[0].astype(int)
players_href_df.to_csv('supporting_cache_data/ipl_players_href_cleaned.csv', index=False, header=True)


In [131]:
commentary_df.head()

Unnamed: 0,year,series_type,series_name,match_no,match_type,match_id,match_venue,match_status,match_winning_team,match_tie_breaker,...,team1_players,team1_bench,team1_support_staff,team2_captain,team2_players,team2_bench,team2_support_staff,ball_no,over_no,ball_commentary
0,2021,T20 League,Indian Premier League 2021,1st Match,T20 IPL,35612,"{'stadium': 'MA Chidambaram Stadium', 'city': ...",completed,Royal Challengers Bangalore,,...,[],"Nathan Coulter-Nile, Piyush Chawla, Dhawal Kul...",,,[],"Adam Zampa, Devdutt Padikkal, Sachin Baby, Nav...",,1,0.1,"Siraj to Rohit, 2 runs, straightaway into the ..."
1,2021,T20 League,Indian Premier League 2021,1st Match,T20 IPL,35612,"{'stadium': 'MA Chidambaram Stadium', 'city': ...",completed,Royal Challengers Bangalore,,...,[],"Nathan Coulter-Nile, Piyush Chawla, Dhawal Kul...",,,[],"Adam Zampa, Devdutt Padikkal, Sachin Baby, Nav...",,2,0.2,"Siraj to Rohit, no run, full ball on middle, d..."
2,2021,T20 League,Indian Premier League 2021,1st Match,T20 IPL,35612,"{'stadium': 'MA Chidambaram Stadium', 'city': ...",completed,Royal Challengers Bangalore,,...,[],"Nathan Coulter-Nile, Piyush Chawla, Dhawal Kul...",,,[],"Adam Zampa, Devdutt Padikkal, Sachin Baby, Nav...",,3,0.3,"Siraj to Rohit, no run, indications of the pac..."
3,2021,T20 League,Indian Premier League 2021,1st Match,T20 IPL,35612,"{'stadium': 'MA Chidambaram Stadium', 'city': ...",completed,Royal Challengers Bangalore,,...,[],"Nathan Coulter-Nile, Piyush Chawla, Dhawal Kul...",,,[],"Adam Zampa, Devdutt Padikkal, Sachin Baby, Nav...",,4,0.4,"Siraj to Rohit, 2 runs, short of length around..."
4,2021,T20 League,Indian Premier League 2021,1st Match,T20 IPL,35612,"{'stadium': 'MA Chidambaram Stadium', 'city': ...",completed,Royal Challengers Bangalore,,...,[],"Nathan Coulter-Nile, Piyush Chawla, Dhawal Kul...",,,[],"Adam Zampa, Devdutt Padikkal, Sachin Baby, Nav...",,5,0.5,"Siraj to Rohit, no run, short of length delive..."


In [124]:
overs_df.match_id.astype(int).nunique(), commentary_df.match_id.astype(int).nunique(), players_href_df.profile_id.astype(int).nunique()

(524, 524, 641)

In [119]:
overs_df[~overs_df.match_id.astype(int).isin(commentary_df.match_id.astype(int).unique())].match_id.unique()

array([], dtype=object)

In [126]:
commentary_df.match_tie_breaker.unique()

array([nan, 'Super Over', None, 'one - over eliminator', '2nd Super Over'],
      dtype=object)

In [None]:
commentary_df.groupby(['match_id']).agg({'ball_no': 'max', })