In [6]:
import os
import time
import pandas as pd
import re
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException, ElementNotInteractableException

In [7]:
# Create Chrome options
chrome_options = Options()
# Run in headless mode (no browser window)
# chrome_options.add_argument("--headless")

# Disable notifications
chrome_options.add_argument("--disable-notifications")

# Create a Service object
service = Service('chromedriver.exe')

driver = webdriver.Chrome(service=service, options=chrome_options)
driver.implicitly_wait(5)

driver.get('https://www.cricbuzz.com/cricket-scorecard-archives')

# 1. Start with Series Archive Page
## Fetch and Store all the series link href to scrape later

In [8]:
if os.path.exists('supporting_cache_data/all_years_series_hrefs.csv'):
  years_series_data = pd.read_csv('supporting_cache_data/all_years_series_hrefs.csv').to_dict(orient='records')
  print("Fetched from cache")
else:
  years = driver.find_elements(By.CSS_SELECTOR, ".cb-yr-tmline > a")
  print(f"Total of {len(years)} years found")
  years_series_data = []
  year_i = 0
  year_retry = 0
  while year_i < len(years) and year_retry <= 5:
    try:
      ActionChains(driver).move_to_element(years[year_i]).click().perform()
      year_name = years[year_i].text.strip()
      time.sleep(2)
      try:
        series_containers = driver.find_elements(By.CSS_SELECTOR, ".cb-schdl > div.cb-col")
        for series_container in series_containers:
          try:
            # Get the Series type by looking at the previous h2 sibling and contains class name "cb-srs-cat"
            series_type = series_container.find_element(By.XPATH, "./preceding-sibling::h2[1][contains(@class, 'cb-srs-cat')]").text
            if not series_type:
              raise ValueError("Series type not found")
            # Get the Series name and href link
            series = series_container.find_elements(By.CSS_SELECTOR, ".cb-srs-lst-itm > a")

            for serie in series:
              years_series_data.append({
                "year": year_name,
                "series_type": series_type,
                "series_name": serie.text.strip(),
                "series_href": serie.get_attribute("href"),
              })
          except (TimeoutException, NoSuchElementException) as e:
            print("Element not found or timeout occurred.", e.msg)
      except (TimeoutException, NoSuchElementException) as e:
        print("Element not found or timeout occurred.", e.msg)
      year_i += 1
      year_retry = 0
      # Print progress in percentage
      progress = (year_i / len(years)) * 100
      print(f"Year {year_name} processed after {year_retry} retries. Progress: {progress:.2f}%")
    except StaleElementReferenceException:
      years = driver.find_elements(By.CSS_SELECTOR, ".cb-yr-tmline > a")
      year_retry += 1
      if year_retry == 5: print(f"StaleElementReferenceException Error with {year_name} after 5 retries")
  # Save to CSV if data exists
  if years_series_data:
    pd.DataFrame(years_series_data).to_csv('supporting_cache_data/all_years_series_hrefs.csv', index=False, header=True)
  else:
    print("No data collected. Skipping CSV save.")

pd.DataFrame(years_series_data)
#3262

Fetched from cache


Unnamed: 0,year,series_type,series_name,series_href
0,2021,International,"Bangladesh tour of New Zealand, 2022",https://www.cricbuzz.com/cricket-series/3876/b...
1,2021,International,"India tour of South Africa, 2021-22",https://www.cricbuzz.com/cricket-series/3656/i...
2,2021,International,"Ireland tour of USA, 2021",https://www.cricbuzz.com/cricket-series/3866/i...
3,2021,International,"West Indies tour of Pakistan, 2021-22",https://www.cricbuzz.com/cricket-series/3858/w...
4,2021,International,"The Ashes, 2021-22",https://www.cricbuzz.com/cricket-series/3532/t...
...,...,...,...,...
3257,1887,International,"England in Australia, 1887",https://www.cricbuzz.com/cricket-series/1022/e...
3258,1888,International,"Australia in England, 1888",https://www.cricbuzz.com/cricket-series/1024/a...
3259,1888,International,"England in Australia, 1888",https://www.cricbuzz.com/cricket-series/1023/e...
3260,1889,International,England in South Africa Test Series,https://www.cricbuzz.com/cricket-series/1025/e...


# 2. Start scraping each Series Page
## Fetch and Store all the match link href to scrape later

In [None]:
if os.path.exists('supporting_cache_data/all_years_series_match_hrefs.csv'):
  years_series_match_data = pd.read_csv('supporting_cache_data/all_years_series_match_hrefs.csv').to_dict(orient='records')
  print("Fetched from cache")
else:
  years_series_match_data = []
  series_i = 0
  while series_i < len(years_series_data):
    driver.get(years_series_data[series_i]['series_href'])
    time.sleep(2)
    match_rows = driver.find_elements(By.CSS_SELECTOR, "#series-matches div.cb-series-matches")
    # print(f"Series: {years_series_data[series_i]['series_name']} - Total of {len(match_rows)} matches found")
    series_match_i = 0
    series_retry = 0
    for match in match_rows:
      try:
        # Extract match dates, might have both start and end dates
        # %b - Abbreviated month name
        # %d - Day of the month as a zero-padded decimal
        # %a - Abbreviated weekday name
        # %-I - hour(12 hour clock) as a decimal number
        # %M - Minute as a zero padded decimal number
        # %p - AM or PM
        # %z - UTC offset in the form +HHMM or -HHMM
        match_dates = match.find_element(By.CSS_SELECTOR, "div.cb-col.schedule-date").text.strip()
        if match_dates:
          if " - " in match_dates:
            match_date_start, match_date_end_str = list(map(str.strip, match_dates.split(" - ")))
            try:
              match_date_end = datetime.strptime(f"{years_series_data[series_i]['year']} {match_date_end_str} +0530", "%Y %b %d, %a %z").date()
            except ValueError:
              match_date_end = datetime.strptime(f"{int(years_series_data[series_i]['year'])+1} {match_date_end_str} +0530", "%Y %b %d %z").date()
          else:
            match_date_start = match_dates.strip()
            match_date_end = None
        
        # Extract Match Start Time         
        match_time = match.find_element(By.CSS_SELECTOR, "span.schedule-date").text.strip()
        if '(' in match_time:
          match_time = match_time.split('(')[0].strip()
        match_time_parts = match_time.split(':')
        match_time_parts[0] = match_time_parts[0].zfill(2)  # Zero-pad the hour part
        match_time = ':'.join(match_time_parts)
        try:
          match_datetime_str = f"{years_series_data[series_i]['year']} {match_date_start} {match_time} +0530"
          match_datetime_start = datetime.strptime(match_datetime_str, "%Y %b %d, %a %I:%M %p %z")
        except ValueError:
          match_datetime_str = f"{int(years_series_data[series_i]['year'])+1} {match_date_start} {match_time} +0530"
          match_datetime_start = datetime.strptime(match_datetime_str, "%Y %b %d, %a %I:%M %p %z")

        if match_datetime_start.date() >= datetime.now().date():
          # Skip future matches
          continue
        try:
          match_link = match.find_element(By.CSS_SELECTOR, "a.text-hvr-underline")
        except NoSuchElementException:
          continue
        match_href = match_link.get_attribute('href')
        match_venue = match_link.find_element(By.XPATH, "following-sibling::div[1][contains(@class, 'text-gray')]").text.strip()

        match_name, match_no = map(str.strip, match_link.text.strip().split(", ", maxsplit=1))
        match_team1, match_team2 = map(str.strip, match_name.split(" vs "))  # Team names
        
        if 'T20' in match_no:
          match_type = 'T20'
        elif 'ODI' in match_no:
          match_type = 'ODI'
        elif 'Test' in match_no:
          match_type = 'Test'
        else:
          # Look at the top left corner for the match type
          try:
            sep = driver.find_element(By.CSS_SELECTOR, "span.cb-nav-dt")
            if sep:
              match_types = sep.find_element(By.XPATH, "./preceding-sibling::span[1]").text.strip()
              if "," in match_types:
                match_type = match_types
              elif "Test" in match_types:
                match_type = "Test"
              elif "ODI" in match_types:
                match_type = "ODI"
              elif "T20" in match_types:
                match_type = "T20"
              else:
                match_type = None
          except NoSuchElementException:
            match_type = None

        years_series_match_data.append({
          "year": years_series_data[series_i]["year"],
          "series_type": years_series_data[series_i]["series_type"],
          "series_name": years_series_data[series_i]["series_name"],
          "match_no": match_no,
          "match_type": match_type,
          "match_name": match_name,
          "match_href": match_href,
          "match_team1": match_team1,
          "match_team2": match_team2,
          "match_datetime_start": match_datetime_start,
          "match_date_end": match_date_end,
          "match_venue": match_venue,
        })
        series_match_i += 1
        series_retry = 0
      except StaleElementReferenceException:
        match_rows = driver.find_elements(By.CSS_SELECTOR, "#series-matches div.cb-series-matches")
        series_retry += 1
        if series_retry == 5: print(f"StaleElementReferenceException Error in {years_series_data[series_i]['series_href']} after 5 retries")
    # Print progress in percentage
    progress = (series_i / len(years_series_data)) * 100
    print(f"Series {years_series_data[series_i]['series_name']} processed with {len(match_rows)} matches after {series_retry} retries. Progress: {progress:.2f}%")
    series_i += 1

  # Save to CSV
  pd.DataFrame(years_series_match_data).to_csv('supporting_cache_data/all_years_series_match_hrefs.csv', index=False, header=True)
pd.DataFrame(years_series_match_data)

Series CSA Four-Day Series Division One 2024-25 processed with 29 matches after 0 retries. Progress: 20.82%
Series The Ford Trophy 2024-25 processed with 32 matches after 0 retries. Progress: 20.85%
Series ACC Mens T20 Emerging Teams Asia Cup 2024 processed with 15 matches after 0 retries. Progress: 20.88%
Series Ranji Trophy Elite 2024-25 processed with 119 matches after 0 retries. Progress: 20.91%
Series Ranji Trophy Plate 2024-25 processed with 16 matches after 0 retries. Progress: 20.94%
Series South American Mens Championships 2024 processed with 16 matches after 0 retries. Progress: 20.97%
Series Sheffield Shield 2024-25 processed with 31 matches after 0 retries. Progress: 21.00%
Series Irani Cup 2024 processed with 1 matches after 0 retries. Progress: 21.03%
Series ICC CWC Challenge League A, 2024-26 processed with 15 matches after 0 retries. Progress: 21.06%
Series Australia Domestic One-Day Cup 2024-25 processed with 22 matches after 0 retries. Progress: 21.09%
Series Australi

In [61]:
match_datetime_str

'2023 Feb 29, Thu 03:00 AM +0530'

In [None]:
i = 0
n = len(years_series_data)
match_data = []
while i < n:
  if "Indian" in years_series_data[i]["series_name"]:
    url = years_series_data[i]["series_href"]
    scraper.driver.get(url)
    match_rows = scraper.wait_for_elements(By.CSS_SELECTOR, #series-matches div.cb-series-matches)
    for match in match_rows:
      match_link = match.find_element(By.CSS_SELECTOR, a.text-hvr-underline)
      match_href = match_link.get_attribute('href')
      match_venue = match_link.find_element(By.XPATH, following-sibling::div[1][contains(@class, 'text-gray')]).text

      match_name, match_no = map(str.strip, match_link.text.strip().split(,, maxsplit=1))
      match_team1, match_team2 = map(str.strip, match_name.split( vs ))  # Team names

      match_date = match.find_element(By.CSS_SELECTOR, div.cb-col.schedule-date).text.strip() if match.find_element(By.CSS_SELECTOR, div.cb-col.schedule-date).text.strip() else match_date
      match_time = match.find_element(By.CSS_SELECTOR, span.schedule-date).text.strip()
      match_time_parts = match_time.split(':')
      match_time_parts[0] = match_time_parts[0].zfill(2)  # Zero-pad the hour part
      match_time = ':'.join(match_time_parts)
      match_datetime_str = f"{years_series_data[i]['year']} {match_date} {match_time} +0530"
      match_datetime = datetime.strptime(match_datetime_str, %Y %b %d, %a %I:%M %p %z)
      match_data.append({
        "year": years_series_data[i]["year"],
        "series_type": years_series_data[i]["series_type"],
        "series_name": years_series_data[i]["series_name"],
        "match_no": match_no,
        "match_name": match_name,
        "match_href": match_href,
        "match_team1": match_team1,
        "match_team2": match_team2,
        "match_venue": match_venue,
        "match_datetime": match_datetime,
        # %b - Abbreviated month name
        # %d - Day of the month as a zero-padded decimal
        # %a - Abbreviated weekday name
        # %-I - hour(12 hour clock) as a decimal number
        # %M - Minute as a zero padded decimal number
        # %p - AM or PM
        # %z - UTC offset in the form +HHMM or -HHMM
      })
    # break
  i += 1
match_df = pd.DataFrame(match_data)
match_df

In [None]:
# match_df['match_id'] = match_df['match_href'].str.lstrip('https://www.cricbuzz.com/cricket-scores/').str.split('/').str[0]
# match_df.to_csv('match_data.csv', index=False, header=True)
match_df = pd.read_csv('match_data.csv', header=0)
match_data = match_df.to_dict(orient='records')

In [None]:
def get_player_details(element, default_role):
  if element:
    res = []
    captain = None
    for player in element.find_elements(By.CSS_SELECTOR, 'a'):
      txt = player.text.strip()
      href = player.get_attribute('href')
      profile_id, name = href.lstrip('https://www.cricbuzz.com/profiles/').split('/')
      name = name.replace("-", " ").capitalize()
      if '(c)' in txt:
        captain = name
        role = 'captain'
      elif '(w)' in txt or '(wk)' in txt:
        role = 'wicket_keeper'
      else:
        role = default_role
      res.append({
        "profile_id": profile_id,
        "name": name,
        "href": href,
        "role": role,
      })
    return res, captain
  return None, None

In [None]:
driver.quit()