In [None]:
from cricbuzz_scraper import CricbuzzScraper

In [None]:
import os
import time
import pandas as pd
import re
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException, ElementNotInteractableException

In [None]:
from selenium.webdriver.chrome.options import Options

# Create Chrome options
chrome_options = Options()
# Run in headless mode (no browser window)
chrome_options.add_argument("--headless")

# Disable notifications
chrome_options.add_argument("--disable-notifications")

driver = webdriver.Chrome(executable_path='chromedriver.exe', options=chrome_options)
driver.implicitly_wait(5)

driver.get('https://www.cricbuzz.com/cricket-scorecard-archives')

# 1. Start with Series Archive Page
## Fetch and Store all the series link href to scrape later

In [None]:
years = driver.find_elements(By.CSS_SELECTOR, ".cb-yr-tmline > a")
len(years)

124

In [44]:
if os.path.exists('supporting_cache_data/all_years_series_hrefs.csv'):
  years_series_data = pd.read_csv('supporting_cache_data/all_years_series_hrefs.csv').to_dict(orient='records')
  print("Fetched from cache")
else:
  years_series_data = []
  year_i = 0
  year_retry = 0
  while year_i < len(years) and year_retry <= 5:
    try:
      ActionChains(driver).move_to_element(years[year_i]).click().perform()
      year_name = years[year_i].text.strip()
      time.sleep(2)
      try:
        series_containers = driver.find_elements(By.CSS_SELECTOR, ".cb-schdl > div.cb-col")
        for series_container in series_containers:
          try:
            # Get the Series type by looking at the previous h2 sibling and contains class name "cb-srs-cat"
            series_type = series_container.find_element(By.XPATH, "./preceding-sibling::h2[1][contains(@class, 'cb-srs-cat')]").text
            if not series_type:
              raise ValueError("Series type not found")
            # Get the Series name and href link
            series = series_container.find_elements(By.CSS_SELECTOR, ".cb-srs-lst-itm > a")

            for serie in series:
              years_series_data.append({
                "year": year_name,
                "series_type": series_type,
                "series_name": serie.text.strip(),
                "series_href": serie.get_attribute("href"),
              })
          except (TimeoutException, NoSuchElementException) as e:
            print("Element not found or timeout occurred.", e.msg)
      except (TimeoutException, NoSuchElementException) as e:
        print("Element not found or timeout occurred.", e.msg)
      year_i += 1
      year_retry = 0
      # Print progress in percentage
      progress = (year_i / len(years)) * 100
      print(f"Year {year_name} processed. Progress: {progress:.2f}%")
    except StaleElementReferenceException:
      years = driver.find_elements(By.CSS_SELECTOR, ".cb-yr-tmline > a")
      year_retry += 1
      if year_retry == 5: print(f"StaleElementReferenceException Error with {year_name} after 5 retries")
  # Save to CSV if data exists
  if years_series_data:
    pd.DataFrame(years_series_data).to_csv('supporting_cache_data/all_years_series_hrefs.csv', index=False, header=True)
  else:
    print("No data collected. Skipping CSV save.")

pd.DataFrame(years_series_data)
#3262

Year 2021 processed. Progress: 0.81%
Year 2022 processed. Progress: 1.61%
Year 2023 processed. Progress: 2.42%
Year 2024 processed. Progress: 3.23%
Year 2025 processed. Progress: 4.03%
Year 2011 processed. Progress: 4.84%
Year 2012 processed. Progress: 5.65%
Year 2013 processed. Progress: 6.45%
Year 2014 processed. Progress: 7.26%
Year 2015 processed. Progress: 8.06%
Year 2016 processed. Progress: 8.87%
Year 2017 processed. Progress: 9.68%
Year 2018 processed. Progress: 10.48%
Year 2019 processed. Progress: 11.29%
Year 2020 processed. Progress: 12.10%
Year 2001 processed. Progress: 12.90%
Year 2002 processed. Progress: 13.71%
Year 2003 processed. Progress: 14.52%
Year 2004 processed. Progress: 15.32%
Year 2005 processed. Progress: 16.13%
Year 2006 processed. Progress: 16.94%
Year 2007 processed. Progress: 17.74%
Year 2008 processed. Progress: 18.55%
Year 2009 processed. Progress: 19.35%
Year 2010 processed. Progress: 20.16%
Year 1991 processed. Progress: 20.97%
Year 1992 processed. Pro

Unnamed: 0,year,series_type,series_name,series_href
0,2021,International,"Bangladesh tour of New Zealand, 2022",https://www.cricbuzz.com/cricket-series/3876/b...
1,2021,International,"India tour of South Africa, 2021-22",https://www.cricbuzz.com/cricket-series/3656/i...
2,2021,International,"Ireland tour of USA, 2021",https://www.cricbuzz.com/cricket-series/3866/i...
3,2021,International,"West Indies tour of Pakistan, 2021-22",https://www.cricbuzz.com/cricket-series/3858/w...
4,2021,International,"The Ashes, 2021-22",https://www.cricbuzz.com/cricket-series/3532/t...
...,...,...,...,...
3257,1887,International,"England in Australia, 1887",https://www.cricbuzz.com/cricket-series/1022/e...
3258,1888,International,"Australia in England, 1888",https://www.cricbuzz.com/cricket-series/1024/a...
3259,1888,International,"England in Australia, 1888",https://www.cricbuzz.com/cricket-series/1023/e...
3260,1889,International,England in South Africa Test Series,https://www.cricbuzz.com/cricket-series/1025/e...


# 2. Start scraping each Series Page
## Fetch and Store all the match link href to scrape later

In [None]:
if os.path.exists('supporting_cache_data/all_years_series_match_hrefs.csv'):
  years_series_match_df = pd.read_csv('supporting_cache_data/all_years_series_hrefs.csv')
  years_series_df['year'] = years_series_df['year'].astype(str)
  years_series_df['series_name'] = years_series_df['series_name'].astype(str)
  years_series_df['series_href'] = years_series_df['series_href'].astype(str)
  years_series_df['series_type'] = years_series_df['series_type'].astype(str)
  print("Fetched from cache")
else:
  years_series_match_data = []
  match_i = 0
  series_retry = 0
  while series_i < len(years) and series_retry <= 5:
    try:
      ActionChains(driver).move_to_element(years[series_i]).click().perform()
      year_name = years[series_i].text.strip()
      time.sleep(2)
      try:
        series_containers = driver.find_elements(By.CSS_SELECTOR, ".cb-schdl > div.cb-col")
        for series_container in series_containers:
          try:
            # Get the Series type by looking at the previous h2 sibling and contains class name "cb-srs-cat"
            series_type = series_container.find_element(By.XPATH, "./preceding-sibling::h2[contains(@class, 'cb-srs-cat')]").text
            
            # Get the Series name and href link
            series = series_container.find_elements(By.CSS_SELECTOR, ".cb-srs-lst-itm > a")

            for serie in series:
              years_series_match_data.append({
                "year": year_name,
                "series_type": series_type,
                "series_name": serie.text.strip(),
                "series_href": serie.get_attribute("href"),
              })
          except (TimeoutException, NoSuchElementException) as e:
            print("Element not found or timeout occurred.", e.msg)
      except (TimeoutException, NoSuchElementException) as e:
        print("Element not found or timeout occurred.", e.msg)
      series_i += 1
      series_retry = 0
      # Print progress in percentage
      progress = (series_i / len(years)) * 100
      print(f"Year {year_name} processed. Progress: {progress:.2f}%")
    except StaleElementReferenceException:
      years = driver.find_elements(By.CSS_SELECTOR, ".cb-yr-tmline > a")
      series_retry += 1
      if series_retry == 5: print(f"StaleElementReferenceException Error with {year_name} after 5 retries")
  print(len(years_series_match_data))
  # Convert to DataFrame
  years_series_df = pd.DataFrame(years_series_match_data)
  # Save to CSV
  years_series_df.to_csv('supporting_cache_data/all_years_series_hrefs.csv', index=False, header=True)
years_series_df

In [None]:
i = 0
n = len(years_series_data)
match_data = []
while i < n:
  if "Indian" in years_series_data[i]["series_name"]:
    url = years_series_data[i]["series_href"]
    scraper.driver.get(url)
    match_rows = scraper.wait_for_elements(By.CSS_SELECTOR, #series-matches div.cb-series-matches)
    for match in match_rows:
      match_link = match.find_element(By.CSS_SELECTOR, a.text-hvr-underline)
      match_href = match_link.get_attribute('href')
      match_venue = match_link.find_element(By.XPATH, following-sibling::div[1][contains(@class, 'text-gray')]).text

      match_name, match_no = map(str.strip, match_link.text.strip().split(,, maxsplit=1))
      match_team1, match_team2 = map(str.strip, match_name.split( vs ))  # Team names

      match_date = match.find_element(By.CSS_SELECTOR, div.cb-col.schedule-date).text.strip() if match.find_element(By.CSS_SELECTOR, div.cb-col.schedule-date).text.strip() else match_date
      match_time = match.find_element(By.CSS_SELECTOR, span.schedule-date).text.strip()
      match_time_parts = match_time.split(':')
      match_time_parts[0] = match_time_parts[0].zfill(2)  # Zero-pad the hour part
      match_time = ':'.join(match_time_parts)
      match_datetime_str = f"{years_series_data[i]['year']} {match_date} {match_time} +0530"
      match_datetime = datetime.strptime(match_datetime_str, %Y %b %d, %a %I:%M %p %z)
      match_data.append({
        "year": years_series_data[i]["year"],
        "series_type": years_series_data[i]["series_type"],
        "series_name": years_series_data[i]["series_name"],
        "match_no": match_no,
        "match_name": match_name,
        "match_href": match_href,
        "match_team1": match_team1,
        "match_team2": match_team2,
        "match_venue": match_venue,
        "match_datetime": match_datetime,
        # %b - Abbreviated month name
        # %d - Day of the month as a zero-padded decimal
        # %a - Abbreviated weekday name
        # %-I - hour(12 hour clock) as a decimal number
        # %M - Minute as a zero padded decimal number
        # %p - AM or PM
        # %z - UTC offset in the form +HHMM or -HHMM
      })
    # break
  i += 1
match_df = pd.DataFrame(match_data)
match_df

In [None]:
# match_df['match_id'] = match_df['match_href'].str.lstrip('https://www.cricbuzz.com/cricket-scores/').str.split('/').str[0]
# match_df.to_csv('match_data.csv', index=False, header=True)
match_df = pd.read_csv('match_data.csv', header=0)
match_data = match_df.to_dict(orient='records')

In [None]:
def get_player_details(element, default_role):
  if element:
    res = []
    captain = None
    for player in element.find_elements(By.CSS_SELECTOR, 'a'):
      txt = player.text.strip()
      href = player.get_attribute('href')
      profile_id, name = href.lstrip('https://www.cricbuzz.com/profiles/').split('/')
      name = name.replace("-", " ").capitalize()
      if '(c)' in txt:
        captain = name
        role = 'captain'
      elif '(w)' in txt or '(wk)' in txt:
        role = 'wicket_keeper'
      else:
        role = default_role
      res.append({
        "profile_id": profile_id,
        "name": name,
        "href": href,
        "role": role,
      })
    return res, captain
  return None, None

In [None]:
driver.quit()