In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def get_match_urls(series_url):
    """Scrape match URLs for the given series."""
    response = requests.get(series_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    match_links = []
    for link in soup.find_all('a', href=True):
        if '/match/' in link['href']:  # Adjust based on the actual URL pattern
            match_links.append("https://www.espncricinfo.com" + link['href'])
    return list(set(match_links))  # Remove duplicates

def scrape_match_data(match_url):
    """Extract ball-by-ball commentary and match details."""
    response = requests.get(match_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract metadata
    match_name = soup.find('h1').text if soup.find('h1') else "Unknown"
    
    details = soup.find_all("span", class_="ds-text-tight-m")
    match_date, match_venue = "", ""
    if details and len(details) >= 2:
        match_date = details[0].text.strip()
        match_venue = details[1].text.strip()
    
    # Extract scores
    teams = soup.find_all("span", class_="ds-text-title")
    scores = soup.find_all("strong")
    team1, team2, score1, score2 = "", "", "", ""
    if teams and scores and len(teams) >= 2 and len(scores) >= 2:
        team1, team2 = teams[0].text.strip(), teams[1].text.strip()
        score1, score2 = scores[0].text.strip(), scores[1].text.strip()
    
    # Extract match result
    result = ""
    result_tag = soup.find("p", class_="ds-text-tight-m")
    if result_tag:
        result = result_tag.text.strip()
    
    # Extract ball-by-ball commentary
    commentary_data = []
    for comment in soup.find_all('div', class_='ds-text-tight-m'):  # Adjust class based on site structure
        ball_info = comment.text.strip()
        if ball_info:
            commentary_data.append([match_name, match_date, match_venue, team1, team2, score1, score2, result, ball_info])
    
    return commentary_data

def main():
    series_url = "https://www.espncricinfo.com/series/ipl-2022-1298423"  # Adjust based on actual series URL
    match_urls = get_match_urls(series_url)
    all_data = []
    
    for match_url in match_urls:
        print(f"Scraping: {match_url}")
        match_data = scrape_match_data(match_url)
        all_data.extend(match_data)
        time.sleep(2)  # Respectful scraping
    
    # Save to CSV
    df = pd.DataFrame(all_data, columns=["Match Name", "Match Date", "Match Venue", "Team 1", "Team 2", "Team 1 Score", "Team 2 Score", "Match Won By", "Ball Commentary"])
    df.to_csv("IPL_2022_Commentary.csv", index=False)
    print("Scraping complete! Data saved to IPL_2022_Commentary.csv")

if __name__ == "__main__":
    main()


Scraping complete! Data saved to IPL_2022_Commentary.csv


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def get_match_urls(series_url):
    """Scrape match URLs for the given series."""
    response = requests.get(series_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    match_links = []
    for link in soup.find_all('a', href=True):
        if '/full-scorecard' in link['href']:  # Ensuring we get full match details
            match_links.append("https://www.espncricinfo.com" + link['href'])
    return list(set(match_links))  # Remove duplicates

def scrape_match_data(match_url):
    """Extract ball-by-ball commentary and match details."""
    response = requests.get(match_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract metadata
    match_name = soup.find('h1').text if soup.find('h1') else "Unknown"
    
    details = soup.find_all("div", class_="ds-text-tight-m ds-font-regular")
    match_date, match_venue = "", ""
    if details and len(details) >= 2:
        match_date = details[0].text.strip()
        match_venue = details[1].text.strip()
    
    # Extract scores
    teams = soup.find_all("span", class_="ds-text-title")
    scores = soup.find_all("div", class_="ds-text-compact-m")
    team1, team2, score1, score2 = "", "", "", ""
    if teams and len(teams) >= 2:
        team1, team2 = teams[0].text.strip(), teams[1].text.strip()
    if scores and len(scores) >= 2:
        score1, score2 = scores[0].text.strip(), scores[1].text.strip()
    
    # Extract match result
    result = ""
    result_tag = soup.find("p", class_="ds-text-tight-m")
    if result_tag:
        result = result_tag.text.strip()
    
    # Extract ball-by-ball commentary
    commentary_data = []
    for comment in soup.find_all('div', class_='ci-html-content'):  # Adjust class based on site structure
        ball_info = comment.text.strip()
        if ball_info:
            commentary_data.append([match_name, match_date, match_venue, team1, team2, score1, score2, result, ball_info])
    
    return commentary_data

def main():
    series_url = "https://www.espncricinfo.com/series/ipl-2022-1298423"  # Adjust based on actual series URL
    match_urls = get_match_urls(series_url)
    all_data = []
    
    for match_url in match_urls:
        print(f"Scraping: {match_url}")
        match_data = scrape_match_data(match_url)
        all_data.extend(match_data)
        time.sleep(2)  # Respectful scraping
    
    # Save to CSV
    df = pd.DataFrame(all_data, columns=["Match Name", "Match Date", "Match Venue", "Team 1", "Team 2", "Team 1 Score", "Team 2 Score", "Match Won By", "Ball Commentary"])
    df.to_csv("IPL_2022_Commentary.csv", index=False)
    print("Scraping complete! Data saved to IPL_2022_Commentary.csv")

if __name__ == "__main__":
    main()


Scraping complete! Data saved to IPL_2022_Commentary.csv


In [13]:
!pip install selenium pandas webdriver-manager


Defaulting to user installation because normal site-packages is not writeable


In [15]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

def get_commentary(url):
    """Scrape ball-by-ball commentary from Cricbuzz using Selenium."""

    options = Options()
    options.add_argument("--headless")  # Run without opening browser
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.get(url)
    time.sleep(5)  # Allow JavaScript to load

    match_title = driver.title.split('|')[0].strip()

    commentary_data = []
    balls = driver.find_elements("xpath", "//div[contains(@class, 'cb-col cb-col-100 cb-com-ln')]")
    
    for ball in balls:
        commentary_text = ball.text.strip()
        if commentary_text:
            commentary_data.append([match_title, commentary_text])

    driver.quit()
    return commentary_data

def main():
    match_url = "https://www.cricbuzz.com/live-cricket-scores/40381"  # Replace with any live match
    data = get_commentary(match_url)

    df = pd.DataFrame(data, columns=["Match Name", "Ball Commentary"])
    df.to_csv("Cricbuzz_Ball_by_Ball_Commentary.csv", index=False)
    print("✅ Scraping Complete! Data saved to Cricbuzz_Ball_by_Ball_Commentary.csv")

if __name__ == "__main__":
    main()


✅ Scraping Complete! Data saved to Cricbuzz_Ball_by_Ball_Commentary.csv


In [25]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Cricbuzz commentary page URL (change the match ID if needed)
url = "https://www.cricbuzz.com/live-cricket-scores/66168/gt-vs-rr-final-indian-premier-league-2023"

# Headers to avoid bot detection
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Fetch page content
response = requests.get(url, headers=headers)
if response.status_code != 200:
    print(f"❌ Failed to fetch page. Status code: {response.status_code}")
else:
    print("✅ Successfully fetched page!")

# Parse the page content
soup = BeautifulSoup(response.content, "lxml")

# Find ball-by-ball commentary section
commentary_section = soup.find("div", class_="cb-col cb-col-100 cb-com-ln")

if not commentary_section:
    print("❌ Commentary section not found. Cricbuzz may have updated its structure.")
else:
    print("✅ Commentary section found!")

# Extract all commentary events
commentary_entries = soup.find_all("div", class_="cb-col cb-col-100 cb-col-rt cb-com-ln")

# Initialize list to store data
data = []

# Loop through each commentary entry
for entry in commentary_entries:
    text = entry.text.strip()

    if not text:
        continue  # Skip empty entries

    # Example: "19.2 Boult to Gill, FOUR!"
    parts = text.split(" ")
    if len(parts) < 3:
        continue

    # Extract Over and Ball Number
    over_ball = parts[0]
    if "." not in over_ball:
        continue
    over, ball_no = map(int, over_ball.split("."))

    # Extract Bowler and Batter Name
    bowler_name = parts[1]
    batter_name = parts[3]

    # Extract Ball Type and Runs
    if "FOUR" in text:
        ball_type = "legal"
        shot_type = "boundary"
        runs = 4
    elif "SIX" in text:
        ball_type = "legal"
        shot_type = "boundary"
        runs = 6
    elif "wide" in text:
        ball_type = "wide"
        shot_type = "none"
        runs = 1
    elif "no ball" in text:
        ball_type = "no ball"
        shot_type = "none"
        runs = 1
    else:
        ball_type = "legal"
        shot_type = "other"
        runs = 0

    # Append to list
    data.append([ball_no, over, bowler_name, batter_name, ball_type, shot_type, runs])

# Convert to DataFrame
df = pd.DataFrame(data, columns=["Ball No", "Over", "Bowler Name", "Batter Name", "Ball Type", "Shot Type", "Runs Scored"])

# Display DataFrame
if not df.empty:
    print("\n✅ Extracted Ball-by-Ball Commentary Data:\n")
    display(df)
else:
    print("\n❌ No data extracted. Cricbuzz may have changed its page structure. Try changing selectors.\n")


✅ Successfully fetched page!
❌ Commentary section not found. Cricbuzz may have updated its structure.

❌ No data extracted. Cricbuzz may have changed its page structure. Try changing selectors.



In [33]:
import pandas as pd
import re

# 🏏 Paste Cricbuzz commentary here (Manually copied from Cricbuzz)
commentary_text = """
19.2 Boult to Gill, FOUR! Crunched past cover
19.1 Boult to Gill, 1 run, taps to point, quick single
18.6 Chahal to Miller, OUT! Caught at deep mid-wicket
18.5 Chahal to Miller, SIX! Massive hit over long-on
"""

# 🏏 Process commentary line by line
data = []
for line in commentary_text.strip().split("\n"):
    match = re.match(r"(\d+)\.(\d+) (\w+) to (\w+), (.*)", line)
    
    if match:
        over = int(match.group(1))
        ball_no = int(match.group(2))
        bowler = match.group(3)
        batter = match.group(4)
        ball_details = match.group(5)

        # Determine ball type & shot type
        ball_type = "legal"
        shot_type = "other"
        runs = 0

        if "FOUR" in ball_details:
            shot_type = "boundary"
            runs = 4
        elif "SIX" in ball_details:
            shot_type = "boundary"
            runs = 6
        elif "OUT" in ball_details:
            shot_type = "wicket"
        elif "1 run" in ball_details:
            runs = 1
        elif "2 runs" in ball_details:
            runs = 2
        elif "3 runs" in ball_details:
            runs = 3
        elif "wide" in ball_details:
            ball_type = "wide"
            runs = 1
        elif "no ball" in ball_details:
            ball_type = "no ball"
            runs = 1

        data.append([ball_no, over, bowler, batter, ball_type, shot_type, runs])

# Convert to DataFrame
df = pd.DataFrame(data, columns=["Ball No", "Over", "Bowler", "Batter", "Ball Type", "Shot Type", "Runs Scored"])

# 📊 Display Data in Jupyter Notebook
display(df)


Unnamed: 0,Ball No,Over,Bowler,Batter,Ball Type,Shot Type,Runs Scored
0,2,19,Boult,Gill,legal,boundary,4
1,1,19,Boult,Gill,legal,other,1
2,6,18,Chahal,Miller,legal,wicket,0
3,5,18,Chahal,Miller,legal,boundary,6
