In [5]:
pip install requests beautifulsoup4 pandas


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [4]:
import asyncio
from playwright.async_api import async_playwright
import pandas as pd
import time

async def scrape_ipl_commentary():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto("https://www.espncricinfo.com/series/indian-premier-league-2022-1298423/match-ball-by-ball-commentary")

        # Scroll down multiple times to load all commentary
        for _ in range(10):
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            await asyncio.sleep(2)  # Allow time for new content to load

        commentary_elements = await page.locator(".match-commentary-wrapper").all()
        data = []

        for row in commentary_elements:
            try:
                ball_no = await row.locator(".ball-number").text_content()
                over = ball_no.split(".")[0]
                bowler = await row.locator(".bowler-name").text_content()
                batter = await row.locator(".batter-name").text_content()
                ball_type = await row.locator(".ball-type").text_content() if await row.locator(".ball-type").count() > 0 else "Normal"
                shot_type = await row.locator(".shot-type").text_content() if await row.locator(".shot-type").count() > 0 else "NA"
                speed = await row.locator(".speed").text_content() if await row.locator(".speed").count() > 0 else "Unknown"
                runs = await row.locator(".runs").text_content()

                data.append([ball_no, over, bowler, batter, ball_type, shot_type, speed, runs])
            
            except Exception:
                continue

        await browser.close()

    # Convert to DataFrame
    df = pd.DataFrame(data, columns=["Ball No", "Over", "Bowler", "Batter", "Ball Type", "Shot Type", "Speed", "Runs"])
    df.to_csv("IPL_2022_Ball_by_Ball_Playwright.csv", index=False)
    print("✅ Scraping Completed! Data saved as 'IPL_2022_Ball_by_Ball_Playwright.csv'")

# Run the async function properly in Jupyter Notebook
await scrape_ipl_commentary()


✅ Scraping Completed! Data saved as 'IPL_2022_Ball_by_Ball_Playwright.csv'


In [7]:
import requests
import pandas as pd

# URL of ESPN API (Get this from the Network tab)
API_URL = "https://site.web.api.espn.com/apis/v2/sports/cricket/series/1298423/events"

# Set User-Agent to avoid bot detection
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

# Fetch the data from the API
response = requests.get(API_URL, headers=HEADERS)

# Check if data was retrieved successfully
if response.status_code == 200:
    data = response.json()

    # Extract ball-by-ball commentary (Modify this based on API response structure)
    ball_by_ball_data = []
    for match in data.get("events", []):
        match_name = match["name"]
        match_id = match["id"]
        
        # Construct the commentary API URL for the match
        commentary_url = f"https://site.web.api.espn.com/apis/v2/sports/cricket/events/{match_id}/commentary"
        comm_response = requests.get(commentary_url, headers=HEADERS)
        
        if comm_response.status_code == 200:
            commentary = comm_response.json()
            for entry in commentary.get("commentary", []):
                ball_no = entry.get("overNumber", "NA")
                over = entry.get("over", "NA")
                bowler = entry.get("bowler", {}).get("displayName", "NA")
                batter = entry.get("batsman", {}).get("displayName", "NA")
                ball_type = entry.get("deliveryType", "NA")
                shot_type = entry.get("shotType", "NA")
                speed = entry.get("deliverySpeed", "NA")
                runs = entry.get("runs", "NA")
                
                ball_by_ball_data.append([match_name, ball_no, over, bowler, batter, ball_type, shot_type, speed, runs])

    # Convert to DataFrame
    df = pd.DataFrame(ball_by_ball_data, columns=["Match", "Ball No", "Over", "Bowler", "Batter", "Ball Type", "Shot Type", "Speed", "Runs"])
    
    # Save to CSV
    df.to_csv("IPL_2022_Ball_by_Ball.csv", index=False)
    print("✅ Data successfully saved to IPL_2022_Ball_by_Ball.csv")

else:
    print(f"⚠️ Failed to fetch data. HTTP Status Code: {response.status_code}")


⚠️ Failed to fetch data. HTTP Status Code: 404


In [8]:
!pip install selenium webdriver-manager pandas


Defaulting to user installation because normal site-packages is not writeable


In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

# ESPN Cricinfo ball-by-ball commentary page
url = "https://www.espncricinfo.com/series/indian-premier-league-2022-1298423/gujarat-titans-vs-rajasthan-royals-final-1312200/ball-by-ball-commentary"
driver.get(url)

# Wait for the page to load
time.sleep(5)

# Scroll multiple times to load all commentary
for _ in range(15):  # Increase the scroll count if needed
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)  # Allow time for new data to load

# Extract ball-by-ball details
commentary_blocks = driver.find_elements(By.CSS_SELECTOR, "div.ds-border-b.ds-border-line.ds-py-2")

# Debug: Print number of found items
print(f"Found {len(commentary_blocks)} commentary entries.")

# Initialize lists to store extracted data
ball_numbers = []
overs = []
bowler_names = []
batter_names = []
ball_types = []
shot_types = []
ball_speeds = []
runs_scored = []

# Loop through each commentary block
for block in commentary_blocks:
    try:
        ball_info = block.find_element(By.CSS_SELECTOR, "div.ds-text-tight-s.ds-font-regular.ds-text-typo-mid3").text.strip()
        over = ball_info.split(".")[0]
    except:
        ball_info, over = None, None

    try:
        text = block.find_element(By.CSS_SELECTOR, "div.ds-text-tight-m.ds-font-regular.ds-px-4.ds-py-3").text.strip()
    except:
        text = None

    if text and "to" in text:
        parts = text.split(" to ")
        bowler_name = parts[0].strip()
        batter_name = parts[1].split(",")[0].strip()
    else:
        bowler_name = batter_name = None

    # Classify ball type
    if "no run" in text:
        ball_type = "dot"
        shot_type = "none"
    elif "FOUR" in text:
        ball_type = "legal"
        shot_type = "boundary"
    elif "SIX" in text:
        ball_type = "legal"
        shot_type = "boundary"
    elif "wide" in text:
        ball_type = "wide"
        shot_type = "none"
    elif "no ball" in text:
        ball_type = "no ball"
        shot_type = "none"
    else:
        ball_type = "legal"
        shot_type = "other"

    # Extract ball speed
    ball_speed = None
    if "km/h" in text:
        speed_parts = text.split("km/h")
        ball_speed = speed_parts[0].split()[-1] + " km/h"

    # Extract runs scored
    runs = "0"
    if "FOUR" in text:
        runs = "4"
    elif "SIX" in text:
        runs = "6"
    elif any(x in text for x in ["1 run", "2 runs", "3 runs"]):
        runs = text.split()[0]

    # Append extracted data
    ball_numbers.append(ball_info)
    overs.append(over)
    bowler_names.append(bowler_name)
    batter_names.append(batter_name)
    ball_types.append(ball_type)
    shot_types.append(shot_type)
    ball_speeds.append(ball_speed)
    runs_scored.append(runs)

# Close WebDriver
driver.quit()

# Create DataFrame
df = pd.DataFrame({
    "Ball No": ball_numbers,
    "Over": overs,
    "Bowler Name": bowler_names,
    "Batter Name": batter_names,
    "Ball Type": ball_types,
    "Shot Type": shot_types,
    "Speed of Ball": ball_speeds,
    "Runs Scored": runs_scored
})

# Save to CSV
df.to_csv("GT_vs_RR_Final_Ball_by_Ball.csv", index=False)

print("✅ Data extraction complete. Check 'GT_vs_RR_Final_Ball_by_Ball.csv'.")


Found 0 commentary entries.
✅ Data extraction complete. Check 'GT_vs_RR_Final_Ball_by_Ball.csv'.


In [19]:
!pip install requests beautifulsoup4 pandas


Defaulting to user installation because normal site-packages is not writeable


In [22]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Cricbuzz commentary page URL (change the match ID if needed)
url = "https://www.cricbuzz.com/live-cricket-scores/66168/gt-vs-rr-final-indian-premier-league-2023"

# Headers to avoid bot detection
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Fetch page content
response = requests.get(url, headers=headers)
if response.status_code != 200:
    print(f"❌ Failed to fetch page. Status code: {response.status_code}")
else:
    print("✅ Successfully fetched page!")

# Parse the page content
soup = BeautifulSoup(response.content, "lxml")

# Find ball-by-ball commentary section
commentary_section = soup.find("div", class_="cb-col cb-col-100 cb-com-ln")

if not commentary_section:
    print("❌ Commentary section not found. Cricbuzz may have updated its structure.")
else:
    print("✅ Commentary section found!")

# Extract all commentary events
commentary_entries = soup.find_all("div", class_="cb-col cb-col-100 cb-col-rt cb-com-ln")

# Initialize list to store data
data = []

# Loop through each commentary entry
for entry in commentary_entries:
    text = entry.text.strip()

    if not text:
        continue  # Skip empty entries

    # Example: "19.2 Boult to Gill, FOUR!"
    parts = text.split(" ")
    if len(parts) < 3:
        continue

    # Extract Over and Ball Number
    over_ball = parts[0]
    if "." not in over_ball:
        continue
    over, ball_no = map(int, over_ball.split("."))

    # Extract Bowler and Batter Name
    bowler_name = parts[1]
    batter_name = parts[3]

    # Extract Ball Type and Runs
    if "FOUR" in text:
        ball_type = "legal"
        shot_type = "boundary"
        runs = 4
    elif "SIX" in text:
        ball_type = "legal"
        shot_type = "boundary"
        runs = 6
    elif "wide" in text:
        ball_type = "wide"
        shot_type = "none"
        runs = 1
    elif "no ball" in text:
        ball_type = "no ball"
        shot_type = "none"
        runs = 1
    else:
        ball_type = "legal"
        shot_type = "other"
        runs = 0

    # Append to list
    data.append([ball_no, over, bowler_name, batter_name, ball_type, shot_type, runs])

# Convert to DataFrame
df = pd.DataFrame(data, columns=["Ball No", "Over", "Bowler Name", "Batter Name", "Ball Type", "Shot Type", "Runs Scored"])

# Display DataFrame
if not df.empty:
    print("\n✅ Extracted Ball-by-Ball Commentary Data:\n")
    display(df)
else:
    print("\n❌ No data extracted. Cricbuzz may have changed its page structure. Try changing selectors.\n")


✅ Successfully fetched page!
❌ Commentary section not found. Cricbuzz may have updated its structure.

❌ No data extracted. Cricbuzz may have changed its page structure. Try changing selectors.



In [23]:
import requests
import pandas as pd

# ESPN CricInfo Ball-by-Ball API (Replace with correct match ID)
match_id = "1312200"  # GT vs RR Final
url = f"https://hs-consumer-api.espncricinfo.com/v1/pages/match/{match_id}/commentary"

# Fetch data
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(url, headers=headers)

if response.status_code == 200:
    print("✅ Successfully fetched ball-by-ball commentary!")
    data = response.json()
else:
    print(f"❌ Failed to fetch data. Status Code: {response.status_code}")
    data = {}

# Extract ball-by-ball commentary
if "commentary" in data:
    commentary = data["commentary"]["items"]
    extracted_data = []

    for ball in commentary:
        over = ball.get("oversActual", "")
        bowler = ball.get("bowler", {}).get("longName", "Unknown")
        batter = ball.get("batsman", {}).get("longName", "Unknown")
        ball_type = "legal" if "delivery" in ball["event"] else ball["event"]
        shot_type = "boundary" if ball.get("scoreValue", 0) >= 4 else "other"
        runs = ball.get("scoreValue", 0)

        extracted_data.append([over, bowler, batter, ball_type, shot_type, runs])

    # Convert to DataFrame
    df = pd.DataFrame(extracted_data, columns=["Over", "Bowler", "Batter", "Ball Type", "Shot Type", "Runs Scored"])
    
    # Display Data
    display(df)
else:
    print("❌ No ball-by-ball commentary found.")


❌ Failed to fetch data. Status Code: 403
❌ No ball-by-ball commentary found.


In [25]:
import pandas as pd
import re

# Paste copied Cricbuzz commentary here
commentary_text = """
19.2 Boult to Gill, FOUR! Crunched past cover
19.1 Boult to Gill, 1 run, taps to point, quick single
18.6 Chahal to Miller, OUT! Caught at deep mid-wicket
18.5 Chahal to Miller, SIX! Massive hit over long-on
"""

# Process commentary line by line
data = []
for line in commentary_text.strip().split("\n"):
    match = re.match(r"(\d+)\.(\d+) (\w+) to (\w+), (.*)", line)
    
    if match:
        over = int(match.group(1))
        ball_no = int(match.group(2))
        bowler = match.group(3)
        batter = match.group(4)
        ball_details = match.group(5)

        # Determine ball type & shot type
        if "FOUR" in ball_details:
            ball_type = "legal"
            shot_type = "boundary"
            runs = 4
        elif "SIX" in ball_details:
            ball_type = "legal"
            shot_type = "boundary"
            runs = 6
        elif "OUT" in ball_details:
            ball_type = "legal"
            shot_type = "wicket"
            runs = 0
        elif "wide" in ball_details:
            ball_type = "wide"
            shot_type = "none"
            runs = 1
        elif "no ball" in ball_details:
            ball_type = "no ball"
            shot_type = "none"
            runs = 1
        else:
            ball_type = "legal"
            shot_type = "other"
            runs = int(re.search(r"(\d+) run", ball_details).group(1)) if re.search(r"(\d+) run", ball_details) else 0

        data.append([ball_no, over, bowler, batter, ball_type, shot_type, runs])

# Convert to DataFrame
df = pd.DataFrame(data, columns=["Ball No", "Over", "Bowler", "Batter", "Ball Type", "Shot Type", "Runs Scored"])

# Display Data
display(df)


Unnamed: 0,Ball No,Over,Bowler,Batter,Ball Type,Shot Type,Runs Scored
0,2,19,Boult,Gill,legal,boundary,4
1,1,19,Boult,Gill,legal,other,1
2,6,18,Chahal,Miller,legal,wicket,0
3,5,18,Chahal,Miller,legal,boundary,6
