In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
import time
import re

# Setup Chrome options
options = Options()
options.add_argument("--headless")  # Comment out to see browser activity
options.add_argument("--disable-gpu")
options.add_argument("--disable-extensions")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})

# Initialize WebDriver
service = Service('/usr/bin/chromedriver')  # Update with your chromedriver path
driver = webdriver.Chrome(service=service, options=options)

# Collected data storage
collected_data = {
    'api_responses': [],
    'clicked_buttons': [],
    'captured_urls': set()
}

def setup_network_logging():
    """Configure network logging to capture API calls"""
    driver.execute_cdp_cmd('Network.enable', {})
    driver.execute_cdp_cmd('Network.setRequestInterception', {'patterns': [{
        'urlPattern': '*',
        'resourceType': 'XHR',
        'interceptionStage': 'HeadersReceived'
    }]})

def capture_network_responses():
    """Capture and log network responses"""
    logs = driver.get_log('performance')
    for entry in logs:
        try:
            log = json.loads(entry['message'])['message']
            if log.get('method') == 'Network.responseReceived':
                response = log.get('params', {}).get('response', {})
                response_url = response.get('url', '')
                
                # Focus on specific API endpoints
                if 'hs-consumer-api.espncricinfo.com/v1/pages/match/comments' in response_url:
                    try:
                        # Extract response body using Chrome DevTools Protocol
                        request_id = log['params']['requestId']
                        response_body = driver.execute_cdp_cmd(
                            'Network.getResponseBody',
                            {'requestId': request_id}
                        )
                        
                        # Store the response
                        collected_data['api_responses'].append({
                            'url': response_url,
                            'body': response_body.get('body', '{}')
                        })
                        
                        # Track unique URLs
                        collected_data['captured_urls'].add(response_url)
                        
                        print(f"Captured Comments API URL: {response_url}")
                    
                    except Exception as e:
                        print(f"Error extracting response body: {e}")
        
        except Exception as e:
            print(f"Error processing log entry: {e}")

def click_dynamic_buttons():
    """
    Find and click buttons that load dynamic content.
    Customize the selectors based on the actual website structure.
    """
    try:
        # Wait for potential dynamic content buttons
        wait = WebDriverWait(driver, 10)
        
        # Examples of potential button selectors - MODIFY THESE for your specific website
        button_selectors = [
            # CSS selectors for buttons that load more content
            'button.load-more',
            'div[data-testid="load-more-comments"]',
            'a.next-page',
            'button#load-more-comments',
            
            # XPath selectors (sometimes more reliable)
            '//button[contains(text(), "Load More")]',
            '//button[contains(@class, "comments-load-more")]'
        ]
        
        # Try each selector
        for selector in button_selectors:
            try:
                # Try CSS selector first
                if selector.startswith('//'):
                    buttons = driver.find_elements(By.XPATH, selector)
                else:
                    buttons = driver.find_elements(By.CSS_SELECTOR, selector)
                
                # Click each found button
                for button in buttons:
                    if button.is_displayed() and button.is_enabled():
                        print(f"Clicking button: {selector}")
                        collected_data['clicked_buttons'].append(selector)
                        
                        # Scroll to button to ensure it's in view
                        driver.execute_script("arguments[0].scrollIntoView(true);", button)
                        time.sleep(0.5)  # Short wait to stabilize page
                        
                        # Click the button
                        button.click()
                        
                        # Wait for potential content load
                        time.sleep(2)
                        
                        # Capture network responses after clicking
                        capture_network_responses()
            
            except Exception as e:
                print(f"Error with selector {selector}: {e}")
    
    except Exception as e:
        print(f"Error in click_dynamic_buttons: {e}")

def main_scraping_process():
    """Main scraping workflow"""
    try:
        # Setup network logging
        setup_network_logging()
        
        # Navigate to the target page
        driver.get("https://www.espncricinfo.com/series/australia-vs-india-2024-25-1426547/australia-vs-india-2nd-test-1426556/ball-by-ball-commentary")
        
        # Wait for initial page load
        time.sleep(3)
        
        # Scroll the page to trigger lazy loading
        scroll_page()
        
        # Click dynamic buttons to load more content
        click_dynamic_buttons()
        
        # Final network response capture
        capture_network_responses()
        
    except Exception as e:
        print(f"Error in main scraping process: {e}")

def scroll_page():
    """Scroll the entire page to load dynamic content"""
    try:
        # Get initial page height
        page_height = driver.execute_script("return document.body.scrollHeight")
        
        # Scroll parameters
        scroll_increment = 200
        current_position = 0
        
        while current_position < page_height:
            # Scroll down
            driver.execute_script(f"window.scrollBy(0, {scroll_increment});")
            current_position += scroll_increment
            time.sleep(0.5)
            
            # Recalculate page height (in case of dynamic content)
            page_height = driver.execute_script("return document.body.scrollHeight")
            
            # Capture any network responses during scrolling
            capture_network_responses()
    
    except Exception as e:
        print(f"Error during page scrolling: {e}")

try:
    # Run the main scraping process
    main_scraping_process()
    
    # Save collected data
    with open('./dynamic_content_data.json', 'w') as f:
        # Convert set to list for JSON serialization
        collected_data['captured_urls'] = list(collected_data['captured_urls'])
        json.dump(collected_data, f, indent=4)
    
    print("\nScraping Summary:")
    print(f"Captured API Responses: {len(collected_data['api_responses'])}")
    print(f"Clicked Buttons: {collected_data['clicked_buttons']}")
    print(f"Unique URLs Captured: {len(collected_data['captured_urls'])}")

finally:
    # Cleanup
    driver.quit()


Scraping Summary:
Captured API Responses: 0
Clicked Buttons: []
Unique URLs Captured: 0


In [15]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import json
import time
import re

# Setup Chrome options
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--disable-extensions")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")
options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})

# Initialize WebDriver
service = Service('/usr/bin/chromedriver')
driver = webdriver.Chrome(service=service, options=options)

# Dictionary to store different types of API URLs
api_url_types = {
    'comments_urls': set(),
    'other_comments_urls': set()
}

api_responses = []

def capture_and_categorize_urls():
    """Capture API URLs and categorize them."""
    logs = driver.get_log('performance')
    for entry in logs:
        try:
            log = json.loads(entry['message'])['message']
            if log.get('method') == 'Network.responseReceived':
                response_url = log.get('params', {}).get('response', {}).get('url', '')

                # Check for comments API URLs
                if 'hs-consumer-api.espncricinfo.com/v1/pages/match/comments' in response_url:
                    print(f"Captured Fetch Request URL: {response_url}")
                    request_id = log['params']['requestId']
                    response_body = driver.execute_cdp_cmd(
                        'Network.getResponseBody',
                        {'requestId': request_id}
                    )
                    # Add to responses list
                    api_responses.append({
                        "url": response_url,
                        "body": json.loads(response_body.get('body', '{}'))  # Parse JSON body
                    })

                    match = re.search(r'/comments/(\w+)', response_url)
                    if match:
                        if match.group(1) == 'default':
                            api_url_types['comments_urls'].add(response_url)
                        else:
                            api_url_types['other_comments_urls'].add(response_url)

        except Exception as e:
            print(f"Error capturing URL: {e}")
            continue

def check_for_buttons():
    """Check for buttons on the page."""
    try:
        buttons = driver.find_elements(By.TAG_NAME, 'button')
        print(f"Found {len(buttons)} buttons on the page.")
        for idx, button in enumerate(buttons):
            print(f"Button {idx + 1}:")
            print(f"  Text: {button.text}")
            print(f"  Enabled: {button.is_enabled()}")
            print(f"  Visible: {button.is_displayed()}")

        # Example: Interact with the first button if it exists
        if buttons:
            print("Clicking the first button...")
            buttons[0].click()
            time.sleep(2)  # Allow time to observe the result

    except Exception as e:
        print(f"Error checking buttons: {e}")

try:
    # Open the target webpage
    driver.get("https://www.espncricinfo.com/series/australia-vs-india-2024-25-1426547/australia-vs-india-2nd-test-1426556/ball-by-ball-commentary")

    # Allow time for initial page load
    time.sleep(3)

    # Check for buttons on the page
    check_for_buttons()

    # Setup scroll parameters
    scroll_increment = 200
    scroll_pause_time = 1
    page_height = driver.execute_script("return document.body.scrollHeight")
    current_position = 0

    while current_position < page_height:
        # Scroll down
        driver.execute_script(f"window.scrollBy(0, {scroll_increment});")
        current_position += scroll_increment
        time.sleep(scroll_pause_time)

        # Update page height
        page_height = driver.execute_script("return document.body.scrollHeight")

        # Capture URLs
        capture_and_categorize_urls()

    print("Scrolling and URL capture completed.")

    # Save the categorized URLs to a file
    with open('./comments_api_urls.json', 'w') as f:
        json.dump(api_responses, f, indent=4)
        json.dump({k: list(v) for k, v in api_url_types.items()}, f, indent=4)
        print("API URLs saved to 'comments_api_urls.json'.")

    # Print out the differences
    print("\nDifferences in Comments API URLs:")
    print("Default Comments URLs:", len(api_url_types['comments_urls']))
    print("Other Comments URLs:", len(api_url_types['other_comments_urls']))

finally:
    driver.quit()


Found 1 buttons on the page.
Button 1:
  Text: 
  Enabled: True
  Visible: False
Clicking the first button...
Error checking buttons: Message: element not interactable
  (Session info: chrome=131.0.6778.85)
Stacktrace:
#0 0x61ef4b9d38da <unknown>
#1 0x61ef4b4fd38d <unknown>
#2 0x61ef4b54c63c <unknown>
#3 0x61ef4b540265 <unknown>
#4 0x61ef4b56f412 <unknown>
#5 0x61ef4b53fba8 <unknown>
#6 0x61ef4b56f5de <unknown>
#7 0x61ef4b58d9e7 <unknown>
#8 0x61ef4b56f1b3 <unknown>
#9 0x61ef4b53de92 <unknown>
#10 0x61ef4b53ee6e <unknown>
#11 0x61ef4b9a233f <unknown>
#12 0x61ef4b9a645d <unknown>
#13 0x61ef4b990987 <unknown>
#14 0x61ef4b9a6bd1 <unknown>
#15 0x61ef4b97872e <unknown>
#16 0x61ef4b9c25e8 <unknown>
#17 0x61ef4b9c27ea <unknown>
#18 0x61ef4b9d2548 <unknown>
#19 0x7a427233dac3 <unknown>

Captured Fetch Request URL: https://hs-consumer-api.espncricinfo.com/v1/pages/match/comments?lang=en&seriesId=1426547&matchId=1426556&inningNumber=3&commentType=ALL&sortDirection=DESC
Error capturing URL: Messa

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Setup Chrome options
options = Options()
# options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--disable-extensions")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")

# Initialize WebDriver
service = Service('/usr/bin/chromedriver')
driver = webdriver.Chrome(service=service, options=options)

def check_for_buttons():
    """Check for buttons on the page and interact with them."""
    try:
        # Locate all button elements
        # buttons = driver.find_element(By.CLASS_NAME, "ds-flex ds-items-center ds-space-x-2 ds-overflow-x-auto")
        buttons = driver.find_element(By.XPATH, "//span[contains(@class, 'ds-inline-flex') and .//span[text()='New']]")
        buttons.click()

        # innings_dropdown = driver.find_element(By.XPATH, "//span[text()='SL 2nd Innings']/ancestor::div[contains(@class, 'ds-popper-wrapper')]")
        # driver.execute_script("arguments[0].click();", innings_dropdown)

        innings_dropdown = driver.find_element(By.XPATH, "//div[contains(@class, 'ds-popper-wrapper') and .//span[contains(@class, 'ds-text-tight-s')]]")
        innings_dropdown.click()

        li_elements = WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.XPATH, "//ul[contains(@class, 'ds-flex ds-flex-col')]//li"))
        )
        li_elements[3].click()

        print(innings_dropdown.text)


        # Find and click dropdown elements
        # full_commentary = driver.find_element(By.XPATH, "//span[contains(text(), 'Full commentary')]")
        # full_commentary.click()

        # JavaScript click method
        # buttons = driver.find_element(By.XPATH, "//span[contains(text(), 'New')]")
        # new_butto.click()

        # wait = WebDriverWait(driver, 10)  # Wait for a maximum of 10 seconds
        # buttons = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".ds-w-full.ds-flex")))
        # driver.execute_script("arguments[0].scrollIntoView();", button)
        # button.click()
        # buttons = driver.find_elements(By.TAG_NAME, 'button')
        # buttons = driver.find_element(By.XPATH, "//button[contains(@class, 'ds-w-full') and contains(@class, 'ds-flex')]")

        print(f"Found {len(buttons)} buttons on the page.")

        for idx, button in enumerate(buttons):
            print(f"Button {idx + 1}:")
            print(f"  Text: {button.text}")
            print(f"  Enabled: {button.is_enabled()}")
            print(f"  Visible: {button.is_displayed()}")

            # Interact with the first button
            if button.is_enabled() and button.is_displayed():
                print(f"Clicking Button {idx + 1}...")
                button.click()  # Simulate clicking the button
                time.sleep(2)  # Wait to see the result of the button click (optional)

                # Optionally: Wait for specific page updates after clicking
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "h1")))  # Example wait condition

                print("Button clicked, and page is updated!")

    except Exception as e:
        print(f"Error checking buttons: {e}")

try:
    # Open the target webpage
    driver.get("https://www.espncricinfo.com/series/australia-vs-india-2024-25-1426547/australia-vs-india-2nd-test-1426556/ball-by-ball-commentary")

    # Allow time for initial page load
    time.sleep(3)

    # Check for buttons on the page and interact with them
    check_for_buttons()

    # Setup scroll parameters (Optional, if you still want to scroll the page)
    scroll_increment = 200
    scroll_pause_time = 1
    page_height = driver.execute_script("return document.body.scrollHeight")
    current_position = 0

    while current_position < page_height:
        # Scroll down
        driver.execute_script(f"window.scrollBy(0, {scroll_increment});")
        current_position += scroll_increment
        time.sleep(scroll_pause_time)

        # Update page height
        page_height = driver.execute_script("return document.body.scrollHeight")

finally:
    driver.quit()
