<a href="https://colab.research.google.com/github/BakhturinaPolina/goodreads-romance-research/blob/main/scraping_ratings_information_expanded_romantic_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Step 1: Install Dependencies and Imports**

In [9]:
# Cell 1: Install dependencies and import libraries (Updated with anti-detection for Selenium)
# Explanation in comments: We install packages if not already present in Colab.
# This ensures everything runs smoothly. Selenium needs ChromeDriver setup for headless browsing.
# Updated: Added options to hide bot detection (e.g., disable automation flags, custom user-agent) to avoid empty pages on Goodreads.

# Install required packages (run this once per Colab session)
!pip install beautifulsoup4 requests pandas selenium tqdm
# Note: Removed webdriver_manager as it's not needed with system chromedriver

# For Selenium in Colab: Install Chrome and ChromeDriver
!apt-get update -qq  # Quiet update to avoid verbose output
!apt install -y -qq chromium-chromedriver  # Quiet install

# Ensure chromedriver is in /usr/bin (Colab often has it here already)
import os
chromedriver_path = '/usr/lib/chromium-browser/chromedriver'
if os.path.exists(chromedriver_path) and not os.path.exists('/usr/bin/chromedriver'):
    !cp {chromedriver_path} /usr/bin
else:
    print("Debug: chromedriver already exists in /usr/bin or source path. Skipping copy.")

# Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from tqdm import tqdm  # For progress bars
import json  # For handling JSON-like data (e.g., reviews)
import os  # For file operations
import sys  # For system paths (debug)

# Set up Selenium Chrome options for Colab (headless, no sandbox) with anti-detection
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')  # Run without visible browser window
chrome_options.add_argument('--no-sandbox')  # Required for Colab
chrome_options.add_argument('--disable-dev-shm-usage')  # Avoid shared memory issues
chrome_options.add_argument('--disable-gpu')  # Extra stability for headless mode
chrome_options.add_argument('window-size=1920x1080')  # Set a reasonable window size
chrome_options.binary_location = '/usr/bin/chromium-browser'  # Point to the installed Chromium

# Anti-detection options (to avoid bot blocks and empty pages)
chrome_options.add_argument('--disable-blink-features=AutomationControlled')  # Hide Selenium flag
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')  # Mimic real browser user-agent
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])  # Exclude automation switches
chrome_options.add_experimental_option('useAutomationExtension', False)  # Disable automation extension

# Debug: Print system paths for troubleshooting
print(f"Debug: Python version: {sys.version}")
print(f"Debug: Chromedriver path: /usr/bin/chromedriver")
print(f"Debug: Chromium binary: {chrome_options.binary_location}")

# Initialize the WebDriver with try-except for error handling
try:
    driver = webdriver.Chrome(options=chrome_options)  # Use system chromedriver (no service/manager)
    print("Debug: WebDriver initialized successfully.")
except WebDriverException as e:
    print(f"Error: Failed to initialize WebDriver: {e}")
    raise  # Re-raise to stop if critical

# Debug print: Confirm WebDriver is set up by loading a test page
try:
    driver.get('https://www.goodreads.com/')  # Test with Goodreads home (as in your code)
    print(f"Debug: WebDriver test - Page title: {driver.title}")  # Should print "Goodreads | Meet your next favorite book"
    print(f"Debug: Test page source sample: {driver.page_source[:500]}")  # Print sample to verify content
    print("Debug: All dependencies installed and imported successfully. Ready to proceed.")
except Exception as e:
    print(f"Error: Test page load failed: {e}")

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
chromium-chromedriver is already the newest version (1:85.0.4183.83-0ubuntu2.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 36 not upgraded.
Debug: chromedriver already exists in /usr/bin or source path. Skipping copy.
Debug: Python version: 3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]
Debug: Chromedriver path: /usr/bin/chromedriver
Debug: Chromium binary: /usr/bin/chromium-browser
Debug: WebDriver initialized successfully.
Debug: WebDriver test - Page title: Goodreads | Meet your next favorite book
Debug: Test page source sample: <html class="desktop withSiteHeaderTopFullImage
 picture es5array es5date es5function es5object strictmode es5string json es5syntax es5undefined es5 no-touchevents cssanimations flexbox flexwrap csstransforms localstorage"><head><script src="https://rules.qu

**Step 2: Define Subgenres and Global Variables**

In [12]:
# Cell 2: Define subgenres, URLs, and global configurations
# Explanation: Here we list the subgenres and their Goodreads URLs as provided.
# We also set configurable variables for scraping limits, delays, etc.
# This makes the code flexible—e.g., change MIN_BOOKS_PER_SUBGENRE for testing.

# List of subgenres and their shelf/genre URLs
subgenres = {
    "Contemporary Romance": "https://www.goodreads.com/shelf/show/contemporary-romance",
    "Historical Romance": "https://www.goodreads.com/shelf/show/historical-romance",
    "Paranormal Romance": "https://www.goodreads.com/shelf/show/paranormal-romance",
    "Romantic Suspense": "https://www.goodreads.com/shelf/show/romantic-suspense",
    "Romantic Fantasy": "https://www.goodreads.com/genres/fantasy-romance",
    "Science Fiction Romance": "https://www.goodreads.com/genres/science-fiction-romance"
}

# Configurable scraping limits
MIN_BOOKS_PER_SUBGENRE = 200  # Minimum to collect (we'll stop if we reach this and can't get more)
MAX_BOOKS_PER_SUBGENRE = 300  # Maximum to aim for (if available on pages)
MAX_REVIEWS_PER_BOOK = 200  # Cap for reviews if >200; set to None for no cap
ALL_REVIEWS = False  # Set to True to scrape ALL reviews regardless of count (warning: can be slow!)
DELAY_MIN = 2  # Minimum delay between requests (seconds)
DELAY_MAX = 5  # Maximum delay (for randomness to mimic human behavior)

# User-agent for requests (to avoid blocks; rotate if needed)
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Empty list to store all scraped book data (will convert to DataFrame later)
all_books = []

# Debug prints: Show configurations
print("Debug: Subgenres defined:")
for genre, url in subgenres.items():
    print(f"  - {genre}: {url}")
print(f"Debug: Config - Books per subgenre: {MIN_BOOKS_PER_SUBGENRE} to {MAX_BOOKS_PER_SUBGENRE}")
print(f"Debug: Config - Max reviews per book: {MAX_REVIEWS_PER_BOOK} (all_reviews={ALL_REVIEWS})")
print(f"Debug: Config - Delays between requests: {DELAY_MIN}-{DELAY_MAX} seconds")
print("Debug: Ready to scrape book lists.")

Debug: Subgenres defined:
  - Contemporary Romance: https://www.goodreads.com/shelf/show/contemporary-romance
  - Historical Romance: https://www.goodreads.com/shelf/show/historical-romance
  - Paranormal Romance: https://www.goodreads.com/shelf/show/paranormal-romance
  - Romantic Suspense: https://www.goodreads.com/shelf/show/romantic-suspense
  - Romantic Fantasy: https://www.goodreads.com/genres/fantasy-romance
  - Science Fiction Romance: https://www.goodreads.com/genres/science-fiction-romance
Debug: Config - Books per subgenre: 200 to 300
Debug: Config - Max reviews per book: 200 (all_reviews=False)
Debug: Config - Delays between requests: 2-5 seconds
Debug: Ready to scrape book lists.


**Step 3: Function to Scrape Book Lists from Subgenre Pages**

In [13]:
# Cell 3: Function to scrape book lists from a subgenre URL (Using Selenium for pagination)
# Explanation: This function uses Selenium to load the subgenre page, extract books, and click "next" for pagination.
# It collects unique books by tracking book_ids in a set.
# Selenium handles dynamic content and proper pagination better than requests (e.g., avoids repeats by simulating clicks).
# Note: This is slower than BS4, so we use waits and delays. Run with care to avoid bans.
# Assumes global 'driver' from Cell 1. We parse with BeautifulSoup after loading for easier tag finding.
# We scroll to the next button and use execute_script for reliable clicks in headless mode.
# Adapted from your provided code: Added handling for potential overlays (wait for invisibility), JS click fallback,
# longer timeouts, anti-detection options (user-agent), and page source print on timeout for debugging.
# New: Added a retry on timeout (up to 2 times), and an initial test load of Goodreads home for verification.

def scrape_subgenre_books(genre, base_url):
    """
    Scrape book details from a subgenre's paginated list using Selenium.
    Args:
        genre (str): Subgenre name (e.g., "Contemporary Romance")
        base_url (str): The starting URL for the subgenre
    Returns:
        list: List of delegation dicts with book info (title, author, url, book_id, subgenre)
    """
    collected_books = []  # List to hold books for this subgenre (each a dict with details)
    seen_ids = set()  # Set to track unique book_ids and skip duplicates (prevents adding the same book multiple times)
    current_url = base_url  # Start with the base URL; we'll update it as we paginate
    page = 1  # Track page number for debugging and logging
    max_retries = 2  # Number of retries on load timeout
    print(f"Debug: Starting Selenium scrape for {genre} at {base_url}")  # Debug print to show start

    try:
        # Test load: Try loading Goodreads home first to verify driver (as in your code)
        driver.get('https://www.goodreads.com/')  # Load home page
        time.sleep(3)  # Short delay
        home_title = driver.title  # Get title
        print(f"Debug: Test load - Goodreads home title: {home_title}")  # Should be "Goodreads | Meet your next favorite book"
        if "Goodreads" not in home_title:
            print("Warning: Test load failed - Driver may not be loading content properly.")  # Alert if test fails

        # Now load the actual subgenre page
        driver.get(current_url)
        time.sleep(random.uniform(3, 6))  # Slightly longer initial delay for full load (adjusted from your code's time.sleep(2))
        print(f"Debug: Loaded/initial page: {driver.current_url}")  # Confirm the page loaded
        print(f' Debug: Initial page source sample: {driver.page_source[:500]}')  # Print sample to verify content (new debug)

        # Main loop: Continue until we have enough books or no more pages
        while len(collected_books) < MAX_BOOKS_PER_SUBGENRE:
            retry_count = 0  # Reset retry counter for this page
            page_loaded = False  # Flag for successful load
            while retry_count < max_retries and not page_loaded:
                # Wait for book items to appear on the page (Goodreads loads dynamically)
                try:
                    # Increased timeout to 30s for slower loads; wait for presence of elementList
                    WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME, 'elementList')))  # Wait for at least one book container
                    print(f"Debug: Page {page} loaded successfully on attempt {retry_count + 1}.")  # Success message
                    page_loaded = True  # Flag as loaded
                except TimeoutException:
                    retry_count += 1  # Increment retry
                    print(f"Debug: Timeout on page {page}, attempt {retry_count}. Retrying...")  # Log retry
                    time.sleep(5)  # Wait before retry
                    driver.refresh()  # Refresh the page on retry
                    if retry_count >= max_retries:
                        print(f"Debug: Max retries exceeded for page {page}. Printing page source for diagnosis...")  # Log failure
                        print(driver.page_source[:1000])  # Print first 1000 chars of HTML to inspect
                        print("Debug: Stopping due to persistent timeout.")  # Stop the loop
                        return collected_books  # Early return if failed

            if not page_loaded:
                break  # Exit if still not loaded after retries

            # Check for and handle potential overlays (adapted from your extract_characters function)
            try:
                WebDriverWait(driver, 10).until(EC.invisibility_of_element_located((By.XPATH, "//span[@tabindex='-1']")))  # Wait for any overlay to disappear
                print("Debug: Overlay handled (invisible).")  # Log success
            except TimeoutException:
                print("Debug: No overlay found or it didn't disappear; proceeding anyway.")  # Not critical, continue

            # Parse the current page source with BeautifulSoup (easier for complex HTML extraction than pure Selenium)
            soup = BeautifulSoup(driver.page_source, 'html.parser')  # Get the full HTML after JS loads

            # Find all book items on the page (adjust selector based on page type)
            if "shelf/show" in base_url:
                book_items = soup.find_all('div', class_='elementList')  # For shelf pages like Contemporary Romance
            else:
                book_items = soup.find_all('div', class_='grid-item')  # For genre pages (e.g., Romantic Fantasy); adjust if structure differs

            # If no items found, stop the loop
            if not book_items:
                print(f"Debug: No book items found on page {page}. Stopping.")  # Debug if selector fails
                break

            print(f"Debug: Found {len(book_items)} book items on page {page}")  # Show how many potential books found

            added_this_page = 0  # Counter for new books added this page (to detect if we're at the end)
            for item in book_items:  # Loop through each potential book element
                # Extract title from the item
                title_tag = item.find('a', class_='bookTitle')  # Find the <a> tag with class 'bookTitle'
                title = title_tag.text.strip() if title_tag else None  # Strip whitespace; None if not found

                # Extract author
                author_tag = item.find('a', class_='authorName')  # Find the author link
                author = author_tag.text.strip() if author_tag else None  # Strip and handle missing

                # Extract partial URL and book_id
                if title_tag and title_tag['href']:  # Check if tag exists and has href
                    url_partial = title_tag['href']  # Get the relative URL
                    full_url = f"https://www.goodreads.com{url_partial.split('?')[0]}"  # Build full clean URL (remove query params)
                    book_id_match = re.search(r'/show/(\d+)', url_partial)  # Regex to extract numeric ID from URL
                    book_id = book_id_match.group(1) if book_id_match else None  # Get the ID or None
                else:
                    full_url = None
                    book_id = None

                # Add the book if it's valid (has ID, title, author) and not already seen
                if book_id and title and author and book_id not in seen_ids:
                    seen_ids.add(book_id)  # Mark as seen
                    collected_books.append({  # Add dict to list
                        'book_id': book_id,
                        'title': title,
                        'author': author,
                        'url': full_url,
                        'subgenre': genre
                    })
                    print(f"Debug: Added book - ID: {book_id}, Title: {title[:50]}..., Author: {author}, URL: {full_url}")  # Truncate title for readability
                    added_this_page += 1  # Increment counter
                elif book_id in seen_ids:
                    print(f"Debug: Skipped duplicate book ID: {book_id}")  # Log duplicates
                else:
                    print("Debug: Skipped invalid book (missing title/author/ID)")  # Log invalid items

                # Check if we've reached the max books limit
                if len(collected_books) >= MAX_BOOKS_PER_SUBGENRE:
                    print(f"Debug: Reached max books ({MAX_BOOKS_PER_SUBGENRE}) for {genre}.")
                    break  # Exit the for loop

            # If no new books were added this page, likely end of unique content
            if added_this_page == 0:
                print(f"Debug: No new unique books on page {page}. Stopping.")
                break

            # Find and click the "next" button to go to the next page (adapted with fallback from your code)
            try:
               ーミ next_button = WebDriverWait(driver, 10).until(  # Wait up to 10s for clickable button
                    EC.element_to_be_clickable((By.XPATH, "//a[@class='next_page' and not(contains(@class, 'disabled'))]"))  # XPath for non.disabled next button
                )
                driver.execute_script("arguments[0].scrollIntoView(true);", next_button)  # Scroll to ensure it's visible (helps in headless)
                try:
                    next_button.click()  # Try direct click
                except:
                    print("Debug: Direct click failed; trying JS click (possible overlay).")  # Fallback if direct fails
                    driver.execute_script("arguments[0].click();", next_button)  # JS click as in your code
                print(f"Debug: Clicked 'next' button. Moving to page {page + 1}")  # Confirm action
                time.sleep(random.uniform(DELAY_MIN, DELAY_MAX))  # Delay after click to let new page load
                page += 1  # Increment page counter
            except TimeoutException:
                print(f"Debug: No clickable 'next' button found on page {page}. End of list.")  # If no button, stop
                break
            except WebDriverException as e:
                print(f"Error: Failed to click next: {e}. Stopping.")  # Handle click errors
                break

            print(f"Debug: Unique books collected so far for {genre}: {len(collected_books)}")  # Progress update

    except WebDriverException as e:
        print(f"Error: Selenium error for {genre}: {e}")  # Catch general Selenium errors

    # Check if we collected enough (warn if below minimum)
    if len(collected_books) < MIN_BOOKS_PER_SUBGENRE:
        print(f"Warning: Only collected {len(collected_books)} unique books for {genre} (less than min {MIN_BOOKS_PER_SUBGENRE})")

    print(f"Debug: Finished scraping {genre}. Total unique books: {len(collected_books)}")  # Final message
    return collected_books  # Return the list of books

# Example usage (for testing): Scrape one subgenre and add to all_books
# Reset all_books for fresh test; change test_genre if needed
all_books = []  # Reset for testing
test_genre = "Contemporary Romance"
all_books.extend(scrape_subgenre_books(test_genre, subgenres[test_genre]))  # Call the function and add results
print(f"Debug: Total books after test scrape: {len(all_books)}")  # Show total collected

SyntaxError: invalid syntax (ipython-input-2536797224.py, line 140)