<a href="https://colab.research.google.com/github/BakhturinaPolina/goodreads-romance-research/blob/main/scraping_ratings_information_expanded_romantic_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Step 1: Install Dependencies and Imports**

In [1]:
# Cell 1: Install dependencies and import libraries
# Explanation in comments: We install packages if not already present in Colab.
# This ensures everything runs smoothly. Selenium needs ChromeDriver setup for headless browsing.
# Updated: Using system chromedriver (no webdriver_manager) for Colab compatibility.
# Added try-except and more debug prints for troubleshooting.

# Install required packages (run this once per Colab session)
!pip install beautifulsoup4 requests pandas selenium tqdm
# Note: Removed webdriver_manager as it's not needed with system chromedriver

# For Selenium in Colab: Install Chrome and ChromeDriver
!apt-get update -qq  # Quiet update to avoid verbose output
!apt install -y -qq chromium-chromedriver  # Quiet install

# Ensure chromedriver is in /usr/bin (Colab often has it here already)
# We check if it exists to avoid the "same file" error
import os
chromedriver_path = '/usr/lib/chromium-browser/chromedriver'
if os.path.exists(chromedriver_path) and not os.path.exists('/usr/bin/chromedriver'):
    !cp {chromedriver_path} /usr/bin
else:
    print("Debug: chromedriver already exists in /usr/bin or source path. Skipping copy.")

# Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from tqdm import tqdm  # For progress bars
import json  # For handling JSON-like data (e.g., reviews)
import os  # For file operations
import sys  # For system paths (debug)

# Set up Selenium Chrome options for Colab (headless, no sandbox)
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')  # Run without visible browser window
chrome_options.add_argument('--no-sandbox')  # Required for Colab
chrome_options.add_argument('--disable-dev-shm-usage')  # Avoid shared memory issues
chrome_options.add_argument('--disable-gpu')  # Extra stability for headless mode
chrome_options.add_argument('window-size=1920x1080')  # Set a reasonable window size
chrome_options.binary_location = '/usr/bin/chromium-browser'  # Point to the installed Chromium

# Debug: Print system paths for troubleshooting
print(f"Debug: Python version: {sys.version}")
print(f"Debug: Chromedriver path: /usr/bin/chromedriver")
print(f"Debug: Chromium binary: {chrome_options.binary_location}")

# Initialize the WebDriver with try-except for error handling
try:
    driver = webdriver.Chrome(options=chrome_options)  # Use system chromedriver (no service/manager)
    print("Debug: WebDriver initialized successfully.")
except WebDriverException as e:
    print(f"Error: Failed to initialize WebDriver: {e}")
    raise  # Re-raise to stop if critical

# Debug print: Confirm WebDriver is set up by loading a test page
try:
    driver.get('https://www.goodreads.com/')
    print(f"Debug: WebDriver test - Page title: {driver.title}")  # Should print "Goodreads | Meet your next favorite book" or similar
    print("Debug: All dependencies installed and imported successfully. Ready to proceed.")
except Exception as e:
    print(f"Error: Test page load failed: {e}")

Collecting selenium
  Downloading selenium-4.34.2-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.34.2-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.post

**Step 2: Define Subgenres and Global Variables**

In [2]:
# Cell 2: Define subgenres, URLs, and global configurations
# Explanation: Here we list the subgenres and their Goodreads URLs as provided.
# We also set configurable variables for scraping limits, delays, etc.
# This makes the code flexible—e.g., change MIN_BOOKS_PER_SUBGENRE for testing.

# List of subgenres and their shelf/genre URLs
subgenres = {
    "Contemporary Romance": "https://www.goodreads.com/shelf/show/contemporary-romance",
    "Historical Romance": "https://www.goodreads.com/shelf/show/historical-romance",
    "Paranormal Romance": "https://www.goodreads.com/shelf/show/paranormal-romance",
    "Romantic Suspense": "https://www.goodreads.com/shelf/show/romantic-suspense",
    "Romantic Fantasy": "https://www.goodreads.com/genres/fantasy-romance",
    "Science Fiction Romance": "https://www.goodreads.com/genres/science-fiction-romance"
}

# Configurable scraping limits
MIN_BOOKS_PER_SUBGENRE = 200  # Minimum to collect (we'll stop if we reach this and can't get more)
MAX_BOOKS_PER_SUBGENRE = 300  # Maximum to aim for (if available on pages)
MAX_REVIEWS_PER_BOOK = 200  # Cap for reviews if >200; set to None for no cap
ALL_REVIEWS = False  # Set to True to scrape ALL reviews regardless of count (warning: can be slow!)
DELAY_MIN = 2  # Minimum delay between requests (seconds)
DELAY_MAX = 5  # Maximum delay (for randomness to mimic human behavior)

# User-agent for requests (to avoid blocks; rotate if needed)
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Empty list to store all scraped book data (will convert to DataFrame later)
all_books = []

# Debug prints: Show configurations
print("Debug: Subgenres defined:")
for genre, url in subgenres.items():
    print(f"  - {genre}: {url}")
print(f"Debug: Config - Books per subgenre: {MIN_BOOKS_PER_SUBGENRE} to {MAX_BOOKS_PER_SUBGENRE}")
print(f"Debug: Config - Max reviews per book: {MAX_REVIEWS_PER_BOOK} (all_reviews={ALL_REVIEWS})")
print(f"Debug: Config - Delays between requests: {DELAY_MIN}-{DELAY_MAX} seconds")
print("Debug: Ready to scrape book lists.")

Debug: Subgenres defined:
  - Contemporary Romance: https://www.goodreads.com/shelf/show/contemporary-romance
  - Historical Romance: https://www.goodreads.com/shelf/show/historical-romance
  - Paranormal Romance: https://www.goodreads.com/shelf/show/paranormal-romance
  - Romantic Suspense: https://www.goodreads.com/shelf/show/romantic-suspense
  - Romantic Fantasy: https://www.goodreads.com/genres/fantasy-romance
  - Science Fiction Romance: https://www.goodreads.com/genres/science-fiction-romance
Debug: Config - Books per subgenre: 200 to 300
Debug: Config - Max reviews per book: 200 (all_reviews=False)
Debug: Config - Delays between requests: 2-5 seconds
Debug: Ready to scrape book lists.


**Step 3: Function to Scrape Book Lists from Subgenre Pages**

In [6]:
# Cell 3: Function to scrape book lists from a subgenre URL (Using Selenium for pagination)
# Explanation: This function uses Selenium to load the subgenre page, extract books, and click "next" for pagination.
# It collects unique books by tracking book_ids in a set.
# Selenium handles dynamic content and proper pagination better than requests (e.g., avoids repeats by simulating clicks).
# Note: This is slower than BS4, so we use waits and delays. Run with care to avoid bans.
# Assumes global 'driver' from Cell 1.

def scrape_subgenre_books(genre, base_url):
    """
    Scrape book details from a subgenre's paginated list using Selenium.
    Args:
        genre (str): Subgenre name (e.g., "Contemporary Romance")
        base_url (str): The starting URL for the subgenre
    Returns:
        list: List of dicts with book info (title, author, url, book_id, subgenre)
    """
    collected_books = []  # List to hold books for this subgenre
    seen_ids = set()  # Set to track unique book_ids and skip duplicates
    current_url = base_url  # Start with the base URL
    page = 1  # Track page for debugging
    print(f"Debug: Starting Selenium scrape for {genre} at {base_url}")

    try:
        # Load the initial page
        driver.get(current_url)
        time.sleep(random.uniform(DELAY_MIN, DELAY_MAX))  # Initial delay for page load
        print(f"Debug: Loaded initial page: {driver.current_url}")

        while len(collected_books) < MAX_BOOKS_PER_SUBGENRE:
            # Wait for book items to load (use WebDriverWait for reliability)
            try:
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'elementList')))
                print(f"Debug: Page {page} loaded successfully.")
            except TimeoutException:
                print(f"Debug: Timeout waiting for book items on page {page}. Stopping.")
                break

            # Parse the page source with BeautifulSoup for easier extraction
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Find all book items on the page
            if "shelf/show" in base_url:
                book_items = soup.find_all('div', class_='elementList')  # For shelf pages
            else:
                book_items = soup.find_all('div', class_='grid-item')  # For genre pages (adjust if needed)

            if not book_items:
                print(f"Debug: No book items found on page {page}. Stopping.")
                break

            print(f"Debug: Found {len(book_items)} book items on page {page}")

            added_this_page = 0
            for item in book_items:
                # Extract title
                title_tag = item.find('a', class_='bookTitle')
                title = title_tag.text.strip() if title_tag else None

                # Extract author
                author_tag = item.find('a', class_='authorName')
                author = author_tag.text.strip() if author_tag else None

                # Extract partial URL and book_id
                if title_tag and title_tag['href']:
                    url_partial = title_tag['href']
                    full_url = f"https://www.goodreads.com{url_partial.split('?')[0]}"  # Clean URL
                    book_id_match = re.search(r'/show/(\d+)', url_partial)
                    book_id = book_id_match.group(1) if book_id_match else None
                else:
                    full_url = None
                    book_id = None

                # Add if valid and unique
                if book_id and title and author and book_id not in seen_ids:
                    seen_ids.add(book_id)
                    collected_books.append({
                        'book_id': book_id,
                        'title': title,
                        'author': author,
                        'url': full_url,
                        'subgenre': genre
                    })
                    print(f"Debug: Added book - ID: {book_id}, Title: {title[:50]}..., Author: {author}, URL: {full_url}")
                    added_this_page += 1
                elif book_id in seen_ids:
                    print(f"Debug: Skipped duplicate book ID: {book_id}")
                else:
                    print("Debug: Skipped invalid book (missing title/author/ID)")

                # Stop if max reached
                if len(collected_books) >= MAX_BOOKS_PER_SUBGENRE:
                    print(f"Debug: Reached max books ({MAX_BOOKS_PER_SUBGENRE}) for {genre}.")
                    break

            if added_this_page == 0:
                print(f"Debug: No new unique books on page {page}. Stopping.")
                break

            # Find and click "next" button for pagination
            try:
                next_button = WebDriverWait(driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//a[@class='next_page' and not(contains(@class, 'disabled'))]"))
                )
                driver.execute_script("arguments[0].scrollIntoView(true);", next_button)  # Scroll to button
                next_button.click()
                print(f"Debug: Clicked 'next' button. Moving to page {page + 1}")
                time.sleep(random.uniform(DELAY_MIN, DELAY_MAX))  # Delay after click
                page += 1
            except TimeoutException:
                print(f"Debug: No clickable 'next' button found on page {page}. End of list.")
                break
            except WebDriverException as e:
                print(f"Error: Failed to click next: {e}. Stopping.")
                break

            print(f"Debug: Unique books collected so far for {genre}: {len(collected_books)}")

    except WebDriverException as e:
        print(f"Error: Selenium error for {genre}: {e}")

    # Check minimum
    if len(collected_books) < MIN_BOOKS_PER_SUBGENRE:
        print(f"Warning: Only collected {len(collected_books)} unique books for {genre} (less than min {MIN_BOOKS_PER_SUBGENRE})")

    print(f"Debug: Finished scraping {genre}. Total unique books: {len(collected_books)}")
    return collected_books

# Example usage (for testing): Scrape one subgenre and add to all_books
all_books = []  # Reset for testing
test_genre = "Contemporary Romance"
all_books.extend(scrape_subgenre_books(test_genre, subgenres[test_genre]))
print(f"Debug: Total books after test scrape: {len(all_books)}")

Debug: Starting Selenium scrape for Contemporary Romance at https://www.goodreads.com/shelf/show/contemporary-romance
Debug: Loaded initial page: https://www.goodreads.com/shelf/show/contemporary-romance
Debug: Timeout waiting for book items on page 1. Stopping.
Debug: Finished scraping Contemporary Romance. Total unique books: 0
Debug: Total books after test scrape: 0
