<a href="https://colab.research.google.com/github/BakhturinaPolina/goodreads-romance-research/blob/main/scraping_ratings_information_expanded_romantic_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Step 1: Install Dependencies and Imports**

In [1]:
# Cell 1: Install dependencies and import libraries
# Explanation in comments: We install packages if not already present in Colab.
# This ensures everything runs smoothly. Selenium needs ChromeDriver setup for headless browsing.
# Updated: Using system chromedriver (no webdriver_manager) for Colab compatibility.
# Added try-except and more debug prints for troubleshooting.

# Install required packages (run this once per Colab session)
!pip install beautifulsoup4 requests pandas selenium tqdm
# Note: Removed webdriver_manager as it's not needed with system chromedriver

# For Selenium in Colab: Install Chrome and ChromeDriver
!apt-get update -qq  # Quiet update to avoid verbose output
!apt install -y -qq chromium-chromedriver  # Quiet install

# Ensure chromedriver is in /usr/bin (Colab often has it here already)
# We check if it exists to avoid the "same file" error
import os
chromedriver_path = '/usr/lib/chromium-browser/chromedriver'
if os.path.exists(chromedriver_path) and not os.path.exists('/usr/bin/chromedriver'):
    !cp {chromedriver_path} /usr/bin
else:
    print("Debug: chromedriver already exists in /usr/bin or source path. Skipping copy.")

# Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException
from tqdm import tqdm  # For progress bars
import json  # For handling JSON-like data (e.g., reviews)
import os  # For file operations
import sys  # For system paths (debug)

# Set up Selenium Chrome options for Colab (headless, no sandbox)
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')  # Run without visible browser window
chrome_options.add_argument('--no-sandbox')  # Required for Colab
chrome_options.add_argument('--disable-dev-shm-usage')  # Avoid shared memory issues
chrome_options.add_argument('--disable-gpu')  # Extra stability for headless mode
chrome_options.add_argument('window-size=1920x1080')  # Set a reasonable window size
chrome_options.binary_location = '/usr/bin/chromium-browser'  # Point to the installed Chromium

# Debug: Print system paths for troubleshooting
print(f"Debug: Python version: {sys.version}")
print(f"Debug: Chromedriver path: /usr/bin/chromedriver")
print(f"Debug: Chromium binary: {chrome_options.binary_location}")

# Initialize the WebDriver with try-except for error handling
try:
    driver = webdriver.Chrome(options=chrome_options)  # Use system chromedriver (no service/manager)
    print("Debug: WebDriver initialized successfully.")
except WebDriverException as e:
    print(f"Error: Failed to initialize WebDriver: {e}")
    raise  # Re-raise to stop if critical

# Debug print: Confirm WebDriver is set up by loading a test page
try:
    driver.get('https://www.goodreads.com/')
    print(f"Debug: WebDriver test - Page title: {driver.title}")  # Should print "Goodreads | Meet your next favorite book" or similar
    print("Debug: All dependencies installed and imported successfully. Ready to proceed.")
except Exception as e:
    print(f"Error: Test page load failed: {e}")

Collecting selenium
  Downloading selenium-4.34.2-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.34.2-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m99.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.post

**Step 2: Define Subgenres and Global Variables**

In [2]:
# Cell 2: Define subgenres, URLs, and global configurations
# Explanation: Here we list the subgenres and their Goodreads URLs as provided.
# We also set configurable variables for scraping limits, delays, etc.
# This makes the code flexible—e.g., change MIN_BOOKS_PER_SUBGENRE for testing.

# List of subgenres and their shelf/genre URLs
subgenres = {
    "Contemporary Romance": "https://www.goodreads.com/shelf/show/contemporary-romance",
    "Historical Romance": "https://www.goodreads.com/shelf/show/historical-romance",
    "Paranormal Romance": "https://www.goodreads.com/shelf/show/paranormal-romance",
    "Romantic Suspense": "https://www.goodreads.com/shelf/show/romantic-suspense",
    "Romantic Fantasy": "https://www.goodreads.com/genres/fantasy-romance",
    "Science Fiction Romance": "https://www.goodreads.com/genres/science-fiction-romance"
}

# Configurable scraping limits
MIN_BOOKS_PER_SUBGENRE = 200  # Minimum to collect (we'll stop if we reach this and can't get more)
MAX_BOOKS_PER_SUBGENRE = 300  # Maximum to aim for (if available on pages)
MAX_REVIEWS_PER_BOOK = 200  # Cap for reviews if >200; set to None for no cap
ALL_REVIEWS = False  # Set to True to scrape ALL reviews regardless of count (warning: can be slow!)
DELAY_MIN = 2  # Minimum delay between requests (seconds)
DELAY_MAX = 5  # Maximum delay (for randomness to mimic human behavior)

# User-agent for requests (to avoid blocks; rotate if needed)
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# Empty list to store all scraped book data (will convert to DataFrame later)
all_books = []

# Debug prints: Show configurations
print("Debug: Subgenres defined:")
for genre, url in subgenres.items():
    print(f"  - {genre}: {url}")
print(f"Debug: Config - Books per subgenre: {MIN_BOOKS_PER_SUBGENRE} to {MAX_BOOKS_PER_SUBGENRE}")
print(f"Debug: Config - Max reviews per book: {MAX_REVIEWS_PER_BOOK} (all_reviews={ALL_REVIEWS})")
print(f"Debug: Config - Delays between requests: {DELAY_MIN}-{DELAY_MAX} seconds")
print("Debug: Ready to scrape book lists.")

Debug: Subgenres defined:
  - Contemporary Romance: https://www.goodreads.com/shelf/show/contemporary-romance
  - Historical Romance: https://www.goodreads.com/shelf/show/historical-romance
  - Paranormal Romance: https://www.goodreads.com/shelf/show/paranormal-romance
  - Romantic Suspense: https://www.goodreads.com/shelf/show/romantic-suspense
  - Romantic Fantasy: https://www.goodreads.com/genres/fantasy-romance
  - Science Fiction Romance: https://www.goodreads.com/genres/science-fiction-romance
Debug: Config - Books per subgenre: 200 to 300
Debug: Config - Max reviews per book: 200 (all_reviews=False)
Debug: Config - Delays between requests: 2-5 seconds
Debug: Ready to scrape book lists.


**Step 3: Function to Scrape Book Lists from Subgenre Pages**

In [4]:
# Cell 3: Function to scrape book lists from a subgenre URL (Updated)
# Explanation: This function paginates through a subgenre's shelf/genre pages,
# extracts book title, author, URL, and ID, and collects up to MAX_BOOKS_PER_SUBGENRE.
# Uses requests + BeautifulSoup for static HTML parsing (faster than Selenium).
# Updated: Better selectors for shelf vs. genre pages, more debug prints, improved book_id regex.
# Adds random delays to avoid rate-limiting.

def scrape_subgenre_books(genre, base_url):
    """
    Scrape book details from a subgenre's paginated list.
    Args:
        genre (str): Subgenre name (e.g., "Contemporary Romance")
        base_url (str): The starting URL for the subgenre
    Returns:
        list: List of dicts with book info (title, author, url, book_id, subgenre)
    """
    collected_books = []  # List to hold books for this subgenre
    page = 1  # Start from page 1
    is_shelf_page = "shelf/show" in base_url  # Detect if it's a shelf or genre page
    print(f"Debug: Starting scrape for {genre} at {base_url} (Type: {'Shelf' if is_shelf_page else 'Genre'})")

    while len(collected_books) < MAX_BOOKS_PER_SUBGENRE:
        # Construct paginated URL
        if is_shelf_page:
            paginated_url = f"{base_url}?page={page}"
        else:
            paginated_url = f"{base_url}?page={page}&sort=popular"  # Add sort for consistency on genre pages

        print(f"Debug: Fetching page {page} for {genre}: {paginated_url}")

        try:
            response = requests.get(paginated_url, headers=HEADERS)
            response.raise_for_status()  # Raise error if not 200 OK
            soup = BeautifulSoup(response.content, 'html.parser')

            # Find all book items on the page (adjust by page type)
            if is_shelf_page:
                book_items = soup.find_all('div', class_='elementList')  # Correct for shelves
            else:
                book_items = soup.find_all('div', class_='grid-item')  # For genre pages (adjust if needed)

            if not book_items:
                # Debug: Print a snippet of HTML if no items found
                print("Debug: No book items found. HTML snippet from page:")
                print(str(soup)[:500])  # First 500 chars of HTML for diagnosis
                print(f"Debug: No more books or bad selector on page {page} for {genre}. Stopping.")
                break

            print(f"Debug: Found {len(book_items)} book items on page {page}")

            for item in book_items:
                # Extract title
                title_tag = item.find('a', class_='bookTitle')
                title = title_tag.text.strip() if title_tag else None

                # Extract author
                author_tag = item.find('a', class_='authorName')
                author = author_tag.text.strip() if author_tag else None

                # Extract partial URL and book_id
                if title_tag and title_tag['href']:
                    url_partial = title_tag['href']
                    full_url = f"https://www.goodreads.com{url_partial.split('?')[0]}"  # Clean URL
                    # Improved regex: Capture digits before the title in /book/show/12345-title
                    book_id_match = re.search(r'/show/(\d+)', url_partial)
                    book_id = book_id_match.group(1) if book_id_match else None
                else:
                    full_url = None
                    book_id = None

                # Add to list if valid (skip if missing key info)
                if book_id and title and author:
                    collected_books.append({
                        'book_id': book_id,
                        'title': title,
                        'author': author,
                        'url': full_url,
                        'subgenre': genre
                    })
                    print(f"Debug: Added book - ID: {book_id}, Title: {title[:50]}..., Author: {author}, URL: {full_url}")
                else:
                    print("Debug: Skipped invalid book (missing title/author/ID)")

                # Stop if we've reached max for this subgenre
                if len(collected_books) >= MAX_BOOKS_PER_SUBGENRE:
                    print(f"Debug: Reached max books ({MAX_BOOKS_PER_SUBGENRE}) for {genre}.")
                    break

            # If fewer items than expected, assume end of list
            expected_per_page = 50 if is_shelf_page else 30  # Shelves: ~50, Genres: ~30
            if len(book_items) < expected_per_page:
                print(f"Debug: Fewer than {expected_per_page} items on page {page}. Likely end of list.")
                break

            page += 1  # Go to next page
            print(f"Debug: Books collected so far for {genre}: {len(collected_books)}")

        except requests.RequestException as e:
            print(f"Error: Failed to fetch page {page} for {genre}: {e}")
            break  # Stop on error

        # Random delay to avoid bans
        delay = random.uniform(DELAY_MIN, DELAY_MAX)
        print(f"Debug: Sleeping for {delay:.2f} seconds...")
        time.sleep(delay)

    # Check if we met the minimum
    if len(collected_books) < MIN_BOOKS_PER_SUBGENRE:
        print(f"Warning: Only collected {len(collected_books)} books for {genre} (less than min {MIN_BOOKS_PER_SUBGENRE})")

    print(f"Debug: Finished scraping {genre}. Total books: {len(collected_books)}")
    return collected_books

# Example usage (for testing): Scrape one subgenre and add to all_books
# Uncomment/change as needed; this tests Contemporary Romance
all_books = []  # Reset for testing
test_genre = "Contemporary Romance"
all_books.extend(scrape_subgenre_books(test_genre, subgenres[test_genre]))
print(f"Debug: Total books after test scrape: {len(all_books)}")

Debug: Starting scrape for Contemporary Romance at https://www.goodreads.com/shelf/show/contemporary-romance (Type: Shelf)
Debug: Fetching page 1 for Contemporary Romance: https://www.goodreads.com/shelf/show/contemporary-romance?page=1
Debug: Found 52 book items on page 1
Debug: Added book - ID: 56732449, Title: The Love Hypothesis (Paperback)..., Author: Ali Hazelwood, URL: https://www.goodreads.com/book/show/56732449-the-love-hypothesis
Debug: Added book - ID: 27213238, Title: The Hating Game (Paperback)..., Author: Sally  Thorne, URL: https://www.goodreads.com/book/show/27213238-the-hating-game
Debug: Added book - ID: 52867387, Title: Beach Read (Paperback)..., Author: Emily Henry, URL: https://www.goodreads.com/book/show/52867387-beach-read
Debug: Added book - ID: 58690308, Title: Book Lovers (Paperback)..., Author: Emily Henry, URL: https://www.goodreads.com/book/show/58690308-book-lovers
Debug: Added book - ID: 36199084, Title: The Kiss Quotient (The Kiss Quotient, #1)..., Autho