<a href="https://colab.research.google.com/github/BakhturinaPolina/goodreads-romance-research/blob/main/scraping_ratings_information_expanded_romantic_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cell 1: Install Dependencies and Imports

In [1]:
# Cell 1: Install dependencies and import libraries (Updated with anti-detection for Selenium)
# This cell installs all required packages and sets up Selenium with anti-detection measures

# Install required packages (run this once per Colab session)
!pip install beautifulsoup4 requests pandas selenium tqdm

# For Selenium in Colab: Install Chrome and ChromeDriver
!apt-get update -qq  # Quiet update to avoid verbose output
!apt install -y -qq chromium-chromedriver  # Quiet install

# Ensure chromedriver is in /usr/bin (Colab often has it here already)
import os
chromedriver_path = '/usr/lib/chromium-browser/chromedriver'
if os.path.exists(chromedriver_path) and not os.path.exists('/usr/bin/chromedriver'):
    !cp {chromedriver_path} /usr/bin
    print("Debug: Copied chromedriver to /usr/bin")
else:
    print("Debug: chromedriver already exists in /usr/bin or source path not found")

# Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException, NoSuchElementException
from tqdm import tqdm  # For progress bars
import json  # For handling JSON-like data
import os  # For file operations
import sys  # For system paths
from datetime import datetime  # For timestamps in debugging

# Set up Selenium Chrome options for Colab (headless, no sandbox) with anti-detection
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')  # Run without visible browser window
chrome_options.add_argument('--no-sandbox')  # Required for Colab
chrome_options.add_argument('--disable-dev-shm-usage')  # Avoid shared memory issues
chrome_options.add_argument('--disable-gpu')  # Extra stability for headless mode
chrome_options.add_argument('window-size=1920x1080')  # Set a reasonable window size
chrome_options.binary_location = '/usr/bin/chromium-browser'  # Point to the installed Chromium

# Anti-detection options (to avoid bot blocks and empty pages)
chrome_options.add_argument('--disable-blink-features=AutomationControlled')  # Hide Selenium flag
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)

# Additional anti-detection measures
chrome_options.add_argument('--disable-web-security')
chrome_options.add_argument('--disable-features=VizDisplayCompositor')
chrome_options.add_argument('--disable-extensions')
chrome_options.add_argument('--disable-default-apps')

# Debug: Print system paths for troubleshooting
print(f"Debug: Python version: {sys.version}")
print(f"Debug: Current time: {datetime.now()}")
print(f"Debug: Chromedriver expected path: /usr/bin/chromedriver")
print(f"Debug: Chromium binary: {chrome_options.binary_location}")

# Initialize the WebDriver with try-except for error handling
try:
    driver = webdriver.Chrome(options=chrome_options)
    print("Debug: WebDriver initialized successfully")

    # Execute CDP commands to hide webdriver
    driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
        'source': '''
            Object.defineProperty(navigator, 'webdriver', {
                get: () => undefined
            })
        '''
    })
    print("Debug: Added anti-detection CDP commands")

except WebDriverException as e:
    print(f"Error: Failed to initialize WebDriver: {e}")
    raise

# Test WebDriver with Goodreads
try:
    print("\nDebug: Testing WebDriver with Goodreads...")
    driver.get('https://www.goodreads.com/')
    time.sleep(3)
    print(f"Debug: Page title: {driver.title}")
    print(f"Debug: Current URL: {driver.current_url}")

    # Check if page loaded properly
    if driver.page_source and len(driver.page_source) > 1000:
        print(f"Debug: Page loaded successfully (source length: {len(driver.page_source)} chars)")
    else:
        print("Warning: Page source seems empty or too short")

    print("Debug: All dependencies installed and WebDriver ready!")

except Exception as e:
    print(f"Error: Test page load failed: {e}")
    print(f"Debug: Page source sample: {driver.page_source[:500] if driver.page_source else 'No source'}")

Collecting selenium
  Downloading selenium-4.34.2-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.34.2-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m102.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.30.0-py3-none-any.whl (499 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m499.2/499.2 kB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.pos

# Cell 2: Define Subgenres and Global Variables

In [2]:
# Cell 2: Define subgenres, URLs, and global configurations
# Fixed: Updated problematic genre URLs to use paginated shelf pages

# List of subgenres and their shelf/genre URLs (FIXED for Romantic Fantasy and Science Fiction Romance)
subgenres = {
    "Contemporary Romance": "https://www.goodreads.com/shelf/show/contemporary-romance",
    "Historical Romance": "https://www.goodreads.com/shelf/show/historical-romance",
    "Paranormal Romance": "https://www.goodreads.com/shelf/show/paranormal-romance",
    "Romantic Suspense": "https://www.goodreads.com/shelf/show/romantic-suspense",
    "Romantic Fantasy": "https://www.goodreads.com/shelf/show/fantasy-romance",  # FIXED: Changed from /genres/ to /shelf/show/
    "Science Fiction Romance": "https://www.goodreads.com/shelf/show/science-fiction-romance"  # FIXED: Changed to shelf URL
}

# Alternative URLs in case the above don't work
alternative_urls = {
    "Romantic Fantasy": "https://www.goodreads.com/shelf/show/romance-fantasy",
    "Science Fiction Romance": "https://www.goodreads.com/shelf/show/sci-fi-romance"
}

# Configurable scraping limits
MIN_BOOKS_PER_SUBGENRE = 200  # Minimum to collect
MAX_BOOKS_PER_SUBGENRE = 300  # Maximum to aim for
MAX_PAGES_PER_SUBGENRE = 15  # Safety cap on pagination (15 pages * 30 books/page = ~450 books)
MAX_REVIEWS_PER_BOOK = 200  # Cap for reviews
ALL_REVIEWS = False  # Set to True to scrape ALL reviews
DELAY_MIN = 3  # Minimum delay between requests (seconds) - Increased for safety
DELAY_MAX = 7  # Maximum delay - Increased for safety
MAX_RETRIES = 3  # Max retries for page loads

# User-agents to rotate (helps avoid detection)
USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
]

# Headers for requests
HEADERS = {
    'User-Agent': random.choice(USER_AGENTS),
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'DNT': '1',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1'
}

# Empty list to store all scraped book data
all_books = []

# Debug prints
print("Debug: Subgenres defined:")
for genre, url in subgenres.items():
    print(f"  - {genre}: {url}")
print(f"\nDebug: Configuration:")
print(f"  - Books per subgenre: {MIN_BOOKS_PER_SUBGENRE} to {MAX_BOOKS_PER_SUBGENRE}")
print(f"  - Max pages per subgenre: {MAX_PAGES_PER_SUBGENRE}")
print(f"  - Max reviews per book: {MAX_REVIEWS_PER_BOOK} (all_reviews={ALL_REVIEWS})")
print(f"  - Delays between requests: {DELAY_MIN}-{DELAY_MAX} seconds")
print(f"  - Max retries per page: {MAX_RETRIES}")
print(f"  - User agents available: {len(USER_AGENTS)}")
print("\nDebug: Configuration complete. Ready to define scraping functions.")

Debug: Subgenres defined:
  - Contemporary Romance: https://www.goodreads.com/shelf/show/contemporary-romance
  - Historical Romance: https://www.goodreads.com/shelf/show/historical-romance
  - Paranormal Romance: https://www.goodreads.com/shelf/show/paranormal-romance
  - Romantic Suspense: https://www.goodreads.com/shelf/show/romantic-suspense
  - Romantic Fantasy: https://www.goodreads.com/shelf/show/fantasy-romance
  - Science Fiction Romance: https://www.goodreads.com/shelf/show/science-fiction-romance

Debug: Configuration:
  - Books per subgenre: 200 to 300
  - Max pages per subgenre: 15
  - Max reviews per book: 200 (all_reviews=False)
  - Delays between requests: 3-7 seconds
  - Max retries per page: 3
  - User agents available: 4

Debug: Configuration complete. Ready to define scraping functions.


# Updated Cell 2: Configuration for 500 Books per Genre

# Cell 3: Cookie Management Function

In [3]:
# Cell 3: Cookie management function with environment variables and complete cookies
# This cell handles cookie loading and login verification

import os
from datetime import datetime, timedelta

def load_cookies(driver, auto_login=False):
    """
    Load cookies to simulate logged-in state or perform auto-login.
    Args:
        driver: Selenium WebDriver instance
        auto_login: If True, attempt automated login if cookies fail
    Returns:
        bool: True if login successful, False otherwise
    """
    print("\n" + "="*50)
    print("Debug: Starting cookie/login process...")
    print("="*50)

    # Complete cookie list from your provided data
    cookies = [
        {
            'name': '_session_id2',
            'value': '6c4ac9d7944645498873dcfaf76b294f',
            'domain': 'www.goodreads.com',
            'path': '/',
            'secure': True,
            'expiry': int((datetime.now() + timedelta(days=365)).timestamp())  # 1 year from now
        },
        {
            'name': 'at-main',
            'value': 'Atza|IwEBIP_WVjU2wDt5XXIwnYO37c_HBxHHpvejaw457NkT7-DHU7flVrkMM02XEH0cYeYRpEYlSuc7aN4cr-0ME-ruM28LUkrC7ODW8WKTpaqeH-sCuNVv1z9YM7xBS-Z4T0jhtLlPe8xzqTMozRae_ZNoxsqvpLNQxyDu0Woeei-Ip3E_PYxuZeikLbIDzpe17BVcrNocfqj4fl5KszGyF7ExHAvfVTGVBH_UeX5rQbSY6ZOCTHiIoca2U5sZfChiCoUH3wk',
            'domain': '.goodreads.com',
            'path': '/',
            'secure': True,
            'httpOnly': True,
            'expiry': int((datetime.now() + timedelta(days=365*2)).timestamp())
        },
        {
            'name': 'ccsid',
            'value': '724-3321647-0772806',
            'domain': 'www.goodreads.com',
            'path': '/',
            'secure': False,
            'expiry': int((datetime.now() + timedelta(days=365*2)).timestamp())
        },
        {
            'name': 'csm-hit',
            'value': 'tb:s-2R1WY81WFQ4QG5VKQVVG|1754352395507&t:1754352395508',
            'domain': 'www.goodreads.com',
            'path': '/',
            'secure': False,
            'expiry': int((datetime.now() + timedelta(days=365)).timestamp())
        },
        {
            'name': 'lc-main',
            'value': 'en_US',
            'domain': '.goodreads.com',
            'path': '/',
            'secure': False,
            'httpOnly': True,
            'expiry': int((datetime.now() + timedelta(days=365*2)).timestamp())
        },
        {
            'name': 'locale',
            'value': 'en',
            'domain': 'www.goodreads.com',
            'path': '/',
            'secure': False
            # Session cookie, no expiry
        },
        {
            'name': 'logged_out_browsing_page_count',
            'value': '1',
            'domain': 'www.goodreads.com',
            'path': '/',
            'secure': False,
            'expiry': int((datetime.now() + timedelta(days=365*2)).timestamp())
        },
        {
            'name': 'sess-at-main',
            'value': 'U8NDSc1MTsE8derMKmAGMZ+Uq9chMayYrfiC+B46wCE=',
            'domain': '.goodreads.com',
            'path': '/',
            'secure': True,
            'httpOnly': True,
            'expiry': int((datetime.now() + timedelta(days=365*2)).timestamp())
        },
        {
            'name': 'session-id',
            'value': '140-3588248-4268607',
            'domain': '.goodreads.com',
            'path': '/',
            'secure': False,
            'httpOnly': True,
            'expiry': int((datetime.now() + timedelta(days=365*2)).timestamp())
        },
        {
            'name': 'session-id-time',
            'value': '2385072442l',
            'domain': '.goodreads.com',
            'path': '/',
            'secure': False,
            'httpOnly': True,
            'expiry': int((datetime.now() + timedelta(days=365*2)).timestamp())
        },
        {
            'name': 'session-token',
            'value': '0+8P54n7E6d+0GGShiBALFOEgtTQdUdna1ExcAeRw3Ul16HkHsMc2W2ZpSYd+dn2CmsRT4KTntQt8WF4Of+YP1EitZ4QW4VQMG/hg3NoH61WK00ztRFxR6GkLoQiygwTiqkHExpG3pEjipb2/x256UoQDqyJcqAdFLyXukWtVKwfCFNsWzxJZMP/gmSX/Ml1mSdPMmkb9yJ5gb+ugF6z5a1F2Hr01Tt1Ynz77AY8fV5BunhvuaXJYoMJjEKHvWoLmurVyyWT/YZ71mOlJJXAtphVwQmemG3C27hNxcCL3cC3x1N6iA36seE5LzPJmOkxNjbGZ5EnO3s+IVXsenDzWijTZZz3dAu9eDn03BYbLEF0fsf7cW4b3g==',
            'domain': '.goodreads.com',
            'path': '/',
            'secure': False,
            'httpOnly': True,
            'expiry': int((datetime.now() + timedelta(days=365*2)).timestamp())
        },
        {
            'name': 'ubid-main',
            'value': '135-7477912-2392604',
            'domain': '.goodreads.com',
            'path': '/',
            'secure': False,
            'httpOnly': True,
            'expiry': int((datetime.now() + timedelta(days=365*2)).timestamp())
        },
        {
            'name': 'x-main',
            'value': 'hf5q7qpx3x0HgBwzCGe?XOExwXUrkUVkjz1Z15Z2ptg0JUWQ9klSPvPcAXY9G41Z',
            'domain': '.goodreads.com',
            'path': '/',
            'secure': False,
            'httpOnly': True,
            'expiry': int((datetime.now() + timedelta(days=365*2)).timestamp())
        }
    ]

    try:
        # First, navigate to Goodreads to set domain
        print("Debug: Navigating to Goodreads homepage...")
        driver.get('https://www.goodreads.com/')
        time.sleep(2)

        # Clear existing cookies
        driver.delete_all_cookies()
        print("Debug: Cleared all existing cookies")

        # Add cookies
        print(f"Debug: Adding {len(cookies)} cookies...")
        successfully_added = 0
        for i, cookie in enumerate(cookies):
            try:
                # Remove 'expiry' key if it's a session cookie
                if 'expiry' not in cookie or cookie.get('name') == 'locale':
                    cookie_to_add = {k: v for k, v in cookie.items() if k != 'expiry'}
                else:
                    cookie_to_add = cookie.copy()

                driver.add_cookie(cookie_to_add)
                successfully_added += 1
                if i < 5 or i % 5 == 0:  # Log first 5 and then every 5th
                    print(f"Debug: Added cookie {i+1}/{len(cookies)}: {cookie['name']}")
            except Exception as e:
                print(f"Warning: Failed to add cookie '{cookie['name']}': {e}")

        print(f"Debug: Successfully added {successfully_added}/{len(cookies)} cookies")

        # Refresh to apply cookies
        print("Debug: Refreshing page to apply cookies...")
        driver.refresh()
        time.sleep(3)

        # Verify login by checking for profile elements
        print("Debug: Verifying login status...")
        logged_in = False

        # Check multiple indicators of being logged in
        login_indicators = [
            (By.CLASS_NAME, 'siteHeader__personalMenu'),
            (By.CLASS_NAME, 'profileMenu'),
            (By.XPATH, "//span[@class='headerPersonalNav__icon']"),
            (By.XPATH, "//a[@href='/review/list']"),  # "My Books" link
            (By.XPATH, "//a[contains(@href, '/user/show/')]"),  # Profile link
            (By.CLASS_NAME, 'dropdown__trigger--profileMenu')  # Profile dropdown
        ]

        for by, value in login_indicators:
            try:
                element = driver.find_element(by, value)
                if element:
                    print(f"Debug: Found login indicator: {value}")
                    logged_in = True
                    break
            except NoSuchElementException:
                continue

        if logged_in:
            print("Success: Cookie-based login successful!")
            print(f"Debug: Current page title: {driver.title}")
            return True
        else:
            print("Warning: No login indicators found. Cookies may be expired.")
            print(f"Debug: Page title: {driver.title}")
            print(f"Debug: Current URL: {driver.current_url}")

            # Check if we see any sign-in prompts
            try:
                sign_in_element = driver.find_element(By.XPATH, "//a[contains(text(), 'Sign In')]")
                if sign_in_element:
                    print("Debug: Found 'Sign In' link - definitely not logged in")
            except NoSuchElementException:
                print("Debug: No 'Sign In' link found")

            # Optional: Try automated login
            if auto_login:
                print("\nDebug: Attempting automated login...")
                return attempt_auto_login(driver)

            return False

    except Exception as e:
        print(f"Error in load_cookies: {e}")
        import traceback
        traceback.print_exc()
        return False

def attempt_auto_login(driver):
    """
    Attempt automated login using environment variables
    Set these environment variables before running:
    - GOODREADS_EMAIL: Your Goodreads email
    - GOODREADS_PASSWORD: Your Goodreads password
    """
    try:
        # Get credentials from environment variables
        email = os.environ.get('GOODREADS_EMAIL')
        password = os.environ.get('GOODREADS_PASSWORD')

        if not email or not password:
            print("ERROR: Environment variables GOODREADS_EMAIL and GOODREADS_PASSWORD not set!")
            print("\nTo set them in Colab, run these commands in a cell:")
            print("import os")
            print("os.environ['GOODREADS_EMAIL'] = 'your_email@example.com'")
            print("os.environ['GOODREADS_PASSWORD'] = 'your_password'")
            print("\nOr set them in your system before running the notebook.")
            return False

        print("Debug: Environment variables found. Attempting login...")
        print(f"Debug: Using email: {email[:3]}...{email[-10:]}")  # Partially masked for security

        # Navigate to login page
        print("Debug: Navigating to login page...")
        driver.get('https://www.goodreads.com/user/sign_in')
        time.sleep(3)

        # Check for login form
        try:
            # Wait for form elements
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, 'user_email'))
            )

            email_field = driver.find_element(By.ID, 'user_email')
            password_field = driver.find_element(By.ID, 'user_password')

            print("Debug: Login form found. Filling credentials...")

            # Clear and fill fields
            email_field.clear()
            email_field.send_keys(email)

            password_field.clear()
            password_field.send_keys(password)

            # Find and click sign in button
            sign_in_button = driver.find_element(By.NAME, 'commit')
            print("Debug: Clicking sign in button...")
            sign_in_button.click()

            # Wait for login to process
            time.sleep(5)

            # Check if login successful
            if "sign_in" not in driver.current_url.lower():
                print("Success: Automated login successful!")
                print(f"Debug: Redirected to: {driver.current_url}")
                return True
            else:
                print("Error: Login failed. Possible reasons:")
                print("  - Invalid credentials")
                print("  - CAPTCHA required")
                print("  - Account locked")

                # Check for error messages
                try:
                    error_elem = driver.find_element(By.CLASS_NAME, 'flash_error')
                    print(f"  - Error message: {error_elem.text}")
                except NoSuchElementException:
                    pass

                return False

        except TimeoutException:
            print("Error: Login form not found. Page may have changed.")
            print(f"Debug: Current URL: {driver.current_url}")
            return False
        except NoSuchElementException as e:
            print(f"Error: Could not find login form element: {e}")
            return False

    except Exception as e:
        print(f"Error in automated login: {e}")
        import traceback
        traceback.print_exc()
        return False

# Instructions for setting environment variables in Colab
print("="*70)
print("IMPORTANT: Setting up credentials")
print("="*70)
print("\nTo use automated login, set environment variables:")
print("\nOption 1 - In a code cell (temporary):")
print("```python")
print("import os")
print("os.environ['GOODREADS_EMAIL'] = 'your_email@example.com'")
print("os.environ['GOODREADS_PASSWORD'] = 'your_password'")
print("```")
print("\nOption 2 - Using Colab secrets (permanent for notebook):")
print("1. Click the key icon 🔑 in the left sidebar")
print("2. Add new secrets: GOODREADS_EMAIL and GOODREADS_PASSWORD")
print("3. Access them with:")
print("```python")
print("from google.colab import userdata")
print("os.environ['GOODREADS_EMAIL'] = userdata.get('GOODREADS_EMAIL')")
print("os.environ['GOODREADS_PASSWORD'] = userdata.get('GOODREADS_PASSWORD')")
print("```")
print("\n" + "="*70)

# Test cookie loading
print("\nDebug: Testing cookie loading...")
login_success = load_cookies(driver, auto_login=False)  # Set to True to try auto-login if cookies fail
print(f"Debug: Login test result: {'Success' if login_success else 'Failed'}")

if not login_success:
    print("\nDebug: Cookies may be expired. Consider:")
    print("1. Getting fresh cookies from your browser")
    print("2. Setting environment variables and using auto_login=True")
    print("3. Checking if Goodreads has changed their login system")

IMPORTANT: Setting up credentials

To use automated login, set environment variables:

Option 1 - In a code cell (temporary):
```python
import os
os.environ['GOODREADS_EMAIL'] = 'your_email@example.com'
os.environ['GOODREADS_PASSWORD'] = 'your_password'
```

Option 2 - Using Colab secrets (permanent for notebook):
1. Click the key icon 🔑 in the left sidebar
2. Add new secrets: GOODREADS_EMAIL and GOODREADS_PASSWORD
3. Access them with:
```python
from google.colab import userdata
os.environ['GOODREADS_EMAIL'] = userdata.get('GOODREADS_EMAIL')
os.environ['GOODREADS_PASSWORD'] = userdata.get('GOODREADS_PASSWORD')
```


Debug: Testing cookie loading...

Debug: Starting cookie/login process...
Debug: Navigating to Goodreads homepage...
Debug: Cleared all existing cookies
Debug: Adding 13 cookies...
Debug: Added cookie 1/13: _session_id2
Debug: Added cookie 2/13: at-main
Debug: Added cookie 3/13: ccsid
Debug: Added cookie 4/13: csm-hit
Debug: Added cookie 5/13: lc-main
Debug: Added cookie 6/

# Cell 4: Main Scraping Function

In [4]:
# Cell 4: Main book scraping function with all fixes implemented
# This is the core function that scrapes books from each subgenre

def scrape_subgenre_books(genre, base_url, max_pages=None):
    """
    Scrape book details from a subgenre's paginated list using Selenium.
    Fixed version with improved selectors and error handling.
    """
    print("\n" + "="*70)
    print(f"STARTING SCRAPE: {genre}")
    print(f"URL: {base_url}")
    print(f"Time: {datetime.now()}")
    print("="*70)

    # Use global max pages if not specified
    if max_pages is None:
        max_pages = MAX_PAGES_PER_SUBGENRE

    collected_books = []
    seen_ids = set()
    page = 1
    consecutive_failures = 0
    max_consecutive_failures = 3

    try:
        # Load cookies first (if not already loaded)
        if page == 1:
            print("Debug: Ensuring logged-in state...")
            load_cookies(driver, auto_login=False)

        # Navigate to the subgenre page
        print(f"\nDebug: Loading initial page: {base_url}")
        driver.get(base_url)
        time.sleep(random.uniform(3, 5))

        # Check if page loaded properly
        if "Page not found" in driver.title or "404" in driver.title:
            print(f"ERROR: 404 - Page not found for {genre}")
            print(f"Debug: Trying alternative URL if available...")
            if genre in alternative_urls:
                base_url = alternative_urls[genre]
                print(f"Debug: Using alternative URL: {base_url}")
                driver.get(base_url)
                time.sleep(random.uniform(3, 5))
            else:
                return collected_books

        print(f"Debug: Initial page loaded. Title: {driver.title}")
        print(f"Debug: Current URL: {driver.current_url}")

        # Main pagination loop
        while len(collected_books) < MAX_BOOKS_PER_SUBGENRE and page <= max_pages:
            print(f"\n--- Page {page} ---")
            retry_count = 0
            page_success = False

            while retry_count < MAX_RETRIES and not page_success:
                try:
                    # Wait for book elements to load
                    print(f"Debug: Waiting for book elements (attempt {retry_count + 1})...")

                    # Try multiple possible selectors
                    book_selectors = [
                        (By.CLASS_NAME, 'elementList'),
                        (By.CLASS_NAME, 'tableList'),
                        (By.CSS_SELECTOR, 'tr.bookalike'),
                        (By.CSS_SELECTOR, 'div.leftContainer div.elementList')
                    ]

                    books_found = False
                    for by, selector in book_selectors:
                        try:
                            WebDriverWait(driver, 10).until(
                                EC.presence_of_element_located((by, selector))
                            )
                            print(f"Debug: Found books using selector: {selector}")
                            books_found = True
                            break
                        except TimeoutException:
                            continue

                    if not books_found:
                        raise TimeoutException("No book elements found with any selector")

                    page_success = True
                    print(f"Debug: Page {page} loaded successfully")

                except TimeoutException:
                    retry_count += 1
                    print(f"Debug: Timeout on page {page}, attempt {retry_count}")

                    if retry_count < MAX_RETRIES:
                        print("Debug: Refreshing page...")
                        driver.refresh()
                        time.sleep(5)
                    else:
                        print("ERROR: Max retries exceeded")
                        print(f"Debug: Page source preview: {driver.page_source[:1000]}")
                        consecutive_failures += 1
                        break

            if not page_success:
                if consecutive_failures >= max_consecutive_failures:
                    print(f"ERROR: {consecutive_failures} consecutive page failures. Stopping.")
                    break
                continue

            # Reset consecutive failures on success
            consecutive_failures = 0

            # Parse page with BeautifulSoup
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Find all book items (try multiple selectors)
            book_items = []

            # Primary selector for shelf pages
            book_items.extend(soup.find_all('div', class_='elementList'))

            # Also try table rows (some pages use tables)
            book_items.extend(soup.find_all('tr', class_='bookalike'))

            # Remove None values
            book_items = [item for item in book_items if item]

            print(f"Debug: Found {len(book_items)} potential book items")

            if not book_items:
                print("Warning: No book items found on this page")
                consecutive_failures += 1
                if consecutive_failures >= max_consecutive_failures:
                    print("ERROR: No books found on multiple pages. Stopping.")
                    break

            # Extract book details
            books_added_this_page = 0
            for i, item in enumerate(book_items):
                try:
                    # Extract title
                    title_elem = item.find('a', class_='bookTitle')
                    if not title_elem:
                        # Try alternative selector
                        title_elem = item.find('a', class_='gr-h3__noDecoration--pageTitle')

                    title = title_elem.text.strip() if title_elem else None

                    # Extract author
                    author_elem = item.find('a', class_='authorName')
                    if not author_elem:
                        # Try alternative selectors
                        author_elem = item.find('span', itemprop='name')

                    author = author_elem.text.strip() if author_elem else None

                    # Extract URL and book ID
                    if title_elem and title_elem.get('href'):
                        url_partial = title_elem['href']
                        full_url = f"https://www.goodreads.com{url_partial.split('?')[0]}"

                        # Extract book ID from URL
                        book_id_match = re.search(r'/show/(\d+)', url_partial)
                        if not book_id_match:
                            book_id_match = re.search(r'/book/show/(\d+)', url_partial)

                        book_id = book_id_match.group(1) if book_id_match else None
                    else:
                        full_url = None
                        book_id = None

                    # Validate and add book
                    if book_id and title and author and book_id not in seen_ids:
                        seen_ids.add(book_id)
                        book_data = {
                            'book_id': book_id,
                            'title': title,
                            'author': author,
                            'url': full_url,
                            'subgenre': genre,
                            'page_scraped': page,
                            'timestamp': datetime.now().isoformat()
                        }
                        collected_books.append(book_data)
                        books_added_this_page += 1

                        if len(collected_books) % 10 == 0:  # Log every 10th book
                            print(f"Debug: Collected {len(collected_books)} books so far")

                        # Stop if we've reached the maximum
                        if len(collected_books) >= MAX_BOOKS_PER_SUBGENRE:
                            print(f"Debug: Reached maximum books ({MAX_BOOKS_PER_SUBGENRE})")
                            break

                    elif book_id in seen_ids:
                        print(f"Debug: Skipped duplicate book ID: {book_id}")
                    elif not all([book_id, title, author]):
                        missing = []
                        if not book_id: missing.append("ID")
                        if not title: missing.append("title")
                        if not author: missing.append("author")
                        if i < 3:  # Only log first few to avoid spam
                            print(f"Debug: Skipped book missing: {', '.join(missing)}")

                except Exception as e:
                    print(f"Warning: Error extracting book {i+1}: {e}")
                    continue

            print(f"Debug: Added {books_added_this_page} new books from page {page}")
            print(f"Debug: Total books collected: {len(collected_books)}")

            # If no new books added, might be at the end
            if books_added_this_page == 0:
                print("Warning: No new books added from this page")
                consecutive_failures += 1
                if consecutive_failures >= max_consecutive_failures:
                    print("Debug: No new content. Stopping pagination.")
                    break

            # Check if we should continue to next page
            if len(collected_books) >= MAX_BOOKS_PER_SUBGENRE or page >= max_pages:
                print(f"Debug: Stopping - reached {'max books' if len(collected_books) >= MAX_BOOKS_PER_SUBGENRE else 'max pages'}")
                break

            # Find and click next button
            try:
                print("Debug: Looking for next button...")

                # Multiple strategies for finding next button
                next_button = None

                # Strategy 1: Look for "next" link with class
                try:
                    next_button = driver.find_element(By.XPATH, "//a[@class='next_page' and @rel='next']")
                except NoSuchElementException:
                    pass

                # Strategy 2: Look for any element with "next_page" class
                if not next_button:
                    try:
                        next_button = driver.find_element(By.CLASS_NAME, "next_page")
                    except NoSuchElementException:
                        pass

                # Strategy 3: Look for "Next »" text
                if not next_button:
                    try:
                        next_button = driver.find_element(By.XPATH, "//a[contains(text(),'Next')]")
                    except NoSuchElementException:
                        pass

                if next_button:
                    # Check if button is disabled
                    if 'disabled' in next_button.get_attribute('class'):
                        print("Debug: Next button is disabled. Reached end of results.")
                        break

                    print("Debug: Next button found. Scrolling to it...")
                    driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
                    time.sleep(1)

                    # Try clicking with JavaScript first (more reliable)
                    print("Debug: Clicking next button...")
                    driver.execute_script("arguments[0].click();", next_button)

                    page += 1
                    print(f"Debug: Navigated to page {page}")

                    # Random delay to mimic human behavior
                    delay = random.uniform(DELAY_MIN, DELAY_MAX)
                    print(f"Debug: Waiting {delay:.1f} seconds before next page...")
                    time.sleep(delay)

                else:
                    print("Debug: No next button found. Reached end of pagination.")
                    break

            except Exception as e:
                print(f"Debug: Error with pagination: {e}")
                break

        # Final summary for this subgenre
        print(f"\n{'='*70}")
        print(f"COMPLETED: {genre}")
        print(f"Total pages scraped: {page}")
        print(f"Total unique books collected: {len(collected_books)}")
        print(f"Time: {datetime.now()}")
        print(f"{'='*70}\n")

    except Exception as e:
        print(f"ERROR: Fatal error while scraping {genre}: {e}")
        import traceback
        traceback.print_exc()

    return collected_books

# Test with a single subgenre
print("Debug: Testing with Contemporary Romance...")
test_books = scrape_subgenre_books("Contemporary Romance", subgenres["Contemporary Romance"], max_pages=2)
print(f"\nTest Results: Found {len(test_books)} books")
if test_books:
    print("Sample book:", test_books[0])

Debug: Testing with Contemporary Romance...

STARTING SCRAPE: Contemporary Romance
URL: https://www.goodreads.com/shelf/show/contemporary-romance
Time: 2025-08-05 20:37:38.500284
Debug: Ensuring logged-in state...

Debug: Starting cookie/login process...
Debug: Navigating to Goodreads homepage...
Debug: Cleared all existing cookies
Debug: Adding 13 cookies...
Debug: Added cookie 1/13: _session_id2
Debug: Added cookie 2/13: at-main
Debug: Added cookie 3/13: ccsid
Debug: Added cookie 4/13: csm-hit
Debug: Added cookie 5/13: lc-main
Debug: Added cookie 6/13: locale
Debug: Added cookie 11/13: session-token
Debug: Successfully added 13/13 cookies
Debug: Refreshing page to apply cookies...
Debug: Verifying login status...
Debug: Found login indicator: //span[@class='headerPersonalNav__icon']
Success: Cookie-based login successful!
Debug: Current page title: Recent updates | Goodreads

Debug: Loading initial page: https://www.goodreads.com/shelf/show/contemporary-romance
Debug: Initial page lo

# Cell 5: Scrape All Subgenres

In [5]:
# Cell 5: Main execution - Scrape all subgenres
# This cell orchestrates the scraping of all subgenres

# Reset the books list
all_books = []
scraping_summary = {}

print("="*80)
print("STARTING FULL SCRAPING RUN")
print(f"Target: {len(subgenres)} subgenres")
print(f"Books per subgenre: {MIN_BOOKS_PER_SUBGENRE}-{MAX_BOOKS_PER_SUBGENRE}")
print(f"Start time: {datetime.now()}")
print("="*80)

# Iterate through all subgenres
for idx, (genre, url) in enumerate(subgenres.items(), 1):
    print(f"\n{'#'*80}")
    print(f"SUBGENRE {idx}/{len(subgenres)}: {genre}")
    print(f"{'#'*80}")

    start_time = time.time()

    try:
        # Scrape books for this subgenre
        genre_books = scrape_subgenre_books(genre, url)

        # Store results
        all_books.extend(genre_books)
        scraping_summary[genre] = {
            'books_found': len(genre_books),
            'success': len(genre_books) > 0,
            'time_taken': round(time.time() - start_time, 2)
        }

        print(f"\nSummary for {genre}:")
        print(f"  - Books collected: {len(genre_books)}")
        print(f"  - Time taken: {scraping_summary[genre]['time_taken']} seconds")
        print(f"  - Status: {'SUCCESS' if len(genre_books) >= MIN_BOOKS_PER_SUBGENRE else 'PARTIAL' if len(genre_books) > 0 else 'FAILED'}")

        # Save intermediate results (in case of crashes)
        if len(all_books) > 0:
            df_intermediate = pd.DataFrame(all_books)
            df_intermediate.to_csv(f'books_intermediate_{idx}_{genre.replace(" ", "_")}.csv', index=False)
            print(f"Debug: Saved intermediate results ({len(all_books)} books total)")

        # Delay between subgenres
        if idx < len(subgenres):
            delay = random.uniform(10, 20)
            print(f"\nDebug: Waiting {delay:.1f} seconds before next subgenre...")
            time.sleep(delay)

    except Exception as e:
        print(f"ERROR: Failed to scrape {genre}: {e}")
        scraping_summary[genre] = {
            'books_found': 0,
            'success': False,
            'time_taken': round(time.time() - start_time, 2),
            'error': str(e)
        }

# Final summary
print("\n" + "="*80)
print("SCRAPING COMPLETE - FINAL SUMMARY")
print("="*80)

for genre, stats in scraping_summary.items():
    status = 'SUCCESS' if stats['success'] and stats['books_found'] >= MIN_BOOKS_PER_SUBGENRE else 'PARTIAL' if stats['books_found'] > 0 else 'FAILED'
    print(f"{genre:30} | Books: {stats['books_found']:4} | Time: {stats['time_taken']:6.1f}s | Status: {status}")

print(f"\nTotal unique books collected: {len(all_books)}")
print(f"Total time: {sum(s['time_taken'] for s in scraping_summary.values()):.1f} seconds")
print(f"End time: {datetime.now()}")

# Check for problematic genres
problem_genres = [g for g, s in scraping_summary.items() if s['books_found'] == 0]
if problem_genres:
    print(f"\nWARNING: The following genres returned 0 books:")
    for pg in problem_genres:
        print(f"  - {pg}")
        if pg in alternative_urls:
            print(f"    Consider manually checking: {alternative_urls[pg]}")

STARTING FULL SCRAPING RUN
Target: 6 subgenres
Books per subgenre: 200-300
Start time: 2025-08-05 20:38:00.116878

################################################################################
SUBGENRE 1/6: Contemporary Romance
################################################################################

STARTING SCRAPE: Contemporary Romance
URL: https://www.goodreads.com/shelf/show/contemporary-romance
Time: 2025-08-05 20:38:00.117429
Debug: Ensuring logged-in state...

Debug: Starting cookie/login process...
Debug: Navigating to Goodreads homepage...
Debug: Cleared all existing cookies
Debug: Adding 13 cookies...
Debug: Added cookie 1/13: _session_id2
Debug: Added cookie 2/13: at-main
Debug: Added cookie 3/13: ccsid
Debug: Added cookie 4/13: csm-hit
Debug: Added cookie 5/13: lc-main
Debug: Added cookie 6/13: locale
Debug: Added cookie 11/13: session-token
Debug: Successfully added 13/13 cookies
Debug: Refreshing page to apply cookies...
Debug: Verifying login status...
Debug: 

# Cell 6: Save and Analyze Results

In [6]:
# Cell 6: Save results and perform basic analysis
# This cell saves the scraped data and provides analysis

# Create DataFrame from collected books
if len(all_books) > 0:
    df_books = pd.DataFrame(all_books)

    print("="*80)
    print("SAVING AND ANALYZING RESULTS")
    print("="*80)

    # Remove duplicates (if any slipped through)
    initial_count = len(df_books)
    df_books = df_books.drop_duplicates(subset=['book_id'], keep='first')
    final_count = len(df_books)

    if initial_count != final_count:
        print(f"Debug: Removed {initial_count - final_count} duplicate entries")

    # Save to CSV
    filename = f'romance_books_scraped_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv'
    df_books.to_csv(filename, index=False)
    print(f"Success: Saved {len(df_books)} unique books to {filename}")

    # Analysis
    print("\nDATA ANALYSIS:")
    print("-"*50)

    # Books per subgenre
    print("\nBooks per subgenre:")
    subgenre_counts = df_books['subgenre'].value_counts()
    for genre, count in subgenre_counts.items():
        percentage = (count / len(df_books)) * 100
        print(f"  {genre:30} | {count:4} books ({percentage:5.1f}%)")

    # Check for missing subgenres
    missing_genres = set(subgenres.keys()) - set(subgenre_counts.index)
    if missing_genres:
        print("\nMissing subgenres (0 books):")
        for mg in missing_genres:
            print(f"  - {mg}")

    # Top authors
    print("\nTop 10 authors by book count:")
    top_authors = df_books['author'].value_counts().head(10)
    for author, count in top_authors.items():
        print(f"  {author:40} | {count:3} books")

    # Sample of collected books
    print("\nSample of collected books:")
    print("-"*50)
    sample = df_books.sample(min(5, len(df_books)))
    for _, book in sample.iterrows():
        print(f"Title: {book['title'][:60]}...")
        print(f"Author: {book['author']}")
        print(f"Subgenre: {book['subgenre']}")
        print(f"URL: {book['url']}")
        print("-"*50)

    # Data quality check
    print("\nDATA QUALITY CHECK:")
    print(f"  - Total unique books: {len(df_books)}")
    print(f"  - Books with valid IDs: {df_books['book_id'].notna().sum()}")
    print(f"  - Books with titles: {df_books['title'].notna().sum()}")
    print(f"  - Books with authors: {df_books['author'].notna().sum()}")
    print(f"  - Books with URLs: {df_books['url'].notna().sum()}")

    # Check for any books with missing data
    missing_data = df_books[df_books.isnull().any(axis=1)]
    if len(missing_data) > 0:
        print(f"\nWarning: {len(missing_data)} books have missing data")
        print("Missing data summary:")
        print(missing_data.isnull().sum())

else:
    print("ERROR: No books were collected!")
    print("\nDebugging suggestions:")
    print("1. Check if the URLs are correct and accessible")
    print("2. Verify cookies are valid and fresh")
    print("3. Review debug output from scraping functions")
    print("4. Try scraping a single subgenre with max_pages=1 to isolate issues")
    print("5. Manually visit one of the URLs in a browser to check structure")

# Clean up driver
try:
    driver.quit()
    print("\nDebug: WebDriver closed successfully")
except:
    pass

SAVING AND ANALYZING RESULTS
Debug: Removed 64 duplicate entries
Success: Saved 1736 unique books to romance_books_scraped_20250805_204440.csv

DATA ANALYSIS:
--------------------------------------------------

Books per subgenre:
  Contemporary Romance           |  300 books ( 17.3%)
  Historical Romance             |  300 books ( 17.3%)
  Paranormal Romance             |  296 books ( 17.1%)
  Romantic Suspense              |  288 books ( 16.6%)
  Science Fiction Romance        |  285 books ( 16.4%)
  Romantic Fantasy               |  267 books ( 15.4%)

Top 10 authors by book count:
  Ruby Dixon                               |  51 books
  J.D. Robb                                |  39 books
  Lisa Kleypas                             |  36 books
  Nora Roberts                             |  34 books
  Sherrilyn Kenyon                         |  33 books
  Julia Quinn                              |  32 books
  Linda Howard                             |  28 books
  Laurann Dohner       