<a href="https://colab.research.google.com/github/BakhturinaPolina/goodreads-romance-research/blob/main/Scraping_Additional_Metadata_from_Goodreads_Book_Pages.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cell 1: Install Dependencies and Imports

In [8]:
# Install required packages (quiet to avoid verbose output)
!pip install beautifulsoup4 requests pandas selenium tqdm lxml webdriver-manager --quiet

# Install Chrome browser using deb package
!apt-get update --quiet > /dev/null
!apt-get install -y libvulkan1 > /dev/null # Install missing dependency
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb --quiet
!dpkg -i google-chrome-stable_current_amd64.deb > /dev/null
!apt-get install -f --yes > /dev/null # Install remaining dependencies

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException, StaleElementReferenceException, InvalidSelectorException
from tqdm import tqdm
import json
import os
from datetime import datetime, timedelta

print("Cell 1: Setup complete.")

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Cell 1: Setup complete.


# Cell 2: Cookie and Login Functions

In [9]:
def load_cookies(driver, auto_login=False):
    """
    Load cookies to simulate logged-in state or perform auto-login.
    """
    print("\n=== Starting cookie/login process ===")

    # Complete list of 13 cookies (NOTE: These may expire; refresh from a logged-in browser session if needed)
    cookies = [
        {
            'name': '_session_id2',
            'value': '6c4ac9d7944645498873dcfaf76b294f',
            'domain': 'www.goodreads.com',
            'path': '/',
            'secure': True,
            'expiry': int((datetime.now() + timedelta(days=365)).timestamp())  # 1 year from now
        },
        {
            'name': 'at-main',
            'value': 'Atza|IwEBIP_WVjU2wDt5XXIwnYO37c_HBxHHpvejaw457NkT7-DHU7flVrkMM02XEH0cYeYRpEYlSuc7aN4cr-0ME-ruM28LUkrC7ODW8WKTpaqeH-sCuNVv1z9YM7xBS-Z4T0jhtLlPe8xzqTMozRae_ZNoxsqvpLNQxyDu0Woeei-Ip3E_PYxuZeikLbIDzpe17BVcrNocfqj4fl5KszGyF7ExHAvfVTGVBH_UeX5rQbSY6ZOCTHiIoca2U5sZfChiCoUH3wk',
            'domain': '.goodreads.com',
            'path': '/',
            'secure': True,
            'httpOnly': True,
            'expiry': int((datetime.now() + timedelta(days=365*2)).timestamp())
        },
        {
            'name': 'ccsid',
            'value': '724-3321647-0772806',
            'domain': 'www.goodreads.com',
            'path': '/',
            'secure': False,
            'expiry': int((datetime.now() + timedelta(days=365*2)).timestamp())
        },
        {
            'name': 'csm-hit',
            'value': 'tb:s-2R1WY81WFQ4QG5VKQVVG|1754352395507&t:1754352395508',
            'domain': 'www.goodreads.com',
            'path': '/',
            'secure': False,
            'expiry': int((datetime.now() + timedelta(days=365)).timestamp())
        },
        {
            'name': 'lc-main',
            'value': 'en_US',
            'domain': '.goodreads.com',
            'path': '/',
            'secure': False,
            'httpOnly': True,
            'expiry': int((datetime.now() + timedelta(days=365*2)).timestamp())
        },
        {
            'name': 'locale',
            'value': 'en',
            'domain': 'www.goodreads.com',
            'path': '/',
            'secure': False
        },
        {
            'name': 'logged_out_browsing_page_count',
            'value': '1',
            'domain': 'www.goodreads.com',
            'path': '/',
            'secure': False,
            'expiry': int((datetime.now() + timedelta(days=365*2)).timestamp())
        },
        {
            'name': 'sess-at-main',
            'value': 'U8NDSc1MTsE8derMKmAGMZ+Uq9chMayYrfiC+B46wCE=',
            'domain': '.goodreads.com',
            'path': '/',
            'secure': True,
            'httpOnly': True,
            'expiry': int((datetime.now() + timedelta(days=365*2)).timestamp())
        },
        {
            'name': 'session-id',
            'value': '140-3588248-4268607',
            'domain': '.goodreads.com',
            'path': '/',
            'secure': False,
            'httpOnly': True,
            'expiry': int((datetime.now() + timedelta(days=365*2)).timestamp())
        },
        {
            'name': 'session-id-time',
            'value': '2385072442l',
            'domain': '.goodreads.com',
            'path': '/',
            'secure': False,
            'httpOnly': True,
            'expiry': int((datetime.now() + timedelta(days=365*2)).timestamp())
        },
        {
            'name': 'session-token',
            'value': '0+8P54n7E6d+0GGShiBALFOEgtTQdUdna1ExcAeRw3Ul16HkHsMc2W2ZpSYd+dn2CmsRT4KTntQt8WF4Of+YP1EitZ4QW4VQMG/hg3NoH61WK00ztRFxR6GkLoQiygwTiqkHExpG3pEjipb2/x256UoQDqyJcqAdFLyXukWtVKwfCFNsWzxJZMP/gmSX/Ml1mSdPMmkb9yJ5gb+ugF6z5a1F2Hr01Tt1Ynz77AY8fV5BunhvuaXJYoMJjEKHvWoLmurVyyWT/YZ71mOlJJXAtphVwQmemG3C27hNxcCL3cC3x1N6iA36seE5LzPJmOkxNjbGZ5EnO3s+IVXsenDzWijTZZz3dAu9eDn03BYbLEF0fsf7cW4b3g==',
            'domain': '.goodreads.com',
            'path': '/',
            'secure': False,
            'httpOnly': True,
            'expiry': int((datetime.now() + timedelta(days=365*2)).timestamp())
        },
        {
            'name': 'ubid-main',
            'value': '135-7477912-2392604',
            'domain': '.goodreads.com',
            'path': '/',
            'secure': False,
            'httpOnly': True,
            'expiry': int((datetime.now() + timedelta(days=365*2)).timestamp())
        },
        {
            'name': 'x-main',
            'value': 'hf5q7qpx3x0HgBwzCGe?XOExwXUrkUVkjz1Z15Z2ptg0JUWQ9klSPvPcAXY9G41Z',
            'domain': '.goodreads.com',
            'path': '/',
            'secure': False,
            'httpOnly': True,
            'expiry': int((datetime.now() + timedelta(days=365*2)).timestamp())
        }
    ]

    for attempt in range(2):  # Retry cookie loading once if needed
        driver.get('https://www.goodreads.com/')
        print(f"{datetime.now()}: Navigating to Goodreads homepage for cookie loading (attempt {attempt+1}).")
        time.sleep(3)
        driver.delete_all_cookies()
        print(f"{datetime.now()}: Cleared existing cookies.")
        for cookie in cookies:
            if 'expiry' in cookie:
                cookie['expiry'] = int(cookie['expiry'])
            driver.add_cookie(cookie)
        driver.refresh()
        print(f"{datetime.now()}: Loaded {len(cookies)} cookies and refreshed page.")
        time.sleep(3)

        if is_logged_in(driver):
            print(f"{datetime.now()}: Success: Cookie-based login successful!")
            return True
        else:
            print(f"{datetime.now()}: Cookie login failed (attempt {attempt+1}).")
            # Insight: Print page_source for debugging login failures
            print(f"{datetime.now()}: Debugging - Current page source:\n{driver.page_source[:500]}...")  # Truncated for brevity

    print(f"{datetime.now()}: Cookie login failed after retries. Falling back to auto-login if enabled.")
    if auto_login:
        return attempt_auto_login(driver)
    return False

def attempt_auto_login(driver):
    email = os.environ.get('GOODREADS_EMAIL')
    password = os.environ.get('GOODREADS_PASSWORD')
    if not email or not password:
        print(f"{datetime.now()}: ERROR: GOODREADS_EMAIL and GOODREADS_PASSWORD not set. Skipping auto-login.")
        return False
    print(f"{datetime.now()}: Attempting auto-login with provided credentials.")
    driver.get('https://www.goodreads.com/user/sign_in')
    time.sleep(3)
    driver.find_element(By.ID, 'user_email').send_keys(email)
    driver.find_element(By.ID, 'user_password').send_keys(password)
    driver.find_element(By.NAME, 'commit').click()
    time.sleep(5)
    return is_logged_in(driver)

def is_logged_in(driver):
    indicators = [
        (By.CLASS_NAME, 'siteHeader__personalMenu'),
        (By.XPATH, "//a[contains(@href, '/user/show/')]")
    ]
    for by, value in indicators:
        try:
            driver.find_element(by, value)
            return True
        except NoSuchElementException:
            continue
    return False

# Set up Selenium with anti-detection
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36')  # Updated to latest Chrome version

driver = webdriver.Chrome(options=chrome_options)
if not load_cookies(driver, auto_login=True):  # Enable auto-login fallback
    raise ValueError("Login failed. Cannot proceed with scraping.")

print("Cell 2: Login functions ready and login successful.")


=== Starting cookie/login process ===
2025-08-07 18:26:43.173389: Navigating to Goodreads homepage for cookie loading (attempt 1).
2025-08-07 18:26:46.441573: Cleared existing cookies.
2025-08-07 18:26:56.059926: Loaded 13 cookies and refreshed page.
2025-08-07 18:27:03.897012: Success: Cookie-based login successful!
Cell 2: Login functions ready and login successful.


# Cell 3: Main Scraping Functions and Execution

In [11]:
def extract_abstract(driver, soup):  # Insight: Pass driver for waits; use explicit wait from provided code
    print(f"{datetime.now()}: Extracting abstract.")
    try:
        element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".BookPageMetadataSection__description span.Formatted"))
        )
        text = element.text.strip()
        print(f"{datetime.now()}: Scraped description: {text[:100]}...")
        return text
    except TimeoutException:
        print(f"{datetime.now()}: No description found. Debugging - Current page source:\n{driver.page_source[:500]}...")
        return 'N/A'

def extract_core_identifiers(soup, url):
    print(f"{datetime.now()}: Extracting core identifiers.")
    title_elem = soup.find('h1', {'data-testid': 'bookTitle'}) or soup.find('h1', class_='gr-h1')
    title = title_elem.get_text(strip=True) if title_elem else 'N/A'
    print(f"{datetime.now()}: Scraped title: {title}")

    authors = [a.text.strip().replace('(Goodreads Author)', '').strip() for a in soup.find_all('span', {'data-testid': 'name'}) if 'Goodreads Author' not in a.text]
    author = authors[0] if authors else 'N/A'
    all_authors = '; '.join(authors) if len(authors) > 1 else author
    print(f"{datetime.now()}: Scraped author: {author}")
    print(f"{datetime.now()}: Scraped all authors: {all_authors}")

    book_id_match = re.search(r'(\d+)', url)
    book_id = book_id_match.group(1) if book_id_match else 'N/A'
    print(f"{datetime.now()}: Scraped book_id: {book_id}")

    return {
        'title': title,  # Updated: Use lowercase 'title'
        'author': author,
        'all_authors': all_authors,
        'book_id': book_id
    }

def extract_ratings_reviews(driver, soup):  # Updated: Parse histogram widths from provided HTML
    print(f"{datetime.now()}: Extracting ratings and reviews metadata.")
    rating_elem = soup.find('div', class_='RatingStatistics__rating')
    average_rating = float(rating_elem.get_text(strip=True)) if rating_elem else 'N/A'
    print(f"{datetime.now()}: Scraped average_rating: {average_rating}")

    ratings_count_elem = soup.find('span', {'data-testid': 'ratingsCount'})
    num_ratings = int(re.search(r'(\d+[\d,]*)', ratings_count_elem.get_text(strip=True)).group(1).replace(',', '')) if ratings_count_elem else 'N/A'
    print(f"{datetime.now()}: Scraped num_ratings: {num_ratings}")

    reviews_count_elem = soup.find('span', {'data-testid': 'reviewsCount'})
    num_reviews = int(re.search(r'(\d+[\d,]*)', reviews_count_elem.get_text(strip=True)).group(1).replace(',', '')) if reviews_count_elem else 'N/A'
    print(f"{datetime.now()}: Scraped num_reviews: {num_reviews}")

    distribution = {}
    try:
        # Wait for histogram
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CLASS_NAME, 'RatingsHistogram')))
        histogram_bars = soup.find_all('div', class_='RatingsHistogram__bar')
        stars = [5, 4, 3, 2, 1]
        for i, bar in enumerate(histogram_bars):
            if i >= len(stars):
                break
            fill_elem = bar.find('div', {'data-testid': f'fill-{stars[i]}'})
            if fill_elem and 'style' in fill_elem.attrs:
                width_match = re.search(r'width:([\d.]+)%', fill_elem['style'])
                if width_match:
                    percentage = round(float(width_match.group(1)))  # Round to nearest int
                    distribution[f'{stars[i]}_star'] = percentage
        print(f"{datetime.now()}: Scraped rating_distribution: {distribution}")
    except TimeoutException:
        print(f"{datetime.now()}: Failed to load histogram. Debugging - Relevant page source:\n" +
              str(soup.find('div', class_='ReviewsSectionStatistics__histogram'))[:500] + "...")

    return {
        'average_rating': average_rating,
        'num_ratings': num_ratings,
        'num_reviews': num_reviews,
        'rating_distribution': distribution
    }

def extract_publication_details(soup):
    print(f"{datetime.now()}: Extracting publication details.")
    pub_info_elem = soup.find('p', {'data-testid': 'publicationInfo'})
    pub_text = pub_info_elem.get_text(strip=True) if pub_info_elem else ''
    print(f"{datetime.now()}: Scraped pub_info: {pub_text}")

    year_match = re.search(r'(\d{4})', pub_text)
    pub_date = int(year_match.group(1)) if year_match else 'N/A'
    print(f"{datetime.now()}: Scraped pub_date: {pub_date}")

    publisher = pub_text.split('by ')[-1].strip() if 'by ' in pub_text else 'N/A'
    print(f"{datetime.now()}: Scraped publisher: {publisher}")

    format_elem = soup.find('p', {'data-testid': 'pagesFormat'})
    format_text = format_elem.get_text(strip=True) if format_elem else ''
    print(f"{datetime.now()}: Scraped format_info: {format_text}")

    pages_match = re.search(r'(\d+) pages', format_text)
    page_count = int(pages_match.group(1)) if pages_match else 'N/A'
    print(f"{datetime.now()}: Scraped page_count: {page_count}")

    return {
        'pub_info': pub_text,
        'pub_date': pub_date,
        'publisher': publisher,
        'isbn': soup.find('span', itemprop='isbn').text if soup.find('span', itemprop='isbn') else 'N/A',
        'format_info': format_text,
        'page_count': page_count,
        'language': 'English'  # Default or parse if available
    }

def extract_genres_shelves(driver, soup, shelves_url):  # Updated: Scrape full shelves from shelves page
    print(f"{datetime.now()}: Extracting genres and shelves.")
    # Initial genres from main page
    genres = []
    genre_section = soup.find('div', class_='BookPageMetadataSection__genres')
    if genre_section:
        genre_links = genre_section.find_all('a', class_='Button')
        genres = [link.text.strip() for link in genre_links[:10]]  # Top 10
    primary_genre = genres[0] if genres else 'N/A'
    genres_str = '; '.join(genres) if genres else 'N/A'
    print(f"{datetime.now()}: Scraped initial genres (main page): {genres_str}")

    # Navigate to shelves page for full list
    all_shelves = 'N/A'
    if shelves_url:
        print(f"{datetime.now()}: Navigating to shelves URL: {shelves_url}")
        driver.get(shelves_url)
        time.sleep(random.uniform(4, 7))
        # Expand on shelves page
        try:
            shelves_expand = driver.find_elements(By.XPATH, "//div[@class='TruncatedContent']//button[contains(@aria-label, 'Show all') or contains(text(), 'more')]")
            for btn in shelves_expand:
                driver.execute_script("arguments[0].click();", btn)
                time.sleep(2)
        except:
            print(f"{datetime.now()}: No expanders on shelves page.")

        shelves_soup = BeautifulSoup(driver.page_source, 'html.parser')
        # Scrape all shelves (assuming listed in <li> or <a> in a shelves container; adjust if needed)
        shelves_elements = shelves_soup.select('div.shelfList a, li.shelfItem')  # Common selectors for shelves
        all_shelves_list = [elem.text.strip() for elem in shelves_elements if elem.text.strip()]
        all_shelves = '; '.join(all_shelves_list) if all_shelves_list else 'N/A'
        print(f"{datetime.now()}: Scraped all shelves (from shelves page): {all_shelves[:200]}...")  # Truncated print

    return {
        'genres': genres_str,
        'primary_genre': primary_genre,
        'all_shelves': all_shelves  # New field
    }

def extract_characters_and_places(driver, soup, shelves_soup):  # Updated: Parse from shelves page if needed
    print(f"{datetime.now()}: Extracting characters and places from main and shelves page.")
    characters_str = 'N/A'
    places_str = 'N/A'
    literary_awards = 'N/A'  # New field

    # Parse from main soup first
    characters_section = soup.find('dt', string=re.compile('Characters', re.I))
    if characters_section:
        dd = characters_section.find_next_sibling('dd')
        if dd:
            character_links = dd.select('div.TruncatedContent__text.TruncatedContent__text--small[data-testid="contentContainer"] a')
            characters_str = '; '.join([link.text.strip() for link in character_links if link.text.strip()]) or 'N/A'

    places_section = soup.find('dt', string=re.compile('Places|Setting', re.I))
    if places_section:
        dd = places_section.find_next_sibling('dd')
        if dd:
            place_links = dd.select('div.TruncatedContent__text.TruncatedContent__text--small[data-testid="contentContainer"] a')
            places_str = '; '.join([link.text.strip() for link in place_links if link.text.strip()]) or 'N/A'

    # Fallback to shelves_soup (work details page)
    if (characters_str == 'N/A' or places_str == 'N/A') and shelves_soup:
        # Characters
        characters_section = shelves_soup.find('dt', string=re.compile('Characters', re.I))
        if characters_section:
            dd = characters_section.find_next_sibling('dd')
            if dd:
                character_links = dd.select('div.TruncatedContent__text.TruncatedContent__text--small[data-testid="contentContainer"] a')
                characters_str = '; '.join([link.text.strip() for link in character_links if link.text.strip()]) or 'N/A'

        # Places (from Setting)
        places_section = shelves_soup.find('dt', string=re.compile('Setting|Places', re.I))
        if places_section:
            dd = places_section.find_next_sibling('dd')
            if dd:
                place_links = dd.select('div.TruncatedContent__text.TruncatedContent__text--small[data-testid="contentContainer"] a')
                places_str = '; '.join([link.text.strip() for link in place_links if link.text.strip()]) or 'N/A'

        # Literary awards (new)
        awards_section = shelves_soup.find('dt', string=re.compile('Literary awards', re.I))
        if awards_section:
            dd = awards_section.find_next_sibling('dd')
            if dd:
                awards_spans = dd.select('span[data-testid="award"] a')  # Updated to select <a> inside spans
                literary_awards = '; '.join([span.get_text(strip=True) for span in awards_spans if span.get_text(strip=True)]) or 'N/A'

    print(f"{datetime.now()}: Scraped characters: {characters_str}")
    print(f"{datetime.now()}: Scraped places: {places_str}")
    print(f"{datetime.now()}: Scraped literary awards: {literary_awards}")

    return {
        'characters': characters_str,
        'places': places_str,
        'literary_awards': literary_awards  # New field
    }


def collect_reviews(driver, book_id, max_reviews=50):  # Reduced for testing; change to 100 for full
    book_id = str(int(book_id))  # Fix: Ensure clean integer string (removes .0)
    reviews_url = f"https://www.goodreads.com/book/show/{book_id}/reviews?review_filters=has_user_rating"
    print(f"{datetime.now()}: Navigating to reviews URL: {reviews_url}")
    driver.get(reviews_url)
    time.sleep(random.uniform(5, 7))  # Longer initial wait for load

    # Wait for reviews container
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'ReviewsSection')))
    except TimeoutException:
        print(f"{datetime.now()}: Reviews container not found. Refreshing page once.")
        driver.refresh()
        time.sleep(random.uniform(5, 7))

    reviews = []
    loaded = 0
    pbar = tqdm(total=max_reviews, desc="Collecting reviews")
    retries = 0
    max_retries = 3
    while loaded < max_reviews and retries < max_retries:
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        review_elements = soup.find_all('section', class_='ReviewCard')[loaded:]
        for rev in review_elements:
            rating_elem = rev.find('span', class_='RatingStars')
            rating = rating_elem['aria-label'] if rating_elem else 'N/A'
            text_elem = rev.find('span', class_='ReviewText')
            text = text_elem.get_text(strip=True) if text_elem else 'N/A'
            reviews.append({'rating': rating, 'text': text})
            loaded += 1
            pbar.update(1)
            if loaded >= max_reviews:
                break

        try:
            load_more = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Load More') or @aria-label='Load more reviews']")))
            print(f"{datetime.now()}: Clicking 'Load More' (loaded so far: {loaded})")
            driver.execute_script("arguments[0].click();", load_more)
            time.sleep(random.uniform(2, 4) * (retries + 1))  # Exponential backoff
            retries = 0
        except TimeoutException:
            retries += 1
            print(f"{datetime.now()}: No more 'Load More' button or timeout (retry {retries}/{max_retries}).")
            time.sleep(2)
            if retries >= max_retries:
                print(f"{datetime.now()}: Max retries reached. Stopping review collection.")
                break
    pbar.close()

    # Print scraped reviews summary and sample
    print(f"{datetime.now()}: Collected {len(reviews)} reviews.")
    if len(reviews) < max_reviews:
        print(f"{datetime.now()}: WARNING: Fewer reviews collected than max ({max_reviews}). Possible page issue.")
    if reviews:
        print(f"{datetime.now()}: Sample of first 5 reviews:")
        for idx, rev in enumerate(reviews[:5]):
            print(f"Review {idx+1}: Rating = {rev['rating']}, Text = {rev['text'][:100]}...")  # First 100 chars for brevity

    return reviews


def enrich_book_metadata(row):
    start_time = datetime.now()
    print(f"\n{start_time}: Starting metadata enrichment for book: {row['title']} (URL: {row['url']})") # Updated: Use lowercase 'title' and 'url'

    for attempt in range(3):  # Increased to 3 retries for page load
        try:
            driver.get(row['url']) # Updated: Use lowercase 'url'
            time.sleep(random.uniform(6, 10))  # Increased initial load time

            # Global expand (for main page)
            print(f"{datetime.now()}: Expanding global sections...")
            for expand_attempt in range(3):  # 3 retries for expansion
                try:
                    # Separate XPaths to avoid invalid selector
                    button_xpath = "//div[contains(@class, 'BookPageMetadataSection')]//button[contains(text(), 'more') or contains(text(), 'More') or @aria-label='Show all' or contains(@class, 'ExpandableContent__button') or @aria-label='Show all items in the list']"
                    a_xpath = "//a[contains(@href, '/work/shelves/') and contains(@aria-label, 'show all top genres')]"
                    expand_elements = driver.find_elements(By.XPATH, f"{button_xpath} | {a_xpath}")  # Use | for union
                    if not expand_elements:
                        print(f"{datetime.now()}: No expand elements found (attempt {expand_attempt+1}). Refreshing page.")
                        driver.refresh()
                        time.sleep(3)
                        continue

                    for elem in expand_elements:
                        try:
                            driver.execute_script("arguments[0].scrollIntoView();", elem)
                            WebDriverWait(driver, 10).until(EC.invisibility_of_element_located((By.XPATH, "//span[@tabindex='-1']")))
                            WebDriverWait(driver, 10).until(EC.element_to_be_clickable(elem))
                            try:
                                elem.click()
                                print(f"{datetime.now()}: Successfully clicked expand element.")
                            except ElementClickInterceptedException:
                                print(f"{datetime.now()}: Click intercepted (attempt {expand_attempt+1}). Using JS fallback.")
                                driver.execute_script("arguments[0].click();", elem)
                            except StaleElementReferenceException:
                                print(f"{datetime.now()}: Stale element during click (attempt {expand_attempt+1}). Skipping this element.")
                                continue
                            except InvalidSelectorException as ise:
                                print(f"{datetime.now()}: Invalid selector during expand click: {ise} (attempt {expand_attempt+1}). Skipping this element.")
                                continue
                            time.sleep(2)
                        except TimeoutException:
                            print(f"{datetime.now()}: Timeout on expand element (attempt {expand_attempt+1}). Element type: {elem.tag_name}")
                            continue
                        except StaleElementReferenceException:
                            print(f"{datetime.now()}: Stale element detected during expansion (attempt {expand_attempt+1}). Relocating...")
                            break  # Break to re-locate all elements
                        except Exception as e:
                            print(f"{datetime.now()}: Unexpected error on expand: {e} (attempt {expand_attempt+1}).")
                    time.sleep(3)  # Delay between attempts
                except InvalidSelectorException as e:
                    print(f"{datetime.now()}: Invalid XPath selector: {e}. Skipping expansion.")
                    break

            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Updated: Extract shelves URL with more flexible selector
            shelves_link = soup.find('a', href=re.compile(r'/work/shelves/'), attrs={'aria-label': re.compile(r'(Tap to show all|show all top genres)', re.I)}) or \
                           soup.find('a', class_=re.compile(r'Button--tag'), string=re.compile(r'\.\.\.show all')) or \
                           soup.find('a', href=re.compile(r'/work/shelves/'))  # Fallback to any matching href
            shelves_url = 'https://www.goodreads.com' + shelves_link['href'] if shelves_link else None
            if shelves_url:
                print(f"{datetime.now()}: Extracted shelves URL: {shelves_url}")
            else:
                genres_section_html = str(soup.find('div', class_='BookPageMetadataSection__genres'))[:500] if soup.find('div', class_='BookPageMetadataSection__genres') else ""
                print(f"{datetime.now()}: Failed to extract shelves URL. Debugging - Genres section HTML:\n{genres_section_html}...")

            # Navigate to shelves page and parse
            shelves_soup = None  # Reset
            if shelves_url:
                driver.get(shelves_url)
                time.sleep(random.uniform(4, 7))
                # Expand on shelves page
                try:
                    shelves_expand = driver.find_elements(By.XPATH, "//div[@class='TruncatedContent']//button[contains(@aria-label, 'Show all') or contains(text(), 'more')]")
                    for btn in shelves_expand:
                        driver.execute_script("arguments[0].click();", btn)
                        time.sleep(2)
                except:
                    print(f"{datetime.now()}: No expanders on shelves page.")
                shelves_soup = BeautifulSoup(driver.page_source, 'html.parser')

            metadata = extract_core_identifiers(soup, row['url']) # Updated: Use lowercase 'url'
            metadata.update(extract_ratings_reviews(driver, soup))
            metadata.update(extract_publication_details(soup))
            metadata.update(extract_genres_shelves(driver, soup, shelves_url))
            metadata.update(extract_characters_and_places(driver, soup, shelves_soup))
            metadata['description'] = extract_abstract(driver, soup)

            duration = (datetime.now() - start_time).total_seconds()
            print(f"{datetime.now()}: Metadata enrichment complete for {row['title']} (took {duration:.2f} seconds).") # Updated: Use lowercase 'title'
            return pd.Series(metadata)
        except Exception as e:
            print(f"{datetime.now()}: Error loading page for {row['title']} (attempt {attempt+1}): {e}. Retrying...") # Updated: Use lowercase 'title'
            time.sleep(5)
    raise ValueError(f"Failed to enrich metadata for {row['title']} after retries.") # Updated: Use lowercase 'title'

# --- Re-initialize driver at the start of the cell ---
print(f"{datetime.now()}: Re-initializing Selenium driver.")
# Set up Selenium with anti-detection
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--disable-blink-features=AutomationControlled')
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36')  # Updated to latest Chrome version

# Ensure driver is properly initialized using webdriver-manager
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=webdriver.chrome.service.Service(ChromeDriverManager().install()), options=chrome_options)

# Reload cookies and attempt login
if not load_cookies(driver, auto_login=True):  # Enable auto-login fallback
    raise ValueError("Login failed. Cannot proceed with scraping.")
print(f"{datetime.now()}: Driver re-initialized and login re-attempted.")
# --- End of driver re-initialization ---


# Load CSV
csv_path = '/content/drive/MyDrive/Romantic_Extened_Dataset_Scraping/romantic_books_final_scraped.csv'
df = pd.read_csv(csv_path)
print(f"{datetime.now()}: Loaded DataFrame with {len(df)} rows.")

# For testing: Process only first 3 rows (uncomment to enable)
df = df.head(3)  # Enabled for testing
print(f"{datetime.now()}: Testing on sample of {len(df)} books (first 3 rows).")

# Insight: Add success/fail counters (inspired by provided code)
success_count = {'abstract': 0, 'characters': 0, 'distribution': 0, 'total': 0}
fail_count = {'abstract': 0, 'characters': 0, 'distribution': 0, 'total': 0}

# Enrich metadata with progress bar
enriched_data = []
for _, row in tqdm(df.iterrows(), total=len(df), desc="Enriching metadata for books"):
    try:
        enriched = enrich_book_metadata(row)
        enriched_data.append(enriched)
        # Count successes (check if key fields were scraped successfully)
        if enriched.get('description') != 'N/A':
            success_count['abstract'] += 1
        else:
            fail_count['abstract'] += 1
        if enriched.get('characters') != 'N/A':
            success_count['characters'] += 1
        else:
            fail_count['characters'] += 1
        if enriched.get('rating_distribution'):
            success_count['distribution'] += 1
        else:
            fail_count['distribution'] += 1
        success_count['total'] += 1
    except Exception as e:
        print(f"{datetime.now()}: Error enriching metadata for {row['title']}: {e}. Skipping.") # Updated: Use lowercase 'title'
        fail_count['total'] += 1
df = pd.concat([df] + enriched_data, axis=1)

# Collect reviews for each book with progress
for i, row in tqdm(df.iterrows(), total=len(df), desc="Collecting reviews for books"):
    try:
        start_time = datetime.now()
        print(f"\n{start_time}: Starting review collection for book: {row['title']} (ID: {row['book_id']})") # Updated: Use lowercase 'title'
        df.at[i, 'reviews'] = json.dumps(collect_reviews(driver, row['book_id']))
        duration = (datetime.now() - start_time).total_seconds()
        print(f"{datetime.now()}: Review collection complete for {row['title']} (took {duration:.2f} seconds).") # Updated: Use lowercase 'title'
        success_count['total'] += 1  # Count as overall success
    except Exception as e:
        print(f"{datetime.now()}: Error collecting reviews for {row['title']}: {e}. Skipping.") # Updated: Use lowercase 'title'
        fail_count['total'] += 1

# Insight: Print success/fail summary
print(f"{datetime.now()}: Scraping summary - Abstracts: {success_count['abstract']} success, {fail_count['abstract']} fails | "
      f"Characters: {success_count['characters']} success, {fail_count['characters']} fails | "
      f"Distributions: {success_count['distribution']} success, {fail_count['distribution']} fails | "
      f"Total: {success_count['total']} success, {fail_count['total']} fails.")

# Save enriched CSV
enriched_path = '/content/drive/MyDrive/Romantic_Extened_Dataset_Scraping/romantic_books_enriched.csv'
df.to_csv(enriched_path, index=False)
print(f"{datetime.now()}: Enriched CSV saved to {enriched_path}")

driver.quit()
print(f"{datetime.now()}: Cell 3: Enrichment complete.")

2025-08-07 18:28:04.138573: Re-initializing Selenium driver.

=== Starting cookie/login process ===
2025-08-07 18:28:07.511420: Navigating to Goodreads homepage for cookie loading (attempt 1).
2025-08-07 18:28:10.736139: Cleared existing cookies.
2025-08-07 18:28:12.006636: Loaded 13 cookies and refreshed page.
2025-08-07 18:28:15.775396: Success: Cookie-based login successful!
2025-08-07 18:28:15.775669: Driver re-initialized and login re-attempted.
2025-08-07 18:28:19.327494: Loaded DataFrame with 2860 rows.
2025-08-07 18:28:19.328821: Testing on sample of 3 books (first 3 rows).


Enriching metadata for books:   0%|          | 0/3 [00:00<?, ?it/s]


2025-08-07 18:28:19.348729: Starting metadata enrichment for book: The Love Hypothesis (Paperback) (URL: https://www.goodreads.com/book/show/56732449-the-love-hypothesis)
2025-08-07 18:28:30.295430: Expanding global sections...
2025-08-07 18:28:40.875392: Timeout on expand element (attempt 1). Element type: button
2025-08-07 18:28:54.376013: Timeout on expand element (attempt 2). Element type: button
2025-08-07 18:29:07.411567: Timeout on expand element (attempt 3). Element type: button


Enriching metadata for books:  33%|███▎      | 1/3 [00:51<01:43, 51.79s/it]

2025-08-07 18:29:11.003134: Failed to extract shelves URL. Debugging - Genres section HTML:
<div class="BookPageMetadataSection__genres" data-testid="genresList"><ul aria-label="Top genres for this book" class="CollapsableList"><span tabindex="-1"><span class="BookPageMetadataSection__genrePlainText"><span class="Text Text__body3 Text__subdued">Genres</span></span><span class="BookPageMetadataSection__genreButton"><a class="Button Button--tag Button--medium" href="https://www.goodreads.com/genres/romance"><span class="Button__labelItem">Romance</span></a></span><span class="BookPageMet...
2025-08-07 18:29:11.003403: Extracting core identifiers.
2025-08-07 18:29:11.004012: Scraped title: The Love Hypothesis
2025-08-07 18:29:11.013335: Scraped author: Ali Hazelwood
2025-08-07 18:29:11.013394: Scraped all authors: Ali Hazelwood; Ali Hazelwood
2025-08-07 18:29:11.013583: Scraped book_id: 56732449
2025-08-07 18:29:11.013604: Extracting ratings and reviews metadata.
2025-08-07 18:29:11.01453

Enriching metadata for books:  67%|██████▋   | 2/3 [01:47<00:54, 54.29s/it]

2025-08-07 18:30:07.106148: Failed to extract shelves URL. Debugging - Genres section HTML:
<div class="BookPageMetadataSection__genres" data-testid="genresList"><ul aria-label="Top genres for this book" class="CollapsableList"><span tabindex="-1"><span class="BookPageMetadataSection__genrePlainText"><span class="Text Text__body3 Text__subdued">Genres</span></span><span class="BookPageMetadataSection__genreButton"><a class="Button Button--tag Button--medium" href="https://www.goodreads.com/genres/romance"><span class="Button__labelItem">Romance</span></a></span><span class="BookPageMet...
2025-08-07 18:30:07.106554: Extracting core identifiers.
2025-08-07 18:30:07.107029: Scraped title: The Hating Game
2025-08-07 18:30:07.112386: Scraped author: Sally  Thorne
2025-08-07 18:30:07.112435: Scraped all authors: Sally  Thorne; Sally  Thorne
2025-08-07 18:30:07.112466: Scraped book_id: 27213238
2025-08-07 18:30:07.112484: Extracting ratings and reviews metadata.
2025-08-07 18:30:07.113298: S

Enriching metadata for books: 100%|██████████| 3/3 [02:38<00:00, 52.76s/it]


2025-08-07 18:30:57.529046: Failed to extract shelves URL. Debugging - Genres section HTML:
<div class="BookPageMetadataSection__genres" data-testid="genresList"><ul aria-label="Top genres for this book" class="CollapsableList"><span tabindex="-1"><span class="BookPageMetadataSection__genrePlainText"><span class="Text Text__body3 Text__subdued">Genres</span></span><span class="BookPageMetadataSection__genreButton"><a class="Button Button--tag Button--medium" href="https://www.goodreads.com/genres/fiction"><span class="Button__labelItem">Fiction</span></a></span><span class="BookPageMet...
2025-08-07 18:30:57.529263: Extracting core identifiers.
2025-08-07 18:30:57.529724: Scraped title: Beach Read
2025-08-07 18:30:57.536156: Scraped author: Emily Henry
2025-08-07 18:30:57.536204: Scraped all authors: Emily Henry; Emily Henry
2025-08-07 18:30:57.536233: Scraped book_id: 52867387
2025-08-07 18:30:57.536247: Extracting ratings and reviews metadata.
2025-08-07 18:30:57.537065: Scraped aver

Collecting reviews for books:   0%|          | 0/25 [00:00<?, ?it/s]


2025-08-07 18:30:57.656323: Starting review collection for book: The Love Hypothesis (Paperback) (ID: 56732449.0)
2025-08-07 18:30:57.656481: Navigating to reviews URL: https://www.goodreads.com/book/show/56732449/reviews?review_filters=has_user_rating
2025-08-07 18:31:16.137417: Reviews container not found. Refreshing page once.



Collecting reviews:   0%|          | 0/50 [00:00<?, ?it/s][A

2025-08-07 18:31:35.147623: No more 'Load More' button or timeout (retry 1/3).
2025-08-07 18:31:47.965931: No more 'Load More' button or timeout (retry 2/3).
2025-08-07 18:32:00.558759: No more 'Load More' button or timeout (retry 3/3).


Collecting reviews:   0%|          | 0/50 [00:38<?, ?it/s]
Collecting reviews for books:   4%|▍         | 1/25 [01:04<25:57, 64.91s/it]

2025-08-07 18:32:02.559013: Max retries reached. Stopping review collection.
2025-08-07 18:32:02.560191: Collected 0 reviews.
2025-08-07 18:32:02.567595: Review collection complete for The Love Hypothesis (Paperback) (took 64.91 seconds).

2025-08-07 18:32:02.568775: Starting review collection for book: The Hating Game (Paperback) (ID: 27213238.0)
2025-08-07 18:32:02.568852: Navigating to reviews URL: https://www.goodreads.com/book/show/27213238/reviews?review_filters=has_user_rating
2025-08-07 18:32:20.129830: Reviews container not found. Refreshing page once.



Collecting reviews:   0%|          | 0/50 [00:00<?, ?it/s][A

2025-08-07 18:32:37.454524: No more 'Load More' button or timeout (retry 1/3).


Collecting reviews for books:   4%|▍         | 1/25 [01:48<43:27, 108.64s/it]Exception ignored in: <generator object tqdm.__iter__ at 0x7a034706e180>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/tqdm/std.py", line 1196, in __iter__
    self.close()
  File "/usr/local/lib/python3.11/dist-packages/tqdm/std.py", line 1302, in close
    self.display(pos=0)
  File "/usr/local/lib/python3.11/dist-packages/tqdm/std.py", line 1495, in display
    self.sp(self.__str__() if msg is None else msg)
  File "/usr/local/lib/python3.11/dist-packages/tqdm/std.py", line 459, in print_status
    fp_write('\r' + s + (' ' * max(last_len[0] - len_s, 0)))
  File "/usr/local/lib/python3.11/dist-packages/tqdm/std.py", line 453, in fp_write
    fp_flush()
  File "/usr/local/lib/python3.11/dist-packages/tqdm/utils.py", line 196, in inner
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/iostream.py", l

KeyboardInterrupt: 