In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from datetime import datetime
import os
from requests import get
import json
import bs4
import glob
import ast

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [None]:
def get_mybooks(new_export_path):
    # Remove previous 
    goodreads_export = f'data/goodreads_library_export.csv'
    if os.path.isfile(goodreads_export):
        os.remove(goodreads_export)

    directory = '/Users/alex/Documents/testing/goodreads-ranker/data'
    prefs = {'download.default_directory' : directory}
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_experimental_option('prefs', prefs)
    driver = webdriver.Chrome(options=chrome_options)

    login_url = 'https://www.goodreads.com/ap/signin?language=en_US&openid.assoc_handle=amzn_goodreads_web_na&openid.claimed_id=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.identity=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0%2Fidentifier_select&openid.mode=checkid_setup&openid.ns=http%3A%2F%2Fspecs.openid.net%2Fauth%2F2.0&openid.pape.max_auth_age=0&openid.return_to=https%3A%2F%2Fwww.goodreads.com%2Fap-handler%2Fsign-in'
    driver.get(login_url)
    WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.CLASS_NAME, "homePrimaryColumn")))

    driver.get('https://www.goodreads.com/review/import')
    time.sleep(2)

    current_date = datetime.now().strftime('%m/%d/%Y')
    file_list = driver.find_element(By.CLASS_NAME, 'fileList')
    if current_date not in file_list.text:
        export_button = driver.find_element(By.CLASS_NAME, 'js-LibraryExport')
        export_button.click()
    WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.CLASS_NAME, "fileList"))) # Maybe useless

    while True:
        file_list = driver.find_element(By.CLASS_NAME, 'fileList')
        if current_date in file_list.text:
            # find the link and click it
            link = file_list.find_element(By.TAG_NAME, 'a')
            link.click()
            time.sleep(3)
            break
        else:
            time.sleep(3)

    os.rename('data/goodreads_library_export.csv', new_export_path)
    goodreads_export = pd.read_csv(new_export_path)
    driver.quit()
    return goodreads_export

In [None]:
def check_mybooks(new_export_path, this_months_scrape_path):
    books_already_scraped = [int(file_name.replace(f'_metadata.json', '')) for file_name in os.listdir('metadata')]
    try:
        goodreads_export = pd.read_csv(new_export_path)
    except FileNotFoundError:
        goodreads_export = get_mybooks(new_export_path)
    try:
        recent_df = pd.read_csv(this_months_scrape_path)
        recent_book_ids = recent_df['book_id'].tolist()
        books_already_scraped = set(recent_book_ids + books_already_scraped)
        print(books_already_scraped)
    except FileNotFoundError:
        pass

    book_ids = goodreads_export['Book Id'].tolist()
    books_to_scrape = [id for id in book_ids if id not in books_already_scraped]

    return book_ids, books_to_scrape

In [None]:
test = set()
test.remove('a')

In [None]:
import pandas as pd

df = pd.read_csv('data/books.csv')
# df = df[~df['similar_books'].isna()]
# df['year'] = df['year'].astype(int)
# df = df.drop(columns='Unnamed: 0')
# df = df.drop_duplicates(subset=['id'])
df#.to_csv('data/goodreads_books_data.csv', index=False)

# df = pd.read_csv('data/goodreads_library_export.csv')
# df.columns = [col.lower().replace(' ','_') for col in df.columns]
# df#.to_csv('data/goodreads_library_export.csv', index=False)

In [None]:
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import json
import nest_asyncio

# Apply nest_asyncio to allow asyncio to run in Jupyter/IPython
# (This is often necessary to run Playwright's async functions in a notebook cell)
nest_asyncio.apply()

# --- Your original function goes here ---

def extract_dom_data(page, book_data):
    # Note: If running this in a synchronous environment like a script,
    # you would need to await page.content() but Playwright's async_api 
    # handles this context-appropriately here.
    html_content = page.content() # This assumes 'page' is a synchronous wrapper or is awaited elsewhere
    
    # In the actual async run below, we will pass the awaited content()
    # Let's adjust the function signature slightly to accept content instead of the page object
    # for cleaner testing in an async environment, or keep it as is if your 
    # original environment is a Playwright/Scrapy/etc. synchronous runner.
    
    # *** I will assume your original environment makes page.content() synchronous
    # For a *direct* Jupyter test using Playwright, it's safer to pass the HTML string.
    # I will modify the function call below, but keep your function as is for clarity.
    
    # --- Start of your function logic ---
    html_content = page 
    soup = BeautifulSoup(html_content, "html.parser")

    stars = {}
    for i in range(1, 6):
        label = soup.find(attrs={"data-testid": f"labelTotal-{i}"})
        if label:
            text = label.get_text().strip().split()[0]
            text = text.replace(",", "")
            stars[f"{i}_star"] = int(text) if text.isdigit() else 0
        else:
            stars[f"{i}_star"] = 0
    
    book_data.update(stars)

    # --- Genres ---
    genre_nodes = soup.select(".BookPageMetadataSection__genreButton .Button__labelItem")
    genres = [node.get_text() for node in genre_nodes if node.get_text() != "...more"]
    book_data['genres'] = "|".join(genres)

    # --- Series ---
    series_el = soup.select_one("h3.Text__italic a")
    if series_el and series_el.get('href'):
        book_data['series'] = series_el['href'].split('/')[-1]
    else:
        book_data['series'] = ""

    # --- Publication Year ---
    pub_el = soup.find(attrs={"data-testid": "publicationInfo"})
    if pub_el:
        parts = pub_el.get_text().split(", ")
        book_data['year'] = parts[-1].strip() if parts else ""
    else:
        book_data['year'] = ""

    # --- Description ---
    desc_el = soup.select_one("[data-testid='description'] span.Formatted")
    
    if not desc_el:
        desc_el = soup.select_one(".DetailsLayoutRightParagraph__widthConstrained span.Formatted")

    if desc_el:
        text = desc_el.get_text(separator="\n", strip=True)
        book_data['description'] = text
    else:
        book_data['description'] = ""

    return book_data

# --- Asynchronous function to run Playwright ---

async def run_scraper():
    url = "https://www.goodreads.com/book/show/122449053-biblioteca-de-manualidades-cortinas-y-estores?from_search=true&from_srp=mn95XrkXj6&qid=3"
    
    print("üöÄ Launching Playwright browser...")
    async with async_playwright() as p:
        # Use chromium and launch in headless mode for speed
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        
        # Set a user-agent to mimic a real browser
        await page.set_extra_http_headers({"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"})

        print(f"üåç Navigating to: {url}")
        # Wait until the network is idle, meaning all content has likely loaded
        await page.goto(url, wait_until="networkidle") 
        
        # NOTE: Goodreads often uses client-side rendering (JavaScript) for content.
        # Playwright handles this automatically by waiting for the page to load.

        # Pass the HTML content (string) to your scraping function
        html_content = await page.content()
        
        await browser.close()
        print("‚úÖ Browser closed.")

        book_data = {}
        # Call the modified function: pass html_content directly as the 'page' argument
        # since page.content() is the key part that BeautifulSoup needs.
        extracted_data = extract_dom_data(html_content, book_data)
        
        # Print the result nicely
        print("\n‚ú® Extracted Data:")
        print(json.dumps(extracted_data, indent=4, ensure_ascii=False))

# Run the async function
asyncio.run(run_scraper())

In [None]:
# main
this_month = datetime.now().strftime('%m-%Y')
this_day = datetime.now().strftime('%d-%m-%Y')
new_export_path = f'data/{this_day}_goodreads_library_export.csv'
this_months_scrape_path = f'data/{this_month}_goodreads_scraped.csv'

while True:
    book_ids, books_to_scrape = check_mybooks(new_export_path, this_months_scrape_path)
    if books_to_scrape:
        for i, book_id in enumerate(books_to_scrape):
            try:
                print(f'\nScraping book-id:{book_id} ({i+1}/{len(books_to_scrape)})')
                start = datetime.now()
                book = scrape_book(book_id)
                if book:
                    json.dump(book, open(f'metadata/{book_id}_metadata.json', 'w'))
                else:
                    print(f'    scrape_book() returned empty')
                print(f'{datetime.now() - start}')
            except Exception as e:
                print(e)

        books = condense_books('metadata')
        book_df = pd.DataFrame(books)
        
        if os.path.isfile(this_months_scrape_path):
            old_df = pd.read_csv(this_months_scrape_path)
            book_df = pd.concat([old_df, book_df])
            book_df = book_df[book_df['book_id'].isin(book_ids)]
            book_df = book_df.drop_duplicates(subset=[col for col in book_df.columns if col != 'genres'])

        book_df.to_csv(this_months_scrape_path, index=False, encoding='utf-8')
    else:
        # delete_metadata()
        print('ALL BOOKS HAVE BEEN SCRAPED')
        break

In [None]:
goodreads_export = pd.read_csv(new_export_path)
# goodreads_export = pd.read_csv('data/20-01-2025_goodreads_library_export.csv')
goodreads_export['Original Publication Year'] = goodreads_export['Original Publication Year'].fillna(goodreads_export['Year Published'])
goodreads_export = goodreads_export[['Book Id', 'Author', 'My Rating', 'Number of Pages', 'Original Publication Year']]
goodreads_export = goodreads_export.rename(columns={'Book Id':'book_id',
                                                    'Author': 'author',
                                                    'My Rating': 'my_rating',
                                                    'Number of Pages': 'num_pages',
                                                    'Original Publication Year': 'year'})
threshold = (goodreads_export['num_pages'].mean() - goodreads_export['num_pages'].std())
goodreads_export.loc[goodreads_export['num_pages'] < threshold, 'num_pages'] = np.nan

book_df = pd.read_csv(this_months_scrape_path)
# book_df = pd.read_csv('data/01-2025_goodreads_scraped.csv')
df = goodreads_export.merge(book_df, on='book_id')

# Drop competing columns
df['author'] = df['author_x'].fillna(df['author_y'])
df['num_pages'] = df['num_pages_x'].fillna(df['num_pages_y'])
df['year'] = df['year_x'].fillna(df['year_y'])
df.drop(columns=['author_x', 'author_y', 'num_pages_x', 'num_pages_y', 'year_x', 'year_y'], inplace=True)

df['genres'] = df['genres'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
df['year'] = df['year'].fillna(df['year'].mean()).round().astype(int)
df['num_pages'] = df['num_pages'].fillna(df['num_pages'].mean()).round().astype(int)
df['num_reviews'] = df['num_reviews'].fillna(0).round().astype(int)
df['my_rating'] = df['my_rating'].replace(0,np.nan)

df['age'] = int(datetime.now().strftime('%Y')) - df['year']
df['average_rating'] = ((df['5 stars'] * 5) + (df['4 stars'] * 4) + (df['3 stars'] * 3) + (df['2 stars'] * 2) + df['1 star']) / df['num_ratings']
df = df[['book_id', 'title', 'author', 'year', 'age', 'series', 'num_pages', 'genres', 'num_ratings', 'num_reviews', 'my_rating', 'average_rating', '5 stars', '4 stars', '3 stars', '2 stars', '1 star']]

In [None]:
def fit_quadratic(row):
    x = np.array([1, 2, 3, 4, 5])
    a, b, c = np.polyfit(x, row, 2)
    return pd.Series([a, b, c])

# Calculating quadrdic modeling coefficients
df['1_star_percentage'] = df['1 star'] / df['num_ratings']
df['2_star_percentage'] = df['2 stars'] / df['num_ratings']
df['3_star_percentage'] = df['3 stars'] / df['num_ratings']
df['4_star_percentage'] = df['4 stars'] / df['num_ratings']
df['5_star_percentage'] = df['5 stars'] / df['num_ratings']
coefficients = df[['1_star_percentage','2_star_percentage','3_star_percentage','4_star_percentage','5_star_percentage']].apply(fit_quadratic, axis=1)
df['a'], df['b'], df['c'] = coefficients[0], coefficients[1], coefficients[2]

# Pre-processing columns for rankings
df['num_ratings_ln'] = np.log1p(df['num_ratings'])
df['num_pages_ln'] = np.log1p(df['num_pages'])
df['2a_shifted'] = df['a'] - df['a'].min()
df['2a_shifted'] = df['2a_shifted'] * (1 / df['2a_shifted'].max()) + 1
df['b_shifted'] = df['b'] - df['b'].min()
df['b_shifted'] = df['b_shifted'] * (1 / df['b_shifted'].max()) + 1
df['c_shifted'] = df['c'] - df['c'].min()
df['c_shifted'] = df['c_shifted'] * (1 / df['c_shifted'].max()) + 1

# Types of rankings
df['num_adjusted_rating'] = df['average_rating'] - (df['average_rating'] - df['average_rating'].mean()) / df['num_ratings_ln']
df['coeff_2a_rating'] = (df['num_adjusted_rating'] * df['2a_shifted'])
df['coeff_b_rating'] = (df['num_adjusted_rating']) / (df['b_shifted'])
df['coeff_c_rating'] = (df['num_adjusted_rating'] * df['c_shifted'])
df['joined_rating'] = (df['num_adjusted_rating'] * df['c_shifted'] * df['2a_shifted']) / df['b_shifted']
df['final_rating'] = df['joined_rating'] - (df['joined_rating'] - df['joined_rating'].mean()) / df['num_ratings_ln']

df['num_adjusted_page_rating'] = df['num_adjusted_rating'] / (df['num_pages_ln'])
df['coeff_2a_page_rating'] = df['coeff_2a_rating'] / df['num_pages_ln']
df['coeff_b_page_rating'] = df['coeff_b_rating'] / df['num_pages_ln']
df['coeff_c_page_rating'] = df['coeff_c_rating'] / df['num_pages_ln']
df['joined_page_rating'] = df['joined_rating'] / df['num_pages_ln']
df['final_page_rating'] = df['joined_page_rating'] - (df['joined_page_rating'] - df['joined_page_rating'].mean()) / df['num_ratings_ln']

In [None]:
numeric_cols = ['age', 'num_pages', 'num_pages_ln', 'num_ratings', 'num_ratings_ln', 'num_reviews', 'my_rating', 'average_rating', '1 star', '2 stars', '3 stars', '4 stars', '5 stars', '1_star_percentage', '2_star_percentage', '3_star_percentage', '4_star_percentage', '5_star_percentage', 'a', 'b', 'c', 'num_adjusted_rating', 'coeff_2a_rating', 'coeff_b_rating', 'coeff_c_rating', 'joined_rating', 'final_rating', 'num_adjusted_page_rating', 'coeff_2a_page_rating', 'coeff_b_page_rating', 'coeff_c_page_rating', 'joined_page_rating', 'final_page_rating']
corr_df= df[numeric_cols].corr()

plt.figure(figsize=(20, 15)) 
sns.heatmap(corr_df, annot=True, cmap='coolwarm', linewidths=0.5) 
plt.title('Correlation Heatmap') 
plt.show()

In [None]:
fresh = df.sort_values(by='final_page_rating', ascending=False).reset_index().drop('index', axis=1)
fresh = fresh[fresh['my_rating'].isna()]
fresh[['Fiction' in genre_list for genre_list in fresh['genres']]] # Fiction, Nonfiction, Memoir, Classics, History, Politics, Philosophy, Business

In [None]:
test = pd.read_csv('books_data.csv')
test#['author'].iloc[]