# BOOKS  GOODREADS web scraping

###  In the process of gathering data for my book analysis project, I utilized web scraping techniques to extract information from Goodreads, a popular online platform for book enthusiasts. My goal was to collect details on over 7000 books, including their titles, authors, genre, publication date, ratings, reviews, and additional metadata. Due to occasional errors encountered while scraping the website, I adopted a strategy of scraping one page at a time, each containing 100 books. While this approach provided me with the necessary data for my analysis, I aim to refine my scraping code in the future to enable more efficient and comprehensive data retrieval.

In [1442]:
# Import libraries

import pandas as pd
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import Select
import time
from bs4 import BeautifulSoup
import requests as req
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)

In [1443]:
options=Options()

options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_argument("--incognito")
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--remote-allow-origins=*");
#options.add_argument(r"user-data-dir=cookies")
#options.add_argument('--headless')                 #Habilitar si no queremos ver la ventana
options.add_experimental_option("detach", True)    #Esta opción corrige el error de cierre repentino
options.add_argument('--start-minimized')
options.add_argument('--disable-gpu')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--no-sandbox')
options.add_argument('--disable-extensions')
options.add_argument('--disable-infobars')

In [1444]:
url = 'https://www.goodreads.com/list/show/1.Best_Books_Ever'

In [1445]:
driver = webdriver.Chrome(options=options)
driver.get(url)

In [1446]:
lst = driver.find_elements(By.XPATH, '//td//a')
lst[1].text

'The Hunger Games (The Hunger Games, #1)'

In [1447]:
url_list[1]

'https://www.goodreads.com/list/show/1.Best_Books_Ever?page=3'

In [1448]:
# Generate a list of URLs

base_url = "https://www.goodreads.com/list/show/1.Best_Books_Ever?page="
urls = [base_url + str(page) for page in range(1, 101)]   

for url in urls:
    print(url)


https://www.goodreads.com/list/show/1.Best_Books_Ever?page=1
https://www.goodreads.com/list/show/1.Best_Books_Ever?page=2
https://www.goodreads.com/list/show/1.Best_Books_Ever?page=3
https://www.goodreads.com/list/show/1.Best_Books_Ever?page=4
https://www.goodreads.com/list/show/1.Best_Books_Ever?page=5
https://www.goodreads.com/list/show/1.Best_Books_Ever?page=6
https://www.goodreads.com/list/show/1.Best_Books_Ever?page=7
https://www.goodreads.com/list/show/1.Best_Books_Ever?page=8
https://www.goodreads.com/list/show/1.Best_Books_Ever?page=9
https://www.goodreads.com/list/show/1.Best_Books_Ever?page=10
https://www.goodreads.com/list/show/1.Best_Books_Ever?page=11
https://www.goodreads.com/list/show/1.Best_Books_Ever?page=12
https://www.goodreads.com/list/show/1.Best_Books_Ever?page=13
https://www.goodreads.com/list/show/1.Best_Books_Ever?page=14
https://www.goodreads.com/list/show/1.Best_Books_Ever?page=15
https://www.goodreads.com/list/show/1.Best_Books_Ever?page=16
https://www.goodr

In [1449]:
# Scraping one page at the time as there are many errors and changes between pages - here is page 71

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

def get_book_details(driver, book_element):
    book_details_list71 = []
    num_books = 100
    timeout = 90

    # Iterate over each book and click to get details
    for i in range(1, num_books + 1):
        # Construct the XPath dynamically for each book element
        book_xpath = '//*[@id="all_votes"]/table/tbody/tr[' + str(i) + ']/td[3]/a/span'
        
        # Use WebDriverWait with the defined timeout
        book_element = WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.XPATH, book_xpath)))
        book_element.click()
        
        try:
            # Wait for the overlay to disappear
            WebDriverWait(driver, 10).until(EC.invisibility_of_element_located((By.CLASS_NAME, 'Overlay--floating')))

            # Once the overlay disappears, click the button to close the modal
            close_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'div.modal__close.button[type="button"]')))
            close_button.click()

        except TimeoutException:
            # Handle the timeout
            pass

        # Extract details                      
        try:
            title = driver.find_element(By.CLASS_NAME, 'Text.Text__title1').text
        except NoSuchElementException:
            title = "N/A"  # If genre is not found using any XPath, set it to "N/A" 
        
        try:
            author = driver.find_element(By.CLASS_NAME, 'ContributorLink__name').text                                          
        except NoSuchElementException:
            author = "N/A"   
        
        try:
            version_price = driver.find_element(By.CSS_SELECTOR, 'button.Button--buy').text
        except NoSuchElementException:
            version_price = "N/A"    
        
        try:
            stars = driver.find_element(By.CLASS_NAME, 'RatingStatistics__rating').text
        except NoSuchElementException:
            stars = "N/A"  
            
        try:    
            ratings = driver.find_element(By.CSS_SELECTOR, 'div.BookPageMetadataSection__ratingStats > a > div:nth-child(2) > div > span:nth-child(1)').text
        except NoSuchElementException:
            ratings = "N/A"   
            
        try:   
            reviews = driver.find_element(By.CSS_SELECTOR, 'div.BookPageMetadataSection__ratingStats > a > div:nth-child(2) > div > span:nth-child(2)').text
        except NoSuchElementException:
            reviews = "N/A"
        
        try:
            sinopsis = driver.find_element(By.XPATH, '//*[@id="__next"]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[4]/div/div[1]/div/div/span').text
        except NoSuchElementException:
            try:
                sinopsis = driver.find_element(By.XPATH, '//*[@id="__next"]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[5]/div/div[1]/div/div/span').text
            except NoSuchElementException:
                sinopsis = "N/A"
        
        try:
            genre = driver.find_element(By.XPATH, '//*[@id="__next"]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[5]/ul/span[1]/span[2]').text
        except NoSuchElementException:
            try:
                genre = driver.find_element(By.XPATH, '//*[@id="__next"]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[6]/ul/span[1]/span[2]/a/span').text
            except NoSuchElementException:           
                try:
                    genre = driver.find_element(By.XPATH, '//*[@id="__next"]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[5]/div/span[1]/span/div/p[2]').text
                except NoSuchElementException:
                    genre = "N/A" 
            
                                            
        try:
            num_pages = driver.find_element(By.XPATH, '//*[@id="__next"]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[6]/div/span[1]/span/div/p[1]').text
        except NoSuchElementException:
            try:
                num_pages = driver.find_element(By.XPATH, '//*[@id="__next"]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[7]/div/span[1]/span/div/p[1]').text
            except NoSuchElementException:   
                try:
                    num_pages = driver.find_element(By.XPATH, '//*[@id="__next"]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[5]/div/span[1]/span/div/p[1]').text
                except NoSuchElementException:
                    num_pages = "N/A"
        
        
        
        
        try:
            published = driver.find_element(By.XPATH, '//*[@id="__next"]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[6]/div/span[1]/span/div/p[2]').text
        except NoSuchElementException:
            try:
                published = driver.find_element(By.XPATH, '//*[@id="__next"]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[7]/div/span[1]/span/div/p[2]').text
            except NoSuchElementException:
                try:
                    published = driver.find_element(By.XPATH, '//*[@id="__next"]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[5]/div/span[1]/span/div/p[2]').text
                except NoSuchElementException:
                    published = "N/A"      
                    
                    
        try:
            num_books = driver.find_element(By.XPATH, '//*[@id="__next"]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[8]/div[2]/div/div[1]/div[2]/div[1]/div[1]/span').text
        except NoSuchElementException:
            try:
                num_books = driver.find_element(By.XPATH, '//*[@id="__next"]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[9]/div[2]/div/div[1]/div[2]/div[1]/div[1]/span').text
            except NoSuchElementException:
                try:
                    num_books = driver.find_element(By.XPATH, '//*[@id="__next"]/div[2]/main/div[1]/div[2]/div[2]/div[2]/div[7]/div[2]/div/div[1]/div[2]/div[1]/div[1]/span').text
                except NoSuchElementException:
                    num_books = "N/A"
    
    
        # Append details to the list
        book_details_list71.append({
            'title': title,
            'author': author,
            'version_price': version_price,
            'stars': stars,
            'ratings': ratings,
            'reviews': reviews,
            'sinopsis': sinopsis,
            'genre': genre,
            'num_pages': num_pages,
            'published': published,
            'num_books': num_books
        })

        # Go back to the list of books
        driver.back()

    return book_details_list71


# Initialize WebDriver
driver = webdriver.Chrome()

# URL for page 71
url_page_71 = "https://www.goodreads.com/list/show/1.Best_Books_Ever?page=71"

# Iterating through URLs
driver.get(url_page_71)
book_details_list_page_71 = get_book_details(driver, book_element)

# Quit WebDriver
driver.quit()

# Print the details extracted from page 71
print(book_details_list_page_71)




In [1451]:
books_df = pd.DataFrame(book_details_list_page_1)
books_df2 = pd.DataFrame(book_details_list_page_2)
books_df3 = pd.DataFrame(book_details_list_page_3)
books_df4 = pd.DataFrame(book_details_list_page_4)
books_df5 = pd.DataFrame(book_details_list_page_5)
books_df6 = pd.DataFrame(book_details_list_page_6)
books_df7 = pd.DataFrame(book_details_list_page_7)
books_df8 = pd.DataFrame(book_details_list_page_8)
books_df9 = pd.DataFrame(book_details_list_page_9)
books_df10 = pd.DataFrame(book_details_list_page_10)
books_df11 = pd.DataFrame(book_details_list_page_11)
books_df12 = pd.DataFrame(book_details_list_page_12)
books_df13 = pd.DataFrame(book_details_list_page_13)
books_df14 = pd.DataFrame(book_details_list_page_14)
books_df15 = pd.DataFrame(book_details_list_page_15)
books_df16 = pd.DataFrame(book_details_list_page_16)
books_df17 = pd.DataFrame(book_details_list_page_17)
books_df18 = pd.DataFrame(book_details_list_page_18)
books_df19 = pd.DataFrame(book_details_list_page_19)
books_df20 = pd.DataFrame(book_details_list_page_20)
books_df21 = pd.DataFrame(book_details_list_page_21)
books_df22 = pd.DataFrame(book_details_list_page_22)
books_df23 = pd.DataFrame(book_details_list_page_23)
books_df24 = pd.DataFrame(book_details_list_page_24)
books_df25 = pd.DataFrame(book_details_list_page_25)
books_df26 = pd.DataFrame(book_details_list_page_26)
books_df27 = pd.DataFrame(book_details_list_page_27)
books_df28 = pd.DataFrame(book_details_list_page_28)
books_df29 = pd.DataFrame(book_details_list_page_29)
books_df30 = pd.DataFrame(book_details_list_page_30)
books_df31 = pd.DataFrame(book_details_list_page_31)
books_df32 = pd.DataFrame(book_details_list_page_32)
books_df33 = pd.DataFrame(book_details_list_page_33)
books_df34 = pd.DataFrame(book_details_list_page_34)
books_df35 = pd.DataFrame(book_details_list_page_35)
books_df36 = pd.DataFrame(book_details_list_page_36)
books_df37 = pd.DataFrame(book_details_list_page_37)
books_df38 = pd.DataFrame(book_details_list_page_38)
books_df39 = pd.DataFrame(book_details_list_page_39)
books_df40 = pd.DataFrame(book_details_list_page_40)
books_df41 = pd.DataFrame(book_details_list_page_41)
books_df42 = pd.DataFrame(book_details_list_page_42)
books_df43 = pd.DataFrame(book_details_list_page_43)
books_df44 = pd.DataFrame(book_details_list_page_44)
books_df45 = pd.DataFrame(book_details_list_page_45)
books_df46 = pd.DataFrame(book_details_list_page_46)
books_df47 = pd.DataFrame(book_details_list_page_47)
books_df48 = pd.DataFrame(book_details_list_page_48)
books_df49 = pd.DataFrame(book_details_list_page_49)
books_df50 = pd.DataFrame(book_details_list_page_50)
books_df51 = pd.DataFrame(book_details_list_page_51)
books_df52 = pd.DataFrame(book_details_list_page_52)
books_df53 = pd.DataFrame(book_details_list_page_53)
books_df54 = pd.DataFrame(book_details_list_page_54)
books_df55 = pd.DataFrame(book_details_list_page_55)
books_df56 = pd.DataFrame(book_details_list_page_56)
books_df57 = pd.DataFrame(book_details_list_page_57)
books_df58 = pd.DataFrame(book_details_list_page_58)
books_df59 = pd.DataFrame(book_details_list_page_59)
books_df60 = pd.DataFrame(book_details_list_page_60)
books_df61 = pd.DataFrame(book_details_list_page_61)
books_df62 = pd.DataFrame(book_details_list_page_62)
books_df63 = pd.DataFrame(book_details_list_page_63)
books_df64 = pd.DataFrame(book_details_list_page_64)
books_df65 = pd.DataFrame(book_details_list_page_65)
books_df66 = pd.DataFrame(book_details_list_page_66)
books_df67 = pd.DataFrame(book_details_list_page_67)
books_df68 = pd.DataFrame(book_details_list_page_68)
books_df69 = pd.DataFrame(book_details_list_page_69)
books_df70 = pd.DataFrame(book_details_list_page_70)
books_df71 = pd.DataFrame(book_details_list_page_71)

In [1454]:
# Concatenate all df into one

concatenated_df = pd.concat([books_df, books_df2, books_df3, books_df4, books_df5, books_df6, books_df7, books_df8, books_df9, books_df10, books_df11, books_df12, books_df13, books_df14, books_df15, books_df16, books_df17, books_df18, books_df19, books_df20, books_df21, books_df22, books_df23, books_df24, books_df25, books_df26, books_df27, books_df28, books_df29, books_df30, books_df31, books_df32, books_df33, books_df34, books_df35, books_df36, books_df37, books_df38, books_df39, books_df40, books_df41, books_df42, books_df43, books_df44, books_df45, books_df46, books_df47, books_df48, books_df49, books_df50, books_df51, books_df52, books_df53, books_df54, books_df55, books_df56, books_df57, books_df58, books_df59, books_df60, books_df61, books_df62, books_df63, books_df64, books_df65, books_df66, books_df67, books_df68, books_df69, books_df70, books_df71], ignore_index=True)
concatenated_df.head()

Unnamed: 0,title,author,version_price,stars,ratings,reviews,sinopsis,genre,num_pages,published,num_books
0,The Hunger Games,Suzanne Collins,Kindle Unlimited $0.00,4.34,"8,601,389 ratings","216,773 reviews","Could you survive on your own in the wild, wit...",Young Adult,"374 pages, Hardcover","First published September 14, 2008",72 books99.7k followers
1,Harry Potter and the Order of the Phoenix,J.K. Rowling,Kindle Unlimited $0.00,4.5,"3,359,608 ratings","62,879 reviews",Harry Potter is about to start his fifth year ...,Young Adult,"912 pages, Paperback","First published June 21, 2003",535 books225k followers
2,Pride and Prejudice,Jane Austen,Kindle $2.99,4.29,"4,238,613 ratings","113,179 reviews","Since its immediate success in 1813, Pride and...",Classics,"279 pages, Paperback","First published January 28, 1813","4,124 books66.3k followers"
3,To Kill a Mockingbird,Harper Lee,Kindle $13.99,4.26,"6,079,471 ratings","116,454 reviews",The unforgettable novel of a childhood in a sl...,Classics,"323 pages, Paperback","First published July 11, 1960",70 books13.5k followers
4,The Book Thief,Markus Zusak,Kindle $10.99,4.39,"2,538,383 ratings","143,787 reviews",Librarian's note: An alternate cover edition c...,Historical Fiction,"592 pages, Hardcover","First published September 1, 2005",23 books39.2k followers


In [1453]:
#Saving DataFrames I've created to .csv for further works

concatenated_df.to_csv('books_goodreads.csv', index=False)