In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd

from tqdm.notebook import tqdm

import time

import re

### Define Functions

In [2]:
# Define all the functions for the paramenters we want to scrap from goodreads

def book_title():
    try:
        title = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, 'bookTitle'))
        )
        return title.text
    except:
        print(f'None title value for {book_url}')
        return None
    
    
def book_serie():
    try:
        serie = driver.find_elements_by_xpath('//*[@property="og:title"]')[0].get_attribute('content')
        return serie
    except:
        print(f'None serie value for {book_url}')
        return None


def book_author():
    try:
        author = driver.find_elements_by_xpath('//*[@itemprop="name"]')[1].text
        return author
    except:
        print(f'None author value for {book_url}')
        return None
    
    
def book_language():
    try:
        language = driver.find_elements_by_xpath('//div[@itemprop="inLanguage"]')[0].get_attribute('innerHTML')
        return language
    except:
        print(f'None language value for {book_url}')
        return None
    
    
    
def book_genres():
    genres_lst = []
    try:
        genres = driver.find_elements_by_xpath('//*[@class="actionLinkLite bookPageGenreLink"]')
        for genre in genres:
            if genre not in genres_lst:
                genres_lst.append(genre.text)
        return genres_lst
    except:
        print(f'None genre value for {book_url}')
        return None
    
    
def book_id():
    try:
        id = driver.find_elements_by_xpath('//*[@name="book_id"]')[0].get_attribute("value")
        return id
    except:
        print(f'None id value for {book_url}')
        return None

    
def book_pages():
    try:
        pages = driver.find_elements_by_xpath('//*[@property="books:page_count"]')[0].get_attribute('content')
        return pages
    except:
        print(f'None pages value for {book_url}')
        return None
    
    
def book_format():
    try:
        format = driver.find_elements_by_xpath('//*[@itemprop="bookFormat"]')[0].text
        return format
    except:
        print(f'None format value for {book_url}')
        return None
    
    
def book_published():
    try:
        try:
            date_raw = driver.find_elements_by_xpath('//div[@class="row"]')[1].text
            date_list = re.findall(r'[0-9]{4}', date_raw)
            return date_list[-1]

        except:
            published_raw = driver.find_elements_by_xpath('//div[@class="row"]')[1].text
            published = published_raw.split()[3]
            return published
    except:
        print(f'None published value for {book_url}')
        return None
    
    
def book_rating():
    try:
        rating = driver.find_elements_by_xpath('//*[@itemprop="ratingValue"]')[0].text
        return rating
    except:
        print(f'None rating value for {book_url}')
        return None
    
    
def book_rating_count():
    try:
        rating_count = driver.find_elements_by_xpath('//*[@itemprop="ratingCount"]')[0].get_attribute('content')
        return rating_count
    except:
        print(f'None rating_count value for {book_url}')
        return None
    
    
def book_rating_distribution():
    try:
        ratings = driver.find_elements_by_xpath('//*[@type="text/javascript+protovis"]')[0].get_property('innerHTML')
        distribution_lst = re.findall(r'\d+', ratings)
        return distribution_lst
    except:
        print(f'None rating_distribution value for {book_url}')
        return None

    
def book_review_count():
    try:
        review_count = driver.find_elements_by_xpath('//*[@itemprop="reviewCount"]')[0].get_attribute('content')
        return review_count
    except:
        print(f'None review_count value for {book_url}')
        return None
    
    
def book_awards():
    awards_lst = []
    try:
        awards = driver.find_elements_by_xpath('//*[@class="award"]')
        for award in awards:
            if award not in awards_lst:
                awards_lst.append(award.get_attribute('innerHTML'))
        return awards_lst
    except:
        print(f'None award value for {book_url}')
        return None

    
def book_isbn13():
    try:

        scripts = driver.find_elements_by_tag_name("script")

        for script in scripts:
            full_text = script.get_property('innerHTML')
            isbns13 = re.findall(r'isbn13: [0-9]{13}' , full_text, re.I)
#                 if isbns13:
            for isbn13 in isbns13:
                if isbn13:
                    return isbn13.split()[-1]
                else:
                    raise Exception('No value')
                    
        isbn13 =  WebDriverWait(driver, 2).until(
                EC.presence_of_element_located((By.XPATH, '//*[@itemprop="isbn"]'))).get_attribute('innerHTML')
        return (isbn13)

    except:
        print(f'None isbn_13 value for {book_url}')
        return None

In [3]:
def get_book(book_url):
    
    #function to scrape a single book
    driver.get(book_url)
    book_info_dict = {'title'               : book_title(),
                      'serie'               : book_serie(),
                      'author'              : book_author(),
                      'language'            : book_language(),
                      'genres'              : book_genres(),
                      'id'                  : book_id(),
                      'pages'               : book_pages(),
                      'format'              : book_format(),
                      'first_published'     : book_published(),
                      'rating'              : book_rating(),
                      'rating_count'        : book_rating_count(),
                      'five_stars_count'    : book_rating_distribution()[0],
                      'four_stars_count'    : book_rating_distribution()[1],
                      'three_stars_count'   : book_rating_distribution()[2],
                      'two_stars_count'     : book_rating_distribution()[3],
                      'one_stars_count'     : book_rating_distribution()[4],
                      'review_count'        : book_review_count(),
                      'awards'              : book_awards(),
                      'awards_count'        : len(book_awards()),
    #                           'isbn'                : book_isbn(),
                      'isbn_13'             : book_isbn13(),
                      'url'                 : driver.current_url
                     }
    return book_info_dict

### Conect to the page

In [4]:
PATH = "C:\Program Files (x86)\chromedriver.exe"

In [5]:
driver = webdriver.Chrome(PATH)

driver.get('https://www.goodreads.com/list/show/1.Best_Books_Ever') # Lists URL to be scraped

print(f'List: {driver.title}')

List: Best Books Ever (54799 books)


In [6]:
df = pd.read_csv('books.csv')

### Execute scraper

In [None]:
#Clean the list for every batch
book_lst = []

# Define batch_size (multiple of 100)
# I defined batches between 1000 and 2000 to avoid loosing to much information if the process fails
batch_size = 2000


try:
    pbar = tqdm(total=batch_size)
    n = 0
    # The scraper will work in batches of batch_sizes books
    while n < batch_size:
        
        #Get the actual url and check for all the books in this page to iterate the books
        current = driver.current_url
        lst_books_elements = driver.find_elements_by_xpath('//*[@class="bookTitle"]')
        lst_books_urls = [i.get_attribute('href') for i in lst_books_elements]
        
        #Create a dictionary for each book and append them to a list
        for book_url in lst_books_urls:
            # Check if the current URL is in the dataframe, if positive, keep with the next one
            
            if book_url in df.url.values:
                pbar.update(1)
                n += 1
                continue
            else:
                try:
                    driver.get(book_url)
                    book_info_dict = {'title'               : book_title(),
                                      'serie'               : book_serie(),
                                      'author'              : book_author(),
                                      'language'            : book_language(),
                                      'genres'              : book_genres(),
                                      'id'                  : book_id(),
                                      'pages'               : book_pages(),
                                      'format'              : book_format(),
                                      'first_published'     : book_published(),
                                      'rating'              : book_rating(),
                                      'rating_count'        : book_rating_count(),
                                      'five_stars_count'    : book_rating_distribution()[0],
                                      'four_stars_count'    : book_rating_distribution()[1],
                                      'three_stars_count'   : book_rating_distribution()[2],
                                      'two_stars_count'     : book_rating_distribution()[3],
                                      'one_stars_count'     : book_rating_distribution()[4],
                                      'review_count'        : book_review_count(),
                                      'awards'              : book_awards(),
                                      'awards_count'        : len(book_awards()),
                                      'isbn_13'             : book_isbn13(),
                                      'url'                 : driver.current_url
                                     }


                    n += 1
                    pbar.update(1)
                    book_lst.append(book_info_dict)


                except:
                    n += 1
                    print('not possible')
            
        #Need to check for only 1 'next page' (comments), if that is true, stop the iteration.
        driver.get(current)
        next_page_list = driver.find_elements_by_xpath('//a[@class="next_page"]')
        if len(next_page_list) > 1:
            driver.find_elements_by_xpath('//a[@class="next_page"]')[0].click()
        else:
            raise Exception('No more pages')
            
    # Append the list to the dataframe and finish the scraper       
    df = df.append(book_lst, ignore_index=True)
    print(f'{len(book_lst)} books were added to the dataset')
    df.to_csv('books.csv')
    pbar.close()
except:
    
    # Append the list to the dataframe and finish the scraper    
    df = df.append(book_lst, ignore_index=True)
    print(f'{len(book_lst)} books were added to the dataset')
    df.to_csv('books.csv')
    print('No more pages')
    pbar.close()

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9998 entries, 0 to 9997
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         9998 non-null   int64  
 1   title              9998 non-null   object 
 2   serie              9998 non-null   object 
 3   author             9998 non-null   object 
 4   language           9692 non-null   object 
 5   genres             9998 non-null   object 
 6   id                 9998 non-null   int64  
 7   pages              9865 non-null   float64
 8   format             9921 non-null   object 
 9   first_published    9993 non-null   object 
 10  rating             9998 non-null   float64
 11  rating_count       9998 non-null   int64  
 12  five_stars_count   9998 non-null   int64  
 13  four_stars_count   9998 non-null   int64  
 14  three_stars_count  9998 non-null   int64  
 15  two_stars_count    9998 non-null   int64  
 16  one_stars_count    9998 