In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd

from tqdm.notebook import tqdm

import time

import re

In [2]:
# Define all the functions for the paramenters we want to scrap from goodreads

def book_title():
    try:
        title = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.ID, 'bookTitle'))
        )
        return title.text
    except:
        print(f'None title value for {book_url}')
        return None
    
    
def book_serie():
    try:
        serie = driver.find_elements_by_xpath('//*[@property="og:title"]')[0].get_attribute('content')
        return serie
    except:
        print(f'None serie value for {book_url}')
        return None


def book_author():
    try:
        author = driver.find_elements_by_xpath('//*[@itemprop="name"]')[1].text
        return author
    except:
        print(f'None author value for {book_url}')
        return None
    
    
def book_language():
    try:
        language = driver.find_elements_by_xpath('//div[@itemprop="inLanguage"]')[0].get_attribute('innerHTML')
        return language
    except:
        print(f'None language value for {book_url}')
        return None
    
    
    
def book_genres():
    genres_lst = []
    try:
        genres = driver.find_elements_by_xpath('//*[@class="actionLinkLite bookPageGenreLink"]')
        for genre in genres:
            if genre not in genres_lst:
                genres_lst.append(genre.text)
        return genres_lst
    except:
        print(f'None genre value for {book_url}')
        return None
    
    
def book_id():
    try:
        id = driver.find_elements_by_xpath('//*[@name="book_id"]')[0].get_attribute("value")
        return id
    except:
        print(f'None id value for {book_url}')
        return None

    
def book_pages():
    try:
        pages = driver.find_elements_by_xpath('//*[@property="books:page_count"]')[0].get_attribute('content')
        return pages
    except:
        print(f'None pages value for {book_url}')
        return None
    
    
def book_format():
    try:
        format = driver.find_elements_by_xpath('//*[@itemprop="bookFormat"]')[0].text
        return format
    except:
        print(f'None format value for {book_url}')
        return None
    
    
def book_published():
    try:
        try:
            date_raw = driver.find_elements_by_xpath('//div[@class="row"]')[1].text
            date_list = re.findall(r'[0-9]{4}', date_raw)
            return date_list[-1]

        except:
            published_raw = driver.find_elements_by_xpath('//div[@class="row"]')[1].text
            published = published_raw.split()[3]
            return published
    except:
        print(f'None published value for {book_url}')
        return None
    
    
def book_rating():
    try:
        rating = driver.find_elements_by_xpath('//*[@itemprop="ratingValue"]')[0].text
        return rating
    except:
        print(f'None rating value for {book_url}')
        return None
    
    
def book_rating_count():
    try:
        rating_count = driver.find_elements_by_xpath('//*[@itemprop="ratingCount"]')[0].get_attribute('content')
        return rating_count
    except:
        print(f'None rating_count value for {book_url}')
        return None
    
    
def book_rating_distribution():
    try:
        ratings = driver.find_elements_by_xpath('//*[@type="text/javascript+protovis"]')[0].get_property('innerHTML')
        distribution_lst = re.findall(r'\d+', ratings)
        return distribution_lst
    except:
        print(f'None rating_distribution value for {book_url}')
        return None

    
def book_review_count():
    try:
        review_count = driver.find_elements_by_xpath('//*[@itemprop="reviewCount"]')[0].get_attribute('content')
        return review_count
    except:
        print(f'None review_count value for {book_url}')
        return None
    
    
def book_awards():
    awards_lst = []
    try:
        awards = driver.find_elements_by_xpath('//*[@class="award"]')
        for award in awards:
            if award not in awards_lst:
                awards_lst.append(award.get_attribute('innerHTML'))
        return awards_lst
    except:
        print(f'None award value for {book_url}')
        return None

    
    
def book_isbn():
    try:
        scripts = driver.find_elements_by_tag_name("script")

        for script in scripts:
            full_text = script.get_property('innerHTML')
            isbns = re.findall(r'nisbn: [0-9]{9}' , full_text)
            if isbns:
                for isbn in isbns:
                    return(isbn.split()[-1])
            else:
                try:
                    isbn = driver.find_elements_by_xpath('//div[@class="infoBoxRowItem"]')[1].get_attribute('innerHTML').split()[0]
                    return isbn
                except:
                    raise Exception('No value')
            
    except:
        print(f'None isbn value for {book_url}')
        return None

    
def book_isbn13():
    try:

        scripts = driver.find_elements_by_tag_name("script")

        for script in scripts:
            full_text = script.get_property('innerHTML')
            isbns13 = re.findall(r'isbn13: [0-9]{13}' , full_text, re.I)
#                 if isbns13:
            for isbn13 in isbns13:
                if isbn13:
                    return isbn13.split()[-1]
                else:
                    raise Exception('No value')
                    
        isbn13 =  WebDriverWait(driver, 2).until(
                EC.presence_of_element_located((By.XPATH, '//*[@itemprop="isbn"]'))).get_attribute('innerHTML')
        return (isbn13)

    except:
        print(f'None isbn_13 value for {book_url}')
        return None

In [3]:
def get_book(book_url):
    driver.get(book_url)
    book_info_dict = {'title'               : book_title(),
                      'serie'               : book_serie(),
                      'author'              : book_author(),
                      'language'            : book_language(),
                      'genres'              : book_genres(),
                      'id'                  : book_id(),
                      'pages'               : book_pages(),
                      'format'              : book_format(),
                      'first_published'     : book_published(),
                      'rating'              : book_rating(),
                      'rating_count'        : book_rating_count(),
                      'five_stars_count'    : book_rating_distribution()[0],
                      'four_stars_count'    : book_rating_distribution()[1],
                      'three_stars_count'   : book_rating_distribution()[2],
                      'two_stars_count'     : book_rating_distribution()[3],
                      'one_stars_count'     : book_rating_distribution()[4],
                      'review_count'        : book_review_count(),
                      'awards'              : book_awards(),
                      'awards_count'        : len(book_awards()),
    #                           'isbn'                : book_isbn(),
                      'isbn_13'             : book_isbn13(),
                      'url'                 : driver.current_url
                     }
    return book_info_dict

In [4]:
PATH = "C:\Program Files (x86)\chromedriver.exe"

In [11]:
driver = webdriver.Chrome(PATH)

driver.get('https://www.goodreads.com/list/show/1.Best_Books_Ever')

print(f'List: {driver.title}')

List: Best Books Ever (54764 books)


In [30]:
#Clean the list for every batch
book_lst = []

# Define batch_size (multiple of 100)
batch_size = 100


try:
    pbar = tqdm(total=batch_size)
    n = 0
    # The scraper will work in batches of batch_sizes books
    while n < batch_size:
        
        #Get the actual url and check for all the books in this page to iterate the books
        current = driver.current_url
        lst_books_elements = driver.find_elements_by_xpath('//*[@class="bookTitle"]')
        lst_books_urls = [i.get_attribute('href') for i in lst_books_elements]
        
        #Create a dictionary for each book and append them to a list
        for book_url in lst_books_urls:
            try:
                driver.get(book_url)
                book_info_dict = {'title'               : book_title(),
                                  'serie'               : book_serie(),
                                  'author'              : book_author(),
                                  'language'            : book_language(),
                                  'genres'              : book_genres(),
                                  'id'                  : book_id(),
                                  'pages'               : book_pages(),
                                  'format'              : book_format(),
                                  'first_published'     : book_published(),
                                  'rating'              : book_rating(),
                                  'rating_count'        : book_rating_count(),
                                  'five_stars_count'    : book_rating_distribution()[0],
                                  'four_stars_count'    : book_rating_distribution()[1],
                                  'three_stars_count'   : book_rating_distribution()[2],
                                  'two_stars_count'     : book_rating_distribution()[3],
                                  'one_stars_count'     : book_rating_distribution()[4],
                                  'review_count'        : book_review_count(),
                                  'awards'              : book_awards(),
                                  'awards_count'        : len(book_awards()),
        #                           'isbn'                : book_isbn(),
                                  'isbn_13'             : book_isbn13(),
                                  'url'                 : driver.current_url
                                 }
                
                #Check if the book is already in the dataframe, if not, add it to the dictionary
                id_value = book_info_dict.get('id')
                n += 1
                if id_value not in df.id.values:
                    pbar.update(1)
                    book_lst.append(book_info_dict)
                    
                else:
                    pbar.update(1)
                    continue
                
            except:
                print('not possible')
            
        #Need to check for only 1 'next page' (comments), if that is true, stop the iteration.
        driver.get(current)
        next_page_list = driver.find_elements_by_xpath('//a[@class="next_page"]')
        if len(next_page_list) > 1:
            driver.find_elements_by_xpath('//a[@class="next_page"]')[0].click()
        else:
            raise Exception('No more pages')
    pbar.close()
except:
    
    print('No more pages')

  0%|          | 0/100 [00:00<?, ?it/s]

None format value for https://www.goodreads.com/book/show/36576608-flowers-for-algernon
None published value for https://www.goodreads.com/book/show/646462._


In [28]:
df = pd.DataFrame(book_lst)

In [8]:
df = df.append(book_lst, ignore_index=True)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   title              101 non-null    object
 1   serie              101 non-null    object
 2   author             101 non-null    object
 3   language           101 non-null    object
 4   genres             101 non-null    object
 5   id                 101 non-null    object
 6   pages              101 non-null    object
 7   format             101 non-null    object
 8   first_published    101 non-null    object
 9   rating             101 non-null    object
 10  rating_count       101 non-null    object
 11  five_stars_count   101 non-null    object
 12  four_stars_count   101 non-null    object
 13  three_stars_count  101 non-null    object
 14  two_stars_count    101 non-null    object
 15  one_stars_count    101 non-null    object
 16  review_count       101 non-null    object
 1

In [11]:
cols = df.columns.drop(['title', 'language', 'serie', 'author', 'genres', 'format', 'awards', 'url'])

In [12]:
cols

Index(['id', 'pages', 'first_published', 'rating', 'rating_count',
       'five_stars_count', 'four_stars_count', 'three_stars_count',
       'two_stars_count', 'one_stars_count', 'review_count', 'awards_count',
       'isbn_13'],
      dtype='object')

In [13]:
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   title              1000 non-null   object 
 1   serie              1000 non-null   object 
 2   author             1000 non-null   object 
 3   language           989 non-null    object 
 4   genres             1000 non-null   object 
 5   id                 1000 non-null   int64  
 6   pages              999 non-null    float64
 7   format             998 non-null    object 
 8   first_published    999 non-null    float64
 9   rating             1000 non-null   float64
 10  rating_count       1000 non-null   int64  
 11  five_stars_count   1000 non-null   int64  
 12  four_stars_count   1000 non-null   int64  
 13  three_stars_count  1000 non-null   int64  
 14  two_stars_count    1000 non-null   int64  
 15  one_stars_count    1000 non-null   int64  
 16  review_count       1000 n

In [61]:
243705 in df.id.values

True

In [15]:
df.id.nunique()

1000

In [None]:
driver.close() #this works only for one tab, if you want to close everything use .quit()