In [1]:
import pandas as pd
import os
import time
import datetime
import csv
import re
import codecs
import requests
from bs4 import BeautifulSoup
    
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementNotInteractableException, ElementClickInterceptedException

In [2]:
# Define a function to extract the publisher element text
def get_publisher_text(publisher_element_WebDriverWait_amount):
    try:
        # Find the element with the EditionDetails class
        edition_details_element = WebDriverWait(driver, publisher_element_WebDriverWait_amount).until(EC.presence_of_element_located((By.CLASS_NAME, 'EditionDetails')))
        
        # Find the nested element with the data-testid attribute
        publisher_element = edition_details_element.find_elements_by_css_selector('div[data-testid="contentContainer"]')
        
        return publisher_element[1].text

    except NoSuchElementException:
        print("Element: EditionDetails element after click fail")
        return ""
    except TimeoutException:
        print("Timeout: EditionDetails element not found within the specified time")
        return ""
    except ElementClickInterceptedException:
        print("N O   P U B L I S H E R")
        return ""

In [3]:
def text_file_to_dataframe(filename):
    # Check if the text file exists
    if os.path.exists(filename):
        
        # Read the text file and create a list of lines
        with open(filename, "r") as file:
            lines = file.readlines()
            content = [line.strip() for line in lines]

        # Create a DataFrame from the list
        df = pd.DataFrame(content, columns=['links'])
        
        return df
    else:
        print(f"{filename} does not exist.")
        return None

In [4]:
input_filename = "final_links.txt"
df = text_file_to_dataframe(input_filename)

In [5]:
df = df[1361:2001]

df.reset_index(drop=True, inplace=True)
df.shape

(640, 1)

In [6]:
# S C R A P I N G   B R O W S E R
driver = webdriver.Chrome()

reviews_file_exists = os.path.isfile("remaining.csv")
index_file_exists = os.path.isfile("index.txt")

# checking last value of index in order to continue from last position
if index_file_exists:
    with open("index.txt", "r") as index_file:
        last_index = int(index_file.read())
else:
    last_index = 0

print(f"Output of last_index: {last_index}")
#============================================================================================================
# T I M E   W A I T   S E T T I N G S
#============================================================================================================
synopsis_and_review_wait_amount = 7
find_button_wait = 0
publisher_click_wait_amount = 7


button_element_WebDriverWait_amount = 30
title_element_WebDriverWait_amount = 20
synopsis_and_review_list_WebDriverWait_amount = 20
publisher_element_WebDriverWait_amount = 60


# iterate over the dataframe
for index, row in df.iloc[last_index:].iterrows():
    #============================================================================================================
    # L O A D    P A G E    O N    S C R A P I N G    B R O W S E R
    #============================================================================================================
    driver.get(row['links'])
    
    #============================================================================================================
    # M E A T   A N D   P O T A T O E S
    #============================================================================================================
    try:
        print(f"- Waiting {synopsis_and_review_wait_amount} Seconds for Synopsis and Reviews to load....")
        time.sleep(synopsis_and_review_wait_amount)
        synopsis_and_review_list = WebDriverWait(driver, synopsis_and_review_list_WebDriverWait_amount).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'Formatted')))
        synopsis_length = len(synopsis_and_review_list)
    except TimeoutException:
        print(f"Timed out while waiting for synopsis and review list for book {title}, moving on to next book")
        continue
    #============================================================================================================
    # T I T L E
    #============================================================================================================
    try:
        title_element = WebDriverWait(driver, title_element_WebDriverWait_amount).until(EC.presence_of_element_located((By.CLASS_NAME, 'BookPageTitleSection__title')))
        title = title_element.text
    except TimeoutException:
        print(f"Timed out while waiting for title for book {row['links']}, moving on to next book")
        continue

    #============================================================================================================
    # P U B L I S H E R
    #============================================================================================================
    # First, try to find and click the button
    try:
        print(f"Waiting {find_button_wait} seconds for button presense")
        time.sleep(find_button_wait)
        print(f"Soft Button Wait of {button_element_WebDriverWait_amount} seconds")
        button = WebDriverWait(driver, button_element_WebDriverWait_amount).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'button[aria-label="Book details and editions"]')))
        
        find_button_test = "Pass"
        #button = driver.find_element_by_css_selector('button[aria-label="Book details and editions"]')
        button.click()
        # if no exception from click() then assign "Pass"
        clicked_button_test = "Pass"
        
        # waiting for elements to load after clicking button
        time.sleep(publisher_click_wait_amount)
        print(f"- Waiting {publisher_click_wait_amount} seconds after clicking button")
        # run the get_publisher_text function for 1st time
        publisher = get_publisher_text(publisher_element_WebDriverWait_amount)
        
    except (NoSuchElementException):
        print("Button not found")
        try:
            publisher = get_publisher_text(publisher_element_WebDriverWait_amount)
            find_button_test = "Pass"
            clicked_button_test = "Pass"
        except:
            publisher=""
            find_button_test = "Fail"
            clicked_button_test = "Fail"
    except (ElementNotInteractableException, ElementClickInterceptedException):
        print("Button Not Clickable or Overlay is in the way\n")
        try:
            publisher = get_publisher_text(publisher_element_WebDriverWait_amount)
            find_button_test = "Pass"
            clicked_button_test = "Pass"
        except:
            publisher=""
            find_button_test = "Fail"
            clicked_button_test = "Fail"
    #except:
        #print("Unforeseen error: \n")
        #publisher=""
        #find_button_test = "Fails"
        #clicked_button_test = "Fail"

    # If the button is not found or not clickable, the get_publisher_text function will still run
    #print("Everything Failed: One last try: \n")
    #publisher = get_publisher_text()

    #============================================================================================================
    # D E F I N I T I O N    S E C T I O N
    #============================================================================================================
    try:
        page_count_element = driver.find_element_by_css_selector('p[data-testid="pagesFormat"]')
        page_count = page_count_element.text

    except NoSuchElementException:
        page_count = ""
    
    try:
        year_element = driver.find_element_by_css_selector('p[data-testid="publicationInfo"]')
        year = year_element.text
    except NoSuchElementException:
        year = ""
    
    try:
        review_count_element = driver.find_element_by_css_selector('span[data-testid="reviewsCount"]')
        review_count = review_count_element.text
        
        # converting string to interger with RegEx
        review_count_match = re.search(r'\d+', review_count)
        if review_count_match:
            review_count = int(review_count_match.group())
        else:
            review_count = 0
            
    except NoSuchElementException:
        review_count = 0

    try:
        rating_count_element = driver.find_element_by_css_selector('span[data-testid="ratingsCount"]')
        rating_count = rating_count_element.text

    except NoSuchElementException:
        rating_count = ""
    
    try:
        rating_element = driver.find_element_by_class_name('RatingStatistics__rating')    
        rating = rating_element.text
        
    except NoSuchElementException:
        rating = ""
    
    try:
        genre_class_element = driver.find_element_by_class_name('BookPageMetadataSection__genres')
        genres_text_element = genre_class_element.find_elements_by_css_selector('.BookPageMetadataSection__genreButton .Button__labelItem')
        genre = genres_text_element[0].text
        
    except NoSuchElementException:
        genre = ""
    
    try:
        author_element = driver.find_element_by_class_name('ContributorLink__name')
        author = author_element.text

    except NoSuchElementException:
        author = ""
    
    # assign the current datetime to this column for each row in the loop
    current_datetime = datetime.datetime.now()
    
    book_reviews = [title, page_count, year, review_count, rating, rating_count, genre, author, publisher, current_datetime]

    # current row in DataFrame. create these columns
    # and add book information to current dataframe row
    df.at[index, 'title'] = title
    df.at[index, 'page_count'] = page_count
    df.at[index, 'publisher'] = publisher
    df.at[index, 'year'] = year
    df.at[index, 'review_count'] = review_count
    df.at[index, 'rating'] = rating
    df.at[index, 'rating_count'] = rating_count
    df.at[index, 'genre'] = genre
    df.at[index, 'author'] = author
    df.at[index, 'scraped_at'] = current_datetime

    # ==================================================================
    # R E V I E W S
    # ==================================================================
    # iterating 5 times for reviews offset by 3 
    # list of elements "synopsis_and_review_list" has synopsis
    # and reviews
    
    # set this to the maximum number of reviews you want to capture
    max_reviews = 5  
    
    # index 0 has synopsis, index 1 & 2 are blank, 3-7 contain reviews
    review_index = 3 
    
    for i in range(max_reviews):
        if review_index < synopsis_length:
            # saving current review to temporary list
            book_reviews.append(synopsis_and_review_list[review_index].text)
            review_index += 1
        else:
            book_reviews.append("")
            break

    
    # ==================================================================
    # L O G
    # ==================================================================
    # T E S T   D E F I N I T I O N S
    # ------------------------------------------------------------------
    review_5_test = 'Pass' if synopsis_length >= 8 else 'Fail'
    publisher_test = 'Fail' if publisher == '' else 'Pass'
    print("===============================================================")
    print(f"            N E W   B O O K  S T A R T I N G:   #{index+1}")
    print("===============================================================")
    print("                      L I N K")
    print("---------------------------------------------------------------")
    print(row['links'])
    print("---------------------------------------------------------------")
    print(f"{title}")
    print("---------------------------------------------------------------")
    print(f"5 Review Test:{review_5_test}")
    print("---------------------------------------------------------------")
    print(f"Clicked Button Test:{clicked_button_test}")
    print("---------------------------------------------------------------")
    print(f"Publisher Found Test:{publisher_test}")
    print("---------------------------------------------------------------")
    print(f"TIME STAMP:{current_datetime}")
    print("---------------------------------------------------------------")
    # ==================================================================
    # S A V I N G   P L A C E   A F T E R   E A C H   I T E R A T I O N
    # ==================================================================
    with open("remaining.csv", "a", newline='', encoding='utf-8') as reviews_file:
        csv_writer = csv.writer(reviews_file)
        if not reviews_file_exists:
            csv_writer.writerow(['Book Name', 'page_count', 'year', 'review_count', 'rating', 'rating_count', 'genre', 'author', 'publisher', 'scraped_at', 'Review 1', 'Review 2', 'Review 3', 'Review 4', 'Review 5'])
            reviews_file_exists = True
        csv_writer.writerow(book_reviews)

    # ==================================================================
    # S A V I N G   P L A C E   A F T E R   E A C H   I T E R A T I O N
    # ==================================================================
    # save place of index to file for resuming later
    with open("index.txt", "w") as index_file:
        index_file.write(str(index + 1))

# D O N E    W I T H    S C R A P E R
driver.quit()

if not reviews_file_exists:
    open("remaining.csv", "a", newline='', encoding='utf-8').close()

if not index_file_exists:
    open("index.txt", "a").close()

Output of last_index: 43
- Waiting 7 Seconds for Synopsis and Reviews to load....
Waiting 0 seconds for button presense
Soft Button Wait of 30 seconds
- Waiting 7 seconds after clicking button
            N E W   B O O K  S T A R T I N G:   #44
                      L I N K
---------------------------------------------------------------
https://www.goodreads.com/book/show/23692271-sapiens
---------------------------------------------------------------
Sapiens: A Brief History of Humankind
---------------------------------------------------------------
5 Review Test:Fail
---------------------------------------------------------------
Clicked Button Test:Pass
---------------------------------------------------------------
Publisher Found Test:Pass
---------------------------------------------------------------
TIME STAMP:2023-03-23 11:05:24.912365
---------------------------------------------------------------
- Waiting 7 Seconds for Synopsis and Reviews to load....
Waiting 0 seconds for

KeyboardInterrupt: 