In [8]:
import os
import time
import datetime
import csv
import re
import json
import pandas as pd
    
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
#from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementNotInteractableException, ElementClickInterceptedException

In [None]:
def text_file_to_dataframe(filename):
    # Check if the text file exists
    if os.path.exists(filename):
        
        # Read the text file and create a list of lines
        with open(filename, "r") as file:
            lines = file.readlines()
            content = [line.strip() for line in lines]

        # Create a DataFrame from the list
        df = pd.DataFrame(content, columns=['links'])
        
        return df
    else:
        print(f"{filename} does not exist.")
        return None

In [6]:
# Save settings to file
def save_settings(settings, filename):
    with open(filename, "w") as f:
        json.dump(settings, f)

In [13]:
# Load settings from file
def load_settings(filename):
    # Define default settings
    DEFAULT_SETTINGS = {
        "synopsis_and_review_wait_amount": 4,
        "find_button_wait": 2,
        "publisher_click_wait_amount": 2,
        "button_element_WebDriverWait_amount": 20,
        "title_element_WebDriverWait_amount": 20,
        "synopsis_and_review_list_WebDriverWait_amount": 20,
        "publisher_element_WebDriverWait_amount": 20,
    }
    try:
        with open(filename, "r") as f:
            return json.load(f)
        
    except (FileNotFoundError, json.JSONDecodeError):
        return DEFAULT_SETTINGS

In [None]:
def book_details_and_editions_button_click(find_button_wait,button_element_WebDriverWait_amount,publisher_click_wait_amount):
    print(f"- [[Hard Wait]] of {find_button_wait} seconds for button to load...")
    time.sleep(find_button_wait)
    print(f"- Waiting up to {button_element_WebDriverWait_amount} seconds for presence of Button element")
    button = WebDriverWait(driver, button_element_WebDriverWait_amount).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'button[aria-label="Book details and editions"]')))
        
    find_button_test = "Pass"
    print("- Found Button")
    #button = driver.find_element_by_css_selector('button[aria-label="Book details and editions"]')
    
    print("- Trying Click")
    button.click()
    
    print("- Clicked Button:")
    
    # if no exception from click() then assign "Pass"
    clicked_button_test = "Pass"
    
    # waiting for elements to load after clicking button
    time.sleep(publisher_click_wait_amount)
    print(f"- [[Hard Wait]] of {publisher_click_wait_amount} seconds after clicking button")

    return clicked_button_test

In [None]:
def get_publisher_text(publisher_element_WebDriverWait_amount):
    try:
        print("- Looking for Publisher - Parent Element")
        # Find the element with the EditionDetails class
        edition_details_element = WebDriverWait(driver, publisher_element_WebDriverWait_amount).until(EC.presence_of_element_located((By.CLASS_NAME, 'EditionDetails')))
        print("- Found Parent Element")
        print("- Looking for Publisher - Child Element")
        # Find the nested element with the data-testid attribute
        publisher_element = edition_details_element.find_elements_by_css_selector('div[data-testid="contentContainer"]')
        print("- Found Child Element")
        
        return publisher_element[1].text
    
    except NoSuchElementException:
        print("Element: EditionDetails element after click fail")
        return ""
    
    except TimeoutException:
        print("Timeout: EditionDetails element not found within the specified time")
        return ""
    
    except IndexError:
        print("IndexError: The publisher_element list does not have enough elements")
        return ""
    
    except ElementClickInterceptedException:
        print("N O   P U B L I S H E R")
        return ""

In [None]:
def index_file_check(filename):
    # boolean value, test if file exists
    index_file_exists = os.path.isfile(filename)
    
    # checking last value of index in order to continue from last position
    if index_file_exists:
        with open(filename, "r") as index_file:
            last_index = int(index_file.read())
    else:
        last_index = 0

    print(f"Output of last_index: {last_index}")

    return index_file_exists, last_index

In [None]:
def scraper_logger(index,current_link,section)
    
    if section == "title":
        print("===============================================================")
        print(f"            N E W   B O O K  S T A R T I N G:   #{index+1}")
        print("===============================================================")
        print("                      L I N K")
        print("---------------------------------------------------------------")
        print(current_link)
        print("---------------------------------------------------------------")
        print("                   A C T I O N    L O G")
        print("---------------------------------------------------------------")
    else if section == "":
        

In [None]:
def publisher_test(publisher):
    publisher_test = 'Fail' if publisher == '' else 'Pass'
    
    return publisher_test

In [None]:
def review_test(synopsis_length):
    review_5_test = 'Pass' if synopsis_length >= 8 else 'Fail'
    
    return review_5_test

In [None]:
def scraper(df, settings_dict, last_index):
    # time settings
    synopsis_and_review_wait_amount = settings_dict["synopsis_and_review_wait_amount"]
    find_button_wait = settings_dict["find_button_wait"]
    publisher_click_wait_amount = settings_dict["publisher_click_wait_amount"]
    
    button_element_WebDriverWait_amount = settings_dict["button_element_WebDriverWait_amount"]
    title_element_WebDriverWait_amount = settings_dict["title_element_WebDriverWait_amount"]
    synopsis_and_review_list_WebDriverWait_amount = settings_dict["synopsis_and_review_list_WebDriverWait_amount"]
    publisher_element_WebDriverWait_amount = settings_dict["publisher_element_WebDriverWait_amount"]
    
    last_index = last_index
    
    # iterate over the dataframe
    for index, row in df.iloc[last_index:].iterrows():
        #============================================================================================================
        # L O A D    P A G E    O N    S C R A P I N G    B R O W S E R
        #============================================================================================================
        driver.get(row['links'])
        current_link = row['links']
        
        #============================================================================================================
        # L O G G I N G   B E G I N S
        #============================================================================================================
        scraper_logger(index,current_link,section="title")

        #============================================================================================================
        # S Y N O P S I S   &   R E V I E W S    -    M E A T   A N D   P O T A T O E S
        #============================================================================================================
        try:
            print(f"- [[Hard Wait]] of {synopsis_and_review_wait_amount} seconds for Synopsis and Reviews to load....")
            time.sleep(synopsis_and_review_wait_amount)
            print(f"- Waiting up to {synopsis_and_review_list_WebDriverWait_amount} for presence of Synopsis and Reviews element")
            synopsis_and_review_list = WebDriverWait(driver, synopsis_and_review_list_WebDriverWait_amount).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'Formatted')))
            synopsis = synopsis_and_review_list[0].text
            synopsis_length = len(synopsis_and_review_list)
        except TimeoutException:
            print(f"!!!Timed out while waiting for synopsis and review list for book {title}, moving on to next book!!!")
            continue
            
        #============================================================================================================
        # T I T L E
        #============================================================================================================
        try:
            print(f"- Waiting up to {title_element_WebDriverWait_amount} for presence of Title element")
            title_element = WebDriverWait(driver, title_element_WebDriverWait_amount).until(EC.presence_of_element_located((By.CLASS_NAME, 'BookPageTitleSection__title')))
            title = title_element.text
        except TimeoutException:
            print(f"!!! Timed out while waiting for title for book {current_link}, moving on to next book !!!")
            continue

        #============================================================================================================
        # P U B L I S H E R
        #============================================================================================================
        # First, try to find and click the button
        try:
            clicked_button_test = book_details_and_editions_button_click(find_button_wait,button_element_WebDriverWait_amount,publisher_click_wait_amount)

            # run the get_publisher_text function for 1st time
            print("- Looking for publisher...")
            publisher = get_publisher_text(publisher_element_WebDriverWait_amount)

        except (NoSuchElementException):
            find_button_test = "Fail"
            clicked_button_test = "Fail"
            print("!!! Button not found !!!")
            try:
                print("- Trying to click one last time")
                clicked_button_test = book_details_and_editions_button_click(find_button_wait,button_element_WebDriverWait_amount,publisher_click_wait_amount)
                print("- Trying to get publisher one last time")
                publisher = get_publisher_text(publisher_element_WebDriverWait_amount)
                find_button_test = "Pass"
            except:
                print("- Didn't Find Element")
                publisher=""
                find_button_test = "Fail"
                clicked_button_test = "Fail"
        except (ElementNotInteractableException, ElementClickInterceptedException):
            print("!!! Button Not Clickable or Overlay is in the way !!!")
            find_button_test = "Pass"
            clicked_button_test = "Fail"
            try:
                print("- Trying to click one last time")
                clicked_button_test = book_details_and_editions_button_click(find_button_wait,button_element_WebDriverWait_amount,publisher_click_wait_amount)
                print("- Trying to get publisher one last time")
                publisher = get_publisher_text(publisher_element_WebDriverWait_amount)
                find_button_test = "Pass"
            except(ElementNotInteractableException, ElementClickInterceptedException):
                print("!!! Button STILL Not Clickable or Overlay is in the way !!!")
                publisher=""
                find_button_test = "Fail"
                clicked_button_test = "Fail"
            except:
                publisher=""
                find_button_test = "Fail"
                clicked_button_test = "Fail"
    #============================================================================================================
    # B A S I C   M E T A D A T A   S E C T I O N
    #============================================================================================================
    try:
        page_count_element = driver.find_element_by_css_selector('p[data-testid="pagesFormat"]')
        page_count = page_count_element.text

    except NoSuchElementException:
        page_count = ""
    
    try:
        year_element = driver.find_element_by_css_selector('p[data-testid="publicationInfo"]')
        year = year_element.text
    except NoSuchElementException:
        year = ""
    
    try:
        review_count_element = driver.find_element_by_css_selector('span[data-testid="reviewsCount"]')
        review_count = review_count_element.text
        
        # converting string to interger with RegEx
        review_count_match = re.search(r'\d+', review_count)
        if review_count_match:
            review_count = int(review_count_match.group())
        else:
            review_count = 0
            
    except NoSuchElementException:
        review_count = 0

    try:
        rating_count_element = driver.find_element_by_css_selector('span[data-testid="ratingsCount"]')
        rating_count = rating_count_element.text

    except NoSuchElementException:
        rating_count = ""
    
    try:
        rating_element = driver.find_element_by_class_name('RatingStatistics__rating')    
        rating = rating_element.text
        
    except NoSuchElementException:
        rating = ""
    
    try:
        genre_class_element = driver.find_element_by_class_name('BookPageMetadataSection__genres')
        genres_text_element = genre_class_element.find_elements_by_css_selector('.BookPageMetadataSection__genreButton .Button__labelItem')
        genre = genres_text_element[0].text
        
    except NoSuchElementException:
        genre = ""
    
    try:
        author_element = driver.find_element_by_class_name('ContributorLink__name')
        author = author_element.text

    except NoSuchElementException:
        author = ""
        
    # create a list to hold current row with book metadata
    
    book_reviews = [title, synopsis, page_count, year, review_count, rating, rating_count, genre, author, publisher]

    #============================================================================================================
    # D A T A F R A M E   S E C T I O N
    #============================================================================================================
    # current row in DataFrame. create these columns
    # and add book information to current dataframe row
    df.at[index, 'title'] = title
    df.at[index, 'synopsis'] = synopsis
    df.at[index, 'page_count'] = page_count
    df.at[index, 'year'] = year
    df.at[index, 'review_count'] = review_count
    df.at[index, 'rating'] = rating
    df.at[index, 'rating_count'] = rating_count
    df.at[index, 'genre'] = genre
    df.at[index, 'author'] = author
    df.at[index, 'publisher'] = publisher

    # ==================================================================
    # R E V I E W S
    # ==================================================================
    # iterating 5 times for reviews offset by 3 
    # list of elements "synopsis_and_review_list" has synopsis
    # and reviews
    
    # set this to the maximum number of reviews you want to capture
    max_reviews = 5  
    
    # index 0 has synopsis, index 1 & 2 are blank, 3-7 contain reviews
    review_index = 3 
    
    for i in range(max_reviews):
        if review_index < synopsis_length:
            # saving current review to temporary list
            book_reviews.append(synopsis_and_review_list[review_index].text)
            review_index += 1
        else:
            book_reviews.append("no_review")
    # ==================================================================
    # T I M E   S T A M P
    # ==================================================================
    
    # assign the current datetime for each row in the loop
    current_datetime = datetime.datetime.now()
    
    # ==================================================================
    # L O G G G I N G   C O N T I N U E S
    # ==================================================================
    
    # T E S T   D E F I N I T I O N S
    # ------------------------------------------------------------------
    publisher_test = 
    review_5_test = 
    
    #============================================================================================================
    # L O G G I N G   C O N T I N U E S
    #============================================================================================================
    print("---------------------------------------------------------------")
    print("                      T I T L E")
    print("---------------------------------------------------------------")
    print(f"{title}")
    print("---------------------------------------------------------------")
    print(f"5 Review Test:{review_5_test}")
    print("---------------------------------------------------------------")
    print(f"Clicked Button Test:{clicked_button_test}")
    print("---------------------------------------------------------------")
    print(f"Publisher Found Test:{publisher_test}")
    print("---------------------------------------------------------------")
    print(f"TIME STAMP:{current_datetime}")
    print("---------------------------------------------------------------\n")
    print("\n")
    
    # ==================================================================
    # F I N A L  O U T P U T   D A T A F R A M E   C O L U M N S
    # ==================================================================
    df.at[index, 'link'] = current_link
    df.at[index, 'scraped_at'] = current_datetime
    
    # ==================================================================
    # F I N A L   C S V   O U T P U T   F I L E  A P P E N D
    # ==================================================================
    book_reviews.append(current_link)
    book_reviews.append(current_datetime)
    
    # ==================================================================
    # S A V I N G   D A T A   A F T E R   E A C H   I T E R A T I O N
    # ==================================================================
    with open(output_filename, "a", newline='', encoding='utf-8') as reviews_file:
        csv_writer = csv.writer(reviews_file)
        
        if not reviews_file_exists:
            csv_writer.writerow(['Book Name','Synopsis', 'page_count', 'year', 'review_count', 'rating', 'rating_count', 'genre', 'author', 'publisher', 'Review 1', 'Review 2', 'Review 3', 'Review 4', 'Review 5','Link','scraped_at'])
            reviews_file_exists = True
            
        # Actual Saving of the data to csv
        csv_writer.writerow(book_reviews)

    # ==================================================================
    # S A V I N G   P L A C E   A F T E R   E A C H   I T E R A T I O N
    # ==================================================================
    
    # save place of index to file for resuming later
    with open("index.txt", "w") as index_file:
        index_file.write(str(index + 1))
        
    return df

In [1]:
def get_goodreads_book_dataset(df, output_filename, settings_dict, index_filename = "index.txt"):
    # Starting scraper browser.
    driver = webdriver.Chrome()

    # Tests to check if file names exist.
    reviews_file_exists = os.path.isfile(output_filename)
    index_file_exits, last_index = index_file_check(index_filename)
     
    # Running scraper loop function.
    df = scraper(df,
                 output_filename,
                 settings_dict,
                 last_index
                )
    
    # Wrapping up!
    # -----------
    
    # Scraper has finished iterating DataFrame. Closing scraper browser.
    driver.quit()

    # Closing files in use.
    if not reviews_file_exists:
        open(output_filename, "a", newline='', encoding='utf-8').close()

    # Closing files in use.
    if not index_file_exists:
        open("index.txt", "a").close()

    # In case scraper runs correctly then there is no need for index file to resume.
    if os.path.isfile("index.txt"):
        os.remove("index.txt")

    return df

# Run GoodReads Scraper

#### Settings

In [None]:
# name input & output files
input_filename = "final_links.txt"
output_filename = "weekend_dataset.csv"
settings_filename = "settings.json"

# Load settings from file
time_settings_dict = load_settings(settings_filename)

In [10]:
# Update settings
# ---------------

# hard waits
time_settings_dict["synopsis_and_review_wait_amount"] = 4
# next 2 settings could double, if failure occurs
time_settings_dict["find_button_wait"] = 2
time_settings_dict["publisher_click_wait_amount"] = 2

# soft waits
time_settings_dict["button_element_WebDriverWait_amount"] = 20
time_settings_dict["title_element_WebDriverWait_amount"] = 20
time_settings_dict["synopsis_and_review_list_WebDriverWait_amount"] = 20
time_settings_dict["publisher_element_WebDriverWait_amount"] = 20

In [None]:
# Save settings to file
save_settings(time_settings_dict, settings_filename)

### Actual Scraper

In [None]:
# acquire links to scrape
df = text_file_to_dataframe(input_filename)

# run goodreads scraper function
df = get_goodreads_book_dataset(df, output_filename, settings_dict)