In [28]:
import pandas as pd
import os
import time
import csv
import re
import codecs
import requests
from bs4 import BeautifulSoup
    
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementNotInteractableException, ElementClickInterceptedException

In [2]:
def text_file_to_dataframe(filename):
    # Check if the text file exists
    if os.path.exists(filename):
        
        # Read the text file and create a list of lines
        with open(filename, "r") as file:
            lines = file.readlines()
            content = [line.strip() for line in lines]

        # Create a DataFrame from the list
        df = pd.DataFrame(content, columns=['links'])
        
        return df
    else:
        print(f"{filename} does not exist.")
        return None

# Loading Links & Creating DataFrame

In [3]:
input_filename = "links.txt"
df = text_file_to_dataframe(input_filename)

# Scraper Code

In [31]:
# S C R A P I N G   B R O W S E R
driver = webdriver.Chrome()

In [7]:
df['links'].iloc[4]

'https://www.goodreads.com/book/show/57566.Secret_Lives_of_the_First_Ladies?from_search=true&from_srp=true&qid=KL5lITbxkx&rank=1'

In [34]:
driver.get('https://www.goodreads.com/choiceawards/best-fantasy-books-2022')

In [12]:
button = driver.find_element_by_css_selector('button[aria-label="Book details and editions"]')

In [14]:
button.click()

In [38]:
close_button = driver.find_element_by_class_name('gr-iconButton')
close_button.text

''

In [41]:
close_button = driver.find_element_by_css_selector('.gr-iconButton')
close_button.click()


ElementNotInteractableException: Message: element not interactable
  (Session info: chrome=111.0.5563.64)


In [None]:
<span class="Button__labelItem">Book details &amp; editions</span>

In [42]:
# Define a function to extract the publisher element text
def get_publisher_text():
    try:
        # Find the element with the EditionDetails class
        time.sleep(10)
        edition_details_element = WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.CLASS_NAME, 'EditionDetails')))
        
        # Find the nested element with the data-testid attribute
        publisher_element = edition_details_element.find_elements_by_css_selector('div[data-testid="contentContainer"]')
        
        return publisher_element[1].text

    except NoSuchElementException:
        return ""
    except TimeoutException:
        print("Timeout: EditionDetails element not found within the specified time")
        return ""
    except ElementClickInterceptedException:
        return ""

In [None]:
# S C R A P I N G   B R O W S E R
driver = webdriver.Chrome()

reviews_file_exists = os.path.isfile("reviews.csv")
print("Does Review CSV File Exists?")
print(reviews_file_exists)
print("\n")

index_file_exists = os.path.isfile("index.txt")

# checking last value of index in order to continue from last position
if index_file_exists:
    with open("index.txt", "r") as index_file:
        last_index = int(index_file.read())
else:
    last_index = 0

print("Output of last_index")
print(last_index)

# iterate over the dataframe
for index, row in df.iloc[last_index:].iterrows():
    print("Ouput of index")
    print(index)
    
    print("Ouput of row")
    print(row)

    #============================================================================================================
    # L O A D    P A G E    O N    S C R A P I N G    B R O W S E R
    #============================================================================================================
    driver.get(row['links'])
    try:
        print("Waiting for Title Element to Load....")
        title_element = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, 'BookPageTitleSection__title')))
        print("Assigning Title")
        title = title_element.text
    except TimeoutException:
        print(f"Timed out while waiting for title for book {row['links']}, moving on to next book")
        continue
    
    print("Ouput of title")
    print(title)
    
    #============================================================================================================
    # P U B L I S H E R   D A T A
    #============================================================================================================
    # First, try to find and click the button
    button_found = False
    try:
        time.sleep(10)
        button = driver.find_element_by_css_selector('button[aria-label="Book details and editions"]')
        button.click()
        button_found = True
        # Wait for the publisher element to load
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "EditionDetails")))
    except (NoSuchElementException, ElementNotInteractableException, ElementClickInterceptedException):
        print("Button not found or not clickable")

    # If the button is not found or not clickable, the get_publisher_text function will still run
    publisher = get_publisher_text()

    # Print the extracted publisher text
    if publisher is not None:
        print("=====================")
        print(" P U B L I S H E R ")
        print("=====================")
        print(publisher)
        print("=====================")
    else:
        print("Publisher text not found")


    #============================================================================================================
    # D E F I N I T I O N    S E C T I O N
    #============================================================================================================
    try:
        print("Waiting for Synopsis and Reviews to load....")
        time.sleep(20)
        synopsis_and_review_list = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'Formatted')))
        print(f"The variable length of book {index+1} = ")
        print(len(synopsis_and_review_list))
        synopsis_length = len(synopsis_and_review_list)
        print("Output of synopsis_length")
        print(synopsis_length)
    except TimeoutException:
        print(f"Timed out while waiting for synopsis and review list for book {title}, moving on to next book")
        continue
    
    try:
        page_count_element = driver.find_element_by_css_selector('p[data-testid="pagesFormat"]')
        page_count = page_count_element.text
        
        print("Ouput of page_count")
        print(page_count)
    except NoSuchElementException:
        page_count = ""
    
    try:
        year_element = driver.find_element_by_css_selector('p[data-testid="publicationInfo"]')
        year = year_element.text
        
        print("Ouput of year")
        print(year)
    except NoSuchElementException:
        year = ""
    
    try:
        review_count_element = driver.find_element_by_css_selector('span[data-testid="reviewsCount"]')
        review_count = review_count_element.text
        
        print("Ouput of review_count")
        print(review_count)
        
        # converting string to interger with RegEx
        review_count_match = re.search(r'\d+', review_count)
        if review_count_match:
            review_count = int(review_count_match.group())
            print("Ouput of RegEx: review_count")
            print(review_count)
        else:
            review_count = 0
            print("Ouput of No_RegEx: review_count")
            print(review_count)
            
    except NoSuchElementException:
        review_count = 0
        print("Ouput of No_Count: review_count")
        print(review_count)
    
    try:
        rating_count_element = driver.find_element_by_css_selector('span[data-testid="ratingsCount"]')
        rating_count = rating_count_element.text
        print("Ouput of rating_count")
        print(rating_count)
    except NoSuchElementException:
        rating_count = ""
    
    try:
        rating_element = driver.find_element_by_class_name('RatingStatistics__rating')    
        rating = rating_element.text
        print("Ouput of rating")
        print(rating)
        
    except NoSuchElementException:
        rating = ""
    
    try:
        genre_class_element = driver.find_element_by_class_name('BookPageMetadataSection__genres')
        genres_text_element = genre_class_element.find_elements_by_css_selector('.BookPageMetadataSection__genreButton .Button__labelItem')
        genre = genres_text_element[0].text
        print("Ouput of genre")
        print(genre)
        
    except NoSuchElementException:
        genre = ""
    
    try:
        author_element = driver.find_element_by_class_name('ContributorLink__name')
        author = author_element.text
        print("Ouput of author")
        print(author)

    except NoSuchElementException:
        author = ""
        
    book_reviews = [title, page_count, year, review_count, rating, rating_count, genre, author,publisher]
    
    # current row in DataFrame. create these columns
    # and add book information to current dataframe row
    df.at[index, 'page_count'] = page_count
    df.at[index, 'publisher'] = publisher
    df.at[index, 'year'] = year
    df.at[index, 'review_count'] = review_count
    df.at[index, 'rating'] = rating
    df.at[index, 'rating_count'] = rating_count
    df.at[index, 'genre'] = genre
    df.at[index, 'author'] = author

    
    # =====================
    # T E S T   P R I N T
    # =====================
    print(f"Book {index}:")
    print(title)
    print("==============")
    print(page_count)
    print("\n")
    print("==============")
    print(year)
    print("\n")
    print("==============")
    print("Total Reviews: ")
    print(review_count)
    print("\n")
    print("==============")
    print(rating)
    print("\n")
    print("==============")
    print(rating_count)
    print("\n")
    print("==============")
    print(genre)
    print("\n")
    print("==============")
    print(author)
    print("\n")
    print("==============")
    print(publisher)
    print("\n")
    # ==================================================================
    # R E V I E W S
    # ==================================================================
    # iterating 5 times for reviews offset by 3 
    # list of elements "synopsis_and_review_list" has synopsis
    # and reviews
    review_index = 3
    for i in range(5):
        if review_index < synopsis_length:
            print(f"Review {i + 1}")
            print("==============")
            print(synopsis_and_review_list[review_index].text)

            # saving current review to temporary list
            book_reviews.append(synopsis_and_review_list[review_index].text)
            review_index += 1
        else:
            book_reviews.append("")
    # ==================================================================
    # S A V I N G   P L A C E   A F T E R   E A C H   I T E R A T I O N
    # ==================================================================
    with open("reviews.csv", "a", newline='', encoding='utf-8') as reviews_file:
        csv_writer = csv.writer(reviews_file)
        if not reviews_file_exists:
            csv_writer.writerow(['Book Name', 'page_count', 'year', 'review_count', 'rating', 'rating_count', 'genre', 'author', 'publisher', 'Review 1', 'Review 2', 'Review 3', 'Review 4', 'Review 5'])
            reviews_file_exists = True
        csv_writer.writerow(book_reviews)

    # ==================================================================
    # S A V I N G   P L A C E   A F T E R   E A C H   I T E R A T I O N
    # ==================================================================
    # save place of index to file for resuming later
    with open("index.txt", "w") as index_file:
        index_file.write(str(index + 1))

# D O N E    W I T H    S C R A P E R
driver.quit()

if not reviews_file_exists:
    open("reviews.csv", "a", newline='', encoding='utf-8').close()

if not index_file_exists:
    open("index.txt", "a").close()

Does Review CSV File Exists?
True


Output of last_index
26
Ouput of index
26
Ouput of row
links           https://www.goodreads.com/book/show/91698697-s...
page_count                                                    NaN
publisher                                                     NaN
year                                                          NaN
review_count                                                  NaN
rating                                                        NaN
rating_count                                                  NaN
genre                                                         NaN
author                                                        NaN
Name: 26, dtype: object
Waiting for Title Element to Load....
Assigning Title
Ouput of title
Study Guide: An Echo in the Bone by Diana Gabaldon
 P U B L I S H E R 
February 24, 2022 by Independently published
Waiting for Synopsis and Reviews to load....
The variable length of book 27 = 
3
Output of synopsis_length

Waiting for Title Element to Load....
Assigning Title
Ouput of title
Inside College Football
Football In The Big East Conference
 P U B L I S H E R 
September 1, 2007 by Rosen Central
Waiting for Synopsis and Reviews to load....
The variable length of book 29 = 
6
Output of synopsis_length
6
Ouput of page_count
48 pages, Hardcover
Ouput of year
First published September 1, 2007
Ouput of review_count
3 reviews
Ouput of RegEx: review_count
3
Ouput of rating_count
11 ratings
Ouput of rating
4.18
Ouput of author
Adam B. Hofstetter
Book 28:
Inside College Football
Football In The Big East Conference
48 pages, Hardcover


First published September 1, 2007


Total Reviews: 
3


4.18


11 ratings





Adam B. Hofstetter


September 1, 2007 by Rosen Central


Review 1
I learned that it talks about one most popular follege football conferences called the "Big East" conference. The book also talks about some of the NFL's most popular Hall of Famers named Dan Marino,Johnny Unitas ,Jim brown,Ernie 