In [9]:
import pandas as pd
import os
import time
import datetime
import csv
import re
import codecs
import requests
from bs4 import BeautifulSoup
    
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementNotInteractableException, ElementClickInterceptedException

In [10]:
def text_file_to_dataframe(filename):
    # Check if the text file exists
    if os.path.exists(filename):
        
        # Read the text file and create a list of lines
        with open(filename, "r") as file:
            lines = file.readlines()
            content = [line.strip() for line in lines]

        # Create a DataFrame from the list
        df = pd.DataFrame(content, columns=['links'])
        
        return df
    else:
        print(f"{filename} does not exist.")
        return None

In [11]:
# Define a function to extract the publisher element text
def get_publisher_text(edition_details_element_WebDriverWait_amount):
    try:
        # Find the element with the EditionDetails class
        edition_details_element = WebDriverWait(test, edition_details_element_WebDriverWait_amount).until(EC.presence_of_element_located((By.CLASS_NAME, 'EditionDetails')))
        
        # Find the nested element with the data-testid attribute
        publisher_element = edition_details_element.find_elements_by_css_selector('div[data-testid="contentContainer"]')
        
        return publisher_element[1].text

    except NoSuchElementException:
        print("Element: EditionDetails element after click fail")
        return ""
    except TimeoutException:
        print("Timeout: EditionDetails element not found within the specified time")
        return ""
    except ElementClickInterceptedException:
        print("N O   P U B L I S H E R")
        return ""

### Test DataFrame

In [8]:
list_of_test_links = ['https://www.goodreads.com/book/show/20980648-waistcoats-weaponry',
                      'https://www.goodreads.com/book/show/144350.The_Battle_for_Skandia',
                      'https://www.goodreads.com/book/show/30625170-dark-age',
                      'https://www.goodreads.com/book/show/11777020-headhunters',
                      'https://www.goodreads.com/book/show/48577242-even-the-dogs',
                      'https://www.goodreads.com/book/show/24796.Zorro',
                      'https://www.goodreads.com/book/show/65112.The_End',
                      'https://www.goodreads.com/book/show/11288465-the-cracked-mirror',
                      'https://www.goodreads.com/book/show/7823038-grave-witch',
                      'https://www.goodreads.com/book/show/128759.Fatal_Voyage',
                      'https://www.goodreads.com/book/show/2726462-green-chic',
                      'https://www.goodreads.com/book/show/129510.Empire_of_Ivory',
                      'https://www.goodreads.com/book/show/20764879-a-gathering-of-shadows',
                      'https://www.goodreads.com/book/show/9814682-a-song-of-ice-and-fire',
                      'https://www.goodreads.com/book/show/8196732-sister',
                      'https://www.goodreads.com/book/show/10131648-the-greater-journey'
                     ]
df = pd.DataFrame(list_of_test_links, columns=['links'])

### Scraper

In [12]:
# S C R A P I N G   B R O W S E R
test = webdriver.Chrome()
test.get(df['links'].iloc[0])

### Time Wait Settings

In [13]:
synopsis_and_review_wait_amount = 10
publisher_click_wait_amount = 5

title_element_WebDriverWait_amount = 5
synopsis_and_review_list_WebDriverWait_amount = 30
edition_details_element_WebDriverWait_amount = 5

### Get Publisher

In [14]:
time.sleep(synopsis_and_review_wait_amount)
button = test.find_element_by_css_selector('button[aria-label="Book details and editions"]')
print(button.text)
button.click()
time.sleep(publisher_click_wait_amount)
publisher = get_publisher_text(edition_details_element_WebDriverWait_amount)
print(publisher)

### Function Desconstructing

In [21]:
edition_details_element = WebDriverWait(test, edition_details_element_WebDriverWait_amount).until(EC.presence_of_element_located((By.CLASS_NAME, 'EditionDetails')))

In [24]:
print(edition_details_element.text)

This edition
Format
298 pages, Hardcover
Published
November 4, 2014 by Little, Brown Books for Young Readers
ISBN
9780316190275 (ISBN10: 0316190276)
Language
English


In [27]:
publisher_element = edition_details_element.find_elements_by_css_selector('div[data-testid="contentContainer"]')
for i in publisher_element:
    print(i.text)

298 pages, Hardcover
November 4, 2014 by Little, Brown Books for Young Readers
9780316190275 (ISBN10: 0316190276)
English


In [28]:
print(publisher_element[1].text)

November 4, 2014 by Little, Brown Books for Young Readers


In [29]:
publisher = get_publisher_text(edition_details_element_WebDriverWait_amount)
print(publisher)

November 4, 2014 by Little, Brown Books for Young Readers


# All Together

In [31]:
# S C R A P I N G   B R O W S E R
test = webdriver.Chrome()

In [32]:
test.get(df['links'].iloc[1])

# T I M E   W A I T   S E T T I N G S
synopsis_and_review_wait_amount = 10
publisher_click_wait_amount = 5

title_element_WebDriverWait_amount = 5
synopsis_and_review_list_WebDriverWait_amount = 30
edition_details_element_WebDriverWait_amount = 5

# P U B L I S H E R   E L E M E N T   S C R A P I N G
time.sleep(synopsis_and_review_wait_amount)
button = test.find_element_by_css_selector('button[aria-label="Book details and editions"]')
button.click()
time.sleep(publisher_click_wait_amount)
publisher = get_publisher_text(edition_details_element_WebDriverWait_amount)
print(publisher)
test.quit()

March 18, 2008 by Viking Books for Young Readers


# Loop Test

In [34]:
# T I M E   W A I T   S E T T I N G S
synopsis_and_review_wait_amount = 10
publisher_click_wait_amount = 5

title_element_WebDriverWait_amount = 5
synopsis_and_review_list_WebDriverWait_amount = 30
edition_details_element_WebDriverWait_amount = 5

for i in range(df.shape[0]):
    test.get(df['links'].iloc[i])

    # P U B L I S H E R   E L E M E N T   S C R A P I N G
    print(f"Waiting {synopsis_and_review_wait_amount} seconds for Book details and editions")
    time.sleep(synopsis_and_review_wait_amount)
    
    print("Looking for Button")
    button = test.find_element_by_css_selector('button[aria-label="Book details and editions"]')
    
    print("Clicking Button")
    button.click()
    
    print(f"Waiting {publisher_click_wait_amount} seconds for get_publisher_text")
    time.sleep(publisher_click_wait_amount)
    
    print("Looking for get_publisher_text")
    publisher = get_publisher_text(edition_details_element_WebDriverWait_amount)
    
    print("Publisher: \n")
    print(publisher)

MaxRetryError: HTTPConnectionPool(host='127.0.0.1', port=56362): Max retries exceeded with url: /session/dff37f351b619b35076167f01afea7c6/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x16c8aac70>: Failed to establish a new connection: [Errno 61] Connection refused'))