In [1]:
import pandas as pd
import os
import time
import csv
import re
import codecs
import requests
from bs4 import BeautifulSoup
    
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException

In [2]:
def text_file_to_dataframe(filename):
    # Check if the text file exists
    if os.path.exists(filename):
        
        # Read the text file and create a list of lines
        with open(filename, "r") as file:
            lines = file.readlines()
            content = [line.strip() for line in lines]

        # Create a DataFrame from the list
        df = pd.DataFrame(content, columns=['links'])
        
        return df
    else:
        print(f"{filename} does not exist.")
        return None

# Loading Links & Creating DataFrame

In [3]:
input_filename = "links.txt"
df = text_file_to_dataframe(input_filename)

# Test Subset DataFrame

In [4]:
df = df.iloc[:9]

In [5]:
#for i in range(df.shape[0]):
#    print(df['links'].iloc[i])

# Scraper Code

In [6]:
# S C R A P I N G   B R O W S E R
driver = webdriver.Chrome()

reviews_file_exists = os.path.isfile("reviews.csv")
print("Does Review CSV File Exists?")
print(reviews_file_exists)
print("\n")

index_file_exists = os.path.isfile("index.txt")

# checking last value of index in order to continue from last position
if index_file_exists:
    with open("index.txt", "r") as index_file:
        last_index = int(index_file.read())
else:
    last_index = 0

print("Output of last_index")
print(last_index)

# iterate over the dataframe
for index, row in df.iloc[last_index:].iterrows():
    print("Ouput of index")
    print(index)
    
    print("Ouput of row")
    print(row)

    #============================================================================================================
    # L O A D    P A G E    O N    S C R A P I N G    B R O W S E R
    #============================================================================================================
    driver.get(row['links'])
    try:
        print("Waiting for Title Element to Load....")
        title_element = WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.CLASS_NAME, 'BookPageTitleSection__title')))
        print("Assigning Title")
        title = title_element.text
    except TimeoutException:
        print(f"Timed out while waiting for title for book {row['links']}, moving on to next book")
        continue
    
    print("Ouput of title")
    print(title)
    #============================================================================================================
    # D E F I N I T I O N    S E C T I O N
    #============================================================================================================
    try:
        print("Waiting for Synopsis and Reviews to load....")
        time.sleep(20)
        synopsis_and_review_list = WebDriverWait(driver, 20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'Formatted')))
        print(f"The variable length of book {index+1} = ")
        print(len(synopsis_and_review_list))
        synopsis_length = len(synopsis_and_review_list)
        print("Output of synopsis_length")
        print(synopsis_length)
    except TimeoutException:
        print(f"Timed out while waiting for synopsis and review list for book {title}, moving on to next book")
        continue
    
    try:
        page_count_element = driver.find_element_by_css_selector('p[data-testid="pagesFormat"]')
        page_count = page_count_element.text
        
        print("Ouput of page_count")
        print(page_count)
    except NoSuchElementException:
        page_count = ""
    
    try:
        year_element = driver.find_element_by_css_selector('p[data-testid="publicationInfo"]')
        year = year_element.text
        
        print("Ouput of year")
        print(year)
    except NoSuchElementException:
        year = ""
    
    try:
        review_count_element = driver.find_element_by_css_selector('span[data-testid="reviewsCount"]')
        review_count = review_count_element.text
        
        print("Ouput of review_count")
        print(review_count)
        
        # converting string to interger with RegEx
        review_count_match = re.search(r'\d+', review_count)
        if review_count_match:
            review_count = int(review_count_match.group())
            print("Ouput of RegEx: review_count")
            print(review_count)
        else:
            review_count = 0
            print("Ouput of No_RegEx: review_count")
            print(review_count)
            
    except NoSuchElementException:
        review_count = 0
        print("Ouput of No_Count: review_count")
        print(review_count)
    
    try:
        rating_count_element = driver.find_element_by_css_selector('span[data-testid="ratingsCount"]')
        rating_count = rating_count_element.text
        print("Ouput of rating_count")
        print(rating_count)
    except NoSuchElementException:
        rating_count = ""
    
    try:
        rating_element = driver.find_element_by_class_name('RatingStatistics__rating')    
        rating = rating_element.text
        print("Ouput of rating")
        print(rating)
        
    except NoSuchElementException:
        rating = ""
    
    try:
        genre_class_element = driver.find_element_by_class_name('BookPageMetadataSection__genres')
        genres_text_element = genre_class_element.find_elements_by_css_selector('.BookPageMetadataSection__genreButton .Button__labelItem')
        genre = genres_text_element[0].text
        print("Ouput of genre")
        print(genre)
        
    except NoSuchElementException:
        genre = ""
    
    try:
        author_element = driver.find_element_by_class_name('ContributorLink__name')
        author = author_element.text
        print("Ouput of author")
        print(author)

    except NoSuchElementException:
        author = ""
        
    book_reviews = [title, page_count, year, review_count, rating, rating_count, genre, author]

    if reviews_file_exists:
        with open("reviews.csv", "a", newline='', encoding='utf-8') as reviews_file:
            csv_writer = csv.writer(reviews_file)
            csv_writer.writerow(book_reviews)
    else:
        with open("reviews.csv", "w", newline='', encoding='utf-8') as reviews_file:
            csv_writer = csv.writer(reviews_file)
            csv_writer.writerow(['Book Name', 'page_count', 'year', 'review_count', 'rating', 'rating_count', 'genre', 'author', 'Review 1', 'Review 2', 'Review 3', 'Review 4', 'Review 5'])
            csv_writer.writerow(book_reviews)
            reviews_file_exists = True
    
    # current row in DataFrame. create these columns
    # and add book information to current dataframe row
    row['page_count'] = page_count
    row['year'] = year
    row['review_count'] = review_count
    row['rating'] = rating
    row['rating_count'] = rating_count
    row['genre'] = genre
    row['author'] = author
    
    # =====================
    # T E S T   P R I N T
    # =====================
    print(f"Book {index}:")
    print(title)
    print("==============")
    print(page_count)
    print("\n")
    print("==============")
    print(year)
    print("\n")
    print("==============")
    print("Total Reviews: ")
    print(review_count)
    print("\n")
    print("==============")
    print(rating)
    print("\n")
    print("==============")
    print(rating_count)
    print("\n")
    print("==============")
    print(genre)
    print("\n")
    print("==============")
    print(author)
    print("\n")
    
    # iterating 5 times for reviews offset by 3 
    # list of elements "synopsis_and_review_list" has synopsis
    # and reviews
    if synopsis_length >= 8:
        for i in range(5):
            print(f"Review {i + 1}")
            print("==============")
            print(synopsis_and_review_list[i + 3].text)

            # saving current review to temporary list
            book_reviews.append(synopsis_and_review_list[i + 3].text)

        # writing temporary list to csv file
        with open("reviews.csv", "a", newline='', encoding='utf-8') as reviews_file:
            csv_writer = csv.writer(reviews_file)
            csv_writer.writerow(book_reviews)

    # save place of index to file for resuming later
    with open("index.txt", "w") as index_file:
        index_file.write(str(index + 1))

# D O N E    W I T H    S C R A P E R
driver.quit()

if not reviews_file_exists:
    open("reviews.csv", "a", newline='', encoding='utf-8').close()

if not index_file_exists:
    open("index.txt", "a").close()

Does Review CSV File Exists?
False


Output of last_index
0
Ouput of index
0
Ouput of row
links    https://www.goodreads.com/book/show/13771831-b...
Name: 0, dtype: object
Waiting for Title Element to Load....
Assigning Title
Ouput of title
The Godmothers #5
Breaking News
Waiting for Synopsis and Reviews to load....
The variable length of book 1 = 
33
Output of synopsis_length
33
Ouput of page_count
272 pages, Paperback
Ouput of year
First published January 1, 2012
Ouput of review_count
131 reviews
Ouput of RegEx: review_count
131
Ouput of rating_count
1,649 ratings
Ouput of rating
4.17
Ouput of genre
Romance
Ouput of author
Fern Michaels
Book 0:
The Godmothers #5
Breaking News
272 pages, Paperback


First published January 1, 2012


Total Reviews: 
131


4.17


1,649 ratings


Romance


Fern Michaels


Review 1
Not the greatest until about the last third of the book. Maybe my taste in authors is changing.
Review 2
This is the fifth sequel in this series. A little better than the last 

Waiting for Title Element to Load....
Assigning Title
Ouput of title
The Black Wall of Silence: A Novel
Waiting for Synopsis and Reviews to load....
The variable length of book 6 = 
6
Output of synopsis_length
6
Ouput of page_count
231 pages, Kindle Edition
Ouput of year
First published March 22, 2015
Ouput of review_count
3 reviews
Ouput of RegEx: review_count
3
Ouput of rating_count
23 ratings
Ouput of rating
4.22
Ouput of author
Paul F Morrissey
Book 5:
The Black Wall of Silence: A Novel
231 pages, Kindle Edition


First published March 22, 2015


Total Reviews: 
3


4.22


23 ratings





Paul F Morrissey


Ouput of index
6
Ouput of row
links    https://www.goodreads.com/book/show/18528428-a...
Name: 6, dtype: object
Waiting for Title Element to Load....
Assigning Title
Ouput of title
The Grangers #2
A Man's Promise
Waiting for Synopsis and Reviews to load....
The variable length of book 7 = 
33
Output of synopsis_length
33
Ouput of page_count
400 pages, Mass Market Paperback
Ouput

Waiting for Title Element to Load....
Assigning Title
Ouput of title
The Oxenburg Princes #1
The Prince Who Loved Me
Waiting for Synopsis and Reviews to load....
The variable length of book 8 = 
33
Output of synopsis_length
33
Ouput of page_count
355 pages, Mass Market Paperback
Ouput of year
First published September 23, 2014
Ouput of review_count
304 reviews
Ouput of RegEx: review_count
304
Ouput of rating_count
1,991 ratings
Ouput of rating
3.80
Ouput of genre
Historical Romance
Ouput of author
Karen Hawkins
Book 7:
The Oxenburg Princes #1
The Prince Who Loved Me
355 pages, Mass Market Paperback


First published September 23, 2014


Total Reviews: 
304


3.80


1,991 ratings


Historical Romance


Karen Hawkins


Review 1
This is a (very loose) retelling of Cinderella in a historical romance format.

Bronwyn is a minor Scottish lady. She, her eccentric inventor father, her stepmother, and her two younger stepsisters are living in genteel poverty - able to keep on a servant and a co

Waiting for Title Element to Load....
Assigning Title
Ouput of title
Secret [With Bonus Material]
Waiting for Synopsis and Reviews to load....
The variable length of book 9 = 
33
Output of synopsis_length
33
Ouput of page_count
327 pages, ebook
Ouput of year
First published June 6, 2015
Ouput of review_count
539 reviews
Ouput of RegEx: review_count
539
Ouput of rating_count
3,787 ratings
Ouput of rating
4.20
Ouput of genre
M M Romance
Ouput of author
Kindle Alexander
Book 8:
Secret [With Bonus Material]
327 pages, ebook


First published June 6, 2015


Total Reviews: 
539


4.20


3,787 ratings


M M Romance


Kindle Alexander


Review 1
Re-Read:

Sometimes I find I just need some Kindle Alexander in my life. Her books are simply guilty pleasure reads for me. I just eat them up.



Enjoyed it almost as much the second time around!

First read:



Oh man, I think this might be my favorite Kindle Alexander yet!!!!

That blurb, no lie….



….had me shaking in my boots. It screamed angst a