In [1]:
import pandas as pd
import os
import re
import codecs
import requests
from bs4 import BeautifulSoup
    
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException

In [2]:
def modify_link(link):
    '''
        Function to modify the link format
    '''
    # Split the link using '?' and take the first part (before the query parameters)
    base_url = link.split('?')[0]
    
    # Split the base_url using '/' and return the second last part
    return base_url.split('/')[-1]

In [3]:
def text_file_to_dataframe(filename):
    # Check if the text file exists
    if os.path.exists(filename):
        
        # Read the text file and create a list of lines
        with open(filename, "r") as file:
            lines = file.readlines()
            content = [line.strip() for line in lines]

        # Create a DataFrame from the list
        df = pd.DataFrame(content, columns=['links'])
        
        return df
    else:
        print(f"{filename} does not exist.")
        return None

In [4]:
def prepare_link_df(df):
    # Apply the modify_link function to the 'data' column
    df['links'] = df['links'].apply(modify_link)
        
    return df

In [5]:
def save_dataframe_to_text_file(df, filename):
    # Save the modified DataFrame to a text file with newline-separated values
    df.to_csv(filename, index=False, header=False, sep='\n')

In [6]:
def get_reviews(df, num_loops):
    # initiate selenium browser
    driver = webdriver.Chrome()
    review_1_show_more_button_tag = '/html/body/div[1]/div/main/div[1]/div[2]/div[3]/div/div[5]/div[2]/div[1]/article/section/section[2]/section/div/div[2]/div/button/span[1]'   
    review_1_show_more_button = driver.find_element_by_xpath(review_1_show_more_button_tag)
    review_1_body_tag = '/html/body/div[1]/div/main/div[1]/div[2]/div[3]/div/div[5]/div[2]/div[1]/article/section/section[2]/section/div/div[1]/span'
    review_1_body = driver.find_element_by_xpath(review_1_body_tag)
    
    # iterate through every row in DataFrame column 'links'
    for i in df['links']:
        # go to starter link on browser
        driver.get(i)
        review_1_show_more_button.click()
        print(review_1_body.text)

In [7]:
def get_reviews_4(df, num_loops):
    # initiate selenium browser
    driver = webdriver.Chrome()

    # iterate through the dataframe for num_loops times
    for i in range(num_loops):
        # variable definitions
        link = df['links'].iloc[i]
        
        # go to starter link on browser
        driver.get(link)
        
        for j in range(5):
            # tag variable dictionaries
            review_body_tag_dictionary = {'review':['//*[@id="ReviewsSection"]/div[5]/div[2]/div[1]/article/section/section[2]/section/div/div[1]/span',
                            '//*[@id="ReviewsSection"]/div[5]/div[2]/div[3]/article/section/section[2]/section/div/div[1]/span',
                            '//*[@id="ReviewsSection"]/div[5]/div[2]/div[4]/article/section/section[2]/section/div/div[1]/span',
                            '//*[@id="ReviewsSection"]/div[5]/div[2]/div[5]/article/section/section[2]/section/div/div[1]/span',
                            '//*[@id="ReviewsSection"]/div[5]/div[2]/div[6]/article/section/section[2]/section/div/div[1]/span'           
                            ]}
            # wait 10 seconds for review to appear and extract it
            try:
                review_body_tag = review_body_tag_dictionary['review'][j]
                review_body = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, review_body_tag)))
                print(f"Book {i+1}: ")
                print(f"Review {j+1}: ")
                print(review_body.text)
                print("\n")
            except (TimeoutException, NoSuchElementException):
                print(f"Could not get review searching for \"more button\": {link}")
                print("\n")
                continue

            # wait 10 seconds for show more button to appear and click on it
            try:
                review_show_more_button_tag = '/html/body/div[1]/div/main/div[1]/div[2]/div[3]/div/div[5]/div[2]/div[1]/article/section/section[2]/section/div/div[2]/div/button/'
                review_show_more_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, review_show_more_button_tag)))
                review_show_more_button.click()
            except (TimeoutException, NoSuchElementException):
                print(f"Could not find show more button for Book {i+1} Review {j+1}: \n{link}")
                print("\n")
                continue
            except ElementClickInterceptedException:
                print(f"Could not click on show more button for Book {i+1} Review {j+1}: \n{link}")
                print("\n")
                continue
            
    # close the browser
    driver.quit()

In [8]:
def get_reviews_5(df, num_loops):
    # initiate selenium browser
    driver = webdriver.Chrome()

    # iterate through the dataframe for num_loops times
    for i in range(num_loops):
        # variable definitions
        link = df['links'].iloc[i]
        
        # go to starter link on browser
        driver.get(link)
        
        # 5 reviews = 5 loops in dictionary
        for j in range(5):
            # tag variable dictionaries
            review_body_tag_dictionary = {'review':['//*[@id="ReviewsSection"]/div[5]/div[2]/div[1]/article/section/section[2]/section/div/div[1]/span',
                            '//*[@id="ReviewsSection"]/div[5]/div[2]/div[3]/article/section/section[2]/section/div/div[1]/span',
                            '//*[@id="ReviewsSection"]/div[5]/div[2]/div[4]/article/section/section[2]/section/div/div[1]/span',
                            '//*[@id="ReviewsSection"]/div[5]/div[2]/div[5]/article/section/section[2]/section/div/div[1]/span',
                            '//*[@id="ReviewsSection"]/div[5]/div[2]/div[6]/article/section/section[2]/section/div/div[1]/span'           
                            ]}
            # wait 10 seconds for review to appear and extract it
            try:
                review_body_tag = review_body_tag_dictionary['review'][j]
                review_body = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, review_body_tag)))
                print(f"Book {i+1}: ")
                print(f"Review {j+1}: ")
                print(review_body.text)
                print("\n")
            except (TimeoutException, NoSuchElementException):
                print(f"Could not get review {j} for book {i} searching for \"more button\": \n{link}")
                print("\n")
                
                # wait 10 seconds for show more button to appear and click on it
                review_show_more_button_tag = '/html/body/div[1]/div/main/div[1]/div[2]/div[3]/div/div[5]/div[2]/div[1]/article/section/section[2]/section/div/div[2]/div/button/'
                try:
                    # try looking for a 'show more' button
                    review_show_more_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, review_show_more_button_tag)))
                    
                    # try clicking on "show more" button
                    review_show_more_button.click()
                    
                    # try getting review text again
                    review_body_tag = review_body_tag_dictionary['review'][j]
                    review_body = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, review_body_tag)))
                    print(f"Book {i+1}: ")
                    print(f"Review {j+1}: ")
                    print(review_body.text)
                    print("\n")
                except (TimeoutException, NoSuchElementException):
                    print(f"Could not find show more button for Book {i+1} Review {j+1}: \n{link}")
                    print("\n")
                    continue
                except ElementClickInterceptedException:
                    print(f"Could not click on show more button for Book {i+1} Review {j+1}: \n{link}")
                    print("\n")
                    continue
    # close the browser
    driver.quit()

In [36]:
def get_reviews_6(df, num_loops):
    # initiate selenium browser
    driver = webdriver.Chrome()

    # iterate through the book link dataframe for num_loops times
    for i in range(num_loops):
        # variable definitions
        link = df['links'].iloc[i]
        
        # go to starter link on browser
        driver.get(link)
        
        for j in range(5):
            # tag variable dictionaries
            review_body_tag_dictionary = {'review':['//*[@id="ReviewsSection"]/div[5]/div[2]/div[1]/article/section/section[2]/section/div/div[1]/span',
                            '//*[@id="ReviewsSection"]/div[5]/div[2]/div[3]/article/section/section[2]/section/div/div[1]/span',
                            '//*[@id="ReviewsSection"]/div[5]/div[2]/div[4]/article/section/section[2]/section/div/div[1]/span',
                            '//*[@id="ReviewsSection"]/div[5]/div[2]/div[5]/article/section/section[2]/section/div/div[1]/span',
                            '//*[@id="ReviewsSection"]/div[5]/div[2]/div[6]/article/section/section[2]/section/div/div[1]/span'           
                            ]}
            
            # flag variable to keep track of success
            review_found = False

            # wait 10 seconds for review to appear and extract it
            for k in range(2):
                try:
                    review_body_tag = review_body_tag_dictionary['review'][j]
                    review_body = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, review_body_tag)))
                    print(f"Book {i+1}: ")
                    print(f"Review {j+1}: ")
                    print(review_body.text)
                    print("\n")
                    review_found = True
                    break
                except (TimeoutException, NoSuchElementException):
                    print(f"Could not get review {j+1} for book {i+1} searching for \"more button\": \n{link}")
                    print("\n")

                    # wait 10 seconds for show more button to appear and click on it
                    review_show_more_button_tag = '/html/body/div[1]/div/main/div[1]/div[2]/div[3]/div/div[5]/div[2]/div[1]/article/section/section[2]/section/div/div[2]/div/button/'
                    try:
                        # try looking for a 'show more' button
                        review_show_more_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, review_show_more_button_tag)))

                        # try clicking on "show more" button
                        review_show_more_button.click()
                        
                        # if click is successfull try loading book review data again
                        try:
                            review_body_tag = review_body_tag_dictionary['review'][j]
                            review_body = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, review_body_tag)))
                            print(f"Book {i+1}: ")
                            print(f"Review {j+1}: ")
                            print(review_body.text)
                            print("\n")
                            review_found = True
                            break
                        except (TimeoutException, NoSuchElementException):
                            print(f"Could not get review {j+1} for book {i+1} after clicking \"more\" button \n{link}")
                            print("\n")
                    except (TimeoutException, NoSuchElementException):
                        print(f"Could not find show more button for Book {i+1} Review {j+1}: \n{link}")
                        print("\n")
                        break
                    except ElementClickInterceptedException:
                        print(f"Could not click on show more button for Book {i+1} Review {j+1}: \n{link}")
                        print("\n")
                        break
            
            # if review is found keep iterating for all 5 reviews
            if review_found:
                continue
            else:
                # break out of the loop if a review or total of 5 is not found
                print(f"Didn't find enough reviews for book {i+1}. Moving on to book {i+2}")
                print("==========================================================\n")
                break
            
    # close the browser
    driver.quit()

# Main

In [29]:
input_filename = "links.txt"
df = text_file_to_dataframe(input_filename)

In [37]:
get_reviews_6(df,5)

Book 1: 
Review 1: 
Not the greatest until about the last third of the book. Maybe my taste in authors is changing.


Book 1: 
Review 2: 
This is the fifth sequel in this series. A little better than the last couple. The Godmothers are funny but the story line needs to end. I have the sixth book on my bookshelf hope it's better.


Book 1: 
Review 3: 
This book reminded me of The Golden Girls television series. It included some of the more recent references to people and events that have occured since the series.


Book 1: 
Review 4: 
fern michaels never fails to provide a story that you just cannot put down, I cant wait for her next one


Book 1: 
Review 5: 
Another good read!


Could not get review 1 for book 2 searching for "more button": 
https://www.goodreads.com/book/show/28111953-jokes-jokes-and-more-jokes?from_search=true&from_srp=true&qid=ZpnjM8vxLv&rank=1


Could not find show more button for Book 2 Review 1: 
https://www.goodreads.com/book/show/28111953-jokes-jokes-and-more-j

# Manually Testing

In [None]:
# initiate selenium browser
driver = webdriver.Chrome()

In [None]:
link = 'https://www.goodreads.com/book/show/13771831-breaking-news?from_search=true&from_srp=true&qid=mipV5pB2vA&rank=1'

In [None]:
# go to starter link on browser
driver.get(link)

# Click on Link

In [None]:
review_1_show_more_button_tag = '/html/body/div[1]/div/main/div[1]/div[2]/div[3]/div/div[5]/div[2]/div[1]/article/section/section[2]/section/div/div[2]/div/button/'

In [None]:
review_1_show_more_button = driver.find_element_by_xpath(review_1_show_more_button_tag)

# If Link Doesn't Exists

In [None]:
review_body_tag_dictionary = {}

In [None]:
review_1_body_tag = '//*[@id="ReviewsSection"]/div[5]/div[2]/div[1]/article/section/section[2]/section/div/div[1]/span'

In [None]:
review_2_body_tag = '//*[@id="ReviewsSection"]/div[5]/div[2]/div[3]/article/section/section[2]/section/div/div[1]/span'

In [None]:
review_3_body_tag = '//*[@id="ReviewsSection"]/div[5]/div[2]/div[4]/article/section/section[2]/section/div/div[1]/span'

In [None]:
review_4_body_tag = '//*[@id="ReviewsSection"]/div[5]/div[2]/div[5]/article/section/section[2]/section/div/div[1]/span'

In [None]:
review_5_body_tag = '//*[@id="ReviewsSection"]/div[5]/div[2]/div[6]/article/section/section[2]/section/div/div[1]/span'

In [None]:
review_1_body = driver.find_element_by_xpath(review_1_body_tag)

In [None]:
review_2_body = driver.find_element_by_xpath(review_2_body_tag)

In [None]:
review_3_body = driver.find_element_by_xpath(review_3_body_tag)

In [None]:
review_4_body = driver.find_element_by_xpath(review_4_body_tag)

In [None]:
review_5_body = driver.find_element_by_xpath(review_5_body_tag)

In [None]:
print(review_1_body.text)

In [None]:
print(review_2_body.text)

In [None]:
print(review_3_body.text)

In [None]:
print(review_4_body.text)

In [None]:
print(review_5_body.text)