In [17]:
import pandas as pd
import os
import re
import codecs
import requests
from bs4 import BeautifulSoup
    
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

In [3]:
def get_amazon_book_list(filename):
    import codecs
    # column names for pandas CSV reading
    header_names = ['Amazon ID (ASIN)', 'Filename', 'Image URL', 'Title', 'Author', 'Category ID',
                'Category']
    
    # specific encoding setting for opening file before Pandas CSV
    with codecs.open(filename, mode='r', encoding='utf-8', errors='ignore') as f:
        df = pd.read_csv(f, delimiter=",", header=None, names=header_names)
    
    return df

In [6]:
def clean_column(df, column):
    # Convert column to lowercase and remove text inside parentheses
    df['temp'] = df[column].str.lower().replace(r'\([^()]*\)', '', regex=True)

    # Remove text after colon or hyphen
    df['temp'] = df['temp'].str.split(r'[:\-]').str[0]

    # Remove extra whitespace
    df['temp'] = df['temp'].str.replace(r'\s+', ' ', regex=True).str.strip()

    # Modify DataFrame with new column
    new_column = f'cleaned_{column}'
    df[new_column] = df['temp']

    # Remove temporary column
    df = df.drop(columns=['temp'])

    return df

In [13]:
def clean_amazon_book_list(df):
    # S U B S E T   D A T A F R A M E
    df = df[['Author','Title','Category']]
    keeper_columns = ['Romance','Mystery, Thrillers & Suspense',
                  'Teen & Young Adult','Science Fiction & Fantasy',
                  'Literature & Fiction','Humor & Entertainment',
                  ]
    df = df[df['Category'].isin(keeper_columns)]
    
    # D R O P   N U L L S
    df = df.dropna()
    
    # C L E A N   C O L U M N S
    df = clean_column(df,'Title')
    df = clean_column(df,'Author')
    
    return df

# Acquire

In [14]:
filename = 'book30-listing-train.csv'
df = get_amazon_book_list(filename)

# Clean

In [15]:
df = clean_amazon_book_list(df)

# Scraper

In [7]:
# initiate selenium browser
driver = webdriver.Chrome()

In [4]:
starter_link = 'https://www.goodreads.com/search?q=norman+doidge+the+brain+that+changes+itself&ref=nav_sb_noss_l_43'

In [8]:
# go to starter link on browser
driver.get(starter_link)

In [30]:
def get_reviews(df, num_loops):
    # if there was an issue with the loop function or the function was interrupted
    # there should be a 'links.txt' file that exists
    # to avoid any extra work, the function will continue from last link
    # =================================
    # opening links.txt to read from it
    if os.path.exists("links.txt"):
        with open("links.txt", "r") as file:
            lines = file.readlines()
            starter_link = lines[-1].strip()
    else:
        # if starting loop from scratch, this link is used as the first page
        starter_link = "https://www.goodreads.com/book/show/3450744-nudge"
    
    # create a blank dataframe for keeping links
    df['link'] = ""

    # initiate selenium browser
    #driver = webdriver.Chrome()
    
    # if there was an issue with the loop function or the function was interrupted
    # there should be a 'row_index.txt' file that exists
    # to avoid any extra work, the function will continue from last index
    # ===============================
    # opening link.txt to write to it
    with open("links.txt", "a") as file:
        if os.path.exists("row_index.txt"):
            # reading index file
            with open("row_index.txt", "r") as index_file:
                last_index = int(index_file.read().strip())
                start_index = last_index + 1
        else:
            start_index = 0

        # creating end_index with argument of num_loops and the "index file"
        end_index = start_index + num_loops

        # iterate through every row in DataFrame using indices created from file & arguments
        for index, row in df.iloc[start_index:end_index].iterrows():
            try:
                title = row['cleaned_Title']
                author = row['cleaned_Author']
                print(author)
                print(title)
            except:
                print("No Results. Moving on to next.")
                continue
            finally:
                print("\n writing index to file")
    return df

In [31]:
new_df = get_reviews(df,506)

breaking news
fern michaels

 writing index to file
jokes, jokes and more jokes
joke star funny bones mr.

 writing index to file
byron's poetry and prose
george gordon byron

 writing index to file
tropical world
millie marotta

 writing index to file
legally stoned
micheal kratom

 writing index to file
secret lives of the first ladies
cormac o'brien

 writing index to file
the black wall of silence
paul morrissey

 writing index to file
a man's promise
brenda jackson

 writing index to file
the prince who loved me
karen hawkins

 writing index to file
secret
kindle alexander

 writing index to file
the poetic edda
ursula dronke

 writing index to file
the wolf man vs. dracula
philip j. riley

 writing index to file
the sound of music family scrapbook
fred bronson

 writing index to file
soulpancake
rainn wilson

 writing index to file
my guide to understanding islam
yusuf karagol

 writing index to file
ninjutsu
eric chaline

 writing index to file
lost for words
edward st. aubyn

 

# Links

In [58]:
search_result_links = driver.find_elements_by_class_name('bookTitle')

In [38]:
for i in range(5):
    # href <tag> link for search result
    print(search_result_links[i].get_attribute('href'))
    print()

https://www.goodreads.com/book/show/44564350-summary-of-norman-doidge-s-the-brain-that-changes-itself-by-swift-reads?from_search=true&from_srp=true&qid=moUxaZXGYV&rank=1

https://www.goodreads.com/book/show/60568411-summary-of-norman-doidge-s-book?from_search=true&from_srp=true&qid=moUxaZXGYV&rank=2

https://www.goodreads.com/book/show/570172.The_Brain_that_Changes_Itself?from_search=true&from_srp=true&qid=moUxaZXGYV&rank=3

https://www.goodreads.com/book/show/67356251-summary-of-the-brain-that-changes-itself?from_search=true&from_srp=true&qid=moUxaZXGYV&rank=4

https://www.goodreads.com/book/show/56738924-summary-and-analysis-of-the-brain-that-changes-itself?from_search=true&from_srp=true&qid=moUxaZXGYV&rank=5



# Author

In [42]:
author_1_tag = '/html/body/div[2]/div[3]/div[1]/div[2]/div[2]/table/tbody/tr[1]/td[2]/span[2]/div/a/span'

In [51]:
author_2_tag = '/html/body/div[2]/div[3]/div[1]/div[2]/div[2]/table/tbody/tr[2]/td[2]/span[2]/div/a/span'

In [43]:
author_1 = driver.find_element_by_xpath(author_1_tag)

In [55]:
author_2 = driver.find_element_by_xpath(author_2_tag)

In [56]:
print(author_2.text)

Good Summaries


In [None]:
loop_author_tag = f'/html/body/div[2]/div[3]/div[1]/div[2]/div[2]/table/tbody/tr[{i}/td[2]/span[2]/div/a/span'

# Reviews

In [62]:
summary_1_link = search_result_links[2].get_attribute('href')

In [64]:
# go to starter link on browser
driver.get(summary_1_link)

In [66]:
review_1_show_more_button_tag = '/html/body/div[1]/div/main/div[1]/div[2]/div[3]/div/div[5]/div[2]/div[1]/article/section/section[2]/section/div/div[2]/div/button/span[1]'

In [67]:
review_1_show_more_button = driver.find_element_by_xpath(review_1_show_more_button_tag)

In [68]:
review_1_show_more_button.click()

In [72]:
review_1_body_tag = '/html/body/div[1]/div/main/div[1]/div[2]/div[3]/div/div[5]/div[2]/div[1]/article/section/section[2]/section/div/div[1]/span'

In [73]:
review_1_body = driver.find_element_by_xpath(review_1_body_tag)

In [74]:
print(review_1_body.text)

When I saw this book initially I thought that I would have nothing but unequivocally good things to say about it. I am very fond of ‘brain’ books and prefer to believe that the mind is ‘plastic’ – that it can change itself or re-wire itself. I haven’t got much to pin this hope on. But hope is a good thing. However, in the end my response to this book has been much less black-and-white than I thought it would be.

I’ve also just finished Fooled By Randomness. This has made me hypersensitive to any possibility that I might be getting fooled by any statistical aberrations. And, potentially, I saw statistical aberrations everywhere in this book. So much so that I was going to stop reading it and move onto something else a few times. But then he quoted an Indian doctor about the importance of individual case studies and I could see what he was doing.

My concern with statistically valid results started when he was talking about internet pornography changing the structure of people’s brains 

# Year & Publisher Element

In [79]:
publisher_link_1 = 'https://www.goodreads.com/book/show/3860977-how-we-decide'

In [80]:
# go to publisher test link on browser
driver.get(publisher_link_1)

In [81]:
book_details_editions_show_more_button_tag = '//*[@id="__next"]/div/main/div[1]/div[2]/div[1]/div[2]/div[6]/div/div/button/span[1]'

In [82]:
book_details_editions_show_more_button = driver.find_element_by_xpath(book_details_editions_show_more_button_tag)

In [83]:
book_details_editions_show_more_button.click()

In [84]:
year_published_and_publisher_tag = '//*[@id="__next"]/div/main/div[1]/div[2]/div[1]/div[2]/div[6]/div/span[2]/div[1]/span/div/dl/div[2]/dd/div/div[1]'

In [85]:
year_published_and_publisher = driver.find_element_by_xpath(year_published_and_publisher_tag)

In [87]:
print(year_published_and_publisher.text)

February 9, 2009 by Houghton Mifflin Company


In [113]:
# saving link to variable to use with web driver
publisher_link_1 = 'https://www.goodreads.com/book/show/3860977-how-we-decide'

# go to publisher test link on browser
driver.get(publisher_link_1)

# saving xpath tag for button element to a variable
book_details_editions_show_more_button_tag = '//*[@id="__next"]/div/main/div[1]/div[2]/div[1]/div[2]/div[6]/div/div/button/span[1]'

# extracting button element using xpath variable
book_details_editions_show_more_button = driver.find_element_by_xpath(book_details_editions_show_more_button_tag)

# clicking on element to expose hidden book metadata
book_details_editions_show_more_button.click()

# saving xpath tag for exposed book metadata element to a variable
year_published_and_publisher_tag = '//*[@id="__next"]/div/main/div[1]/div[2]/div[1]/div[2]/div[6]/div/span[2]/div[1]/span/div/dl/div[2]/dd/div/div[1]'

# extracting book metadata element
year_published_and_publisher = driver.find_element_by_xpath(year_published_and_publisher_tag)

# printing text from book metadata element
print(year_published_and_publisher.text)

WebDriverException: Message: disconnected: not connected to DevTools
  (failed to check if window was closed: disconnected: not connected to DevTools)
  (Session info: chrome=111.0.5563.64)


# Publisher (REGEX)

In [90]:
import re

In [93]:
publisher_info = year_published_and_publisher.text

In [98]:
print(publisher_info)

February 9, 2009 by Houghton Mifflin Company


In [109]:
test = re.search(r'by\s+(.*)$', publisher_info)

In [110]:
print(test[1])

Houghton Mifflin Company


# Year (REGEX)

In [114]:
# initiate selenium browser
driver = webdriver.Chrome()

In [115]:
# saving link to variable to use with web driver
publisher_link_1 = 'https://www.goodreads.com/book/show/3860977-how-we-decide'

In [117]:
# go to starter link on browser
driver.get(publisher_link_1)

In [118]:
# saving xpath tag for button element to a variable
book_details_editions_show_more_button_tag = '//*[@id="__next"]/div/main/div[1]/div[2]/div[1]/div[2]/div[6]/div/div/button/span[1]'

# extracting button element using xpath variable
book_details_editions_show_more_button = driver.find_element_by_xpath(book_details_editions_show_more_button_tag)

# clicking on element to expose hidden book metadata
book_details_editions_show_more_button.click()

# saving xpath tag for exposed book metadata element to a variable
year_published_and_publisher_tag = '//*[@id="__next"]/div/main/div[1]/div[2]/div[1]/div[2]/div[6]/div/span[2]/div[1]/span/div/dl/div[2]/dd/div/div[1]'

# extracting book metadata element
year_published_and_publisher = driver.find_element_by_xpath(year_published_and_publisher_tag)

# printing text from book metadata element
print(year_published_and_publisher.text)

February 9, 2009 by Houghton Mifflin Company


In [119]:
year_and_publisher_info = year_published_and_publisher.text

In [120]:
year_and_publisher_info

'February 9, 2009 by Houghton Mifflin Company'

In [126]:
year = re.findall(r'\d{4}', year_and_publisher_info)[0]

In [127]:
print(year)

2009


# RUNNING IT ALL

In [None]:
num_loops = 506
df_with_links = get_bad_books_links(df, num_loops)