In [1]:
import pandas as pd
import os
import re
import codecs
import requests
from bs4 import BeautifulSoup

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys

# Acquire

In [3]:
filename = 'book30-listing-train.csv'

In [4]:
header_names = ['Amazon ID (ASIN)', 'Filename', 'Image URL', 'Title', 'Author', 'Category ID',
                'Category']
with codecs.open(filename, mode='r', encoding='utf-8', errors='ignore') as f:
    df = pd.read_csv(f, delimiter=",", header=None, names=header_names)

# Subset Dataframe

In [5]:
df = df[['Author','Title','Category']]

In [6]:
keeper_columns = ['Romance','Mystery, Thrillers & Suspense',
                  'Teen & Young Adult','Science Fiction & Fantasy',
                  'Literature & Fiction','Humor & Entertainment',
                  ]

In [7]:
df = df[df['Category'].isin(keeper_columns)]

# Nulls

In [8]:
df = df.dropna()

In [9]:
df = df.reset_index(drop=True)

In [10]:
def clean_column(df, column):
    # Convert column to lowercase and remove text inside parentheses
    df['temp'] = df[column].str.lower().replace(r'\([^()]*\)', '', regex=True)

    # Remove text after colon or hyphen
    df['temp'] = df['temp'].str.split(r'[:\-]').str[0]

    # Remove extra whitespace
    df['temp'] = df['temp'].str.replace(r'\s+', ' ', regex=True).str.strip()

    # Modify DataFrame with new column
    new_column = f'cleaned_{column}'
    df[new_column] = df['temp']

    # Remove temporary column
    df = df.drop(columns=['temp'])

    return df

# Clean

In [11]:
df = clean_column(df,'Title')

In [12]:
df = clean_column(df,'Author')

In [13]:
df.sample(5)

Unnamed: 0,Author,Title,Category,cleaned_Title,cleaned_Author
3278,Joshua Jay,Magic: The Complete Course,Humor & Entertainment,magic,joshua jay
4175,C. S. Lewis,The Complete C. S. Lewis Signature Classics,Literature & Fiction,the complete c. s. lewis signature classics,c. s. lewis
3438,Will Tipton,Expert Heads Up No Limit Hold'em Play: Strateg...,Humor & Entertainment,expert heads up no limit hold'em play,will tipton
4721,Roman Espejo,Eco-Architecture (Opposing Viewpoints),Teen & Young Adult,eco,roman espejo
1462,Tom Shippey,The Road to Middle-Earth: How J.R.R. Tolkien C...,Science Fiction & Fantasy,the road to middle,tom shippey


In [14]:
title = df['cleaned_Title']
author = df['cleaned_Author']

In [15]:
df['Title'].iloc[0]

'Breaking News (Godmothers, Book 5) (The Godmothers)'

In [16]:
df['cleaned_Title'].iloc[0]

'breaking news'

# Does The Concept Work?

# - Selenium -

### Open Browser

In [17]:
driver = webdriver.Chrome()
driver.get("https://www.goodreads.com/book/show/3450744-nudge")

#### Clicked Search

In [18]:
search_bar = driver.find_element_by_xpath('//*[@id="Header"]/div[2]/div[2]/section/form/input[1]')
search_bar.send_keys(title.iloc[0] + " " + author.iloc[0])
search_bar.submit()

In [19]:
links = driver.find_element_by_class_name('bookTitle')

### Extracted Link from 1st Search Result using Selenium

In [20]:
the_link = links.get_attribute('href')

# It Does!

In [21]:
print(the_link)

https://www.goodreads.com/book/show/13771831-breaking-news?from_search=true&from_srp=true&qid=hgai78Yaum&rank=1


# Deliverable

- 2000 links

# Loop Function

<hr style="border:2px solid gray">

In [22]:
df.head()

Unnamed: 0,Author,Title,Category,cleaned_Title,cleaned_Author
0,Fern Michaels,"Breaking News (Godmothers, Book 5) (The Godmot...",Literature & Fiction,breaking news,fern michaels
1,Joke Star Funny Bones Mr.,"Jokes, Jokes and More Jokes: Hilarious Adult H...",Humor & Entertainment,"jokes, jokes and more jokes",joke star funny bones mr.
2,George Gordon Byron,Byron's Poetry and Prose (Norton Critical Edit...,Literature & Fiction,byron's poetry and prose,george gordon byron
3,Millie Marotta,Tropical World: A Coloring Book Adventure (A M...,Humor & Entertainment,tropical world,millie marotta
4,Micheal Kratom,Legally Stoned: The Most Effective Substances ...,Humor & Entertainment,legally stoned,micheal kratom


### Keeper Columns

In [23]:
working_list = df[['cleaned_Author','cleaned_Title','Category']]

### Rename Columns

In [24]:
working_list.columns = ['Author','Title','Category']

# The Working List

In [25]:
working_list.head()

Unnamed: 0,Author,Title,Category
0,fern michaels,breaking news,Literature & Fiction
1,joke star funny bones mr.,"jokes, jokes and more jokes",Humor & Entertainment
2,george gordon byron,byron's poetry and prose,Literature & Fiction
3,millie marotta,tropical world,Humor & Entertainment
4,micheal kratom,legally stoned,Humor & Entertainment


# Testing The Scrape Loop Function

In [31]:
def get_bad_books_links(df):
    # Check if links.txt exists, read the last link as the starter_link
    if os.path.exists("links.txt"):
        with open("links.txt", "r") as file:
            lines = file.readlines()
            starter_link = lines[-1].strip()
    else:
        starter_link = "https://www.goodreads.com/book/show/3450744-nudge"

    # Add new column called 'link' to df
    df['link'] = ""

    driver = webdriver.Chrome()

    # Open a text file for writing
    with open("links.txt", "a") as file:

        # If row_index.txt exists, read the last index and start from the next row
        if os.path.exists("row_index.txt"):
            with open("row_index.txt", "r") as index_file:
                last_index = int(index_file.read().strip())
                start_index = last_index + 1
        else:
            start_index = 0

        for index, row in df.iloc[start_index:].iterrows():
            # loading initial webpage
            driver.get(starter_link)

            # current row content to use in query
            title = row['Title']
            author = row['Author']

            try:
                # add wait for page to finish loading
                WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH, '//*[@id="Header"]/div[2]/div[2]/section/form/input[1]')))

                # search GoodReads for "title author"
                search_bar = driver.find_element_by_xpath('//*[@id="Header"]/div[2]/div[2]/section/form/input[1]')
                search_bar.send_keys(title + " " + author)
                search_bar.submit()

                # add wait for page to finish loading
                WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, 'bookTitle')))

                # extract 1st search result link from page
                links = driver.find_element_by_class_name('bookTitle')
                row_value = links.get_attribute('href')

                # append value to new column called 'link'
                df.at[index, 'link'] = row_value

                # Update starter_link
                starter_link = row_value

                # Write the row_value to the text file
                file.write(f"{row_value}\n")

            except:
                # If no search results or timeout, continue to the next row
                continue

            finally:
                # Save current row index to row_index.txt
                with open("row_index.txt", "w") as index_file:
                    index_file.write(str(index))

    driver.quit()
    return df

In [None]:
the_links = get_bad_books_links(working_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['link'] = ""
