In [1]:
import pandas as pd
import os
import re
import codecs
import requests
from bs4 import BeautifulSoup
from selenium import webdriver

In [2]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Acquire

In [3]:
filename = 'book30-listing-train.csv'

In [4]:
header_names = ['Amazon ID (ASIN)', 'Filename', 'Image URL', 'Title', 'Author', 'Category ID',
                'Category']
with codecs.open(filename, mode='r', encoding='utf-8', errors='ignore') as f:
    df = pd.read_csv(f, delimiter=",", header=None, names=header_names)

# Subset Dataframe

In [5]:
df = df[['Author','Title','Category']]

In [6]:
keeper_columns = ['Romance','Mystery, Thrillers & Suspense',
                  'Teen & Young Adult','Science Fiction & Fantasy',
                  'Literature & Fiction','Humor & Entertainment',
                  ]

In [7]:
df = df[df['Category'].isin(keeper_columns)]

# Nulls

In [8]:
df = df.dropna()

In [9]:
df = df.reset_index(drop=True)

In [10]:
def clean_column(df, column):
    # Convert column to lowercase and remove text inside parentheses
    df['temp'] = df[column].str.lower().replace(r'\([^()]*\)', '', regex=True)

    # Remove text after colon or hyphen
    df['temp'] = df['temp'].str.split(r'[:\-]').str[0]

    # Remove extra whitespace
    df['temp'] = df['temp'].str.replace(r'\s+', ' ', regex=True).str.strip()

    # Modify DataFrame with new column
    new_column = f'cleaned_{column}'
    df[new_column] = df['temp']

    # Remove temporary column
    df = df.drop(columns=['temp'])

    return df

# Clean

In [11]:
df = clean_column(df,'Title')

In [12]:
df = clean_column(df,'Author')

In [13]:
df.sample(5)

Unnamed: 0,Author,Title,Category,cleaned_Title,cleaned_Author
2704,Donna Grant,Darkest Flame (Dark Kings),Literature & Fiction,darkest flame,donna grant
2991,Mandy Levy,Calorie Accounting: The Foolproof Diet-by-Numb...,Humor & Entertainment,calorie accounting,mandy levy
7897,Gregory Benford,Timescape,Science Fiction & Fantasy,timescape,gregory benford
2803,Mark Lawrence,King of Thorns (The Broken Empire),Science Fiction & Fantasy,king of thorns,mark lawrence
3393,Anonymous,The Pleasures of Cruelty; Being a sequel to th...,Romance,the pleasures of cruelty; being a sequel to th...,anonymous


In [14]:
title = df['cleaned_Title']
author = df['cleaned_Author']

In [None]:
df['Title'].iloc[0]

In [None]:
df['cleaned_Title'].iloc[0]

# Selenium

### Open Browser

In [15]:
driver = webdriver.Chrome()
driver.get("https://www.goodreads.com/book/show/3450744-nudge")

#### Clicked Search

In [17]:
search_bar = driver.find_element_by_xpath('//*[@id="Header"]/div[2]/div[2]/section/form/input[1]')
search_bar.send_keys(title.iloc[0] + " " + author.iloc[0])
search_bar.submit()

In [18]:
links = driver.find_element_by_class_name('bookTitle')

### Extracted Link from 1st Search Result using Selenium

In [19]:
links.get_attribute('href')

'https://www.goodreads.com/book/show/13771831-breaking-news?from_search=true&from_srp=true&qid=3HKP8RaS3f&rank=1'

# Deliverable

- 2000 links

# Loop Function

In [23]:
df.head()

Unnamed: 0,Author,Title,Category,cleaned_Title,cleaned_Author
0,Fern Michaels,"Breaking News (Godmothers, Book 5) (The Godmot...",Literature & Fiction,breaking news,fern michaels
1,Joke Star Funny Bones Mr.,"Jokes, Jokes and More Jokes: Hilarious Adult H...",Humor & Entertainment,"jokes, jokes and more jokes",joke star funny bones mr.
2,George Gordon Byron,Byron's Poetry and Prose (Norton Critical Edit...,Literature & Fiction,byron's poetry and prose,george gordon byron
3,Millie Marotta,Tropical World: A Coloring Book Adventure (A M...,Humor & Entertainment,tropical world,millie marotta
4,Micheal Kratom,Legally Stoned: The Most Effective Substances ...,Humor & Entertainment,legally stoned,micheal kratom


In [31]:
working_list = df[['cleaned_Author','cleaned_Title','Category']]

In [32]:
working_list.columns = ['Author','Title','Category']

In [40]:
working_list.head()

Unnamed: 0,Author,Title,Category
0,fern michaels,breaking news,Literature & Fiction
1,joke star funny bones mr.,"jokes, jokes and more jokes",Humor & Entertainment
2,george gordon byron,byron's poetry and prose,Literature & Fiction
3,millie marotta,tropical world,Humor & Entertainment
4,micheal kratom,legally stoned,Humor & Entertainment


In [34]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

In [41]:
def get_bad_books_links(df):
    starter_link = "https://www.goodreads.com/book/show/3450744-nudge"
    # Add new column called 'link' to df
    df['link'] = ""

    driver = webdriver.Chrome()

    for index, row in df.iterrows():
        driver.get(starter_link)
        title = row['Title']
        author = row['Author']

        # Add wait for page to finish loading
        WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH, '//*[@id="Header"]/div[2]/div[2]/section/form/input[1]')))

        # search GoodReads for "title author"
        search_bar = driver.find_element_by_xpath('//*[@id="Header"]/div[2]/div[2]/section/form/input[1]')
        search_bar.send_keys(title + " " + author)
        search_bar.submit()

        # Add wait for page to finish loading
        WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, 'bookTitle')))

        # extract 1st search result link from page
        links = driver.find_element_by_class_name('bookTitle')
        row_value = links.get_attribute('href')
        
        # Append value to new column called 'link'
        df.at[index, 'link'] = row_value
        
        # Update starter_link
        starter_link = row_value
        
    driver.quit()
    return df

In [42]:
df = get_bad_books_links(working_list)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['link'] = ""


TimeoutException: Message: 
