# Functionalizing and running through Acquisition for more data

In [26]:
import requests
import pandas as pd
import numpy as np
import re
import os
from bs4 import BeautifulSoup 
import time
import sketch

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import warnings
warnings.filterwarnings("ignore")

### DF 1 of several needed for total data

In [27]:
df1 = pd.read_csv('2000_book_ids.txt', header=None)

In [28]:
df1.head()

Unnamed: 0,0
0,136251.Harry_Potter_and_the_Deathly_Hallows
1,2767052-the-hunger-games
2,77203.The_Kite_Runner
3,19063.The_Book_Thief
4,1.Harry_Potter_and_the_Half_Blood_Prince


### DF 2 of several for total data

In [29]:
df2 = pd.read_csv('by_books_urls.txt', header=None)

In [30]:
df2.head()

Unnamed: 0,0
0,17851885-i-am-malala?from_search=true&from_srp...
1,36336078-call-me-by-your-name?from_search=true...
2,27220736-shoe-dog?from_search=true&from_srp=tr...
3,138398.The_Walking_Dead_Vol_1?from_search=true...
4,3206011-crazy-love?from_search=true&from_srp=t...


### DF 3 of several for total data

In [31]:
df3 = pd.read_csv('fiction-and-non-fiction-top-best-sellers.csv', index_col=0)

In [32]:
df3.head()

Unnamed: 0,Date,Book,Author,Publisher
0,October 12 1931,The Ten Commandments,Warwick Deeping,unknown
1,November 23 1931,No List Published,No List Published,unknown
2,January 4 1932,Maid in Waiting,John Galsworthy,unknown
3,January 18 1932,The Harbourmaster,William McFee,unknown
4,February 1 1932,Mr. and Mrs. Pennington,Francis Brett Young,unknown


In [33]:
def clean_column(df, column):
    # Convert column to lowercase and remove text inside parentheses
    df['temp'] = df[column].str.lower().replace(r'\([^()]*\)', '', regex=True)

    # Remove text after colon or hyphen
    df['temp'] = df['temp'].str.split(r'[:\-]').str[0]

    # Remove extra whitespace
    df['temp'] = df['temp'].str.replace(r'\s+', ' ', regex=True).str.strip()

    # Modify DataFrame with new column
    new_column = f'cleaned_{column}'
    df[new_column] = df['temp']

    # Remove temporary column
    df = df.drop(columns=['temp'])

    return df

In [34]:
df3 = df3.rename(columns={'Book':'Title'})

In [35]:
df3 = clean_column(df3, 'Title')

In [36]:
df3 = clean_column(df3, 'Author')

In [37]:
df3.head()

Unnamed: 0,Date,Title,Author,Publisher,cleaned_Title,cleaned_Author
0,October 12 1931,The Ten Commandments,Warwick Deeping,unknown,the ten commandments,warwick deeping
1,November 23 1931,No List Published,No List Published,unknown,no list published,no list published
2,January 4 1932,Maid in Waiting,John Galsworthy,unknown,maid in waiting,john galsworthy
3,January 18 1932,The Harbourmaster,William McFee,unknown,the harbourmaster,william mcfee
4,February 1 1932,Mr. and Mrs. Pennington,Francis Brett Young,unknown,mr. and mrs. pennington,francis brett young


#### Getting the full NYT links to concat onto the main file Using Manny's function below

In [23]:
def get_bad_books_links(df):
    # Check if links.txt exists, read the last link as the starter_link
    if os.path.exists("links.txt"):
        with open("links.txt", "r") as file:
            lines = file.readlines()
            starter_link = lines[-1].strip()
    else:
        starter_link = "https://www.goodreads.com/book/show/3450744-nudge"

    # Add new column called 'link' to df
    df['link'] = ""

    driver = webdriver.Firefox()

    # Open a text file for writing
    with open("links.txt", "a") as file:

        # If row_index.txt exists, read the last index and start from the next row
        if os.path.exists("row_index.txt"):
            with open("row_index.txt", "r") as index_file:
                last_index = int(index_file.read().strip())
                start_index = last_index + 1
        else:
            start_index = 0

        for index, row in df.iloc[start_index:].iterrows():
            # loading initial webpage
            driver.get(starter_link)
            # current row content to use in query
            title = row['cleaned_Title']
            author = row['cleaned_Author']
            print(title, author)
                
            try:
                # add wait for page to finish loading
                search_bar = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH, '//*[@id="Header"]/div[2]/div[2]/section/form/input[1]')))

                # search GoodReads for "title author"
                # search_bar = driver.find_element_by_xpath('//*[@id="Header"]/div[2]/div[2]/section/form/input[1]')
                search_bar.send_keys(title + " " + author)
                search_bar.submit()

                # add wait for page to finish loading
                links = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CLASS_NAME, 'bookTitle')))

                # extract 1st search result link from page
                # links = driver.find_element_by_class_name('bookTitle')
                row_value = links.get_attribute('href')
                print(row_value)
                # append value to new column called 'link'
                df.at[index, 'link'] = row_value

                # Update starter_link
                starter_link = row_value

                # Write the row_value to the text file
                file.write(f"{row_value}\n")

            except:
                # If no search results or timeout, continue to the next row
                continue

            finally:
                # Save current row index to row_index.txt
                with open("row_index.txt", "w") as index_file:
                    index_file.write(str(index))

    driver.quit()
    return df

In [40]:
get_bad_books_links(df3)

the big fisherman lloyd c. douglas
https://www.goodreads.com/book/show/778667.The_Big_Fisherman?from_search=true&from_srp=true&qid=1lVHLWzcWF&rank=1
the big fisherman lloyd douglas
point of no return john p. marquand
https://www.goodreads.com/book/show/279416.Point_of_No_Return?from_search=true&from_srp=true&qid=zzyUqwW5Id&rank=1
a rage to live john o'hara
https://www.goodreads.com/book/show/503994.A_Rage_to_Live?from_search=true&from_srp=true&qid=rrNdWArdY0&rank=1
the egyptian mika waltari
https://www.goodreads.com/book/show/10536206-sinuhe-the-egyptian?from_search=true&from_srp=true&qid=jiwo9B9rf8&rank=1
the parasites daphne du maurier
https://www.goodreads.com/book/show/18869975-the-parasites?from_search=true&from_srp=true&qid=kF1aUUFN5h&rank=1
the wall john hersey
https://www.goodreads.com/book/show/27326.The_Wall?from_search=true&from_srp=true&qid=K0UmwdFoZU&rank=1
the cardinal henry morton robinson
https://www.goodreads.com/book/show/1866494.The_Cardinal?from_search=true&from_srp

Unnamed: 0,Date,Title,Author,Publisher,cleaned_Title,cleaned_Author,link
0,October 12 1931,The Ten Commandments,Warwick Deeping,unknown,the ten commandments,warwick deeping,
1,November 23 1931,No List Published,No List Published,unknown,no list published,no list published,
2,January 4 1932,Maid in Waiting,John Galsworthy,unknown,maid in waiting,john galsworthy,
3,January 18 1932,The Harbourmaster,William McFee,unknown,the harbourmaster,william mcfee,
4,February 1 1932,Mr. and Mrs. Pennington,Francis Brett Young,unknown,mr. and mrs. pennington,francis brett young,
...,...,...,...,...,...,...,...
1405,November 1 2020,Caste,Isabel Wilkerson,Random House,caste,isabel wilkerson,https://www.goodreads.com/book/show/57461318-w...
1406,November 15 2020,Greenlights,Matthew McConaughey,Crown,greenlights,matthew mcconaughey,https://www.goodreads.com/book/show/55931352-s...
1407,November 22 2020,Clanlands,Sam Heughan and Graham McTavish,Quercus,clanlands,sam heughan and graham mctavish,https://www.goodreads.com/book/show/57424615-s...
1408,November 29 2020,Humans,Brandon Stanton,St. Martin's Press,humans,brandon stanton,https://www.goodreads.com/book/show/93683188-s...
