In [None]:
from IPython.display import clear_output, display
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET

def extract_urls_from_sitemap(file_path):
    urls = []

    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Iterate through the 'url' elements and extract the 'loc' text
    for url_element in root.findall('{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
        loc_element = url_element.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
        if loc_element is not None:
            urls.append(loc_element.text)

    return urls

sitemap_file_path = './data/goodreads_genres.xml'
urls = extract_urls_from_sitemap(sitemap_file_path)
for url in urls[:4]:
    print(url)
    
print(len(urls))


In [None]:
# https://www.goodreads.com/genres/fiction -> https://www.goodreads.com/genres/new_releases/fiction
def new_releases(s: str) -> str:
    genre = s.split("/")[-1]
    genre_len = len(genre)
    return s[:-genre_len]+"new_releases/"+genre

print(new_releases("https://www.goodreads.com/genres/fiction"))

# After we've gotten all the "new_release" it would be nice to go get the "most_read"
# https://www.goodreads.com/genres/classics -> https://www.goodreads.com/genres/most_read/classics
def most_read(s: str) -> str:
    genre = s.split("/")[-1]
    genre_len = len(genre)
    return s[:-genre_len]+"most_read/"+genre

print(most_read("https://www.goodreads.com/genres/classics"))

def write(books):
    df = pd.DataFrame(books, columns=["URL_ID", "Author", "Title", "Genres", "Blurb", "Average Rating", "My Rating"])
    print(df)
    df.to_csv('./data/book_data.csv', index=False)     

In [None]:
def get_book_data(url):
    time.sleep(0.1)
    book_page = requests.get(url) # We assume this works because otherwise there'd be a broken, clickable link.
    sub_soup = BeautifulSoup(book_page.content, 'html.parser')
    
    # Extract the author -- Sometimes the author isn't present-- not totally sure why. Maybe a load problem.
    # We might pick it up on a future run, don't worry too much about it for now.
    try:
        author = sub_soup.find('span', {'class':'ContributorLink__name', 'data-testid':'name'}).text
    except:
        display(f"Trouble finding author for {url}; skipping for now...")
        return None
    # Extract the title
    title_element = sub_soup.find('meta', {'property': 'og:title'})
    title = title_element['content']
    
    # Extract the Blurb
    blurb_div = sub_soup.find('div', {'data-testid': 'description'})
    blurb_span = blurb_div.find('span', {'class': 'Formatted'})
    blurb = blurb_span.text

    # Extract Genres
    genres_div = sub_soup.find('div', {'data-testid': 'genresList'})
    if not genres_div:
        display(f"Trouble finding genres for {url}; skipping for now...")
        return None
    genres_span = genres_div.find_all('span', {'class': 'BookPageMetadataSection__genreButton'})
    genres = []
    for s in genres_span:
        genres.append(s.find('span', {'class': 'Button__labelItem'}).text)
        
    # Extract Rating (out of 500 becaues I like integers)
    rating = int(float(sub_soup.find('div', {'class': 'RatingStatistics__rating'}).text)*100)
    
    # Short summary (missing URL_ID and blurb)
    # display(f"Author: {author},\n Title: {title},\n Genres: {genres},\n Rating: {rating}\n\n")
    return [author, title, " ".join(genres), blurb, rating]

In [None]:
gr_prefix = "https://www.goodreads.com/book/show/"

# Goal dataset should be something like
# URL_ID - Author - Genre - Blurb - Rating - My Rating

try:
    df = pd.read_csv('./data/book_data.csv')
    seen_books = set(df['URL_ID'])
    book_data = df.values.tolist()
except: # Only needed for a first run.
    seen_books = set()
    book_data = []

added = 0
for url in urls:
    if added >= 100:
        write(book_data)
        added = 0
    cur_genre = url.split("/")[-1]
    time.sleep(1)
    # new_url = new_releases(url)
    new_url = most_read(url)
    content = ""
    try: 
        response = requests.get(new_url)
        content = response.content
    except requests.exceptions.RequestException as e:
        print(f"Errorparsing url {new_url}: {e}")    

    if content == "":
        continue

    soup = BeautifulSoup(content, 'html.parser')
    possible_book_links = soup.find_all('div', {'class': 'coverWrapper'})
    print(f'There are {len(possible_book_links)} books under the genre of {cur_genre}...')

    for i, link in enumerate(possible_book_links):
        clear_output(wait=True)
        display(f"Current Genre: {cur_genre}")
        display(f"Parsing book {i}/{len(possible_book_links)}...")
        a_bit = link.find('a')
        if a_bit is None:
            display(f"No redirect associated with {link}.")
            continue
        redirect = a_bit['href']
        if redirect.startswith('/book/show'):
            url_suffix = redirect.split("/")[-1]
            # Skip books we may have seen from previous runs or previous genres
            try:
                url_identifier = int(url_suffix.split("-")[0])
            except:
                display(f"URL Suffix is likely a classic split by a '.' : {url_suffix}")
                try:
                    url_identifier = int(url_suffix.split(".")[0])
                except:
                    display(f"Unsuccessful in second attempt to split url suffix: {url_suffix}")
                    continue

            if url_identifier in seen_books:
                display(f'Skipping {url_suffix}.')
                continue
            sub_url = gr_prefix+url_suffix
            try:
                author, title, genres, blurb, rating = get_book_data(sub_url)
            except:
                continue
            added += 1
            book_data.append([url_identifier, author, title, genres, blurb, rating, 0])


In [None]:
### 'Current Genre: christian-historical-fiction'

In [None]:
print(len(book_data))

In [None]:
write(book_data)


### This repopulates the ratings from goodreads books that were parsed from silly.txt

In [None]:
silly_df = pd.read_csv("./data/silly_ratings.csv")
# get_book_data("https://www.goodreads.com/book/show/19161852-the-fifth-season")
silly_books = silly_df.values.tolist()
uids = [x[0] for x in book_data]
# print(uids)
for br in silly_books[:10]:
    url_identifier = str(br[0])
    my_rating = int(br[1])
    if url_identifier not in uids:
        try:
            author, title, genres, blurb, rating = get_book_data(gr_prefix+url_identifier)
        except:
            print(f'Exception: {url_identifier}')
            continue
        book_data.append([url_identifier, author, title, genres, blurb, rating, my_rating])
    else:
        print(book_data[uids.index(url_identifier)])
