In [37]:
import requests
from bs4 import BeautifulSoup
import csv
import random
import time
from fake_useragent import UserAgent
import json
import re

In [2]:
with open('proxies.csv', 'r') as f:
    csv_reader = csv.reader(f)
    ips = list(csv_reader)

ip_addresses = [ip[0] for ip in ips]

session = requests.Session()


def proxy_request(url, ip_addresses=ip_addresses):
    while True:
        try:
            ua = UserAgent()
            headers = {"Accept-Language": "en-US,en;q=0.5",
                       'User-Agent': ua.random}
            proxy = random.randint(0, len(ip_addresses) - 1)
            proxies = {"http": ip_addresses[proxy]}
            response = session.get(url, proxies=proxies,
                                   timeout=5, headers=headers)
            if response.status_code == 200:
                break
            elif response.status_code == 429:
                print(f"Rate limit exceeded. Waiting before retrying...")
                time.sleep(10)
            else:
                print(
                    f"Request failed with status code: {response.status_code}")
                time.sleep(random.uniform(1, 3))
        except Exception as e:
            print(f"Error: {e}")

    return response

In [17]:
years = [x for x in range(2011, 2024)]
years

[2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

In [3]:
url = 'https://www.goodreads.com/choiceawards/best-books-2023'
response = proxy_request(url)
response.status_code

200

In [21]:
genre_ulrs = {}

for year in years:
    url = f'https://www.goodreads.com/choiceawards/best-books-{year}'
    response = proxy_request(url)
    genre_ulrs[year] = {}
    soup = BeautifulSoup(response.content, 'html.parser')
    genre_list = soup.find_all('div', class_='category clearFix')
    for genre in genre_list:
        name = genre.find('a').get_text().strip('\n')
        link = 'https://www.goodreads.com' + genre.find('a').get('href')
        genre_ulrs[year][name] = link

genre_ulrs


{2011: {'Favorite Book of 2011': 'https://www.goodreads.com/choiceawards/favorite-book-of-2011',
  'Fiction': 'https://www.goodreads.com/choiceawards/best-fiction-books-2011',
  'Mystery & Thriller': 'https://www.goodreads.com/choiceawards/best-mystery-thriller-books-2011',
  'Historical Fiction': 'https://www.goodreads.com/choiceawards/best-historical-fiction-books-2011',
  'Fantasy': 'https://www.goodreads.com/choiceawards/best-fantasy-books-2011',
  'Paranormal Fantasy': 'https://www.goodreads.com/choiceawards/best-paranormal-fantasy-books-2011',
  'Science Fiction': 'https://www.goodreads.com/choiceawards/best-science-fiction-books-2011',
  'Horror': 'https://www.goodreads.com/choiceawards/best-horror-books-2011',
  'Romance': 'https://www.goodreads.com/choiceawards/best-romance-books-2011',
  'Humor': 'https://www.goodreads.com/choiceawards/best-humor-books-2011',
  'Nonfiction': 'https://www.goodreads.com/choiceawards/best-nonfiction-books-2011',
  'History & Biography': 'https:/

In [None]:
book_urls = {}

for year in genre_ulrs:
    book_urls[year] = {}
    for genre in genre_ulrs[year]:
        url = genre_ulrs[year][genre]
        response = proxy_request(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        book_urls[year][genre] = {}
        book_list = soup.find_all('div', class_='js-tooltipTrigger tooltipTrigger')
        for book in book_list:
            a_tag = soup.find_all('a', class_='pollAnswer__bookLink')
            for a in a_tag:
                href = a['href'] if a else None
                img_tag = a.find('img') if a else None
                img_alt = img_tag['alt'] if img_tag else None
                book_urls[year][genre][img_alt] = 'https://www.goodreads.com' + href
                print(img_alt)

book_urls

In [30]:
with open('genre_ulrs.json', 'w') as json_file:
    json.dump(genre_ulrs, json_file, indent=4)

In [31]:
with open('book_ulrs.json', 'w') as json_file:
    json.dump(book_urls, json_file, indent=4)

In [32]:
response = proxy_request('https://www.goodreads.com/book/show/62047984-yellowface?from_choice=true')


In [66]:
book_data = {}
for year in book_urls:
    for genre in book_urls[year]:
        for book, book_url in book_urls[year][genre].items():
            # time.sleep(2)
            response = proxy_request(book_url)
            soup = BeautifulSoup(response.content, 'html.parser')
            try:
                rating = soup.find('div', class_='RatingStatistics__rating').get_text()
            except:
                rating = None

            try:
                ratings_count = soup.find_all(lambda tag: tag.get('data-testid') == 'ratingsCount')[0].get_text().split('\xa0ratings')[0]
            except:
                ratings_count = None

            try:
                page_count = int(re.findall(r'\d+', soup.find_all(lambda tag: tag.get('data-testid') == 'pagesFormat')[0].get_text())[0])
            except:
                page_count = None

            try:
                author = soup.find_all(lambda tag: tag.get('data-testid') == 'name')[0].get_text()
            except:
                author = None

            try:
                review_count = soup.find_all(lambda tag: tag.get('data-testid') == 'reviewsCount')[0].get_text().split('\xa0reviews')[0]
            except:
                review_count = None

            try:
                description = soup.find('div', class_='DetailsLayoutRightParagraph__widthConstrained')
                for br in description.find_all('br'):
                    br.replace_with('\n')
                description = description.get_text()
                
            except:
                description = None
            try:
                publication_info = soup.find_all(lambda tag: tag.get('data-testid') == 'publicationInfo')[0].get_text()
            except:
                publication_info = None
            try:
                author_info = soup.find_all('div', class_='DetailsLayoutRightParagraph__widthConstrained')[1]
                for br in author_info.find_all('br'):
                    br.replace_with('\n')
                author_info = author_info.get_text()
            except:
                author_info = None

            book_data[book] = {
                'rating': rating,
                'ratings_count': ratings_count,
                'page_count': page_count,
                'author': author,
                'review_count': review_count,
                'description': description,
                'publication_info': publication_info,
                'author_info': author_info,
                'year': year,
                'url': book_url,
                'genre': genre,
            }

            print(book)

Divergent by Veronica Roth
Shadowfever by Karen Marie Moning
A Dance with Dragons by George R.R. Martin
City of Fallen Angels by Cassandra Clare
Bossypants by Tina Fey
Error: HTTPSConnectionPool(host='www.goodreads.com', port=443): Read timed out. (read timeout=5)
The Wise Man's Fear by Patrick Rothfuss
A Discovery of Witches by Deborah Harkness
Miss Peregrine's Home for Peculiar Children by Ransom Riggs
Delirium by Lauren Oliver
Dead Reckoning by Charlaine Harris
Forever by Maggie Stiefvater
Sing You Home by Jodi Picoult
State of Wonder by Ann Patchett
The Paris Wife by Paula McLain
Awakened by P.C. Cast
The Night Circus by Erin Morgenstern
The Son of Neptune by Rick Riordan
Bloodlines by Richelle Mead
Daughter of Smoke & Bone by Laini Taylor
Ready Player One by Ernest Cline
1Q84 by Haruki Murakami
The Peach Keeper by Sarah Addison Allen
Sing You Home by Jodi Picoult
Night Road by Kristin Hannah
The Lover's Dictionary by David Levithan
State of Wonder by Ann Patchett
The Tiger's Wife 

In [67]:
with open('book_data.json', 'w') as json_file:
    json.dump(book_data, json_file, indent=4)

In [57]:
# response = proxy_request('https://www.goodreads.com/book/show/62047984-yellowface?from_choice=true')
# soup = BeautifulSoup(response.content, 'html.parser')
# rating = soup.find('div', class_='RatingStatistics__rating').get_text()
# ratings_count = soup.find_all(lambda tag: tag.get('data-testid') == 'ratingsCount')[0].get_text().split('\xa0ratings')[0]
# page_count = int(re.findall(r'\d+', soup.find_all(lambda tag: tag.get('data-testid') == 'pagesFormat')[0].get_text())[0])
# author = soup.find_all(lambda tag: tag.get('data-testid') == 'name')[0].get_text()
# review_count = soup.find_all(lambda tag: tag.get('data-testid') == 'reviewsCount')[0].get_text().split('\xa0reviews')[0]
# description = soup.find('div', class_='DetailsLayoutRightParagraph__widthConstrained')
# for br in description.find_all('br'):
#     br.replace_with('\n')
# description = description.get_text()
# publication_info = soup.find_all(lambda tag: tag.get('data-testid') == 'publicationInfo')[0].get_text()
# author_info = soup.find_all('div', class_='DetailsLayoutRightParagraph__widthConstrained')[1]
# for br in author_info.find_all('br'):
#     br.replace_with('\n')

# author_info = author_info.get_text()


''