# GoodReads Scraping

Reference: https://rakaarfi.medium.com/scrape-goodreads-book-reviews-using-python-a53252284726

In [1]:
import requests
from bs4 import BeautifulSoup as bs
import polars as pl

In [2]:
def find_review(url: str) -> list:
    """
    This function scrapes the reviews from a Goodreads book page.
    It takes the URL of the book page as input and returns a list of dictionaries,
    where each dictionary contains the details of a single review.

    Args:
    url (str): The URL of the Goodreads book page.

    Returns:
    list: A list of dictionaries, where each dictionary contains the details of a single review.
    """


    # Headers to mimic a real browser request and avoid being blocked
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36'
    }
    # Send a GET request to fetch the page content
    response = requests.get(url, headers=headers)
    # Parse the HTML content using BeautifulSoup
    soup = bs(response.content, 'html.parser')

    # Find all div tags containing review sections
    reviews_list = soup.find_all('div', class_='ReviewsList')

    # Select the relevant div that contains the reviews
    reviews_tag = reviews_list[1] # Only scrape the second <div>

    articles = reviews_tag.find_all('article', class_='ReviewCard')

    all_reviews = []

    # Loop through each review (article tag) and extract the necessary details
    for idx, i in enumerate(articles):
        # Extract Reviewer Profile Information
        profile_info = i.find('section', class_='ReviewerProfile__info')
        
        # Extract the reviewer's name and profile link
        name = profile_info.find('a').get_text()
        link_profile = profile_info.find('a').get('href')
        
        # Extract the number of books (if available), reviews, and followers and check if the reviewer is an author
        profile_meta = profile_info.find('div', class_='ReviewerProfile__meta')
        spans = profile_meta.find_all('span')  # Find all span tags inside profile_meta
        
        # Initialize default values
        check_author = False
        books_amount = None
        reviews_amount = 'Not Found'
        followers_amount = 'Not Found'
        
        for span in spans:
            span_text = span.get_text(strip=True)
            
            # Check if the span contains 'books'
            if 'books' in span_text:
                books_amount = span_text
            
            # Check if the span contains 'reviews'
            elif 'reviews' in span_text:
                reviews_amount = span_text
            
            # Check if the span contains 'followers'
            elif 'followers' in span_text:
                followers_amount = span_text
            
            # Check if the span contains 'Author'
            elif 'Author' in span_text:
                check_author = span_text

        # Store reviewer profile info in a dictionary
        profile = {
            'Name': name,
            'Link Profile': link_profile,
            'An Author': bool(check_author), # Will be False if not available
            'Books': books_amount,  # Will be None if not available
            'Reviews Amount': reviews_amount,  # Will be Not Found if not available
            'Followers Amount': followers_amount  # Will be Not Found if not available
        }

        # Extract the rating (stars) given by the reviewer
        shelf_status = i.find('div', class_='ShelfStatus')

        # Reviewer can give a rating (stars) or not
        try:
            rating_given = shelf_status.find('span', class_='RatingStars RatingStars__small').get('aria-label')
        except:
            rating_given = 'No Rating Given'
        # Extract the review content
        content = i.find('span', class_='Formatted').get_text(strip=True)
        # Create a dictionary with all the extracted data for this review
        data = {
            'Index': idx + 1,
            'Profile Info': profile,
            'Rating': rating_given,
            'Content': content
        }
        # Append the review data to the list of all reviews
        all_reviews.append(data)

    return all_reviews

In [3]:
review = find_review('https://www.goodreads.com/book/show/62047984-yellowface')

In [4]:
review

[{'Index': 1,
  'Profile Info': {'Name': 'ishika',
   'Link Profile': 'https://www.goodreads.com/user/show/76759733-ishika',
   'An Author': False,
   'Books': None,
   'Reviews Amount': '64 reviews',
   'Followers Amount': '1,059 followers'},
  'Rating': 'Rating 2 out of 5',
 {'Index': 2,
  'Profile Info': {'Name': 's.penkevich',
   'Link Profile': 'https://www.goodreads.com/user/show/6431467-s-penkevich',
   'An Author': False,
   'Books': None,
   'Reviews Amount': '1,391 reviews',
   'Followers Amount': '11.6k followers'},
  'Rating': 'Rating 4 out of 5',
  'Content': '‘Don’t ghosts just want to be remembered?’A book about fucking around and finding out.The question of who should or shouldn’t tell a story has been a hotly debated subject, a discourse that must also recognize the playing field is guided by rules of capitalism in a for-profit publishing industry and a social climate that prods “culture wars” to increase clicks. Still in recent memory are the debates overAmerican Dirt

In [5]:
# Convert the list of dictionaries to a DataFrame
df = pl.DataFrame(review)

In [6]:
df = df.unnest('Profile Info').drop('Link Profile', 'Index', 'Books', 'An Author').with_columns(
    pl.col('Reviews Amount').str.replace('reviews', '').str.replace('review', '')\
        .str.replace(' ', '').str.replace(',', '').cast(pl.Int32).alias('Reviews Amount'),
    pl.col('Followers Amount').str.replace(r'(\d+(?:\.\d+)?)[kK]\s+followers', 
                                           (pl.col('Followers Amount').str.extract(r'(\d+(?:\.\d+)?)', group_index=1)\
                                            .cast(pl.Float64) * 1000).cast(pl.Int32))\
        .str.replace('followers', '').str.replace('follower', '')\
        .str.replace(' ', '').str.replace(',', '').cast(pl.Int32).alias('Followers Amount'),
    pl.col('Rating').str.extract(r'Rating (\d+) out of').cast(pl.Int32).alias('Rating')
    )

In [7]:
df

Name,Reviews Amount,Followers Amount,Rating,Content
str,i32,i32,i32,str
"""ishika""",64,1059,2,"""2.5/5i feel weird writing this…"
"""s.penkevich""",1391,11600,4,"""‘Don’t ghosts just want to be …"
"""idiomatic""",545,16,2,"""lmao.the frustration, as alway…"
"""Bookishrealm""",2915,6112,5,"""Whew child. Some of these revi…"
"""Emily May""",2109,315000,4,"""If publishing is rigged, you m…"
…,…,…,…,…
"""Chelsea Humphrey""",1487,82400,5,"""”Who has the right to write ab…"
"""Antje ❦""",163,7,5,"""I'M CRYING, THROWING UP, RIPPI…"
"""Anna Bartłomiejczyk""",179,4325,5,"""Dziś doszłam do wniosku, że Re…"
"""lulu""",287,2109,2,"""well it was definitely a book"""
