In [1]:
# !pip3 install bs4
# !pip install requests-html

In [2]:
from bs4 import BeautifulSoup
import requests
import itertools
import pandas as pd
import time
from urllib.request import urlopen
import urllib.error

# Good Soup

In [3]:
response = requests.get('https://www.goodreads.com/book/show/136251.Harry_Potter_and_the_Deathly_Hallows')

In [4]:
soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
print(soup.prettify())

<!DOCTYPE html>
<html class="desktop withSiteHeaderTopFullImage">
 <head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# good_reads: http://ogp.me/ns/fb/good_reads#">
  <title>
   Harry Potter and the Deathly Hallows by J.K. Rowling
  </title>
  <meta content="Harry Potter and the Deathly Hallows book. Read 71,716 reviews from the world's largest community for readers. It's no longer safe for Harry at Hogwarts,..." name="description"/>
  <meta content="telephone=no" name="format-detection"/>
  <link href="https://www.goodreads.com/book/show/136251.Harry_Potter_and_the_Deathly_Hallows" rel="canonical"/>
  <meta content="2415071772" property="fb:app_id"/>
  <meta content="books.book" property="og:type"/>
  <meta content="Harry Potter and the Deathly Hallows (Harry Potter, #7)" property="og:title"/>
  <meta content="It's no longer safe for Harry at Hogwarts, so he and his best friends, Ron and Hermione, are on the run. Professor Dumbledore has given t..." property="og:description"/

# Book Metadata

In [6]:
# book_title = soup.find("h1", id="bookTitle")
book_title = soup.find("h1", attrs={"id": "bookTitle"})
book_title.get_text()

'\n      Harry Potter and the Deathly Hallows\n'

In [7]:
author = soup.find("span", attrs={"itemprop": "name"})
author.get_text()

'J.K. Rowling'

In [8]:
avg_rating = soup.find("span", attrs={"itemprop": "ratingValue"})
avg_rating.get_text()

'\n  4.61\n'

In [9]:
num_ratings = soup.find("meta", attrs={"itemprop": "ratingCount"})
num_ratings['content']

'3121093'

In [10]:
num_reviews = soup.find("meta", attrs={"itemprop": "reviewCount"})
num_reviews['content']

'71716'

In [11]:
num_pages = soup.find("span", attrs={"itemprop": "numberOfPages"})
num_pages.get_text()

'759 pages'

# Book Review Data

In [12]:
review_boxes = soup.findAll("div", attrs={"class": "left bodycol"})
review_boxes

[<div class="left bodycol">
 <div class="reviewHeader uitext stacked">
 <a class="reviewDate createdAt right" href="/review/show/17719129?book_show_action=true">Mar 13, 2008</a>
 <span itemprop="author" itemscope="" itemtype="http://schema.org/Person">
 <a class="user" href="/user/show/992038-tara" itemprop="url" name="Tara" title="Tara">Tara</a>
 </span>
 
         rated it
         <span class="staticStars notranslate" title="did not like it"><span class="staticStar p10" size="15x15">did not like it</span><span class="staticStar p0" size="15x15"></span><span class="staticStar p0" size="15x15"></span><span class="staticStar p0" size="15x15"></span><span class="staticStar p0" size="15x15"></span></span>
 <div><span class="uitext greyText">Recommends it for: </span>
 <span class="uitext reviewText">desperate Harry Potter Fans</span></div>
 <div class="uitext greyText bookshelves">
             Shelves:
               <a class="actionLinkLite" href="/review/list/992038-tara?shelf=childre

In [13]:
len(review_boxes)

30

In [14]:
def get_star_rating(review_box):
    rating = review_box.find("span", attrs={"class": "staticStars notranslate"})
    
    if rating == None:
        return None

    rating_in_words = rating['title']

    rating_dict = {"it was amazing": 5, 
                    "really liked it": 4, 
                    "liked it": 3,
                    "it was ok": 2, 
                    "did not like it": 1}

    return rating_dict[rating_in_words]

for review_box in review_boxes:
    print(get_star_rating(review_box))

1
5
5
5
5
5
5
None
5
5
5
1
5
5
5
5
5
5
4
5
5
5
5
5
5
5
5
5
5
5


In [15]:
def get_review_text(review_box):
    # is it hidden bc spoilers?
    yes_spoilers = review_box.find("span", attrs={"class": "readable", "style": "display:none"})

    if yes_spoilers != None:
        # after revealing, does it have "read more"?
        yes_read_more = yes_spoilers.find("span", attrs={"style": "display:none"})
        
        if yes_read_more != None:
            return yes_read_more.get_text().replace("\n", " ")
        else:
            ys_children = [child for child in yes_spoilers.children]
            # the 0th element is a \n for some reason
            return ys_children[1].get_text().replace("\n", " ")

    else:
        yes_read_more = review_box.find("span", attrs={"style": "display:none"})

        if yes_read_more != None:
            return yes_read_more.get_text().replace("\n", " ")
        else:
            no_read_more = review_box.find("span", attrs={"class": "readable"})

            if no_read_more != None:
                nrm_children = [child for child in no_read_more.children]
                # the 0th element is a \n for some reason
                return nrm_children[1].get_text().replace("\n", " ")
            else:
                return None

# first = get_review_text(review_boxes[0])
# print("...more" in first)
# # print(get_review_text(review_boxes[1]))

for review_box in review_boxes:
    print(get_review_text(review_box))

“I’m going to keep going until I succeed — or die. Don’t think I don’t know how this might end. I’ve known it for years.” — Harry PotterMost seventeen-year olds don’t view the possibility of an early death as being, well, possible. But then again, most seventeen-year olds haven’t come face-to-face with death almost half a dozen times before their first kiss either.In Harry Potter and the Deathly Hallows, the seventh and final installment of the ridiculously popular Harry Potter series, J.K. Rowling brilliantly ties up every loose end that she has planted over the last ten years since the very first outing of the series was published in 1997. Truly, Rowling has learned exactly what her fans want and subsequently delivers a book that answers every Potterhead’s questions — and then some. Not only does Deathly Hallows revisit key places and characters from all of the previous six books, but Rowling even manages to make clever references to previous bits of dialogue from her earlier books. 

In [16]:
def get_num_likes(review_box):
    likesCount = review_box.find("span", attrs={"class": "likesCount"})
    if likesCount == None:
        return 0
    else:
        likes_text = likesCount.get_text()
        # format: "### likes"
        trunc_likes_text = likes_text[0:likes_text.index(" ")]
        return int(trunc_likes_text)


for review_box in review_boxes:
    print(get_num_likes(review_box))

632
1384
1299
698
550
517
503
428
847
338
332
286
279
244
236
225
226
221
200
199
192
191
180
178
172
162
161
146
149
140


# How to get to the next batch of reviews?

In [17]:
# maybe just do 30 reviews per book?

# How to get the next book in the list?

In [18]:
# Books With a Goodreads Average Rating of 4.5 and above and With At Least 100 Ratings
# Books With a Goodreads Average Rating of 4.3 and Above
# Books With a Goodreads Average Rating of 4.2 and Above
# Books With a Goodreads Average Rating of 4.0 and above and With At Least 30,000 Ratings
# Favorite Poorly Rated Books
# Worst Rated Books on Goodreads

book_lists= ["https://www.goodreads.com/list/show/10198", # 
            "https://www.goodreads.com/list/show/74717",
            "https://www.goodreads.com/list/show/24320",
            "https://www.goodreads.com/list/show/165313",
            "https://www.goodreads.com/list/show/24328",
            "https://www.goodreads.com/list/show/23974"]

In [19]:
list_response = requests.get(book_lists[4])
list_soup = BeautifulSoup(list_response.text, 'html.parser')

In [20]:
print(list_soup.prettify())

<!DOCTYPE html>
<html class="desktop withSiteHeaderTopFullImage">
 <head>
  <title>
   Favorite Poorly Rated Books (1480 books)
  </title>
  <meta content="1,480 books based on 255 votes: The Scarlet Letter by Nathaniel Hawthorne, The Canterbury Tales by Geoffrey Chaucer, Beowulf by Unknown, Moby-Dick or, th..." name="description"/>
  <meta content="telephone=no" name="format-detection"/>
  <link href="https://www.goodreads.com/list/show/24328.Favorite_Poorly_Rated_Books" rel="canonical"/>
  <script type="text/javascript">
   var ue_t0=window.ue_t0||+new Date();
  </script>
  <script type="text/javascript">
   var ue_mid = "A1PQBFHBHS6YH1";
    var ue_sn = "www.goodreads.com";
    var ue_furl = "fls-na.amazon.com";
    var ue_sid = "182-5705384-1303553";
    var ue_id = "0HH9PBTW39N8W4C68SH9";

    (function(e){var c=e;var a=c.ue||{};a.main_scope="mainscopecsm";a.q=[];a.t0=c.ue_t0||+new Date();a.d=g;function g(h){return +new Date()-(h?0:a.t0)}function d(h){return function(){a.q.push({n

In [21]:
book_title_a_tags = list_soup.findAll("a", attrs={"itemprop": "url", "class": "bookTitle"})
domain = "https://www.goodreads.com"
for a_tag in book_title_a_tags:
    print(domain+a_tag['href'])

https://www.goodreads.com/book/show/12296.The_Scarlet_Letter
https://www.goodreads.com/book/show/2696.The_Canterbury_Tales
https://www.goodreads.com/book/show/52357.Beowulf
https://www.goodreads.com/book/show/153747.Moby_Dick_or_the_Whale
https://www.goodreads.com/book/show/37442.Wicked
https://www.goodreads.com/book/show/7733.Gulliver_s_Travels
https://www.goodreads.com/book/show/4900.Heart_of_Darkness
https://www.goodreads.com/book/show/12898.Death_of_a_Salesman
https://www.goodreads.com/book/show/13497818-the-casual-vacancy
https://www.goodreads.com/book/show/7437.Naked_Lunch
https://www.goodreads.com/book/show/32049.Lady_Chatterley_s_Lover
https://www.goodreads.com/book/show/6101718-the-magicians
https://www.goodreads.com/book/show/18943.Confessions_of_an_Ugly_Stepsister
https://www.goodreads.com/book/show/5148.A_Separate_Peace
https://www.goodreads.com/book/show/18414.Utopia
https://www.goodreads.com/book/show/104778.The_Merry_Wives_of_Windsor
https://www.goodreads.com/book/show/5

# How to get the next page in the list?

In [22]:
list_page_number = 1 # increment
list_page_num_url_suffix = f"?page={list_page_number}"

In [23]:
for book_url in book_lists:
    print(book_url+list_page_num_url_suffix)

https://www.goodreads.com/list/show/10198?page=1
https://www.goodreads.com/list/show/74717?page=1
https://www.goodreads.com/list/show/24320?page=1
https://www.goodreads.com/list/show/165313?page=1
https://www.goodreads.com/list/show/24328?page=1
https://www.goodreads.com/list/show/23974?page=1


In [24]:
def get_num_pages(list_soup):
    pagination = list_soup.find("div", class_="pagination")

    pagination_num_children = sum(1 for i in pagination.children)
    
    last_page = next(itertools.islice(pagination.children, pagination_num_children-3, None))

    return int(last_page.get_text())

print(get_num_pages(list_soup))

15


# Get the data
## Loop Structure

In [25]:
# for each book list
    # get soup of book list 1st page
    # find last_page
    # for each page in book list
        # get soup of page
        # get url of every book in the list on that page
        # for each book url
            # get soup of book url
            # get book title
            # get author name
            # get avg rating
            # get num ratings
            # get num reviews
            # get all review boxes
            # for each review box
                # get star rating
                # get review text
                # get num likes
                # validate the data
            # add new row to dataframe
        # iterate to the next page
    # iterate to the next book list


In [40]:
# "max" parameters can be -1 if you want no max
def get_book_data(max_lists, max_pages_per_list, max_books, max_reviews, book_lists, sleep_length):
    print("Starting get_book_data")
    book_lists_count, books_count, reviews_count = 0,0,0
    
    books_df = pd.DataFrame(columns=["book_title", "author", "url", "avg_rating", "num_ratings", "num_reviews"])
    reviews_df = pd.DataFrame(columns=["book_title", "review_text", "num_likes", "star_rating"])
    
    for book_list in book_lists:

        book_lists_count += 1
        if max_books != -1 and book_lists_count >= max_lists:
            return books_df, reviews_df


        print("Opening new list:", book_list)

        error = True
        while error:
            try:
                book_list_response = urlopen(book_list).read()
            except urllib.error.URLError as e:
                print("\tbook_list_response error")
                # print(f"\te.reason: {e.code}")
            else:
                error = False

        time.sleep(sleep_length)
        book_list_soup = BeautifulSoup(book_list_response, 'html.parser')
        
        max_pages = get_num_pages(book_list_soup)

        if max_pages_per_list < max_pages:
            max_pages = max_pages_per_list

        for page_num in range(1, max_pages+1):

            page_suffix = f"?page={page_num}"
            print("\tOpening new page:", book_list+page_suffix)

            error = True
            while error:
                try:
                    page_response = urlopen(book_list+page_suffix).read()
                except urllib.error.URLError as e:
                    print("\t\tpage_response error")
                    # print(f"\t\te.reason: {e.code}")
                else:
                    error = False

            time.sleep(sleep_length)
            page_soup = BeautifulSoup(page_response, 'html.parser')

            book_a_tags = page_soup.findAll("a", attrs={"itemprop": "url", "class": "bookTitle"})
            domain = "https://www.goodreads.com"
            book_page_links = [domain + a_tag['href'] for a_tag in book_a_tags]

            for url in book_page_links:
                books_count += 1
                if max_books != -1 and books_count >= max_books:
                    return books_df, reviews_df

                print(f"\t\tOpening book URL {books_count}: {url}")
                
                error = True
                while error:
                    try: 
                        book_response = urlopen(url).read()
                    except urllib.error.URLError as e:
                        print("\t\t\tbook_response error")
                        # print(f"\t\t\te.reason: {e.code}")
                    else:
                        error = False

                time.sleep(sleep_length)
                book_soup = BeautifulSoup(book_response, 'html.parser')

                book_title_tag = book_soup.find("h1", attrs={"id": "bookTitle", "itemprop": "name"})

                book_title = book_title_tag.get_text().strip("\n").strip(" ")

                if book_title not in set(books_df["book_title"]):
                    author_tag = book_soup.find("span", attrs={"itemprop": "name"})
                    author = author_tag.get_text()

                    avg_rating_tag = book_soup.find("span", attrs={"itemprop": "ratingValue"})
                    avg_rating = float(avg_rating_tag.get_text().strip("\n").strip(" "))

                    num_ratings_tag = book_soup.find("meta", attrs={"itemprop": "ratingCount"})
                    num_ratings = int(num_ratings_tag['content'])

                    num_reviews_tag = book_soup.find("meta", attrs={"itemprop": "reviewCount"})
                    num_reviews = int(num_reviews_tag['content'])

                    review_boxes = book_soup.findAll("div", attrs={"class": "left bodycol"})

                    books_df = books_df.append(pd.DataFrame([[book_title, 
                                                            author,
                                                            url,
                                                            avg_rating,
                                                            num_ratings,
                                                            num_reviews]], 
                                                            columns=["book_title", "author", "url", "avg_rating", "num_ratings", "num_reviews"]),
                                                            ignore_index=True)
                    for review_box in review_boxes:

                        reviews_count += 1
                        if max_reviews != -1 and reviews_count >= max_reviews:
                            return books_df, reviews_df

                        star_rating = get_star_rating(review_box)
                        review_text = get_review_text(review_box)
                        num_likes = get_num_likes(review_box)

                        reviews_df = reviews_df.append(pd.DataFrame([[book_title,
                                                                    review_text,
                                                                    num_likes,
                                                                    star_rating]],
                                                                    columns=["book_title", "review_text", "num_likes", "star_rating"]),
                                                                    ignore_index=True)
    return books_df, reviews_df

In [41]:
book_lists

['https://www.goodreads.com/list/show/10198',
 'https://www.goodreads.com/list/show/74717',
 'https://www.goodreads.com/list/show/24320',
 'https://www.goodreads.com/list/show/165313',
 'https://www.goodreads.com/list/show/24328',
 'https://www.goodreads.com/list/show/23974']

In [42]:
books, reviews = get_book_data(max_lists=-1, max_pages_per_list=5, max_books=-1, max_reviews=-1, book_lists=book_lists, sleep_length=1/16)

Starting get_book_data
Opening new list: https://www.goodreads.com/list/show/10198
	Opening new page: https://www.goodreads.com/list/show/10198?page=1
		Opening book URL 1: https://www.goodreads.com/book/show/136251.Harry_Potter_and_the_Deathly_Hallows
		Opening book URL 2: https://www.goodreads.com/book/show/186074.The_Name_of_the_Wind
		Opening book URL 3: https://www.goodreads.com/book/show/7235533-the-way-of-kings
		Opening book URL 4: https://www.goodreads.com/book/show/1215032.The_Wise_Man_s_Fear
		Opening book URL 5: https://www.goodreads.com/book/show/17332218-words-of-radiance
		Opening book URL 6: https://www.goodreads.com/book/show/18335634-clockwork-princess
		Opening book URL 7: https://www.goodreads.com/book/show/62291.A_Storm_of_Swords
		Opening book URL 8: https://www.goodreads.com/book/show/5.Harry_Potter_and_the_Prisoner_of_Azkaban
		Opening book URL 9: https://www.goodreads.com/book/show/1.Harry_Potter_and_the_Half_Blood_Prince
		Opening book URL 10: https://www.good

# See the data

In [43]:
books

Unnamed: 0,book_title,author,url,avg_rating,num_ratings,num_reviews
0,Harry Potter and the Deathly Hallows,J.K. Rowling,https://www.goodreads.com/book/show/136251.Har...,4.61,3121106,71716
1,The Name of the Wind,Patrick Rothfuss,https://www.goodreads.com/book/show/186074.The...,4.51,805869,46110
2,The Way of Kings,Brandon Sanderson,https://www.goodreads.com/book/show/7235533-th...,4.60,367613,24154
3,The Wise Man's Fear,Patrick Rothfuss,https://www.goodreads.com/book/show/1215032.Th...,4.53,477860,23603
4,Words of Radiance,Brandon Sanderson,https://www.goodreads.com/book/show/17332218-w...,4.73,251185,14905
...,...,...,...,...,...,...
2741,By a Lady: Being the Adventures of an Enlighte...,Amanda Elyot,https://www.goodreads.com/book/show/157413.By_...,2.84,330,70
2742,The Creator's Map,Emilio Calderón,https://www.goodreads.com/book/show/1086286.Th...,2.90,315,50
2743,The Glove of Darth Vader,Paul Davids,https://www.goodreads.com/book/show/1148191.Th...,2.76,833,59
2744,J,Howard Jacobson,https://www.goodreads.com/book/show/22370991-j,2.93,3467,569


In [44]:
"Columbine" in set(books["book_title"])

True

In [45]:
books["book_title"].value_counts()

Harry Potter and the Deathly Hallows    1
Zone One                                1
The Short Second Life of Bree Tanner    1
When We Were Orphans                    1
Saturn's Children                       1
                                       ..
Keeper of the Lost Cities               1
When Nietzsche Wept                     1
The Labyrinth of the Spirits            1
A Little Hatred                         1
The Twelve                              1
Name: book_title, Length: 2746, dtype: int64

In [46]:
reviews

Unnamed: 0,book_title,review_text,num_likes,star_rating
0,Harry Potter and the Deathly Hallows,"Ok, before I start a few warnings. This will c...",632,1
1,Harry Potter and the Deathly Hallows,“I’m going to keep going until I succeed — or ...,1384,5
2,Harry Potter and the Deathly Hallows,It's hard for me to believe that I finished th...,1299,5
3,Harry Potter and the Deathly Hallows,I can't believe its over.... I've FINALLY read...,698,5
4,Harry Potter and the Deathly Hallows,(A) 86% | ExtraordinaryNotes: It ends too expo...,550,5
...,...,...,...,...
82632,The Twelve,You have to accept this book as a work of fict...,0,3
82633,The Twelve,I figured how this story was going to go judgi...,0,1
82634,The Twelve,"What an incredible book!! At the core, the eve...",0,5
82635,The Twelve,"Max, this ""special human being"" has a long jo...",0,3


# Store the data

In [47]:
books.to_csv("books.csv", header=True)

In [48]:
reviews.to_csv("reviews.csv", header=True)

# How much data?
First 5 pages of the 6 book lists = 5(100)(6) = 3000 books.  
30 reviews per book, so 90000 reviews.