# Scrape Book Reviews from Goodreads

## 1. Single Pages

In [3]:
import requests
from bs4 import BeautifulSoup  
import pandas as pd
import re

def getReviews(page_url):
 # add your code here
    
    # request the url
    page = requests.get(page_url)
    d=BeautifulSoup(page.text, 'html.parser')
    
    # get the value of 'name', 'date', 'review', 'like'
    name = d.find_all('a', class_='user')
    date = d.find_all('a', class_='reviewDate createdAt right')
    review = d.find_all("div", {"class": "reviewText stacked"})
    like = d.find_all('span', class_='likeItContainer')
    
    # get the value of 'rating', [progressing NA value]
    review_state = d.find_all('div', class_="reviewHeader uitext stacked")
    rating = []
    for i in range(len(review_state)):
        if 'rated it' in review_state[i].get_text():
            rate_i = review_state[i].find_all('span', class_='staticStars notranslate')[0].get_text()
            rating.append(rate_i)
        else:
            rating.append(None)
    
    # extract text from data
    name = list(map(lambda x: x.get_text(), name))
    date = list(map(lambda x: x.get_text(), date))
    review = list(map(lambda x: x.get_text(), review))
    like = list(map(lambda x: x.get_text(), like))
    
    # define a dataframe of different variables
    reviews = pd.DataFrame({'name':name, 'rating':rating, 'date':date, 'review':review, 'like':like})
    
    # clean data
    reviews.like = reviews['like'].str.extract('(\d+)')
    reviews['review'] = reviews['review'].apply(lambda x:x.replace('\n', ''))
    
    return reviews


In [4]:
# enter your own url
page_url = 'https://www.goodreads.com/book/show/36320.Tales_of_the_Cthulhu_Mythos'

reviews=getReviews(page_url)
reviews


Unnamed: 0,name,rating,date,review,like
0,Stephen,it was amazing,"Oct 10, 2008",Iä! Iä! Cthulhu Fhtagn!…but not forever:If the...,68.0
1,Werner,it was amazing,"Jul 15, 2009","Note, May 3, 2020: When I read short story col...",25.0
2,Ashley Daviau,it was ok,"Mar 30, 2020",Before I get surprised comments at me giving a...,16.0
3,S̶e̶a̶n̶,,"Nov 08, 2019",Stories read:'The Call of Cthulhu' / H. P. Lov...,9.0
4,Marsha Altman,liked it,"Oct 15, 2011",I can't give it more than three stars because ...,4.0
5,Harris,liked it,"Oct 03, 2013",There are some very interesting stories in thi...,2.0
6,Myridian,it was amazing,"Mar 18, 2008",This is a collection of stories by H. P. Lovec...,2.0
7,John Frasene,liked it,"Nov 12, 2017",This book as a whole was a bit disappointing b...,2.0
8,Edward Taylor,it was amazing,"Apr 28, 2018",Here we have just the right mix of modern auth...,1.0
9,Félix D'Jesús,really liked it,"Nov 02, 2019",Sticks by Karl Edward Wagner 🌟 🌟 🌟 🌟 🌟,1.0


## 2. All Pages with selenium

In [5]:
from selenium import webdriver
import time

# define a new function to get single page data
def singe_page(d):
    # get the value of 'name', 'date', 'review', 'like'
    name = d.find_all('a', class_='user')
    date = d.find_all('a', class_='reviewDate createdAt right')
    review = d.find_all("div", {"class": "reviewText stacked"})
    like = d.find_all('span', class_='likeItContainer')
    
    # get the value of 'rating', [progressing NA value]
    review_state = d.find_all('div', class_="reviewHeader uitext stacked")
    rating = []
    for i in range(len(review_state)):
        if 'rated it' in review_state[i].get_text():
            rate_i = review_state[i].find_all('span', class_='staticStars notranslate')[0].get_text()
            rating.append(rate_i)
        else:
            rating.append(None)
    
    # extract text from data
    name = list(map(lambda x: x.get_text(), name))
    date = list(map(lambda x: x.get_text(), date))
    review = list(map(lambda x: x.get_text(), review))
    like = list(map(lambda x: x.get_text(), like))
    
    # define a dataframe of different variables
    reviews = pd.DataFrame({'name':name, 'rating':rating, 'date':date, 'review':review, 'like':like})
    
    # clean data
    reviews.like = reviews['like'].str.extract('(\d+)')
    reviews['review'] = reviews['review'].apply(lambda x:x.replace('\n', ''))
    
    return reviews

In [6]:
def getReviews_2(page_url):

 # add your code here
    # get 1 st page data
    page = requests.get(page_url)
    d=BeautifulSoup(page.text, 'html.parser')
    reviews_1 = singe_page(d)
    reviews = [reviews_1]
    # get others page data
    for i in range(4):
        page = str(i+2)
        driver = webdriver.Safari(executable_path='/usr/bin/safaridriver') # use safari browser as core
        driver.get(page_url)
        time.sleep(2)
        driver.find_element_by_link_text(page).click()                     # turn to specific pages
        time.sleep(3)
        source = driver.page_source                                        # get the source of that page
        d=BeautifulSoup(source, 'html.parser')
        time.sleep(2)  
        reviews_i = singe_page(d)                                          # get data of that page
        driver.close()
        reviews.append(reviews_i)
        time.sleep(2) 
    # concat all pages    
    reviews = pd.concat(reviews)

    return reviews             
        

In [7]:
# enter your own url
#page_url = 'https://www.goodreads.com/book/show/52578297-the-midnight-library?from_choice=true'
page_url = 'https://www.goodreads.com/book/show/36320.Tales_of_the_Cthulhu_Mythos'
reviews=getReviews_2(page_url)
reviews

  driver.find_element_by_link_text(page).click()


Unnamed: 0,name,rating,date,review,like
0,Stephen,it was amazing,"Oct 10, 2008",Iä! Iä! Cthulhu Fhtagn!…but not forever:If the...,68
1,Werner,it was amazing,"Jul 15, 2009","Note, May 3, 2020: When I read short story col...",25
2,Ashley Daviau,it was ok,"Mar 30, 2020",Before I get surprised comments at me giving a...,16
3,S̶e̶a̶n̶,,"Nov 08, 2019",Stories read:'The Call of Cthulhu' / H. P. Lov...,9
4,Marsha Altman,liked it,"Oct 15, 2011",I can't give it more than three stars because ...,4
...,...,...,...,...,...
2,Kelly,liked it,"Jan 02, 2008",another book i would like to reread.,
3,Andrew Black,it was amazing,"Jan 17, 2011",Possibly one of the best introductory antholog...,
4,Dale,liked it,"Jun 13, 2011",A decent cross section of Cthulhu mythos stori...,
5,James Kinniburgh,,"Feb 13, 2013",Could not download this book.,


https://www.goodreads.com/book/show/806933.Tales_of_the_Cthulhu_Mythos#