In [37]:
from bs4 import BeautifulSoup as bs
import urllib.request
from urllib.parse import urljoin
import pandas as pd
import requests
import json

# Webscapping titles and authors, descriptions


In [45]:
# Specifying website url
url= "https://www.goodreads.com/shelf/show/ebooks"

In [46]:
# Make http request
page = requests.get(url) 

In [47]:
# Get the html from webpage
html = page.content

In [48]:
# Creating a BeautifulSoup object with the use of a parser
soup = bs(html, 'html.parser')

In [49]:
# Exporting html file
with open('ebooksshelf.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

In [50]:
# First layer: The element that contains all the data
divs = soup.find_all("div", {"class": "elementList"})

In [51]:
# Second layer: Extracting html tags that contain the links
links = [div.find('a') for div in divs]

# Extracting the partial links  
relative_url = [link['href'] for link in links]  

# Computing the full url addresses 
full_url = [urljoin(url, relativeurl) for relativeurl in relative_url]

# Filter only the book links
book_url = [url for url in full_url if "https://www.goodreads.com/book/show" in url]

In [52]:
# Scraping information of each book 
book_description = []
book_title = []
book_author = []
book_rating = []
book_pages = []

In [53]:
#creating a loop counter
i = 0

#Loop through 50 books
for url in book_url:
    
    #connect to url page
    note_resp = requests.get(url)
    
    #checking if the request is successful
    if note_resp.status_code == 200:
        print("URL{}: {}".format(i+1, url))
        
    else:
        print('Status code{}: Skipping URL #{}: {}'.format(note_resp.status_code, i+1, url))
        i = i+1
        continue
    
    
    #get HTML from url page
    note_html = note_resp.content
    
    
    #create beautifulsoup object for url page
    note_soup = bs(note_html, 'html.parser')
    
    #Extract Author particulars
    author_divs = note_soup.find_all("div", {"class": "authorName__container"})
    author_text = author_divs[0].find_all('a')[0].text
    book_author.append(author_text)

    
    #Extract title particulars
    title_divs = note_soup.find_all("div", {"class": "last col"})
    title_text = title_divs[0].find_all('h1')[0].text
    book_title.append(title_text)
    
    #Extracting description particulars
    description_divs = note_soup.find_all("div", {"class": "readable stacked", "id": "description"})
    try:
        description_text = description_divs[0].find_all("span")[1].text
    except IndexError:
        try:
            description_text = description_divs[0].find_all("span")[0].text
        except IndexError:
            description_text = "Nil"
    book_description.append(description_text)
    
    #Extract rating particulars
    rating_divs = note_soup.find_all("div", {"class": "uitext stacked", "id": "bookMeta"})
    rating_text = rating_divs[0].find_all("span", {"itemprop": "ratingValue"})[0].text
    book_rating.append(rating_text)
    
    #Extracting page particulars
    page_divs = note_soup.find_all("div", {"class": "row"})
    try:
        page_text = page_divs[0].find_all("span", {"itemprop": "numberOfPages"})[0].text.strip(' pages')
    except IndexError:
        page_text = 0
    book_pages.append(page_text)
    
    
    #Incremeting the loop counter
    i = i+1

URL1: https://www.goodreads.com/book/show/2767052-the-hunger-games
URL2: https://www.goodreads.com/book/show/6148028-catching-fire
URL3: https://www.goodreads.com/book/show/7260188-mockingjay
URL4: https://www.goodreads.com/book/show/13335037-divergent
URL5: https://www.goodreads.com/book/show/1885.Pride_and_Prejudice
URL6: https://www.goodreads.com/book/show/11870085-the-fault-in-our-stars
URL7: https://www.goodreads.com/book/show/11735983-insurgent
URL8: https://www.goodreads.com/book/show/10818853-fifty-shades-of-grey
URL9: https://www.goodreads.com/book/show/3.Harry_Potter_and_the_Sorcerer_s_Stone
URL10: https://www.goodreads.com/book/show/13496.A_Game_of_Thrones
URL11: https://www.goodreads.com/book/show/19288043-gone-girl
URL12: https://www.goodreads.com/book/show/38447.The_Handmaid_s_Tale
URL13: https://www.goodreads.com/book/show/256683.City_of_Bones
URL14: https://www.goodreads.com/book/show/10507293-the-selection
URL15: https://www.goodreads.com/book/show/5297.The_Picture_of_

In [54]:
#### Some simple data processing ####

revised_book_title = [book.strip() for book in book_title]
revised_book_description = [description.strip() for description in book_description]
revised_book_rating = [float(rating.strip()) for rating in book_rating]
revised_book_pages = [int(page) for page in book_pages]

#### Organising the data into a dataframe ####

book_df = pd.DataFrame()

book_df["Book Title"] = revised_book_title
book_df["Author"] = book_author
book_df["Description"] = revised_book_description
book_df["Rating"] = revised_book_rating
book_df["Pages"] = revised_book_pages
book_df["Links"] = book_url

#Preview dataframe
book_df.head()

Unnamed: 0,Book Title,Author,Description,Rating,Pages,Links
0,The Hunger Games,Suzanne Collins,"Could you survive on your own in the wild, wit...",4.32,374,https://www.goodreads.com/book/show/2767052-th...
1,Catching Fire,Suzanne Collins,Sparks are igniting.Flames are spreading.And t...,4.29,391,https://www.goodreads.com/book/show/6148028-ca...
2,Mockingjay,Suzanne Collins,My name is Katniss Everdeen.Why am I not dead?...,4.05,398,https://www.goodreads.com/book/show/7260188-mo...
3,Divergent,Veronica Roth,"In Beatrice Prior's dystopian Chicago world, s...",4.18,487,https://www.goodreads.com/book/show/13335037-d...
4,Pride and Prejudice,Jane Austen,Alternate cover edition of ISBN 9780679783268S...,4.27,279,https://www.goodreads.com/book/show/1885.Pride...


In [55]:
book_df.to_csv (r'iCloud Drive: Desktop\goodreads_dataframe.csv', index = False, header=True)