# Webscarping Goodreads

This file is used to webscrape the required book data from the goodread.com List of "Books that everyone should have read at least once", using beautiful soup.
The list contains 24,529 books.

## Import Libraries

In [46]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import math
import re
import time
import random
import glob

## Fetch book information

In [None]:
# book title - Done

#book series - Done 

#book genres - Done

#book author - Done

#book desription -Done

#book year published - Done

#number of pages - Done 

#language - Done 

#isbn10 - Done

#av rating - Done 

#1 star ratings, 2 star ratings .. - will not do 

#number of total ratings - Done

#book cover link - Done 

In [131]:
books = requests.get("https://www.goodreads.com/book/show/32297830-recreant")
print("books:", books.status_code)
soup = BeautifulSoup(books.content, 'html.parser')

books: 200


In [49]:
print(soup.prettify)

<bound method Tag.prettify of <!DOCTYPE html>

<html class="desktop">
<head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# good_reads: http://ogp.me/ns/fb/good_reads#">
<title>Recreant by Geffrey Kane</title>
<meta content="Recreant book. Read 2 reviews from the world's largest community for readers. It has been three days since Father Jack McKenna killed himself; The world ..." name="description"/>
<meta content="telephone=no" name="format-detection"/>
<link href="https://www.goodreads.com/book/show/32181246-recreant" rel="canonical"/>
<meta content="2415071772" property="fb:app_id"/>
<meta content="books.book" property="og:type"/>
<meta content="Recreant" property="og:title"/>
<meta content="It has been three days since Father Jack McKenna killed himself; The world as Daniel knows it is about to change forever... As Father Dan..." property="og:description"/>
<meta content="https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1475206849i/32297830._UY630_SR1200

### Book Title

In [132]:
def get_book_title(soup):
    if soup.find(id='bookTitle'):
        btitle = soup.find(id="bookTitle").get_text(strip=True)
        return btitle
    return ''

In [133]:
get_book_title(soup)

'Recreant'

### Book isbn10 

In [134]:
# This function gives back the isbn10 number by browsing the soup as string for "nisbn". 
#The first "nisbn" in the html code is the ISBN number of the chosen book[0].
def get_book_isbn10(soup):
    try:
        isbn10 = re.findall(r'nisbn: \d{10}' , str(soup))[0] #get first nisbn number in str(soup)
        return isbn10.split()[1] # only show isbn number not "nisbn"
    except:
        return "isbn not found"


In [135]:
get_book_isbn10(soup)

'isbn not found'

### Book Series

In [136]:
def get_book_series(soup):
    if soup.find(id="bookSeries").find('a'):
        book_series=soup.find(id="bookSeries").find('a').get_text(strip=True)
        book_series=book_series.replace('(',')')
        return (book_series.split(')')[1]) 
    return ''

#not a really generic solution, but works ;) Alternative would have been re.search (...)


In [137]:
get_book_series(soup)

''

### Book Description

In [138]:
def get_book_description(soup):
    bdescription= ''
    if soup.find(id="description").find(style="display:none"):
        bdescription +=soup.find(id="description").find(style="display:none").get_text(strip=True)
        return bdescription
    return ''

In [139]:
get_book_description(soup)

"It has been three days since Father Jack McKenna killed himself; The world as Daniel knows it is about to change forever... As Father Daniel Whelan struggles to accept the untimely death of his friend and mentor he turns to the one person in his life he loves more than the old priest for comfort; Saoirse. But, the woman he can never give himself to has many dark desires and secrets of her own. None more so than her sinister involvement in the affairs of a decades-long active serial killer, known amongst his cult followers as 'The Master'. Refusing to let go, Daniel pushes himself deeper and deeper into a hidden world that reveals the truth about his friends' past, how and why Jack died, and ultimately forces the young Dublin priest to question the truth of his own faith."

### Book Author

In [140]:
#DONE - This function gives a list with all the authors
def get_book_authors(soup):
    if soup.find(id="bookAuthors").find_all(itemprop='name'):  #find('a', attrs={'class': 'authorName'}).find(itemprop='name'):
        authors=soup.find(id="bookAuthors").find_all(itemprop='name')#.find(itemprop='name')
        #print(authors)
        authors_all=[]
        for i in authors:
            authors_all.append(i.get_text(strip=True))
        return(authors_all)
    return ''

In [141]:
get_book_authors(soup)

['Geffrey Kane']

### Number of pages

In [142]:
def get_book_pages(soup):
    if soup.find(id="details").find(itemprop='numberOfPages'):  #find('a', attrs={'class': 'authorName'}).find(itemprop='name'):
        pages=soup.find(id="details").find(itemprop='numberOfPages').get_text(strip=True)#.find(itemprop='name')
        return int(pages.split()[0])
    return ''

In [143]:
get_book_pages(soup)

''

### Date first published

In [146]:
def get_book_year(soup):
    if soup.find('nobr', attrs={'class':'greyText'}):
        year_published1=soup.find('nobr', attrs={'class':'greyText'}).get_text(strip=True)
        #print(1)
        return re.search('(\d{4})', year_published1).group(1) #search for year: number with 4 digits
    elif soup.find('p',attrs={'data-testid':"publicationInfo"}):
        year_published2=soup.find('p',attrs={'data-testid':'publicationInfo'}).get_text(strip=True)
        #print(2)
        return re.search('(\d{4})', year_published2).group(1)
    elif soup.find(id="details"):#.findall('div', attrs={'class':'row'}):
        year_published3=soup.find(id="details").get_text(strip=True)#.split()#findall('div', attrs={'class':'row'})
        #print(3)
        return re.search('\d{4}', year_published3).group(0)   ##.split()[3]       
    return ''      

In [147]:
get_book_year(soup)

'2016'

# def changeyear(soup):
    try:
        date = re.findall(r'[Pp]ublished ([\w]{1,10} \d{1,4})?(, )?\d{1,4}' , str(soup))#[0] #get first nisbn number in str(soup)
        print(date)
        return date##[2] # only show isbn number not "nisbn"
    except:
        return "None"

### Language

In [148]:
def get_book_language(soup):
    if soup.find(itemprop="inLanguage"):
        language=soup.find(itemprop="inLanguage").get_text(strip=True)
        return language
    return ''

In [149]:
get_book_language(soup)

''

### Book Cover Link

In [150]:
def get_book_cover(soup):
    if soup.find(id="coverImage"):
        cover=soup.find(id="coverImage")
       # print(cover)
        return cover.get('src') #img.get
    return''

In [151]:
get_book_cover(soup)

'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1475206849l/32297830.jpg'

### Number of total ratings

In [152]:
def get_total_ratings(soup):
    if soup.find(itemprop='ratingCount'):  #find('a', attrs={'class': 'authorName'}).find(itemprop='name'):
        reviews=soup.find(itemprop='ratingCount').get_text(strip=True)#.find(itemprop='name')
        return reviews.split()[0]#float(pages.split()[0])

In [153]:
get_total_ratings(soup)

'8'

### Average rating of book

In [154]:
def get_bookavrating(soup):
    if soup.find("span", itemprop="ratingValue"):#soup.find("div", class_="RatingStatistics__rating"):
        av_rating = soup.find("span", itemprop="ratingValue").get_text(strip=True)#soup.find("div", class_="RatingStatistics__rating")
        return float(av_rating)
    return ''

In [155]:
get_bookavrating(soup)

4.38

### Number of different star ratings - skip due to time reasons

### Book Genres

In [156]:
#This function gets all genres listed under more, can be various amounts
def get_all_book_genres(soup):
    genres=[]
    for i in soup.find_all('div', class_="left"):
        genre=i.find_all('a', class_="actionLinkLite bookPageGenreLink")
        for j in genre:
            texts=j.get_text(strip=True)
            if texts not in genres:
                genres.append(texts)
    return genres
    

In [157]:
get_all_book_genres(soup)

[]

In [None]:
gs(soup)

gs1(soup)

In [None]:
print(soup.prettify)

## Scrape Data

### From one book page

In [None]:
#Done - trying to scrape from one book
title=[]
series=[]
authors=[]
isbn=[]
genres=[]
descriptions=[]
years=[]
pages=[]
languages=[]
setting=[]
av_ratings=[]
total_ratings=[]
cover_image_link=[]



title.append(get_book_title(soup))
series.append(get_book_series(soup))
authors.append(get_book_authors(soup))
isbn.append(get_book_isbn10(soup))
genres.append(get_all_book_genres(soup))
descriptions.append(get_book_description(soup))
pages.append(get_book_isbn10(soup))
years.append(get_book_year(soup))
languages.append(get_book_language(soup))
cover_image_link.append(get_book_cover(soup))
total_ratings.append(get_total_ratings(soup))
av_ratings.append(get_bookavrating(soup))





print((title,series, authors,isbn,descriptions,pages,genres,years,languages,cover_image_link, total_ratings,av_ratings)) 


### Extract all book URLs from List

In [None]:
#Extract all urls of books
def geturls(startpage,endpage):
    urls_all=[]
    for pagenr in range(startpage,endpage):    
        req = requests.get(f'https://www.goodreads.com/list/show/264.Books_That_Everyone_Should_Read_At_Least_Once?page={pagenr}')
        soup = BeautifulSoup(req.content, 'html.parser')
        #print(soup)
        urls=soup.find_all('a', class_='bookTitle')#.get('href')
        for i in urls:
            urls_all.append('https://www.goodreads.com'+i.get('href'))
        time.sleep(random.randint(1,4))
    #print(urls_all)
    df = pd.DataFrame({"url": urls_all})
    df.to_csv(r'c:\Users\anton\Desktop\df_urls{pagenr}.csv',index=None, header=True)
    
    

In [None]:
geturls(100,101)

In [None]:
#concat url lists
main_df= pd.DataFrame([])

for file_name in glob.glob(r'C:\Users\anton\Ironhack\Final Project\URLsFinal\*.csv'):
    df = pd.read_csv(file_name)
    main_df = pd.concat([main_df,df])
main_df

## Scrape all book sites

In [None]:
#scrape all links on link list

In [None]:
def getfinaldata(url_list):
    title=[]
    series=[]
    authors=[]
    isbn=[]
    genres=[]
    descriptions=[]
    years=[]
    pages=[]
    languages=[]
    setting=[]
    av_ratings=[]
    total_ratings=[]
    cover_image_link=[]
    link_list=pd.read_csv(url_list)
    link_list=link_list['url'].tolist()
    
    for url in link_list:
        req = requests.get(url)
        soup = BeautifulSoup(req.content, 'html.parser')
        #FETCH DATA, append lists:
        title.append(get_book_title(soup))
        
        try:
            series.append(get_book_series(soup))
        except:
            series.append('None')
        try:  
            authors.append(get_book_authors(soup))
        except:
            authors.append('None')
            
        isbn.append(get_book_isbn10(soup))
        genres.append(get_all_book_genres(soup))
        try:
            descriptions.append(get_book_description(soup))
        except:
            descriptions.append('None')
        try:    
            pages.append(get_book_pages(soup))
        except:
            pages.append('None')
        try:
            years.append(get_book_year(soup))
        except:
            years.append('None')
            
        languages.append(get_book_language(soup))
        cover_image_link.append(get_book_cover(soup))
        total_ratings.append(get_total_ratings(soup))
        av_ratings.append(get_bookavrating(soup))
        print(url)
        time.sleep(random.randint(1,4))
    #CREATE DATAFRAME FROM LISTS
    df = pd.DataFrame(
    {
     "title": title, 
     "series": series,
     "authors": authors,
     "isbn": isbn,
     "genres": genres,
     "description": descriptions,
     "pages": pages,
     "year": years, 
     "language": languages, 
     "cover_image": cover_image_link,
     "total_number_ratings": total_ratings, 
     "average_rating": av_ratings,
     }
    )
    df
    df.to_csv(r'c:\Users\anton\Desktop\df_alldetails.csv',index=None, header=True)
    return df    

In [None]:
getfinaldata('URLsFinal\df_urls{100}.csv')

In [None]:
dfgh =pd.read_csv(r'C:\Users\anton\Desktop\df_alldetails.csv')
dfgh#