# Webscarping Goodreads

This file is used to webscrape the required book data from the goodread.com List of "Books that everyone should have read at least once", using beautiful soup.
The list contains 24,529 books.

## Import Libraries

In [6]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import math
import re
import time
import random
import glob

## Fetch book information

In [8]:
books = requests.get("https://www.goodreads.com/book/show/32000545-the-dragon-and-the-princess")
print("books:", books.status_code)
soup = BeautifulSoup(books.content, 'html.parser')

books: 200


In [None]:
print(soup.prettify)

### Book Title

In [9]:
def get_book_title(soup):
    if soup.find(id='bookTitle'):
        btitle = soup.find(id="bookTitle").get_text(strip=True)
        return btitle
    return ''

In [10]:
get_book_title(soup)

'The Dragon and the Princess'

### Book isbn13 

In [11]:
# This function gives back the isbn13 number by browsing the soup as string for "nisbn". 
#The first "nisbn" in the html code is the ISBN number of the chosen book[0].
def get_book_isbn13(soup):
    if soup.find('span', attrs={'itemprop':"isbn"}):
        isbn1 = soup.find('span', attrs={'itemprop':"isbn"}).get_text(strip=True)
        #print(1)
        return isbn1
    if not soup.find('span', attrs={'itemprop':"isbn"}):
        try:
            try:
                isbn10 = re.findall(r'nisbn: \d{10}' , str(soup))[0] #get first nisbn number in str(soup)
                #print(2)
                return isbn10.split()[1] # only show isbn number not "nisbn
            except:
                try:
                    isbn2 =soup.find(id="bookDataBox").find('div', class_="infoBoxRowItem").get_text(strip=True)
                    #print(3)
                    return re.search(r'\d{10}',isbn2).group(0)
                except:
                    return ""
        except:
            return ""
                
        #except:
            #continue      
    #else:
       # try:
           # isbn10 = re.findall(r'nisbn: \d{10}' , str(soup))[0] #get first nisbn number in str(soup)
            #print(2)
           # return isbn10.split()[1] # only show isbn number not "nisbn
       # except:
         #   return ""


In [14]:
get_book_isbn13(soup)

''

In [13]:
print(soup.prettify)

<bound method Tag.prettify of <!DOCTYPE html>

<html class="desktop">
<head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# good_reads: http://ogp.me/ns/fb/good_reads#">
<title>The Dragon and the Princess by Andrew P.M. Yiallouros</title>
<meta content="The Dragon and the Princess book. Read 2 reviews from the world's largest community for readers. A tale of magic, love, spirituality and adventure...it's..." name="description"/>
<meta content="telephone=no" name="format-detection"/>
<link href="https://www.goodreads.com/book/show/32000545-the-dragon-and-the-princess" rel="canonical"/>
<meta content="2415071772" property="fb:app_id"/>
<meta content="books.book" property="og:type"/>
<meta content="The Dragon and the Princess" property="og:title"/>
<meta content="A tale of magic, love, spirituality and adventure...it's not really about the princess but more a spiritual work inspired by my years of ..." property="og:description"/>
<meta content="https://i.gr-assets.com/images/S/comp

### Book Series

In [15]:
def get_book_series(soup):
    if soup.find(id="bookSeries").find('a'):
        book_series=soup.find(id="bookSeries").find('a').get_text(strip=True)
        book_series=book_series.replace('(',')')
        return (book_series.split(')')[1]) 
    return ''

#not a really generic solution, but works ;) Alternative would have been re.search (...)


In [16]:
get_book_series(soup)

''

### Book Description

In [17]:
def get_book_description(soup):
    bdescription= ''
    if soup.find(id="description").find(style="display:none"):
        bdescription +=soup.find(id="description").find(style="display:none").get_text(strip=True)
        return bdescription
    return ''

In [18]:
get_book_description(soup)

'A tale of magic, love, spirituality and adventure...it\'s not really about the princess but more a spiritual work inspired by my years of spiritual enquiry, I\'ve been a Buddhist, a Yogi, a Muslim, I even lived as a Christian monk on Mount Athos for a short while. It\'s not biographical but life inspires art, and it is an allegorical philosophical tale with some interesting insights into life. It\'s a story of friendship, addiction, festivals, love, demons, angels, wizards and magic, poetry, and much more. It\'s also the beginning of a series of books centred around the dragon\'s world, with the next title being the second part to this introduction. Future titles are "The Sorcerer\'s Kingdom"; " The Wizard and The Dragon"; "The Secret Garden"; "Where The Man Burns"; "The Mysterious Island and the World of the Sea People" and lots more! Please support a new writer with a lot to offer!'

### Book Author

In [19]:
#DONE - This function gives a list with all the authors
def get_book_authors(soup):
    if soup.find(id="bookAuthors").find_all(itemprop='name'):  #find('a', attrs={'class': 'authorName'}).find(itemprop='name'):
        authors=soup.find(id="bookAuthors").find_all(itemprop='name')#.find(itemprop='name')
        #print(authors)
        authors_all=[]
        for i in authors:
            authors_all.append(i.get_text(strip=True))
        return(authors_all)
    return ''

In [20]:
get_book_authors(soup)

['Andrew P.M. Yiallouros']

### Number of pages

In [21]:
def get_book_pages(soup):
    if soup.find(id="details").find(itemprop='numberOfPages'):  #find('a', attrs={'class': 'authorName'}).find(itemprop='name'):
        pages=soup.find(id="details").find(itemprop='numberOfPages').get_text(strip=True)#.find(itemprop='name')
        return int(pages.split()[0])
    return '0'

In [22]:
get_book_pages(soup)

94

### Date first published

In [23]:
def get_book_year(soup):
    if soup.find('nobr', attrs={'class':'greyText'}):
        year_published1=soup.find('nobr', attrs={'class':'greyText'}).get_text(strip=True)
        #print(1)
        return re.search('(\d{4})', year_published1).group(1) #search for year: number with 4 digits
    elif soup.find('p',attrs={'data-testid':"publicationInfo"}):
        year_published2=soup.find('p',attrs={'data-testid':'publicationInfo'}).get_text(strip=True)
        #print(2)
        return re.search('(\d{4})', year_published2).group(1)
    elif soup.find(id="details"):#.findall('div', attrs={'class':'row'}):
        year_published3=soup.find(id="details").get_text(strip=True)#.split()#findall('div', attrs={'class':'row'})
        #print(3)
        return re.search('\d{4}', year_published3).group(0)   ##.split()[3]       
    return ''      

In [24]:
get_book_year(soup)

'2016'

### Language

In [25]:
def get_book_language(soup):
    if soup.find(itemprop="inLanguage"):
        language=soup.find(itemprop="inLanguage").get_text(strip=True)
        return language
    return ''

In [26]:
get_book_language(soup)

'English'

### Book Cover Link

In [27]:
def get_book_cover(soup):
    if soup.find(id="coverImage"):
        cover=soup.find(id="coverImage")
       # print(cover)
        return cover.get('src') #img.get
    return''

In [28]:
get_book_cover(soup)

'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1473747540l/32000545._SY475_.jpg'

### Number of total ratings

In [29]:
def get_total_ratings(soup):
    if soup.find(itemprop='ratingCount'):  #find('a', attrs={'class': 'authorName'}).find(itemprop='name'):
        reviews=soup.find(itemprop='ratingCount').get_text(strip=True)#.find(itemprop='name')
        return reviews.split()[0]#float(pages.split()[0])

In [30]:
get_total_ratings(soup)

'4'

### Average rating of book

In [31]:
def get_bookavrating(soup):
    if soup.find("span", itemprop="ratingValue"):#soup.find("div", class_="RatingStatistics__rating"):
        av_rating = soup.find("span", itemprop="ratingValue").get_text(strip=True)#soup.find("div", class_="RatingStatistics__rating")
        return float(av_rating)
    return ''

In [32]:
get_bookavrating(soup)

5.0

### Book Genres

In [33]:
#This function gets all genres listed under more, can be various amounts
def get_all_book_genres(soup):
    genres=[]
    for i in soup.find_all('div', class_="left"):
        genre=i.find_all('a', class_="actionLinkLite bookPageGenreLink")
        for j in genre:
            texts=j.get_text(strip=True)
            if texts not in genres:
                genres.append(texts)
    return genres
    

In [34]:
get_all_book_genres(soup)

[]

gs1(soup)

## Scrape Data

### From one book page

In [35]:
#Checking functions for one book
title=[]
series=[]
authors=[]
isbn=[]
genres=[]
descriptions=[]
years=[]
pages=[]
languages=[]
setting=[]
av_ratings=[]
total_ratings=[]
cover_image_link=[]



title.append(get_book_title(soup))
series.append(get_book_series(soup))
authors.append(get_book_authors(soup))
isbn.append(get_book_isbn13(soup))
genres.append(get_all_book_genres(soup))
descriptions.append(get_book_description(soup))
pages.append(get_book_isbn13(soup))
years.append(get_book_year(soup))
languages.append(get_book_language(soup))
cover_image_link.append(get_book_cover(soup))
total_ratings.append(get_total_ratings(soup))
av_ratings.append(get_bookavrating(soup))





print((title,series, authors,isbn,descriptions,pages,genres,years,languages,cover_image_link, total_ratings,av_ratings)) 


(['The Dragon and the Princess'], [''], [['Andrew P.M. Yiallouros']], [''], ['A tale of magic, love, spirituality and adventure...it\'s not really about the princess but more a spiritual work inspired by my years of spiritual enquiry, I\'ve been a Buddhist, a Yogi, a Muslim, I even lived as a Christian monk on Mount Athos for a short while. It\'s not biographical but life inspires art, and it is an allegorical philosophical tale with some interesting insights into life. It\'s a story of friendship, addiction, festivals, love, demons, angels, wizards and magic, poetry, and much more. It\'s also the beginning of a series of books centred around the dragon\'s world, with the next title being the second part to this introduction. Future titles are "The Sorcerer\'s Kingdom"; " The Wizard and The Dragon"; "The Secret Garden"; "Where The Man Burns"; "The Mysterious Island and the World of the Sea People" and lots more! Please support a new writer with a lot to offer!'], [''], [[]], ['2016'], 

### Extract all book URLs from List

In [36]:
#Extract all urls of books
def geturls(startpage,endpage):
    urls_all=[]
    for pagenr in range(startpage,endpage):    
        req = requests.get(f'https://www.goodreads.com/list/show/264.Books_That_Everyone_Should_Read_At_Least_Once?page={pagenr}')
        soup = BeautifulSoup(req.content, 'html.parser')
        #print(soup)
        urls=soup.find_all('a', class_='bookTitle')#.get('href')
        for i in urls:
            urls_all.append('https://www.goodreads.com'+i.get('href'))
        time.sleep(random.randint(1,4))
    #print(urls_all)
    df = pd.DataFrame({"url": urls_all})
    df.to_csv(r'c:\Users\anton\Desktop\df_urls{pagenr}.csv',index=None, header=True)
    
    

In [58]:
geturls(90,95)

### Optional - Concat all URL dataframes

In [None]:
url_list= pd.DataFrame([])

for file_name in glob.glob(r'C:\Users\anton\Ironhack\Final Project\URLsFinal\*.csv'):
    df = pd.read_csv(file_name)
    url_list= pd.concat([url_list],df)
url_list

## Scrape all book sites

In [None]:
#scrape all links on link list

In [83]:
def getfinaldata(url_list):
    title=[]
    series=[]
    authors=[]
    isbn=[]
    genres=[]
    descriptions=[]
    years=[]
    pages=[]
    languages=[]
    setting=[]
    av_ratings=[]
    total_ratings=[]
    cover_image_link=[]
    link_list=pd.read_csv(url_list)
    link_list=link_list['url'].tolist()
        
    for url in link_list:
        req = requests.get(url)
        soup = BeautifulSoup(req.content, 'html.parser')
        #FETCH DATA, append lists:
        title.append(get_book_title(soup))
        
        try:
            series.append(get_book_series(soup))
        except:
            series.append('None')
        try:  
            authors.append(get_book_authors(soup))
        except:
            authors.append('None')
            
        isbn.append(get_book_isbn13(soup))
        genres.append(get_all_book_genres(soup))
        try:
            descriptions.append(get_book_description(soup))
        except:
            descriptions.append('None')
        try:    
            pages.append(get_book_pages(soup))
        except:
            pages.append('None')
        try:
            years.append(get_book_year(soup))
        except:
            years.append('None')
            
        languages.append(get_book_language(soup))
        cover_image_link.append(get_book_cover(soup))
        total_ratings.append(get_total_ratings(soup))
        av_ratings.append(get_bookavrating(soup))
        print(url)
        time.sleep(random.randint(1,4))
        
    #CREATE DATAFRAME FROM LISTS
    df = pd.DataFrame(
    {
     "title": title, 
     "series": series,
     "authors": authors,
     "isbn": isbn,
     "genres": genres,
     "description": descriptions,
     "pages": pages,
     "year": years, 
     "language": languages, 
     "cover_image": cover_image_link,
     "total_number_ratings": total_ratings, 
     "average_rating": av_ratings,
     }
    )
    df.head()
    df.to_csv(r'c:\Users\anton\Desktop\df_alldetails.csv',index=None, header=True)
    return df    

In [59]:
getfinaldata('URLsFinal\df_urls{90-94}.csv')

https://www.goodreads.com/book/show/18364031-wake-up-call
https://www.goodreads.com/book/show/20745973-runaway-bullet
https://www.goodreads.com/book/show/20902330-night-is-falling
https://www.goodreads.com/book/show/3166997-petits-meurtres-entre-moines
https://www.goodreads.com/book/show/19716566-once-upon-a-zombie
https://www.goodreads.com/book/show/15773896-thin-skin-soul-pinned
https://www.goodreads.com/book/show/13589129-the-party-is-over
https://www.goodreads.com/book/show/17824064-mythbusting-the-cult-of-confucius
https://www.goodreads.com/book/show/18775517-stand-tough
https://www.goodreads.com/book/show/19140427-rose-colored-glasses
https://www.goodreads.com/book/show/20809246-seed-of-evil
https://www.goodreads.com/book/show/13801447-heart-waves
https://www.goodreads.com/book/show/19613562-the-return-of-the-soldier
https://www.goodreads.com/book/show/19513363-the-unseen-terrorist
https://www.goodreads.com/book/show/20507301-crazy-maybe
https://www.goodreads.com/book/show/206264

https://www.goodreads.com/book/show/22395655-home-by-the-sea
https://www.goodreads.com/book/show/22113407-the-hungry-wolves-of-van-diemen-s-land
https://www.goodreads.com/book/show/22440552-are-we-normal-funny-true-stories-from-an-everyday-family
https://www.goodreads.com/book/show/22360232-stella-and-moonface
https://www.goodreads.com/book/show/22027000-learning-to-love-again
https://www.goodreads.com/book/show/444330.Kneeknock_Rise
https://www.goodreads.com/book/show/22022846-bred-to-rule
https://www.goodreads.com/book/show/22319292-the-chronicles-of-nightwolf
https://www.goodreads.com/book/show/7581.The_Book_of_Awakening
https://www.goodreads.com/book/show/17838475-the-taking
https://www.goodreads.com/book/show/10083616-ashes-for-the-elephant-god
https://www.goodreads.com/book/show/20630471-plebs
https://www.goodreads.com/book/show/16109612-my-prison-without-bars
https://www.goodreads.com/book/show/22475955-warrior-king-legacy
https://www.goodreads.com/book/show/22396204-the-bookmak

https://www.goodreads.com/book/show/7568425-sparks
https://www.goodreads.com/book/show/23206659-find-me
https://www.goodreads.com/book/show/23203724-the-pacific-depths
https://www.goodreads.com/book/show/123106.Twisted
https://www.goodreads.com/book/show/228250.Four_Arguments_for_the_Elimination_of_Television
https://www.goodreads.com/book/show/19982425-the-outsourcing-guide-for-appraisers
https://www.goodreads.com/book/show/21618998-sid-s-place
https://www.goodreads.com/book/show/17670941-a-vampire-s-saving-embrace
https://www.goodreads.com/book/show/920412.The_Secret_of_Childhood
https://www.goodreads.com/book/show/23291155-highland-darkness
https://www.goodreads.com/book/show/23151687-they-re-rugby-boys-don-t-you-know
https://www.goodreads.com/book/show/22274921-effortless-savings
https://www.goodreads.com/book/show/22593820-autobiography-of-charles-r-barefoot-jr-the-world-imperial-wizard-for-t
https://www.goodreads.com/book/show/21950219-letters-to-my-daughter-s-killer
https://www.

https://www.goodreads.com/book/show/10000191-yellow-crocus
https://www.goodreads.com/book/show/25147457-young-nixon
https://www.goodreads.com/book/show/20892195-henry-s-re-entry
https://www.goodreads.com/book/show/166434.Empire
https://www.goodreads.com/book/show/18884274-all-the-dancing-birds
https://www.goodreads.com/book/show/24610957-tenacity
https://www.goodreads.com/book/show/18513604
https://www.goodreads.com/book/show/22678976-the-moment-before-an-injury
https://www.goodreads.com/book/show/18774020-california
https://www.goodreads.com/book/show/24640244-the-world-at-large
https://www.goodreads.com/book/show/20464904-kill-thy-neighbor
https://www.goodreads.com/book/show/23641381-the-end
https://www.goodreads.com/book/show/24381375-cowboys-last-all-night
https://www.goodreads.com/book/show/24001777-her-lifeline
https://www.goodreads.com/book/show/24661503-who-i-am
https://www.goodreads.com/book/show/23305079-hassle-free-bedtime
https://www.goodreads.com/book/show/23494599-libellu

Unnamed: 0,title,series,authors,isbn,genres,description,pages,year,language,cover_image,total_number_ratings,average_rating,book_url
0,Wake Up Call,,[Victoria Ashley],9781490906133,"[Romance, New Adult, Contemporary, Sociology, ...",My heart stopped and suddenly it was hard to b...,248,2013,English,https://i.gr-assets.com/images/S/compressed.ph...,2679,3.8,https://www.goodreads.com/book/show/18373208-t...
1,Runaway Bullet,Runaway #3,[Nicole Clark],,[],"Strong, silent, and the one most likely to kil...",0,2014,English,https://i.gr-assets.com/images/S/compressed.ph...,15,3.8,https://www.goodreads.com/book/show/18373208-t...
2,Night is Falling,,[Mariam Arif],9789948158,[],In a place far from the cruelty and injustice ...,0,1397,English,https://i.gr-assets.com/images/S/compressed.ph...,3,4.67,https://www.goodreads.com/book/show/18373208-t...
3,Petits Meurtres Entre Moines,Les Nouvelles Enquêtes du juge Ti #4,[Frédéric Lenormand],9782213622484,[Mystery],A peine arrivé dans le monastère taoïste où l’...,223,2004,French,https://i.gr-assets.com/images/S/compressed.ph...,28,3.82,https://www.goodreads.com/book/show/18373208-t...
4,Once Upon A Zombie,,[Storm Stoker],1611605385,[],What would you do if you turned on the news to...,158,2012,English,https://i.gr-assets.com/images/S/compressed.ph...,4,4.0,https://www.goodreads.com/book/show/18373208-t...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,The Stove-Junker,,[S.K. Kalsi],9780990779063,[],"In the winter of 2012, 79-year-old Somerset Ga...",338,2015,English,https://i.gr-assets.com/images/S/compressed.ph...,23,3.65,https://www.goodreads.com/book/show/18373208-t...
496,The Treemakers,The Treemakers Trilogy #1,[Christina L. Rozelle],1508840466,"[Science Fiction, Dystopia, Fantasy, Young Adu...",This is a previously-published alternate cover...,288,2014,English,https://i.gr-assets.com/images/S/compressed.ph...,535,4.23,https://www.goodreads.com/book/show/18373208-t...
497,100 Years: A Journey to End a Vicious Cycle,,[Mark L. Baynard],9780986138003,[],"100 Years, A Journey to End a Vicious Cycle is...",290,2015,English,https://i.gr-assets.com/images/S/compressed.ph...,13,4.0,https://www.goodreads.com/book/show/18373208-t...
498,Thrive: 30 Inspirational Rags-to-Riches Stories,,[Jason Navallo],1511503882,"[Nonfiction, Self Help, Inspirational, Busines...",Thrive: 30 Inspirational Rags-to-Riches Storie...,100,2015,English,https://i.gr-assets.com/images/S/compressed.ph...,335,3.89,https://www.goodreads.com/book/show/18373208-t...


## Concat Dataframes

In [61]:
scraped_concat= pd.DataFrame([])

for file_name in glob.glob(r'C:\Users\anton\Ironhack\Final Project\DataScraped\*.csv'):
    df = pd.read_csv(file_name)
    print(file_name)
    scraped_concat= pd.concat([scraped_concat,df],ignore_index=True)

scraped_concat.to_csv(r'c:\Users\anton\Desktop\scraped_concat.csv',index=None, header=True)


C:\Users\anton\Ironhack\Final Project\DataScraped\df_alldetails{1-4}.csv
C:\Users\anton\Ironhack\Final Project\DataScraped\df_alldetails{10-14}.csv
C:\Users\anton\Ironhack\Final Project\DataScraped\df_alldetails{15-19}.csv
C:\Users\anton\Ironhack\Final Project\DataScraped\df_alldetails{20-24}.csv
C:\Users\anton\Ironhack\Final Project\DataScraped\df_alldetails{25-29}.csv
C:\Users\anton\Ironhack\Final Project\DataScraped\df_alldetails{30-34}.csv
C:\Users\anton\Ironhack\Final Project\DataScraped\df_alldetails{35-39}.csv
C:\Users\anton\Ironhack\Final Project\DataScraped\df_alldetails{40-44}.csv
C:\Users\anton\Ironhack\Final Project\DataScraped\df_alldetails{45-49}.csv
C:\Users\anton\Ironhack\Final Project\DataScraped\df_alldetails{5-9}.csv
C:\Users\anton\Ironhack\Final Project\DataScraped\df_alldetails{50-54}.csv
C:\Users\anton\Ironhack\Final Project\DataScraped\df_alldetails{55-59}.csv
C:\Users\anton\Ironhack\Final Project\DataScraped\df_alldetails{60-64}.csv
C:\Users\anton\Ironhack\Final

 ## Check results

### Check null values in title column

In [97]:
rawdata =pd.read_csv(r'DataFinal\scraped_concat.csv')
rawdata.columns

Index(['title', 'series', 'authors', 'isbn', 'genres', 'description', 'pages',
       'year', 'language', 'cover_image', 'total_number_ratings',
       'average_rating'],
      dtype='object')

In [98]:
rawdata['title'].isna().sum()

969

### Remove rows with empty data, as title is empty then all the rest of the row is emtpy

In [100]:
emptytitle = pd.DataFrame(rawdata[rawdata['title'].isna()==True])
#emptytitle['url'].to_csv(r'URLsFinal\df_emptytitles.csv',index=None, header=True)
emptytitle

Unnamed: 0,title,series,authors,isbn,genres,description,pages,year,language,cover_image,total_number_ratings,average_rating
2,,,,,[],,,1813,,,,
17,,,,,[],,,1964,,,,
28,,,,,[],,,1871,,,,
29,,,,,[],,,1847,,,,
47,,,,,[],,,1950,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
9358,,,,,[],,,2014,,,,
9362,,,,,[],,,2010,,,,
9369,,,,,[],,,2005,,,,
9371,,,,,[],,,2015,,,,


In [107]:
rawdata = rawdata[rawdata['title'].notna()]
rawdata.to_csv(r'DataFinal\scraped_concat.csv')
rawdata.shape

(8431, 12)