# Homework 5

## Rest API

In [80]:
import requests
import json
import matplotlib as plt
from wordcloud import WordCloud, STOPWORDS

We want to create a wordcloud with the most common words from all the motivations of all the nobel prizes in physics. We can collect data on this by using the get method on the appropiate url below.

In [82]:
nobelPrize_physics = requests.get('http://api.nobelprize.org/2.1/nobelPrizes?limit=124&nobelPrizeCategory=phy')

We can then filter out just the english comments on every year there was a nobel prize awarded and print all this text to a textfile.

In [84]:
with open('nobelPrizes_physics.txt', 'w') as h:
    for year in nobelPrize_physics.json()['nobelPrizes']:
        try:
            print(year['laureates'][0]['motivation']['en'], file=h)
        except KeyError:
            continue

We can then use this textfile and create an image (separate png.file) with the most common words using WordCloud and appropriate stopwords.

In [86]:
stopwords = STOPWORDS

wc = WordCloud(
    background_color = 'white',
    stopwords = stopwords,
    height = 600,
    width = 400
)

In [87]:
text = open('nobelPrizes_physics.txt', 'r').read()

In [88]:
wc.generate(text)
wc.to_file('wordcloud_nobelPrizes_physics.png')

<wordcloud.wordcloud.WordCloud at 0x19716a60bf0>

## Web Scraping

In [90]:
from bs4 import BeautifulSoup
import pandas as pd

Here we want to scrape data from the 3 first pages of a website containing books.

In [92]:
url1 = 'https://books.toscrape.com/'
url2 = 'https://books.toscrape.com/catalogue/page-2.html'
url3 = 'https://books.toscrape.com/catalogue/page-3.html'

In [93]:
page1 = requests.get(url1)

In [94]:
page2 = requests.get(url2)

In [95]:
page3 = requests.get(url3)

In [96]:
html1 = BeautifulSoup(page1.content)

In [97]:
html2 = BeautifulSoup(page2.content)

In [98]:
html3 = BeautifulSoup(page3.content)

We also want to analyse more data on the first 5 books.

In [100]:
url_book1 = 'https://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html'
url_book2 = 'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html'
url_book3 = 'https://books.toscrape.com/catalogue/soumission_998/index.html'
url_book4 = 'https://books.toscrape.com/catalogue/sharp-objects_997/index.html'
url_book5 = 'https://books.toscrape.com/catalogue/sapiens-a-brief-history-of-humankind_996/index.html'

In [101]:
page_book1 = requests.get(url_book1)

In [102]:
page_book2 = requests.get(url_book2)

In [103]:
page_book3 = requests.get(url_book3)

In [104]:
page_book4 = requests.get(url_book4)

In [105]:
page_book5 = requests.get(url_book5)

In [106]:
html_page_book1 = BeautifulSoup(page_book1.content)

In [107]:
html_page_book2 = BeautifulSoup(page_book2.content)

In [108]:
html_page_book3 = BeautifulSoup(page_book3.content)

In [109]:
html_page_book4 = BeautifulSoup(page_book4.content)

In [110]:
html_page_book5 = BeautifulSoup(page_book5.content)

In [111]:
html_page_books = [html_page_book1, html_page_book2, html_page_book3, html_page_book4, html_page_book5]

To create a dataframe with the first 5 books including data on upc, title, price and rating we can first extract to separate lists.

In [113]:
upc = []

for page_book in html_page_books:
    page_book = str(page_book)
    start_index_page_book = page_book.find('UPC</th><td>') + len('UPC</th><td>')
    end_index_page_book = start_index_page_book + page_book[start_index_page_book : ].find('</td>')
    upc.append(page_book[start_index_page_book : end_index_page_book])

upc

['a897fe39b1053632',
 '90fa61229261140a',
 '6957f44c3847a760',
 'e00eb4fd7b871a48',
 '4165285e1663650f']

In [114]:
h3_list = html1.find_all('h3')
titles = []

for title in h3_list:
    title = str(title)
    start_index_title = title.find('title="') + len('title="')
    end_index_title = title[start_index_title : ].find('">')
    titles.append(title[start_index_title : start_index_title + end_index_title])

In [115]:
first_5_titles = titles[0:5]
first_5_titles

['A Light in the Attic',
 'Tipping the Velvet',
 'Soumission',
 'Sharp Objects',
 'Sapiens: A Brief History of Humankind']

In [116]:
price = []

for page_book in html_page_books:
    page_book = str(page_book)
    start_index_page_book = page_book.find('Price (excl. tax)</th><td>') + len('Price (excl. tax)</th><td>')
    end_index_page_book = start_index_page_book + page_book[start_index_page_book : ].find('</td>')
    price.append(page_book[start_index_page_book : end_index_page_book])

price

['£51.77', '£53.74', '£50.10', '£47.82', '£54.23']

In [117]:
rating = []

for page_book in html_page_books:
    page_book = str(page_book)
    start_index_page_book = page_book.find('star-rating') + len('star-rating ')
    end_index_page_book = start_index_page_book + page_book[start_index_page_book : ].find('"')
    rating.append(page_book[start_index_page_book : end_index_page_book])

rating

['Three', 'One', 'One', 'Four', 'Five']

We can here combine the lists to a nested list and use this as the cell values in the dataframe. This values together with the column headers gives us the final table with the first 5 books.

In [119]:
data = []

for i in range(5):
        data.append([upc[i], first_5_titles[i], price[i], rating[i]])

In [120]:
column_headers = ['upc', 'title', 'price', 'rating']
df = pd.DataFrame(data, columns=column_headers)
df.set_index('upc')

Unnamed: 0_level_0,title,price,rating
upc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a897fe39b1053632,A Light in the Attic,£51.77,Three
90fa61229261140a,Tipping the Velvet,£53.74,One
6957f44c3847a760,Soumission,£50.10,One
e00eb4fd7b871a48,Sharp Objects,£47.82,Four
4165285e1663650f,Sapiens: A Brief History of Humankind,£54.23,Five
