# Task Web Scraping

## Web Scraping with Beautiful Soup

In [None]:
import requests as rq
import pandas as pd
from bs4 import BeautifulSoup

### Scraping Quotes

To achieve 25 quotes, we have to move into pages

### Moving into pages

In [None]:
quotes_list = []
for i in range(1,4):
    print('page ', i)
    resp = rq.get(f'http://quotes.toscrape.com/page/{i}')
    soup = BeautifulSoup(resp.content, 'html.parser')
    quotes = soup.find_all("span", class_ = "text")
    q = [quote.get_text() for quote in quotes]
    quotes_list = quotes_list + q

len(quotes_list)

In [None]:
j = 1
for i in quotes_list:
    print(f'{j} - {i}')
    j += 1
    if j == 26:
        break
    else:
        continue

### Scraping Books

In [None]:
books_list = []
for i in range(1,4):
    print('page ', i)
    resp = rq.get(f'http://books.toscrape.com/catalogue/page-{i}.html')
    soup = BeautifulSoup(resp.content, 'html.parser')
    books = soup.find_all("h3")
    bs = [book.find('a')['title'] for book in books]
    books_list = books_list + bs

len(books_list)

In [None]:
j = 1
for i in books_list:
    print(f'{j} - {i}')
    j += 1
    if j == 51:
        break
    else:
        continue

### PLUS: Retrieving information from 50 books and saving into a CSV

In [None]:
def get_book_info(route:str) -> dict:
    response = rq.get(f'http://books.toscrape.com/catalogue/{route}')
    soups = BeautifulSoup(response.content, 'html.parser')
    info_dict = {
        "title":soups.find("h1").get_text(),
        "description":soups.find_all("p")[3].get_text(),
        "image":'http://books.toscrape.com'+soups.find("img")['src'].replace('../..',''),
        "UPC":((soups.find("table", class_ = "table table-striped")).find_all("td")[0]).get_text(),
        "price":((soups.find("table", class_ = "table table-striped")).find_all("td")[2]).get_text()
    }
    return info_dict

books_list_titles = []
for i in range(1,4):
    print('page ', i)
    resp = rq.get(f'http://books.toscrape.com/catalogue/page-{i}.html')
    soup = BeautifulSoup(resp.content, 'html.parser')
    books = soup.find_all("h3")
    bs = [book.find('a')['href'] for book in books]
    books_list_titles = books_list_titles + bs

print(books_list_titles[0])

dict_info = {
        "title":[],
        "description":[],
        "image":[],
        "UPC":[],
        "price":[]
}

j = 0
for i in books_list_titles:
    data = get_book_info(route=i)
    for key in dict_info:
        dict_info[key].append(data[key])
    j += 1
    if j == 51:
        break
    else:
        continue

dict_info['title']

In [None]:
df_books = pd.DataFrame.from_dict(dict_info)
df_books['image'].head(10)[9]

In [None]:
df_books.to_csv('books_scrapped.csv')

## Web Scraping with XPath

Same thing but with xpath

In [1]:
from random import randint

import pandas as pd
import requests as rq
from bs4 import BeautifulSoup
from IPython.display import Image
from lxml import etree

### Scraping Quotes (XPath version)

In [2]:
quotes_list = []
for i in range(1,4):
    print('page ', i)
    resp = rq.get(f'http://quotes.toscrape.com/page/{i}')
    soup = BeautifulSoup(resp.content, 'html.parser')
    x = etree.HTML(str(soup))
    quotes = x.xpath('//div[@class="quote"]/span[@class="text"]/text()')
    #<div class = "quote">
    #   <span class = "text">Texto</span>
    # </div>
    #q = [quote.get_text() for quote in quotes]
    quotes_list = quotes_list + quotes

page  1
page  2
page  3


In [8]:
print(quotes_list[randint(14, 29)])

“For every minute you are angry you lose sixty seconds of happiness.”


### Scraping Books (XPath Version)

In [16]:
resp = rq.get(f'http://books.toscrape.com/catalogue/page-{i}.html')
soup = BeautifulSoup(resp.content, 'html.parser')
x = etree.HTML(str(soup))

In [9]:
books_list = []
for i in range(1,4):
    print('page ', i)
    resp = rq.get(f'http://books.toscrape.com/catalogue/page-{i}.html')
    soup = BeautifulSoup(resp.content, 'html.parser')
    x = etree.HTML(str(soup))
    books = x.xpath('//h3/a/@title')
    books_list = books_list + books

books_list[randint(0,30)]

page  1
page  2
page  3


'Foolproof Preserving: A Guide to Small Batch Jams, Jellies, Pickles, Condiments, and More: A Foolproof Guide to Making Small Batch Jams, Jellies, Pickles, Condiments, and More'

In [14]:
books_list[randint(0,29)]

"Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)"

### PLUS: Retrieving information from 50 books and saving into a CSV (XPath)

In [22]:
def get_book_info_xpath(route:str) -> dict:
    response = rq.get(f'http://books.toscrape.com/catalogue/{route}')
    soups = BeautifulSoup(response.content, 'html.parser')
    x = etree.HTML(str(soups))
    info_dict = {
        "title":x.xpath('//h1/text()')[0],
        "description":x.xpath("//div[@id='product_description']/following-sibling::p/text()")[0],
        "image":'http://books.toscrape.com'+x.xpath('//div[@class="item active"]/img/@src')[0].replace('../..',''),
        "UPC":x.xpath('//table[@class="table table-striped"]//tr/td/text()')[0],
        "price":x.xpath('//table[@class="table table-striped"]//tr/td/text()')[2]
    }
    return info_dict

In [23]:
books_list = []
for i in range(1,4):
    resp = rq.get(f'http://books.toscrape.com/catalogue/page-{i}.html')
    soup = BeautifulSoup(resp.content, 'html.parser')
    x = etree.HTML(str(soup))
    books = x.xpath('//h3/a/@href')
    books_list = books_list + books

dict_info = {
        "title":[],
        "description":[],
        "image":[],
        "UPC":[],
        "price":[]
}

j = 0
for i in books_list:
    data = get_book_info_xpath(route=i)
    for key in dict_info:
        dict_info[key].append(data[key])
    j += 1
    if j == 51:
        break
    else:
        continue

dict_info['title']

page  1
page  2
page  3


['A Light in the Attic',
 'Tipping the Velvet',
 'Soumission',
 'Sharp Objects',
 'Sapiens: A Brief History of Humankind',
 'The Requiem Red',
 'The Dirty Little Secrets of Getting Your Dream Job',
 'The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull',
 'The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics',
 'The Black Maria',
 'Starving Hearts (Triangular Trade Trilogy, #1)',
 "Shakespeare's Sonnets",
 'Set Me Free',
 "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)",
 'Rip it Up and Start Again',
 'Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991',
 'Olio',
 'Mesaerion: The Best Science Fiction Stories 1800-1849',
 'Libertarianism for Beginners',
 "It's Only the Himalayas",
 'In Her Wake',
 'How Music Works',
 'Foolproof Preserving: A Guide to Small Batch Jams, Jellies, Pickles, Condiments, and More: A Foolproof Guide to Making Small Batch Jams, Jellies, Pickles, 

In [None]:
url_prove = 'http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html'
response = rq.get(url_prove)
soups = BeautifulSoup(response.content, 'html.parser')
x = etree.HTML(str(soups))

In [24]:
df_books = pd.DataFrame.from_dict(dict_info)

In [25]:
df_books.head()

Unnamed: 0,title,description,image,UPC,price
0,A Light in the Attic,It's hard to imagine a world without A Light i...,http://books.toscrape.com/media/cache/fe/72/fe...,a897fe39b1053632,£51.77
1,Tipping the Velvet,"""Erotic and absorbing...Written with starling ...",http://books.toscrape.com/media/cache/08/e9/08...,90fa61229261140a,£53.74
2,Soumission,"Dans une France assez proche de la nôtre, un h...",http://books.toscrape.com/media/cache/ee/cf/ee...,6957f44c3847a760,£50.10
3,Sharp Objects,"WICKED above her hipbone, GIRL across her hear...",http://books.toscrape.com/media/cache/c0/59/c0...,e00eb4fd7b871a48,£47.82
4,Sapiens: A Brief History of Humankind,From a renowned historian comes a groundbreaki...,http://books.toscrape.com/media/cache/ce/5f/ce...,4165285e1663650f,£54.23


In [26]:
df_books.to_csv('books_scrapped_xpath.csv')

In [None]:
Image(url=df_books['image'][randint(13,38)], width=250, height=250)