In [33]:
import requests
from lxml import etree
from time import sleep
from tqdm.notebook import tqdm
from itertools import count
import random
from IPython.display import clear_output

In [2]:
# https://github.com/scrapinghub/spidyquotes
url = 'https://quotes.toscrape.com/'
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
headers = {'user-agent': user_agent}
response = requests.get(url, headers=headers)

In [3]:
tree = etree.HTML(response.content)

In [4]:
quotes_block = tree.xpath('//div[contains(@class, "quote")]')

In [5]:
block = quotes_block[0]

In [6]:
quote_xpath = 'string(.//span[contains(@class, "text")]/text())'
author_xpath = 'string(.//small[contains(@class, "author")]/text())'
keywords_xpath = './/div[contains(@class, "tags")]//a/text()'
print(block.xpath(quote_xpath).replace('“','').replace('”',''))
print(block.xpath(author_xpath))
print(block.xpath(keywords_xpath))

The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.
Albert Einstein
['change', 'deep-thoughts', 'thinking', 'world']


In [7]:
def clean_char_quotes_in_string(string):
    if isinstance(string, str):
        return string.replace('“', '').replace('”', '')
    return string

In [8]:
xpaths = {
    'quote': quote_xpath,
    'author': author_xpath,
    'keywords': keywords_xpath
}

In [9]:
quotes_list = []
for block in quotes_block:
    quote = {}
    for key, path in xpaths.items():
        quote[key] = clean_char_quotes_in_string(block.xpath(path))
    quotes_list.append(quote)

In [49]:
print(quotes_list)

[{'quote': 'The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.', 'author': 'Albert Einstein', 'keywords': ['change', 'deep-thoughts', 'thinking', 'world']}, {'quote': 'It is our choices, Harry, that show what we truly are, far more than our abilities.', 'author': 'J.K. Rowling', 'keywords': ['abilities', 'choices']}, {'quote': 'There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.', 'author': 'Albert Einstein', 'keywords': ['inspirational', 'life', 'live', 'miracle', 'miracles']}, {'quote': 'The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.', 'author': 'Jane Austen', 'keywords': ['aliteracy', 'books', 'classic', 'humor']}, {'quote': "Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.", 'author': 'Marilyn Monroe', 'keywords': ['be-yourself'

In [11]:
tree.xpath('.//meta[contains(@class, "keywords")]/@content')

['change,deep-thoughts,thinking,world',
 'abilities,choices',
 'inspirational,life,live,miracle,miracles',
 'aliteracy,books,classic,humor',
 'be-yourself,inspirational',
 'adulthood,success,value',
 'life,love',
 'edison,failure,inspirational,paraphrased',
 'misattributed-eleanor-roosevelt',
 'humor,obvious,simile']

In [12]:
def get_quotes_info(tree):
    quotes_list2 = []
    quotes = tree.xpath('.//span[contains(@class, "text")]/text()')
    authors = tree.xpath('.//small[contains(@class, "author")]/text()')
    keywords = tree.xpath('.//meta[contains(@class, "keywords")]/@content')
    quotes_info_zip = zip(quotes, authors, keywords)
    for quote, author, keywords in quotes_info_zip:
        quote = {
            'quote': clean_char_quotes_in_string(quote),
            'author': author,
            'keywords': keywords.split(','),
        }
        quotes_list2.append(quote)
    return quotes_list2

In [13]:
[print(quote, '\n------------') for quote in get_quotes_info(tree)]

{'quote': 'The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.', 'author': 'Albert Einstein', 'keywords': ['change', 'deep-thoughts', 'thinking', 'world']} 
------------
{'quote': 'It is our choices, Harry, that show what we truly are, far more than our abilities.', 'author': 'J.K. Rowling', 'keywords': ['abilities', 'choices']} 
------------
{'quote': 'There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.', 'author': 'Albert Einstein', 'keywords': ['inspirational', 'life', 'live', 'miracle', 'miracles']} 
------------
{'quote': 'The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.', 'author': 'Jane Austen', 'keywords': ['aliteracy', 'books', 'classic', 'humor']} 
------------
{'quote': "Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.", 'au

[None, None, None, None, None, None, None, None, None, None]

In [37]:
def have_quotes(body):
    return "No quotes found!" not in body

In [44]:
def get_all_quotes_to_page(last_page=10):
    counter = count(1)
    all_quotes_info = []
    for page in counter:
        print(f'Scraping page {page}')
        url = f'https://quotes.toscrape.com/page/{page}/'
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            continue
        if not have_quotes(response.text):
            break
        tree = etree.HTML(response.content)
        all_quotes_info.extend(get_quotes_info(tree))
        sleep(random.randrange(100, 200)/1000)
        clear_output(wait=True)
    return all_quotes_info

In [45]:
quotes = get_all_quotes_to_page(20)

Scraping page 11


In [46]:
len(quotes)

100

In [47]:
[item for item in quotes if 'books' in item['keywords']]

[{'quote': 'The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.',
  'author': 'Jane Austen',
  'keywords': ['aliteracy', 'books', 'classic', 'humor']},
 {'quote': 'Good friends, good books, and a sleepy conscience: this is the ideal life.',
  'author': 'Mark Twain',
  'keywords': ['books', 'contentment', 'friends', 'friendship', 'life']},
 {'quote': 'I have always imagined that Paradise will be a kind of library.',
  'author': 'Jorge Luis Borges',
  'keywords': ['books', 'library']},
 {'quote': 'You can never get a cup of tea large enough or a book long enough to suit me.',
  'author': 'C.S. Lewis',
  'keywords': ['books', 'inspirational', 'reading', 'tea']},
 {'quote': 'If you only read the books that everyone else is reading, you can only think what everyone else is thinking.',
  'author': 'Haruki Murakami',
  'keywords': ['books', 'thought']},
 {'quote': 'There is no friend as loyal as a book.',
  'author': 'Ernest Hemingway',
  'ke

## Desafio

scrap do site https://quotes.toscrape.com//tableful