In [1]:
import requests
from lxml import etree
from time import sleep
from tqdm.notebook import trange, tqdm

In [2]:
url = 'https://quotes.toscrape.com/'
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
headers = {'user-agent': user_agent}
response = requests.get(url, headers=headers)

In [3]:
response.status_code

200

In [4]:
tree = etree.HTML(response.content)

In [13]:
quotes_block = tree.xpath('.//div[contains(@class, "quote")]')

In [14]:
block = quotes_block[0]

In [38]:
quote_xpath = "string(.//span[contains(@class, 'text')]/text())"
author_xpath = "string(.//span//small/text())"
keywords_xpath = "string(.//div//meta/@content)"
print(remove_char_quotes_in_string(block.xpath(quote_xpath)))
print(block.xpath(author_xpath))
print(block.xpath(keywords_xpath).split(','))

A day without sunshine is like, you know, night.
Steve Martin
['humor', 'obvious', 'simile']


In [39]:
block.xpath(quote_xpath)

'“A day without sunshine is like, you know, night.”'

In [24]:
block.xpath(keywords_xpath)[0].split(',')

['change', 'deep-thoughts', 'thinking', 'world']

In [42]:
def remove_char_quotes_in_string(string):
    if isinstance(string, str):
        return string.replace('“','').replace('”','')
    return string

In [None]:
quotes_block = tree.xpath('.//div[contains(@class, "quote")]')

In [27]:
xpaths = {
    'quote': quote_xpath,
    'author': author_xpath,
    'keywords': keywords_xpath
}

In [48]:
quotes_list = []
for block in quotes_block:
    quote = {}
    for key, path in xpaths.items():
        if key == 'keywords':
            quote[key] = remove_char_quotes_in_string(block.xpath(path)).split(',')
        else:
            quote[key] = remove_char_quotes_in_string(block.xpath(path))
    quotes_list.append(quote)

In [49]:
[print(quote, '\n------------') for quote in quotes_list]

{'quote': 'The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.', 'author': 'Albert Einstein', 'keywords': ['change', 'deep-thoughts', 'thinking', 'world']} 
------------
{'quote': 'It is our choices, Harry, that show what we truly are, far more than our abilities.', 'author': 'J.K. Rowling', 'keywords': ['abilities', 'choices']} 
------------
{'quote': 'There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.', 'author': 'Albert Einstein', 'keywords': ['inspirational', 'life', 'live', 'miracle', 'miracles']} 
------------
{'quote': 'The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.', 'author': 'Jane Austen', 'keywords': ['aliteracy', 'books', 'classic', 'humor']} 
------------
{'quote': "Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.", 'au

[None, None, None, None, None, None, None, None, None, None]

In [55]:
def get_quotes_info(tree):
    quotes_list2 = []
    quotes = tree.xpath('.//span[contains(@class, "text")]/text()')
    authors = tree.xpath('.//small[contains(@class, "author")]/text()')
    keywords = tree.xpath('.//meta[contains(@class, "keywords")]/@content')
    quotes_info_zip = zip(quotes, authors, keywords)
    for quote, author, keywords in quotes_info_zip:
        quote = {
            'quote': remove_char_quotes_in_string(quote),
            'author': author,
            'keywords': keywords.split(','),
        }
        quotes_list2.append(quote)
    return quotes_list2

In [52]:
[print(quote, '\n------------') for quote in get_quotes_info(tree)]

{'quote': 'The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.', 'author': 'Albert Einstein', 'keywords': ['change', 'deep-thoughts', 'thinking', 'world']} 
------------
{'quote': 'It is our choices, Harry, that show what we truly are, far more than our abilities.', 'author': 'J.K. Rowling', 'keywords': ['abilities', 'choices']} 
------------
{'quote': 'There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.', 'author': 'Albert Einstein', 'keywords': ['inspirational', 'life', 'live', 'miracle', 'miracles']} 
------------
{'quote': 'The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.', 'author': 'Jane Austen', 'keywords': ['aliteracy', 'books', 'classic', 'humor']} 
------------
{'quote': "Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.", 'au

[None, None, None, None, None, None, None, None, None, None]

In [56]:
def get_all_quotes_to_page(last_page=10):
    all_quotes_info = []
    for page in range(1,last_page+1):
        url = f'https://quotes.toscrape.com/page/{page}/'
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            sleep(2)
            continue
        tree = etree.HTML(response.content)
        all_quotes_info.extend(get_quotes_info(tree))
        sleep(random.randrange(100, 200)/1000)
    return all_quotes_info

In [None]:
quotes = get_all_quotes_to_page(10)

In [67]:
a = zip([1,2,3],['a','b','c'])

In [71]:
def inf_func():
    count = 0
    while True:
        yield count
        count +=1

In [72]:
a = inf_func()

In [114]:
next(a)

41

scrap do site https://quotes.toscrape.com//tableful