In [26]:
import requests
from time import sleep
from tqdm import tqdm
from typing import Generator, Dict, Any
from multiprocessing.dummy import Pool, Queue
import re
import pandas as pd
import gzip
import os
import sys
import codecs
import json
from bs4 import BeautifulSoup

In [2]:
def get_page(url, n_attempts=5, t_sleep=1, **kwargs):
    for i in range(n_attempts):
        r_get = requests.get(url)
        if r_get.ok is True:
            return r_get
        else:
            sleep(t_sleep)
    return None

In [3]:
# Функция получения списка книг со страницы
def get_books(url):
    books = []
    r_get = get_page(url)
    if r_get is None:
        print("\nPage", url, "is not available", file=sys.stderr)
        return []
    soup = BeautifulSoup(r_get.text, "lxml")
    tags = soup.find_all('div', class_='bxr-element-name', id=re.compile('bxr-element-name-[0-9]+'))
    for tag in tags:
        book = re.search(r'/catalog-products/[^/]+/', str(tag)).group(0)
        books.append(book[18:-1])
    return books

In [4]:
# Функция парсинга одной страницы с информацией о книге
def process_page(url):
    book_info = dict()
    r_get = get_page(url)
    if r_get is None:
        print("\nPage", url, "is not available", file=sys.stderr)
        return None
    soup = BeautifulSoup(r_get.text, "lxml")
    if soup is None:
        print("\nPage", url, "parsing failed", file=sys.stderr)
        return None
    main = soup.find("div", itemtype="http://schema.org/Product")
    if main is None:
        print("\nPage", url, "parsing failed", file=sys.stderr)
        return None
    left_part = main.find("div", attrs={"class": "bxr-element-slider"})
    right_part = main.find("div", attrs={"class": "bxr-preview-detail-col"})
    price_part = main.find("div", attrs={"class": "bxr-prices-detail-col"})
    if left_part is None or right_part is None or price_part is None:
        print("\nPage", url, "parsing failed", file=sys.stderr)
        return None

    book_info["url"] = url
    
    book_info["Название"] = main.find("h1", itemprop="name").text
    
    img_part = left_part.find("div", attrs={"class": "bxr-element-slider-main"})
    if img_part is None:
        print("\nPage", url, "parsing failed", file=sys.stderr)
        return None
    book_info["Иллюстрации"] = list(map(lambda x: x.attrs["data-src"], img_part.find_all("img", title=book_info["Название"])))
    
    marks_part = left_part.find("div", attrs={"class": "bxr-ribbon-marker-vertical"})
    if marks_part is None:
        print("\nPage", url, "parsing failed", file=sys.stderr)
        return None
    book_info["Метки"] = marks_part.text.split("\n")[1:-1]
    
    rating_part = right_part.find("div", attrs={"class": "bxr-rating-detail"})
    if rating_part is None:
        print("\nPage", url, "parsing failed", file=sys.stderr)
        return None
    value = rating_part.find("meta", attrs={"itemprop": "ratingValue"})
    if value is not None:
        book_info["Оценка"] = value.attrs["content"]
        book_info["Число голосов"] = rating_part.find("meta", attrs={"itemprop": "ratingCount"}).attrs["content"]
    else:
        book_info["Оценка"] = 0
        book_info["Число голосов"] = 0
    
    book_info["Наличие"] = price_part.find("div", attrs={"itemprop": "availability"}).text
    text = ' '.join(list(map(lambda x: str(x), soup.find_all("script"))))
    res = re.search(r'\"PRICE\" : \d+(\.\d+)?,\
                \"MEASURE\": \'<span class=\"bxr-detail-measure\">/ шт.</span>\',\
                \"PRICE_FORMAT\": \"\d+(\.\d+)? <span class=\'bxr-detail-currency\'>руб</span>\",\
                \"PRICE_ORIGIN\" : \d+(\.\d+)?,', text)
    if res is not None:
        res = res.group(0).split()
        book_info["Цена"] = res[13][:-1]
        book_info["Цена (скидка)"] = res[2][:-1]
    else:
        book_info["Цена"] = price_part.find("meta", attrs={"itemprop": "price"}).attrs["content"]
    
    table = right_part.find_all("td")
    i = 0
    while i != len(table):
        book_info[table[i].text] = table[i + 1].text[1:-1]
        if table[i].text == "Издатель":
               book_info[table[i].text] = book_info[table[i].text][:-20]
        i += 2
        
    
    description = soup.find("div", attrs={"data-scroll": "DETAIL"}).text
    if description is None:
        print("\nPage", url, "parsing failed", file=sys.stderr)
        return None
    book_info["Описание"] = "".join(re.findall(r"([^\s]+| )", description))
    
    return book_info

In [115]:
def records_reader(filename: str) -> Generator[Dict[str, Any], None, None]:
    with gzip.open(filename, "r") as file:
        for line in file:
            json_dict = json.loads(line)
            yield json_dict

df = pd.DataFrame(records_reader("parse.json.gz"))
df.to_csv('hw_3.csv', index=False)

In [7]:
%%time
# main
base_address = 'https://shop.relod.ru/catalog-products/4577/?sort=PROPERTY_RATING&order=desc&PAGEN_1='
books = []
for page_num in tqdm(range(1, 251)):
    page_address = base_address + str(page_num)
    books.extend(get_books(page_address))

info = []
base_address = "https://shop.relod.ru/catalog-products/"
with gzip.open('parse.json.gz', mode='wb') as f_json:
    f_json = codecs.getwriter('utf8')(f_json)
    for book in tqdm(books):
        book_url = base_address + book + "/"
        record = process_page(book_url)
        if record is None:
            continue
        record_str = json.dumps(record, ensure_ascii=False)
        print(record_str, file=f_json)

 24%|███████████████████▍                                                             | 60/250 [00:22<01:11,  2.67it/s]


KeyboardInterrupt: 

In [10]:
%%time
# main
base_address = 'https://shop.relod.ru/catalog-products/4577/?sort=PROPERTY_RATING&order=desc&PAGEN_1='
books = []
for page_num in tqdm(range(1, 251)):
    page_address = base_address + str(page_num)
    books.extend(get_books(page_address))

base_address = "https://shop.relod.ru/catalog-products/"
queue = Queue()   
for book in books:
    queue.put(base_address + book + "/")

def process_page_wrapper(i):
    with gzip.open('parse.json.gz', mode='wb') as f_json:
        f_json = codecs.getwriter('utf8')(f_json)

        while not queue.empty():
            record = process_page(queue.get())
            record_str = json.dumps(record, ensure_ascii=False)
            print(record_str, file=f_json)

            # счетчик должен атомарно обновиться
            with lock:
                pbar.update(1)


with Pool(processes=8) as pool, tqdm(total=queue.qsize()) as pbar:
    lock = pbar.get_lock()
    pool.map(process_page_wrapper, range(pool._processes))

  0%|                                                                                         | 0/5000 [00:00<?, ?it/s]

False


100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [37:03<00:00,  2.25it/s]

Wall time: 37min 3s





In [6]:
def func(p):
    res = 1
    for i in range(1, 1000):
        res *= i

In [48]:
#%%timeit
def doubler(number):
    print(os.getpid())
    return number * 2
 
numbers = [5, 10, 20]
pool = Pool(processes=3)
pool.map(doubler, numbers)

10416
10416
10416


[10, 20, 40]