In [2]:
import pandas as pd
import requests
from requests import request
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool as ThreadPool
import functools
import time
from tqdm.notebook import tqdm
import sys
from itertools import chain
import re
import functools

In [10]:
authors = list(map(str.strip, open('authors.txt', 'r')))

In [22]:
def safe(func):
    @functools.wraps(func)
    def newfunc(*args,**kwargs):
        res = []
        try:
            res = func(*args,**kwargs)
        except:
            print("Error", file=sys.stderr)
        return res
    return newfunc

@safe
def get_page(url, n_attempts=5, t_sleep=1, **kwargs):
    r_get = requests.get(url, params=kwargs)
    if not r_get.ok:
        for attempt in range(n_attempts):
            time.sleep(t_sleep)
            r_get = requests.get(url, params=kwargs)
            if r_get:
                break
    return r_get
@safe
def get_cur_list(url):
    page = get_page(url, n_attempts=5, t_sleep=1)
    if not page.ok:
        print(url, file=sys.stderr)
        return []
    author_page = page.text
    soup = BeautifulSoup(author_page, 'html.parser')
    soup = BeautifulSoup(author_page, 'lxml')
    return [e.attrs['href'] for e in soup.find_all('a', class_="rd-listing-product-item__image-wrapper")]

@safe
def get_list(url):
    links = []
    url = 'https://www.respublica.ru/authors/' + url
    page = get_page(url, n_attempts=5, t_sleep=1)
    if not page.ok:
        print(url, file=sys.stderr)
        return []
    author_page = page.text
    soup = BeautifulSoup(author_page, 'html.parser')
    number = soup.find('span',class_ = "rd-listing-count__total").text
    cur_page = 1
    num = int(number)

    while len(links)<num:
        cur_url = str(url) +'?page=' +str(cur_page)
        links+=get_cur_list(cur_url)
        cur_page+=1
    return links

In [23]:
pages = []
with ThreadPool(10) as pool:
    max_ = len(authors)
    with tqdm(total=max_) as pbar:
        for obr, cur_list in enumerate(pool.imap_unordered(get_list, authors)):
            pages.append(cur_list)
            pbar.update()
pool.join()

HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




In [24]:
@safe
def process_page(url):
    url = 'https://www.respublica.ru' + url
    
    page = get_page(url, n_attempts=5, t_sleep=1)
    if not page.ok:
        print(url, file=sys.stderr)
        return {}
    book_page = page.text
    soup = BeautifulSoup(book_page, 'lxml')
    
    book_info = {}
    book_info["ID"] = soup.find('div',class_="rd-page-product__article").find('span', itemprop="sku").text
    book_info["URL"] = url
    book_info["Название"] = soup.find('h1', class_="rd-page-product__title").text
    book_info["Автор"] = ';'.join([e.text for e in soup.find_all('a',itemprop="brand")])
    
    preview = soup.find('div', class_="rd-page-product__pages-preview-container hidden")
    if preview:
        preview = preview.find('a',class_="download-pdf").attrs['href']
        book_info["Превью"] = 'https://www.respublica.ru' + preview
    
    image = soup.find('img', class_="rd-page-product__img").attrs['data-zoom-image']
    book_info["Изображение"] = 'https://www.respublica.ru' + image
    
    book_info["Описание"] = soup.find('div', class_="rd-page-product__desc-body" ).text
    book_info["Цена"] = int(soup.find('meta',itemprop="price").attrs['content'])
    
    price_old = soup.find('div', class_="rd-page-product__price-old")
    if price_old:
        price_old = price_old.find('span',class_="prev").text
        book_info["Цена (старая)"] = int(re.split(r' ', price_old)[0])
        
    available = soup.find('div', class_="rd-page-product__buttons").find('a').attrs['class']
    book_info["В наличии"] = 'rd-page-product__buy_status_available' in available
    
    categories = soup.find_all('span', class_="rd-page-breadcrumbs-item")
    book_info["Категория"] = ';'.join(e.find('span', itemprop="name").text for e in categories)
    
    rating = soup.find('span',itemprop="aggregateRating")
    if rating:
        book_info["Число отзывов"] = float(rating.find('meta',itemprop='reviewCount').attrs['content'])
        book_info["Число оценок"] = float(rating.find('meta',itemprop='ratingCount').attrs['content'])
        book_info["Оценка"] = float(rating.find('meta',itemprop='ratingValue').attrs['content'])
        
    table = soup.find('div', class_="rd-page-product__desc-params")
    for row in table.find_all('p', class_="rd-page-product__desc-param"):
        key = row.find(itemprop='name').text
        val = row.find(itemprop='value').text
        book_info[key] = val
    return book_info

In [25]:
urls = list(chain.from_iterable(pages))
len(urls)

2456

In [15]:
result = []
with ThreadPool(10) as pool:
    max_ = len(urls)
    with tqdm(total=max_) as pbar:
        for obr, book_inf in enumerate(pool.imap_unordered(process_page, urls)):
            result.append(book_inf)
            pbar.update()
pool.join()

HBox(children=(FloatProgress(value=0.0, max=2456.0), HTML(value='')))




In [8]:
%%time
df = pd.DataFrame(result)
df.sort_values(by=['ID'], inplace=True)

with open('data/hw_3.csv', mode='w', encoding='utf-8') as f_csv:
    df.to_csv(f_csv, index=False)

CPU times: user 115 ms, sys: 12.4 ms, total: 127 ms
Wall time: 125 ms


In [11]:
result = {}
result[1] = ' '.join(['ixgnu','xbnie','wndi'])
result[2] = ' '.join(['ixgnu2','xbni2e','wn2di','nxferbi'])

In [5]:
with open('data/hw_311111111111.csv', mode='w', encoding='utf-8') as f_csv:
    df.to_csv(f_csv)

In [13]:
with open('out.csv','w') as out:
    for key,val in result.items():
        out.write('{},{}\n'.format(key,val))