In [1]:
# IMPORTS
from skimage.io import imread
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [3]:
# GLOBALS

URL_BOOXMIX = "https://bookmix.ru"
OPTIONS_BOOXMIX = "/reviews.phtml?option=all&begin=%d&num_point=10&num_points=10"

CSV_COLUMNS=["name", "author", "description", "marks", "review", "image", "height", "url_to_book", "label"]

In [7]:
def extract_news(parser):
    global BOOKS_CSV
    """ Extract news from a given web page """
    news_list = []
    
    books_list = parser.find("ul", {"class": "books_list"})
    news_divs = books_list.find_all("li")
    
    for ind in range(len(news_divs)):
        try:
            url_to_review = news_divs[ind].find("div", {"class" : "inner"}).find("h4").a["href"]
            url_to_book = news_divs[ind].find("div", {"class" : "inner"}).find("div", {"class" : "authors"}).span.a["href"]
            
            review_soup = get_soup(URL_BOOXMIX + url_to_review)
            book_soup   = get_soup(URL_BOOXMIX + url_to_book)
            
            name = news_divs[ind].find("div", {"class" : "inner"}).find("h4").text
            author = news_divs[ind].find_all("div", {"class" : "authors"})[-1].span.text.replace("\n", "")
            description = extract_description(book_soup)
            marks  = extract_marks(book_soup)
            review = extract_review(review_soup).replace("\r", "").replace("\n", "")
            image  = extract_image(book_soup)
            
            height = imread(image).shape[0]
            
            news = pd.Series([name, author, description, marks, review, image, height, URL_BOOXMIX + url_to_book, None], index=CSV_COLUMNS).to_frame().T

            BOOKS_CSV = BOOKS_CSV.append(news, ignore_index=True)
        except:
            pass
            

        
def get_soup(url):
    response = requests.get(url)
    response.encoding = response.apparent_encoding
    soup = BeautifulSoup(response.text, "html.parser")
    
    return soup


def extract_review(soup):
    review_content = soup.find("div", {"class": "comment-text"}).text
    for p in soup.find_all("p", {"class": "br"}): review_content += p.text
    
    return review_content
    

def extract_image(soup):
    image = soup.find("div", {"class": "thumb"}).img["src"]

    return image


def extract_description(soup):
    description = soup.find("p", {"itemprop": "about"}).text

    return description


def extract_marks(soup):
    marks = ""
    
    for keywords in soup.find_all("div", {"itemprop": "keywords"}):
        for a in keywords.find_all("a"):
            marks += a.text + ";"
    
    return marks


def get_news_from_bookmix(n_pages=1):
    """ Collect news from a given web page """
    news = []

    while n_pages:
        response = requests.get(URL_BOOXMIX + OPTIONS_BOOXMIX % (10 * n_pages))
        response.encoding = response.apparent_encoding
        soup = BeautifulSoup(response.text, "html.parser")
        
        extract_news(soup)
        n_pages -= 1

In [8]:
BOOKS_CSV = pd.DataFrame()
get_news_from_bookmix(100)

BOOKS_CSV.to_csv("books.csv", index=False)