In [101]:
import asyncio
import aiohttp
import curses
from bs4 import BeautifulSoup

In [102]:
import re

def format_film(soup):
    description = soup.find(id='event-description')
    if description:
        description = description.contents[2].strip()
    
    genres = soup.find('td', class_='genre')
    if genres:
        genres = genres.select('a', class_='genre');
        if genres:
            genres = list(map(lambda x: x.text, genres))
    
    year = soup.find('td', class_='year')
    if year:
        year=year.text
    
    country = soup.find('td', class_='author')
    if country:
        country = country.text.strip()
    
    duration = soup.find('td', class_='duration')
    if duration:
        duration_re = re.compile('([0-9]+)')
        duration = re.search(duration_re, duration.text.strip()).group(1)
    
    producer_re = re.compile('Режиссер: (.*)')
    producer = soup.find(text=producer_re) 
    if producer:
        producer = re.search(producer_re, producer).group(1)

    actors_re = re.compile('В ролях: (.*)')
    actors = soup.find_all('p')
    actors = [p for p in actors if p.find(text=actors_re)]; # find if there a valid p
    if len(actors):
        actors = actors[0].text
        actors_str = re.search(actors_re, actors).group(1)
        actors = list(map(lambda x: x.strip(), actors_str.split(',')))
    
    rating = soup.find(class_='rating-big__value')
    if rating:
        rating = rating.text
    
    return {
        'description': description,
        'genres': genres,
        'year': year,
        'country': country,
        'duration': duration,
        'producer': producer,
        'actors': actors,
        'rating': rating,
    }

In [103]:
from pipetools import pipe
from IPython.display import clear_output

flatten = lambda list_: [item for sublist in list_ for item in sublist]
get_li_list = (pipe 
               | (map, lambda x: x.find_all('li', class_='lists__li'))
               | list 
               | flatten
              );

async def fetch_html(url, session):
    response = await session.get(url)
    return await response.text()

async def get_soup_from_url(url, session):
    html = await fetch_html(url, session)
    return BeautifulSoup(html, 'html.parser')

async def get_films_list(url, session):
    soup = await get_soup_from_url(url, session)
    ul_list = soup.find_all('ul', class_='list_afisha')
    return get_li_list(ul_list)

async def fetch_film_data(url, session):
    data = await get_soup_from_url(url, session)
    return format_film(data)

async def get_film_data(film_soup, index, session):
        a_name = film_soup.find('a', class_='name')
        img = film_soup.find('img')

        if not a_name:
            return

        link_to_film = a_name['href']

        film_data = await fetch_film_data(link_to_film, session)

        result = {
            'name': a_name.text.strip(),
            'img_url': img['src'],
            'url': link_to_film,
        }
        result.update(film_data)
        return result

In [104]:
from aiohttp import ClientSession

async with ClientSession() as session:
    films_list = await get_films_list('https://afisha.tut.by/day/film/2020-11-01/2020-11-28/', session)
    
    films_list_len = len(films_list)
    processed = []

    async def get_film_data_with_logging(film_soup, index, session):
        global films_list_len

        film = await get_film_data(film_soup, index, session)
        processed.append(index)

        clear_output(wait=True)
        print(f'Finished processing {len(processed)} of {films_list_len} films')

        return film
    
    films = await asyncio.gather(
        *[get_film_data_with_logging(film_soup, index, session) for index, film_soup in enumerate(films_list)]
    )
    

Finished processing 93 of 93 films


In [105]:
import pandas as pd

df_films = pd.DataFrame(films)

In [106]:
df_films

Unnamed: 0,name,img_url,url,description,genres,year,country,duration,producer,actors,rating
0,Взаперти (SUB),https://img.afisha.tut.by/static/media/176x0ec...,https://afisha.tut.by/film/vzaperti-sub/,Гиперзаботливая мать Диана Шерман растит дочь ...,"[Детектив, Триллер, Ужасы]",2020,США,90,Аниш Чаганти,"[Сара Полсон, Кира Аллен, Онали Эймс, Пэт Хили...",
1,TheatreHD: Золотая маска: Иранская конференция,https://img.afisha.tut.by/static/media/176x0ec...,https://afisha.tut.by/film/theatrehd_zolotaya_...,Эта постановка по пьесе знаменитого драматурга...,,2020,,120,,[],
2,Непосредственно Каха,https://img.afisha.tut.by/static/media/176x0ec...,https://afisha.tut.by/film/neposredstvenno_kakha/,"Каха запал на прекрасную Софу, но, чтобы подоб...",[Комедия],2020,Россия,117,,"[Артем Карокозян, Артем Калайджян, Марина Кале...",63
3,Афера по-голливудски,https://img.afisha.tut.by/static/media/176x0ec...,https://afisha.tut.by/film/afera-po-gollivudski/,"Макс Барбер – типичный голливудский продюсер, ...","[Боевик, Комедия]",2020,США,110,Джордж Галло,"[Роберт Де Ниро, Морган Фриман, Томми Ли Джонс...",63
4,Взаперти,https://img.afisha.tut.by/static/media/176x0ec...,https://afisha.tut.by/film/vzaperti/,Гиперзаботливая мать Диана Шерман растит дочь ...,"[Детектив, Триллер, Ужасы]",2020,США,90,Аниш Чаганти,"[Сара Полсон, Кира Аллен, Онали Эймс, Пэт Хили...",67
...,...,...,...,...,...,...,...,...,...,...,...
88,TheatreHD: Золотая маска: Гамлет,https://img.afisha.tut.by/static/media/176x0ec...,https://afisha.tut.by/film/theatrehd-zolotaya-...,Театр: Малый драматический театр — Театр Европы,,2020,Россия,120,Лев Додин,[],
89,TheatreHD: Волшебная флейта,https://img.afisha.tut.by/static/media/176x0ec...,https://afisha.tut.by/film/theatrehd_volshebna...,,,,,,,[],67
90,TheatreHD: Щелкунчик,https://img.afisha.tut.by/static/media/176x0ec...,https://afisha.tut.by/film/theatrehd-shchelkun...,Балет в 2-х действиях,,2018,,144,,[],100
91,TheatreHD: Дрянь,https://img.afisha.tut.by/static/media/176x0ec...,https://afisha.tut.by/film/theatrehd-dryan/,Страна: Великобритания,,,Великобритания,84,,[],50
