# Task

The purpose of this notebook is to find all the possible titles for the movie info section so that we can parse all the needed information

# Importing needed packages

In [1]:
import os
import pprint
import re
import unicodedata
from pathlib import Path
from typing import List

import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

# Functions

In [2]:
def get_page_type(page_name: str) -> str:
    if "отзывы и рецензии" in page_name:
        return "review"
    else:
        return "front"

In [6]:
def get_global_titles(data_path: str) -> List[str]:
    _, _, files = next(os.walk(data_path))
    pathlist = Path(data_path).rglob("*.html")
    
    titles_global = []

    for path in tqdm(pathlist, desc="Parsing movie pages", unit="page"):
        str_path = str(path)
        page_type = get_page_type(str_path)
        if page_type == "front":
            with open(str_path, "r", encoding="utf-8") as f:
                front = f.read()
            soup_front = BeautifulSoup(front, "html.parser")
            movie_info_divs = soup_front.find_all(attrs={"data-test-id": "encyclopedic-table"})[0]
            titles_local = []
            for div in movie_info_divs.find_all(class_=re.compile("styles_title")):
                current_title = div.get_text()
                titles_local.append(current_title)

            titles_global.append(titles_local)
            
    return sum(titles_global, [])

# Parsing movie info section

In [7]:
movies_path = "data/movies"

In [8]:
movie_titles_global = get_global_titles(movies_path)

Parsing movie pages: 0page [00:00, ?page/s]

In [9]:
movie_titles_global = list(dict.fromkeys(movie_titles_global))
movie_titles_global

['Год производства',
 'Страна',
 'Жанр',
 'Слоган',
 'Режиссер',
 'Сценарий',
 'Продюсер',
 'Оператор',
 'Композитор',
 'Художник',
 'Монтаж',
 'Бюджет',
 'Сборы в США',
 'Сборы в мире',
 'Зрители',
 'Сборы в России',
 'Премьера в Росcии',
 'Премьера в мире',
 'Релиз на DVD',
 'Релиз на Blu-ray',
 'Возраст',
 'Рейтинг MPAA',
 'Время',
 'Цифровой релиз',
 'Маркетинг',
 'Платформа',
 'Ре-релиз (РФ)',
 'Директор фильма']

This is the whole set of titles that were seen in the movie info section for the list of films

# Parsing series info section

In [10]:
series_path = "data/series"

In [11]:
series_titles_global = get_global_titles(series_path)

Parsing movie pages: 0page [00:00, ?page/s]

In [12]:
series_titles_global = list(dict.fromkeys(series_titles_global))
series_titles_global

['Год производства',
 'Платформа',
 'Страна',
 'Жанр',
 'Слоган',
 'Режиссер',
 'Сценарий',
 'Продюсер',
 'Оператор',
 'Композитор',
 'Художник',
 'Монтаж',
 'Премьера в Росcии',
 'Премьера в мире',
 'Цифровой релиз',
 'Возраст',
 'Время',
 'Бюджет',
 'Релиз на DVD',
 'Сборы в США',
 'Директор фильма',
 'Рейтинг MPAA',
 'Сборы в мире',
 'Релиз на Blu-ray',
 'Сборы в России']

This is the whole set of titles that were seen in the movie info section for the list of series

In [13]:
set(movie_titles_global).difference(set(series_titles_global))

{'Зрители', 'Маркетинг', 'Ре-релиз (РФ)'}

We can see that there are few titles that are not in the list for series - that's something