In [2]:
import pandas as pd
import numpy as np
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
import lxml
import json
import time
import csv
import random

In [3]:
# Библиотеки для веб-драйвера Selenium

from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service

service = Service(executable_path=ChromeDriverManager().install())

In [11]:
# Объединяем файлы ссылок в единый датафрейм

lnk_csv = 'https://raw.githubusercontent.com/ArtemDorofeev/Kinopoisk/main/csv/df_{}.csv'
df_full = pd.DataFrame()

for i in range(2015, 2023):
    data = pd.read_csv(lnk_csv.format(i))
    data['year'] = i
    df_full = pd.concat([df_full, data], ignore_index=True)

In [12]:
df_full = df_full[['name', 'links', 'year']]
df_full

Unnamed: 0,name,links,year
0,Игра престолов,https://www.kinopoisk.ru/series/464963/,2015
1,Рик и Морти,https://www.kinopoisk.ru/series/685246/,2015
2,Легенда,https://www.kinopoisk.ru/film/839954/,2015
3,Марсианин,https://www.kinopoisk.ru/film/841700/,2015
4,Головоломка,https://www.kinopoisk.ru/film/645118/,2015
...,...,...,...
377994,Музыка летней ночи,https://www.kinopoisk.ru/film/5073920/,2022
377995,Пингвины,https://www.kinopoisk.ru/film/5074010/,2022
377996,Прогресс,https://www.kinopoisk.ru/film/5074011/,2022
377997,Прятки,https://www.kinopoisk.ru/film/5074012/,2022


In [13]:
# Удаляем дубликаты ссылок

duplicate = df_full['links'].duplicated()
duplicate.value_counts()

df = df_full.drop(index=df_full[duplicate].index, axis=0)
df

In [15]:
# Категоризируем на фильмы и сериалы

def split_serials(row):
    if 'film' in row:
        x = 'film'
    else:
        x = 'series'
    return x

In [21]:
df['type'] = df['links'].apply(split_serials)
df

Unnamed: 0,name,links,year,type
0,Игра престолов,https://www.kinopoisk.ru/series/464963/,2015,series
1,Рик и Морти,https://www.kinopoisk.ru/series/685246/,2015,series
2,Легенда,https://www.kinopoisk.ru/film/839954/,2015,film
3,Марсианин,https://www.kinopoisk.ru/film/841700/,2015,film
4,Головоломка,https://www.kinopoisk.ru/film/645118/,2015,film
...,...,...,...,...
377994,Музыка летней ночи,https://www.kinopoisk.ru/film/5073920/,2022,film
377995,Пингвины,https://www.kinopoisk.ru/film/5074010/,2022,film
377996,Прогресс,https://www.kinopoisk.ru/film/5074011/,2022,film
377997,Прятки,https://www.kinopoisk.ru/film/5074012/,2022,film


In [22]:
# Раззбиваем датасет по годам

df_2015 = df[df['year'] == 2015]
df_2016 = df[df['year'] == 2016]
df_2017 = df[df['year'] == 2017]
df_2018 = df[df['year'] == 2018]
df_2019 = df[df['year'] == 2019]
df_2020 = df[df['year'] == 2020]
df_2021 = df[df['year'] == 2021]
df_2022 = df[df['year'] == 2022]

In [23]:
df_2016

Unnamed: 0,name,links,year,type
36950,Зверополис,https://www.kinopoisk.ru/film/775276/,2016,film
36952,Мажор,https://www.kinopoisk.ru/series/820638/,2016,series
36953,Доктор Стрэндж,https://www.kinopoisk.ru/film/409600/,2016,film
36954,Дэдпул,https://www.kinopoisk.ru/film/462360/,2016,film
36955,Шерлок,https://www.kinopoisk.ru/series/502838/,2016,series
...,...,...,...,...
79495,Eiland,https://www.kinopoisk.ru/film/996604/,2016,film
79496,K.E.R.O.S.E.N.E poems from the planet,https://www.kinopoisk.ru/film/996606/,2016,film
79497,Cooker,https://www.kinopoisk.ru/film/996607/,2016,film
79498,"Devils & Angels, Ritual Feasts in Europe",https://www.kinopoisk.ru/film/996608/,2016,film


In [24]:
# Формируем суп-объект

def get_soup(url):    
    driver.get(url)    
    html = driver.page_source
    soup = BeautifulSoup(html, "lxml")
    #info_section = soup.select('div.styles_root__2kxYy')
    return soup

In [25]:
# Собираем характеристики фильмов

def get_top(row):
    list_top = []
    try:
        name = row.select_one('h1[class*="styles_title"]').get_text()
    except:
        name = []
    
    try:
        description = row.select_one('div[class*="styles_topText"]').get_text()
    except:
        description = []
        
    try:
        rating = row.select_one('span[class*="film-rating-value"]').get_text()
    except:
        rating = []
        
    try:
        grades = row.select_one('div[class*="styles_countBlock"]').get_text()
    except:
        grades = []
    
    dic = {'name': name, 'description': description, 'rating': rating, 'grades': grades}
    list_top.append(dic)
    return list_top

def get_about(row):    
    about_block = row.select_one('div[data-test-id*="encyclopedic-table"]')
    about_row = about_block.select('div[class*="styles_row"]')
    list_about = []
    for i in about_block:
        try:            
            title = i.select_one('div[class*="styles_title"]').get_text()
            values = i.select('div[class*="styles_value"]')    
            dic = {title: values}
        except:
            dic = {}
        list_about.append(dic)        
    return list_about


def get_actor(row):
    list_actor = []
    try:        
        actor_block = row.select('div[class*="styles_actors"] ul li')        
        for i in actor_block:
            link = i.select_one('a').get('href')
            actor_name = i.get_text()
            dic = {'actor_name': actor_name, 'link': link}
            list_actor.append(dic)
    except:
        list_actor = list_actor
    return list_actor

def get_low(row):    
    try:
        critic_section = row.select_one('div[class*="styles_criticRatingSection"]')    
        try:
            critic_positive = critic_section.select_one('span[class*="film-rating-value"]').get_text()
        except:
            critic_positive = []
        try:   
            critic_value = critic_section.select_one('div[class*="styles_countBlock"]').get_text()
        except:
            critic_value = []
        try:
            critic_star = critic_section.select_one('div[class*="styles_starValue"]').get_text()
        except:
            critic_star = []
        lst = [critic_positive, critic_value, critic_star]
    except:
        lst = []
    return lst
    

In [26]:
# Формируем единый словарь характеристик

def full_parse(row):    
    #time.sleep(tm)
    soup = get_soup(row)
    name_rate = get_top(soup)
    about_film = get_about(soup)
    actors = get_actor(soup)
    critic_rate = get_low(soup)
    dic = {'name_rate': name_rate, 'about_film': about_film, 'actors': actors, 'critic_rate': critic_rate}
    return dic

In [24]:
driver = webdriver.Chrome(service=service) # Открывает новую отдельную вкладку браузера хром от selenium

In [None]:
# ЗАПУСК ПАРСЕРА!!!

df_2016['char'] = df_2016['links'].apply(full_parse)

In [16]:
df_2016

Unnamed: 0,name,links,year,type
0,Игра престолов,https://www.kinopoisk.ru/series/464963/,2015,series
1,Рик и Морти,https://www.kinopoisk.ru/series/685246/,2015,series
2,Легенда,https://www.kinopoisk.ru/film/839954/,2015,film
3,Марсианин,https://www.kinopoisk.ru/film/841700/,2015,film
4,Головоломка,https://www.kinopoisk.ru/film/645118/,2015,film
...,...,...,...,...
377994,Музыка летней ночи,https://www.kinopoisk.ru/film/5073920/,2022,film
377995,Пингвины,https://www.kinopoisk.ru/film/5074010/,2022,film
377996,Прогресс,https://www.kinopoisk.ru/film/5074011/,2022,film
377997,Прятки,https://www.kinopoisk.ru/film/5074012/,2022,film


In [304]:
#df_2015['char'][46179]

In [17]:
df_2016.to_csv('D:\\Data\\Kinopoisk\\csv\\df_full_2016.csv') # здесь надо указать свой путь на компе для записи файла

In [270]:
driver.quit()