## Годзун Егор
### ДЗ №1

#### Устанавливаем нужные библиотеки

In [None]:
!pip install clickhouse-connect -q
!pip install pandas

Defaulting to user installation because normal site-packages is not writeable
Collecting tabulate
  Using cached tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Using cached tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0


#### Загрузка данных и миграция

In [26]:
import clickhouse_connect
import pandas as pd


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

client_1 = clickhouse_connect.get_client(
    host='localhost',
    port=8123,
    username='default',
    password='12345',
    database='imdb'
)

client_2 = clickhouse_connect.get_client(
    host='localhost',
    port=8124,
    username='default',
    password='12345',
    database='imdb'
)

CREATE_TABLES_SQL_FILE = 'create_tables.sql'
INSERT_IN_TABLES_SQL_FILE = 'insert_in_tables.sql'
MIGRATION_TABLES_SQL_FILE = 'migration.sql'
LIMIT = 10

In [4]:
def execute_sql_file(client, file_path):
    with open(file_path, 'r') as file:
        sql_queries = file.read().split(';')

    for query in sql_queries:
        query = query.strip()
        if query:
            try:
                client.command(query)
                print(f"Completed: {query.split('\n')[0]}")
            except Exception as e:
                print(f"Error: {e}")

In [None]:
def execute_query(client, query):
    try:
        result_df = client.query_df(query)
        result = client.query(query)

        elapsed_s = int(result.summary.get('elapsed_ns', 0)) / 1_000_000_000
        read_rows = int(result.summary.get('read_rows', 0))
        result_rows = int(result.summary.get('result_rows', 0))

        print(f"Время выполнения: {elapsed_s:.3f} сек")
        print(f"Обработано строк: {read_rows:_}")
        print(f"Возвращено строк: {result_rows:_}")

        return result_df.iloc[:LIMIT]

    except Exception as e:
        print(f"Ошибка при выполнении запроса: {str(e)}")

##### Создаем таблицы 

In [4]:
execute_sql_file(client_1, CREATE_TABLES_SQL_FILE)

Completed: CREATE DATABASE IF NOT EXISTS imdb
Completed: DROP TABLE IF EXISTS imdb.name_basics
Completed: DROP TABLE IF EXISTS imdb.title_basics
Completed: DROP TABLE IF EXISTS imdb.title_akas
Completed: DROP TABLE IF EXISTS imdb.title_crew
Completed: DROP TABLE IF EXISTS imdb.title_episode
Completed: DROP TABLE IF EXISTS imdb.title_principals
Completed: DROP TABLE IF EXISTS imdb.title_ratings
Completed: CREATE TABLE IF NOT EXISTS imdb.name_basics
Completed: CREATE TABLE IF NOT EXISTS imdb.title_basics
Completed: CREATE TABLE IF NOT EXISTS imdb.title_akas
Completed: CREATE TABLE IF NOT EXISTS imdb.title_crew
Completed: CREATE TABLE IF NOT EXISTS imdb.title_episode
Completed: CREATE TABLE IF NOT EXISTS imdb.title_principals
Completed: CREATE TABLE IF NOT EXISTS imdb.title_ratings


##### Наполняем таблицы 

In [None]:
execute_sql_file(client_1, INSERT_IN_TABLES_SQL_FILE)

Completed: INSERT INTO imdb.name_basics
Completed: INSERT INTO imdb.title_basics SELECT
Completed: INSERT INTO imdb.title_akas SELECT
Completed: INSERT INTO imdb.title_crew
Completed: INSERT INTO imdb.title_episode
Completed: INSERT INTO imdb.title_principals
Completed: INSERT INTO imdb.title_ratings


##### Смотрим таблицы

In [6]:
result = client_1.query('SHOW TABLES')
for table in result.result_rows:
    print(table[0])

name_basics
title_akas
title_basics
title_crew
title_episode
title_principals
title_ratings


In [53]:
client_1.query_df('SELECT * FROM imdb.title_episode LIMIT 5')

Unnamed: 0,tconst,parentTconst,seasonNumber,episodeNumber
0,tt0031458,tt32857063,,
1,tt0041951,tt0041038,1.0,9.0
2,tt0042816,tt0989125,1.0,17.0
3,tt0042889,tt0989125,,
4,tt0043426,tt0040051,3.0,42.0


In [54]:
client_1.query_df('SELECT * FROM imdb.title_episode LIMIT 5')

Unnamed: 0,tconst,parentTconst,seasonNumber,episodeNumber
0,tt0031458,tt32857063,,
1,tt0041951,tt0041038,1.0,9.0
2,tt0042816,tt0989125,1.0,17.0
3,tt0042889,tt0989125,,
4,tt0043426,tt0040051,3.0,42.0


##### Создаем таблицы в другом ClickHouse

In [None]:
execute_sql_file(client_2, CREATE_TABLES_SQL_FILE)

Completed: CREATE DATABASE IF NOT EXISTS imdb
Completed: DROP TABLE IF EXISTS imdb.name_basics
Completed: DROP TABLE IF EXISTS imdb.title_basics
Completed: DROP TABLE IF EXISTS imdb.title_akas
Completed: DROP TABLE IF EXISTS imdb.title_crew
Completed: DROP TABLE IF EXISTS imdb.title_episode
Completed: DROP TABLE IF EXISTS imdb.title_principals
Completed: DROP TABLE IF EXISTS imdb.title_ratings
Completed: CREATE TABLE IF NOT EXISTS imdb.name_basics
Completed: CREATE TABLE IF NOT EXISTS imdb.title_basics
Completed: CREATE TABLE IF NOT EXISTS imdb.title_akas
Completed: CREATE TABLE IF NOT EXISTS imdb.title_crew
Completed: CREATE TABLE IF NOT EXISTS imdb.title_episode
Completed: CREATE TABLE IF NOT EXISTS imdb.title_principals
Completed: CREATE TABLE IF NOT EXISTS imdb.title_ratings


##### Переносим данные в другой ClickHouse

In [None]:
execute_sql_file(client_2, MIGRATION_TABLES_SQL_FILE)

Completed: INSERT INTO imdb.name_basics
Completed: INSERT INTO imdb.title_basics
Completed: INSERT INTO imdb.title_akas
Completed: INSERT INTO imdb.title_crew
Completed: INSERT INTO imdb.title_episode
Completed: INSERT INTO imdb.title_principals
Completed: INSERT INTO imdb.title_ratings


##### Смотрим таблицы

In [55]:
execute_query(client_2, 'SHOW TABLES')

Время выполнения: 0.029 сек
Обработано строк: 7
Возвращено строк: 7


Unnamed: 0,name
0,name_basics
1,title_akas
2,title_basics
3,title_crew
4,title_episode
5,title_principals
6,title_ratings


In [56]:
execute_query(client_2, 'SELECT * FROM imdb.title_episode LIMIT 5')

Время выполнения: 0.065 сек
Обработано строк: 10
Возвращено строк: 5


Unnamed: 0,tconst,parentTconst,seasonNumber,episodeNumber
0,tt0031458,tt32857063,,
1,tt0041951,tt0041038,1.0,9.0
2,tt0042816,tt0989125,1.0,17.0
3,tt0042889,tt0989125,,
4,tt0043426,tt0040051,3.0,42.0


видим, что данные успешно перенесены

##### Выполняем запросы с замером времени

PREWHERE

In [57]:
execute_query(
    client=client_2,
    query="""
            SELECT tconst, primaryTitle, startYear, titleType
            FROM imdb.title_basics
            PREWHERE startYear > 2020 AND titleType = 'movie'
          """
)


Время выполнения: 0.827 сек
Обработано строк: 11_542_593
Возвращено строк: 86_230


Unnamed: 0,tconst,primaryTitle,startYear,titleType
0,tt0762118,Moe,2023,movie
1,tt0780471,American Surfrider,2024,movie
2,tt0781524,Vendetta,2022,movie
3,tt0783848,"L'homme au bâton, une légende créole",2024,movie
4,tt24227018,Kather Basha Endra Muthuramalingam,2023,movie
5,tt24227370,Uninformed Consent,2022,movie
6,tt24227752,Raa Raa Penimiti,2023,movie
7,tt24227796,Lost in the Lou,2023,movie
8,tt24227898,11103,2022,movie
9,tt24227952,Wicked Season,2024,movie


LEFT JOIN

In [58]:
execute_query(
    client=client_2,
    query="""
            SELECT
                b.primaryTitle,
                r.averageRating,
                r.numVotes
            FROM imdb.title_basics b
            LEFT JOIN imdb.title_ratings r ON b.tconst = r.tconst
            WHERE b.titleType = 'movie' AND r.numVotes > 1000
            ORDER BY r.averageRating DESC
          """
)

Время выполнения: 0.901 сек
Обработано строк: 13_091_640
Возвращено строк: 45_239


Unnamed: 0,primaryTitle,averageRating,numVotes
0,The Suspect,10.0,1011
1,Pratyartha,9.6,1581
2,Strawberry Melancholy,9.6,4531
3,Interval,9.5,2039
4,Golden Opulence: 500 Years of Luxury in Anatolia,9.5,1298
5,Jithender Reddy,9.5,2352
6,Nidurinchu jahapana,9.5,2054
7,Anamadheya Ashok Kumar,9.5,1120
8,Guard: Revenge for Love,9.4,2047
9,Jibon Theke Neya,9.3,2301


FULL OUTER JOIN

In [61]:
execute_query(
    client=client_2,
    query="""
            SELECT
                b.tconst,
                b.primaryTitle,
                r.averageRating
            FROM imdb.title_basics b
            FULL OUTER JOIN imdb.title_ratings r ON b.tconst = r.tconst
          """
)

Время выполнения: 11.161 сек
Обработано строк: 13_091_640
Возвращено строк: 11_542_593


Unnamed: 0,tconst,primaryTitle,averageRating
0,tt30442451,Bomont,8.1
1,tt30442452,Episode #1.75,0.0
2,tt30442453,Episode #1.76,0.0
3,tt30442457,Episode #1.77,0.0
4,tt3044246,Welt der Melodie,0.0
5,tt30442469,Episode #1.78,0.0
6,tt30442474,Episode #1.79,0.0
7,tt30442475,Brigade,0.0
8,tt3044248,SDU: Sex Duties Unit,5.8
9,tt30442483,Episode #1.80,0.0


OVER QUALIFY

In [62]:
execute_query(
    client=client_2,
    query="""
            SELECT
                b.primaryTitle,
                r.averageRating,
                AVG(r.averageRating) OVER() AS global_avg,
                RANK() OVER(ORDER BY r.averageRating DESC) AS rating_rank
            FROM imdb.title_basics b
            JOIN imdb.title_ratings r ON b.tconst = r.tconst
            WHERE b.titleType = 'movie'
            QUALIFY rating_rank >= 10
            ORDER BY r.averageRating DESC
          """
)

Время выполнения: 1.799 сек
Обработано строк: 13_091_640
Возвращено строк: 327_248


Unnamed: 0,primaryTitle,averageRating,global_avg,rating_rank
0,Robert Shields: My Life as a Robot,9.9,6.156128,67
1,The Truth on Trial,9.9,6.156128,67
2,The Paternal Bond: Barbary Macaques,9.9,6.156128,67
3,Independent Roads,9.9,6.156128,67
4,Project Pivot,9.9,6.156128,67
5,Chitram Cheppina Katha,9.9,6.156128,67
6,The Burmese Python,9.9,6.156128,67
7,Azotes de Barrio 2,9.9,6.156128,67
8,Siege at Nune High,9.9,6.156128,67
9,Fly,9.9,6.156128,67


GROUP BY HAVING

In [66]:
execute_query(
    client=client_2,
    query="""
            SELECT
                titleType,
                AVG(LENGTH(primaryTitle)) AS avg_title_length
            FROM imdb.title_basics
            GROUP BY titleType
            HAVING avg_title_length > 20
            ORDER BY avg_title_length DESC
          """
)

Время выполнения: 0.901 сек
Обработано строк: 11_542_593
Возвращено строк: 6


Unnamed: 0,titleType,avg_title_length
0,tvSpecial,29.995127
1,video,25.623876
2,tvMovie,22.749348
3,tvShort,22.297481
4,tvEpisode,20.282706
5,videoGame,20.035408


EXCEPT

In [64]:
execute_query(
    client=client_2,
    query="""
            SELECT tconst FROM imdb.title_basics
            WHERE titleType = 'movie' AND startYear > 2020
            EXCEPT
            SELECT tconst FROM imdb.title_ratings
            WHERE averageRating < 5.0
          """
)

Время выполнения: 0.719 сек
Обработано строк: 13_091_640
Возвращено строк: 78_555


Unnamed: 0,tconst
0,tt0780471
1,tt0781524
2,tt0783848
3,tt13266270
4,tt13266998
5,tt13267122
6,tt13267124
7,tt13267142
8,tt13267306
9,tt13267336


CUBE

In [65]:
execute_query(
    client=client_2,
    query="""
            SELECT
                isAdult,
                titleType,
                AVG(runtimeMinutes) AS avg_runtime,
                COUNT() AS count
            FROM imdb.title_basics
            GROUP BY CUBE(isAdult, titleType)
          """
)

Время выполнения: 0.685 сек
Обработано строк: 11_542_593
Возвращено строк: 35


Unnamed: 0,isAdult,titleType,avg_runtime,count
0,0,video,41.569105,198370
1,0,tvMovie,71.786097,150230
2,0,movie,89.839313,700972
3,0,tvMiniSeries,74.942724,60283
4,1,tvShort,18.333333,3
5,0,short,13.086183,1048608
6,0,tvSeries,81.270279,275996
7,0,videoGame,104.861364,41827
8,1,tvSpecial,86.166667,26
9,0,tvEpisode,38.620649,8631156


FORMAT

![JSONF](./photo/jsonf.png)

![VERTICALF](./photo/verticalf.png)