## Годзун Егор
### ДЗ №1

#### Устанавливаем нужные библиотеки

In [1]:
!pip install clickhouse-connect -q
!pip install tabulate



#### Загрузка данных и миграция

In [39]:
import clickhouse_connect
from tabulate import tabulate

client_1 = clickhouse_connect.get_client(
    host='localhost',
    port=8123,
    username='default',
    password='12345',
    database='imdb'
)

client_2 = clickhouse_connect.get_client(
    host='localhost',
    port=8124,
    username='default',
    password='12345',
    database='imdb'
)

CREATE_TABLES_SQL_FILE = 'create_tables.sql'
INSERT_IN_TABLES_SQL_FILE = 'insert_in_tables.sql'
MIGRATION_TABLES_SQL_FILE = 'migration.sql'
LIMIT = 10

In [None]:
def execute_sql_file(client, file_path):
    with open(file_path, 'r') as file:
        sql_queries = file.read().split(';')

    for query in sql_queries:
        query = query.strip()
        if query:
            try:
                client.command(query)
                print(f"Completed: {query.split('\n')[0]}")
            except Exception as e:
                print(f"Error: {e}")

In [None]:
def execute_query(client, query):
    try:
        result = client.query(query)

        column_names = result.column_names
        data = result.result_rows[:LIMIT]

        elapsed_s = int(result.summary.get('elapsed_ns', 0)) / 1_000_000_000
        read_rows = int(result.summary.get('read_rows', 0))
        result_rows = int(result.summary.get('result_rows', 0))

        print(tabulate(data, headers=column_names, tablefmt='grid'))
        print(f"Время выполнения: {elapsed_s:.3f} сек")
        print(f"Обработано строк: {read_rows:_}")
        print(f"Возвращено строк: {result_rows:_}")

    except Exception as e:
        print(f"Ошибка при выполнении запроса: {str(e)}")

##### Создаем таблицы 

In [4]:
execute_sql_file(client_1, CREATE_TABLES_SQL_FILE)

Completed: CREATE DATABASE IF NOT EXISTS imdb
Completed: DROP TABLE IF EXISTS imdb.name_basics
Completed: DROP TABLE IF EXISTS imdb.title_basics
Completed: DROP TABLE IF EXISTS imdb.title_akas
Completed: DROP TABLE IF EXISTS imdb.title_crew
Completed: DROP TABLE IF EXISTS imdb.title_episode
Completed: DROP TABLE IF EXISTS imdb.title_principals
Completed: DROP TABLE IF EXISTS imdb.title_ratings
Completed: CREATE TABLE IF NOT EXISTS imdb.name_basics
Completed: CREATE TABLE IF NOT EXISTS imdb.title_basics
Completed: CREATE TABLE IF NOT EXISTS imdb.title_akas
Completed: CREATE TABLE IF NOT EXISTS imdb.title_crew
Completed: CREATE TABLE IF NOT EXISTS imdb.title_episode
Completed: CREATE TABLE IF NOT EXISTS imdb.title_principals
Completed: CREATE TABLE IF NOT EXISTS imdb.title_ratings


##### Наполняем таблицы 

In [None]:
execute_sql_file(client_1, INSERT_IN_TABLES_SQL_FILE)

Completed: INSERT INTO imdb.name_basics
Completed: INSERT INTO imdb.title_basics SELECT
Completed: INSERT INTO imdb.title_akas SELECT
Completed: INSERT INTO imdb.title_crew
Completed: INSERT INTO imdb.title_episode
Completed: INSERT INTO imdb.title_principals
Completed: INSERT INTO imdb.title_ratings


##### Смотрим таблицы

In [6]:
result = client_1.query('SHOW TABLES')
for table in result.result_rows:
    print(table[0])

name_basics
title_akas
title_basics
title_crew
title_episode
title_principals
title_ratings


In [7]:
result = client_1.query('SELECT * FROM imdb.title_episode LIMIT 5')
column_names = [col_name for col_name in result.column_names]
data = result.result_rows

print(tabulate(data, headers=column_names, tablefmt="grid"))

+-----------+----------------+----------------+-----------------+
| tconst    | parentTconst   |   seasonNumber |   episodeNumber |
| tt0031458 | tt32857063     |                |                 |
+-----------+----------------+----------------+-----------------+
| tt0041951 | tt0041038      |              1 |               9 |
+-----------+----------------+----------------+-----------------+
| tt0042816 | tt0989125      |              1 |              17 |
+-----------+----------------+----------------+-----------------+
| tt0042889 | tt0989125      |                |                 |
+-----------+----------------+----------------+-----------------+
| tt0043426 | tt0040051      |              3 |              42 |
+-----------+----------------+----------------+-----------------+


##### Создаем таблицы в другом ClickHouse

In [None]:
execute_sql_file(client_2, CREATE_TABLES_SQL_FILE)

Completed: CREATE DATABASE IF NOT EXISTS imdb
Completed: DROP TABLE IF EXISTS imdb.name_basics
Completed: DROP TABLE IF EXISTS imdb.title_basics
Completed: DROP TABLE IF EXISTS imdb.title_akas
Completed: DROP TABLE IF EXISTS imdb.title_crew
Completed: DROP TABLE IF EXISTS imdb.title_episode
Completed: DROP TABLE IF EXISTS imdb.title_principals
Completed: DROP TABLE IF EXISTS imdb.title_ratings
Completed: CREATE TABLE IF NOT EXISTS imdb.name_basics
Completed: CREATE TABLE IF NOT EXISTS imdb.title_basics
Completed: CREATE TABLE IF NOT EXISTS imdb.title_akas
Completed: CREATE TABLE IF NOT EXISTS imdb.title_crew
Completed: CREATE TABLE IF NOT EXISTS imdb.title_episode
Completed: CREATE TABLE IF NOT EXISTS imdb.title_principals
Completed: CREATE TABLE IF NOT EXISTS imdb.title_ratings


##### Переносим данные в другой ClickHouse

In [None]:
execute_sql_file(client_2, MIGRATION_TABLES_SQL_FILE)

Completed: INSERT INTO imdb.name_basics
Completed: INSERT INTO imdb.title_basics
Completed: INSERT INTO imdb.title_akas
Completed: INSERT INTO imdb.title_crew
Completed: INSERT INTO imdb.title_episode
Completed: INSERT INTO imdb.title_principals
Completed: INSERT INTO imdb.title_ratings


##### Смотрим таблицы

In [182]:
result = client_2.query('SHOW TABLES')
for table in result.result_rows:
    print(table[0])

name_basics
title_akas
title_basics
title_crew
title_episode
title_principals
title_ratings


In [10]:
result = client_2.query('SELECT * FROM imdb.title_episode LIMIT 5')
column_names = [col_name for col_name in result.column_names]
data = result.result_rows

print(tabulate(data, headers=column_names, tablefmt="grid"))

+-----------+----------------+----------------+-----------------+
| tconst    | parentTconst   |   seasonNumber |   episodeNumber |
| tt0031458 | tt32857063     |                |                 |
+-----------+----------------+----------------+-----------------+
| tt0041951 | tt0041038      |              1 |               9 |
+-----------+----------------+----------------+-----------------+
| tt0042816 | tt0989125      |              1 |              17 |
+-----------+----------------+----------------+-----------------+
| tt0042889 | tt0989125      |                |                 |
+-----------+----------------+----------------+-----------------+
| tt0043426 | tt0040051      |              3 |              42 |
+-----------+----------------+----------------+-----------------+


видим, что данные успешно перенесены

##### Выполняем запросы с замером времени

PREWHERE

In [None]:
execute_query(
    client=client_2,
    query="""
            SELECT tconst, primaryTitle, startYear, titleType
            FROM imdb.title_basics
            PREWHERE startYear > 2020 AND titleType = 'movie'
          """
)


+------------+----------------------------------------------------------+-------------+-------------+
| tconst     | primaryTitle                                             |   startYear | titleType   |
| tt0762118  | Moe                                                      |        2023 | movie       |
+------------+----------------------------------------------------------+-------------+-------------+
| tt0780471  | American Surfrider                                       |        2024 | movie       |
+------------+----------------------------------------------------------+-------------+-------------+
| tt0781524  | Vendetta                                                 |        2022 | movie       |
+------------+----------------------------------------------------------+-------------+-------------+
| tt0783848  | L'homme au bâton, une légende créole                     |        2024 | movie       |
+------------+----------------------------------------------------------+---------

LEFT JOIN

In [None]:
execute_query(
    client=client_2,
    query="""
            SELECT
                b.primaryTitle,
                r.averageRating,
                r.numVotes
            FROM imdb.title_basics b
            LEFT JOIN imdb.title_ratings r ON b.tconst = r.tconst
            WHERE b.titleType = 'movie' AND r.numVotes > 1000
            ORDER BY r.averageRating DESC
          """
)

+--------------------------------------------------+-----------------+------------+
| primaryTitle                                     |   averageRating |   numVotes |
| The Suspect                                      |            10   |       1011 |
+--------------------------------------------------+-----------------+------------+
| Pratyartha                                       |             9.6 |       1581 |
+--------------------------------------------------+-----------------+------------+
| Strawberry Melancholy                            |             9.6 |       4531 |
+--------------------------------------------------+-----------------+------------+
| Interval                                         |             9.5 |       2039 |
+--------------------------------------------------+-----------------+------------+
| Jithender Reddy                                  |             9.5 |       2352 |
+--------------------------------------------------+-----------------+------

FULL OUTER JOIN

In [None]:
execute_query(
    client=client_2,
    query="""
            SELECT
                b.tconst,
                b.primaryTitle,
                r.averageRating
            FROM imdb.title_basics b
            FULL OUTER JOIN imdb.title_ratings r ON b.tconst = r.tconst
          """
)

+------------+-------------------------------------------------------------------------------------------+-----------------+
| tconst     | primaryTitle                                                                              |   averageRating |
| tt14175250 | Misfit Bits: Innuendos: When Animals Attack!                                              |               0 |
+------------+-------------------------------------------------------------------------------------------+-----------------+
| tt14175252 | Goodbye                                                                                   |               0 |
+------------+-------------------------------------------------------------------------------------------+-----------------+
| tt14175254 | It's time you tied your hot MILF wife up tight                                            |               0 |
+------------+-------------------------------------------------------------------------------------------+-----------------+


OVER QUALIFY

In [None]:
execute_query(
    client=client_2,
    query="""
            SELECT
                b.primaryTitle,
                r.averageRating,
                AVG(r.averageRating) OVER() AS global_avg,
                RANK() OVER(ORDER BY r.averageRating DESC) AS rating_rank
            FROM imdb.title_basics b
            JOIN imdb.title_ratings r ON b.tconst = r.tconst
            WHERE b.titleType = 'movie'
            QUALIFY rating_rank >= 10
            ORDER BY r.averageRating DESC
          """
)

+-------------------------------------+-----------------+--------------+---------------+
| primaryTitle                        |   averageRating |   global_avg |   rating_rank |
| Independent Roads                   |             9.9 |      6.15613 |            67 |
+-------------------------------------+-----------------+--------------+---------------+
| Chitram Cheppina Katha              |             9.9 |      6.15613 |            67 |
+-------------------------------------+-----------------+--------------+---------------+
| Apple CUT                           |             9.9 |      6.15613 |            67 |
+-------------------------------------+-----------------+--------------+---------------+
| The College                         |             9.9 |      6.15613 |            67 |
+-------------------------------------+-----------------+--------------+---------------+
| Der Bote                            |             9.9 |      6.15613 |            67 |
+--------------------

GROUP BY HAVING

In [None]:
execute_query(
    client=client_2,
    query="""
            SELECT
                titleType,
                AVG(LENGTH(primaryTitle)) AS avg_title_length
            FROM imdb.title_basics
            GROUP BY titleType
            HAVING avg_title_length > 20
            ORDER BY avg_title_length DESC
          """
)

+-------------+--------------------+
| titleType   |   avg_title_length |
| tvSpecial   |            29.9951 |
+-------------+--------------------+
| video       |            25.6239 |
+-------------+--------------------+
| tvMovie     |            22.7493 |
+-------------+--------------------+
| tvShort     |            22.2975 |
+-------------+--------------------+
| tvEpisode   |            20.2827 |
+-------------+--------------------+
| videoGame   |            20.0354 |
+-------------+--------------------+
Время выполнения: 0.182 сек
Обработано строк: 11_542_593
Возвращено строк: 6


EXCEPT

In [None]:
execute_query(
    client=client_2,
    query="""
            SELECT tconst FROM imdb.title_basics
            WHERE titleType = 'movie' AND startYear > 2020
            EXCEPT
            SELECT tconst FROM imdb.title_ratings
            WHERE averageRating < 5.0
          """
)

+------------+
| tconst     |
| tt0780471  |
+------------+
| tt0781524  |
+------------+
| tt0783848  |
+------------+
| tt0870154  |
+------------+
| tt0875612  |
+------------+
| tt0887261  |
+------------+
| tt19781790 |
+------------+
| tt19781894 |
+------------+
| tt19782002 |
+------------+
| tt19782334 |
+------------+
Время выполнения: 0.191 сек
Обработано строк: 13_091_640
Возвращено строк: 78_555


CUBE

In [None]:
execute_query(
    client=client_2,
    query="""
            SELECT
                isAdult,
                titleType,
                AVG(runtimeMinutes) AS avg_runtime,
                COUNT() AS count
            FROM imdb.title_basics
            GROUP BY CUBE(isAdult, titleType)
          """
)

+-----------+--------------+---------------+---------+
|   isAdult | titleType    |   avg_runtime |   count |
|         0 | video        |       41.5691 |  198370 |
+-----------+--------------+---------------+---------+
|         0 | tvMovie      |       71.7861 |  150230 |
+-----------+--------------+---------------+---------+
|         0 | movie        |       89.8393 |  700972 |
+-----------+--------------+---------------+---------+
|         0 | tvMiniSeries |       74.9427 |   60283 |
+-----------+--------------+---------------+---------+
|         1 | tvShort      |       18.3333 |       3 |
+-----------+--------------+---------------+---------+
|         0 | short        |       13.0862 | 1048608 |
+-----------+--------------+---------------+---------+
|         0 | tvSeries     |       81.2703 |  275996 |
+-----------+--------------+---------------+---------+
|         0 | videoGame    |      104.861  |   41827 |
+-----------+--------------+---------------+---------+
|         

FORMAT

![JSONF](./photo/jsonf.png)

![VERTICALF](./photo/verticalf.png)