Запускается либо локально, либо в контейнере с подключённым spark master/worker

# Общая настройка

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars ./jars/postgresql-42.6.0.jar,./jars/clickhouse-jdbc-0.4.6.jar pyspark-shell'

In [2]:
USER = os.getenv("USER")
PASSWORD = os.getenv("PASSWORD")
HOST = os.getenv("HOST")

PG_NAME = os.getenv("POSTGRES_DB")
PG_PORT = os.getenv("POSTGRES_PORT")

CH_NAME = os.getenv("CLICKHOUSE_DB")
CH_PORT = os.getenv("CLICKHOUSE_PORT")

# Настройка подключения pyspark

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Spark SQL with PostgreSQL") \
    .getOrCreate()

25/05/25 19:10:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
pg_jdbc_url = f"jdbc:postgresql://{HOST}:{PG_PORT}/{PG_NAME}"
pg_properties = {
    "user": USER,
    "password": PASSWORD,
    "driver": "org.postgresql.Driver"
}

In [5]:
ch_jdbc_url = f"jdbc:clickhouse://{HOST}:{CH_PORT}/{CH_NAME}"
ch_properties = {
    "user": USER,
    "password": PASSWORD,
    "driver": "com.clickhouse.jdbc.ClickHouseDriver"
}

# Витрины

In [6]:
from pyspark.sql.functions import *

In [7]:
def load_table(table_name):
    return spark.read.jdbc(url=pg_jdbc_url, table=table_name, properties=pg_properties)

In [8]:
d_customer = load_table("d_customer")
d_product = load_table("d_product")
d_seller = load_table("d_seller")
d_store = load_table("d_store")
d_supplier = load_table("d_supplier")

f_sales = load_table("f_sales")

In [9]:
def save_report(df, name: str, order: str):
    df.write \
    .mode("overwrite") \
    .option("createTableOptions", f"""
        ENGINE = MergeTree()
        ORDER BY ({order})
    """) \
    .jdbc(url=ch_jdbc_url, table=name, properties=ch_properties)

In [10]:
def read_show(table_name:str):
    ch_df = spark.read.jdbc(url=ch_jdbc_url, table=table_name, properties=ch_properties)
    ch_df.show()

In [11]:
def save_show(df, name, order):
    save_report(df, name, order)
    read_show(name)

## 1. Витрина продаж по продуктам

### Топ-10 самых продаваемых продуктов

In [12]:
top_products = f_sales \
    .join(d_product, f_sales.sale_product_id == d_product.product_id) \
    .groupBy(
        "product_id",
    ) \
    .agg(
        sum("sale_quantity").alias("total_quantity_sold"),
        sum("sale_total_price").alias("total_revenue")
    ) \
    .orderBy(desc("total_revenue")) \
    .limit(10)

In [13]:
save_show(top_products, "1_top10products", "product_id")

25/05/25 19:10:16 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:16 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:16 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [5195e361-ebb8-419d-9998-f0a1a0eaaa9f] (2 queries & 0 savepoints) is committed.
25/05/25 19:10:16 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [964a4038-c913-4e04-98ce-46ce33a06293] (0 queries & 0 savepoints) is committed.


+----------+-------------------+--------------------+
|product_id|total_quantity_sold|       total_revenue|
+----------+-------------------+--------------------+
|       124|                  4|499.6900000000000...|
|       126|                  9|499.6200000000000...|
|       383|                 10|499.7300000000000...|
|      2481|                  8|499.7600000000000...|
|      2810|                  2|499.7600000000000...|
|      3826|                  7|499.8500000000000...|
|      5001|                  8|499.7100000000000...|
|      5523|                  9|499.8000000000000...|
|      7488|                 10|499.5900000000000...|
|      9481|                  3|499.6200000000000...|
+----------+-------------------+--------------------+



### Общая выручка по категориям продуктов

In [14]:
revenue_by_category = f_sales \
    .join(d_product, f_sales.sale_product_id == d_product.product_id) \
    .groupBy(
        "product_category",
        "pet_category"
    ) \
    .agg(
        sum("sale_total_price").alias("total_revenue")
    ) \
    .orderBy(desc("total_revenue"))

In [15]:
save_show(revenue_by_category, "1_bycategory", "product_category")

25/05/25 19:10:17 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:17 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:17 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [474d4d53-5399-4f4b-807e-f51f8784bd40] (2 queries & 0 savepoints) is committed.
25/05/25 19:10:17 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [19b2deb9-cb5b-4666-b977-93d5f8219c30] (0 queries & 0 savepoints) is committed.


+----------------+------------+--------------------+
|product_category|pet_category|       total_revenue|
+----------------+------------+--------------------+
|            Cage|       Birds|181830.9300000000...|
|            Cage|    Reptiles|172473.9200000000...|
|            Cage|        Dogs|166057.7100000000...|
|            Cage|        Fish|158665.9600000000...|
|            Cage|        Cats|152089.4200000000...|
|            Food|       Birds|177085.2300000000...|
|            Food|        Fish|170664.5800000000...|
|            Food|        Cats|165220.8100000000...|
|            Food|        Dogs|162632.2800000000...|
|            Food|    Reptiles|155029.6500000000...|
|             Toy|        Dogs|182909.8500000000...|
|             Toy|        Fish|181426.5000000000...|
|             Toy|       Birds|176069.2100000000...|
|             Toy|        Cats|163869.8900000000...|
|             Toy|    Reptiles|163826.1800000000...|
+----------------+------------+---------------

### Средний рейтинг и количество отзывов для каждого продукта

In [16]:
product_ratings = d_product.select(
    "product_id",
    "product_rating",
    "product_reviews"
)

In [17]:
save_show(product_ratings, "1_ratings", "product_id")

25/05/25 19:10:18 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:18 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:18 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [71ea7e65-dae8-4e11-9f5c-3d46bd0d6f91] (11 queries & 0 savepoints) is committed.
25/05/25 19:10:18 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [37a77ef8-a548-48f7-ac81-6513630ac8bc] (0 queries & 0 savepoints) is committed.


+----------+--------------------+---------------+
|product_id|      product_rating|product_reviews|
+----------+--------------------+---------------+
|      7664|1.100000000000000000|            460|
|      7728|3.300000000000000000|            460|
|      7730|2.700000000000000000|            178|
|      7731|1.800000000000000000|             38|
|      7733|2.600000000000000000|             96|
|      7735|1.600000000000000000|            382|
|      7736|1.200000000000000000|            650|
|      7738|2.000000000000000000|            179|
|      7740|1.000000000000000000|            909|
|      7742|1.300000000000000000|            766|
|      7744|3.900000000000000000|            205|
|      7746|2.100000000000000000|            335|
|      7748|3.500000000000000000|            309|
|      7750|2.200000000000000000|             34|
|      7752|4.800000000000000000|            673|
|      7754|1.000000000000000000|            210|
|      7756|3.800000000000000000|            663|


## 2. Витрина продаж по клиентам

### Топ-10 клиентов с наибольшей общей суммой покупок

In [18]:
top_customers = f_sales \
    .join(d_customer, f_sales.sale_customer_id == d_customer.customer_id) \
    .groupBy(
        "customer_id",
    ) \
    .agg(
        sum("sale_total_price").alias("total_spent")
    ) \
    .orderBy(desc("total_spent")) \
    .limit(10)

In [19]:
save_show(top_customers, "2_top10customers", "customer_id")

25/05/25 19:10:19 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:19 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:19 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [ffbcac96-5453-4a58-a547-60b3ec22fecf] (2 queries & 0 savepoints) is committed.
25/05/25 19:10:19 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [8b9f4f3b-2a9f-41f6-92ff-5d61bf1771df] (0 queries & 0 savepoints) is committed.


+-----------+--------------------+
|customer_id|         total_spent|
+-----------+--------------------+
|       1494|499.8000000000000...|
|       1527|499.6900000000000...|
|       1795|499.7300000000000...|
|       4466|499.7600000000000...|
|       4470|499.6200000000000...|
|       4508|499.5900000000000...|
|       5982|499.7100000000000...|
|       5991|499.6200000000000...|
|       6885|499.8500000000000...|
|       7992|499.7600000000000...|
+-----------+--------------------+



### Распределение клиентов по странам

In [20]:
customers_by_country = f_sales \
    .join(d_customer, f_sales.sale_customer_id == d_customer.customer_id) \
    .groupBy("customer_country") \
    .agg(
        count("customer_id").alias("customer_count")
    )

In [21]:
save_show(customers_by_country, "2_contryspread", "customer_country")

25/05/25 19:10:20 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:20 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:20 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [68435d53-2114-41e2-8314-62fc10f340dc] (2 queries & 0 savepoints) is committed.
25/05/25 19:10:20 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [4d24a1e8-9a45-427b-bca4-6642b4cb02aa] (0 queries & 0 savepoints) is committed.


+-------------------+--------------+
|   customer_country|customer_count|
+-------------------+--------------+
|        Afghanistan|            31|
|      Aland Islands|             2|
|            Albania|            46|
|     American Samoa|             3|
|            Andorra|             2|
|             Angola|             6|
|Antigua and Barbuda|             3|
|          Argentina|           113|
|            Armenia|            34|
|          Australia|             3|
|            Austria|             6|
|         Azerbaijan|            21|
|            Bahamas|             6|
|            Bahrain|             3|
|         Bangladesh|            17|
|           Barbados|             1|
|            Belarus|            30|
|            Belgium|             5|
|             Belize|             1|
|              Benin|             4|
+-------------------+--------------+
only showing top 20 rows


### Средний чек для каждого клиента

In [22]:
customer_avg_check = f_sales \
    .join(d_customer, f_sales.sale_customer_id == d_customer.customer_id) \
    .groupBy(
        "customer_id",
    ) \
    .agg(
        (sum("sale_total_price") / count("sale_id")).alias("avg_price"),
        avg("sale_quantity").alias("avg_items_count")
    )

In [23]:
save_show(customer_avg_check, "2_customeravgcheck", "customer_id")

25/05/25 19:10:21 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:21 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:21 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [785ebecf-1612-4801-a2ec-dbb1011c1c55] (11 queries & 0 savepoints) is committed.
25/05/25 19:10:21 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [15badcde-10d1-4a53-ae56-29f7e61c9e8f] (0 queries & 0 savepoints) is committed.


+-----------+--------------------+---------------+
|customer_id|           avg_price|avg_items_count|
+-----------+--------------------+---------------+
|         18|419.1000000000000...|            6.0|
|         30|79.18000000000000...|           10.0|
|         36|493.6100000000000...|            5.0|
|         46|43.96000000000000...|            4.0|
|         66|375.7400000000000...|            9.0|
|         67|417.7300000000000...|            7.0|
|         74|111.2400000000000...|            7.0|
|         89|280.3700000000000...|            4.0|
|         99|343.0800000000000...|            9.0|
|        104|450.2500000000000...|            5.0|
|        118|398.7000000000000...|            7.0|
|        124|359.7000000000000...|            1.0|
|        134|65.93000000000000...|            5.0|
|        138|145.9100000000000...|            7.0|
|        144|398.4400000000000...|            4.0|
|        153|158.5100000000000...|            8.0|
|        172|141.1500000000000.

## 3. Витрина продаж по времени

### Месячные и годовые тренды продаж + Сравнение выручки за разные периоды

In [24]:
sales_trends = f_sales \
    .withColumn("year", year("sale_date")) \
    .withColumn("month", month("sale_date")) \
    .groupBy("year", "month") \
    .agg(
        sum("sale_total_price").alias("monthly_revenue"),
        count("sale_id").alias("order_count"),
        sum("sale_quantity").alias("total_items_sold"),
        (sum("sale_total_price") / count("sale_id")).alias("avg_order_value"),
        approx_count_distinct("sale_customer_id").alias("unique_customers")
    ) \
    .orderBy("year", "month")

In [25]:
save_show(sales_trends, "3_salestrends", "year, month")

25/05/25 19:10:21 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/05/25 19:10:22 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:22 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:22 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [379eeebc-e29d-408a-870c-e03e04f885f7] (2 queries & 0 savepoints) is committed.
25/05/25 19:10:22 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [948d2972-4b64-4c52-b8b3-b7b5cde19080] (0 queries & 0 savepoints) is committed.


+----+-----+--------------------+-----------+----------------+--------------------+----------------+
|year|month|     monthly_revenue|order_count|total_items_sold|     avg_order_value|unique_customers|
+----+-----+--------------------+-----------+----------------+--------------------+----------------+
|2010|    1|203842.2000000000...|        807|            4478|252.5925650557620...|             802|
|2011|    1|197096.2200000000...|        775|            4171|254.3177032258064...|             811|
|2012|    1|202942.4800000000...|        800|            4534|253.6781000000000...|             821|
|2013|    1|199344.9800000000...|        788|            4257|252.9758629441624...|             765|
|2014|    1|191980.7200000000...|        773|            4277|248.3579818887451...|             765|
|2015|    1|197908.7900000000...|        760|            4043|260.4063026315789...|             734|
|2016|    1|192888.2600000000...|        766|            4213|251.8123498694516...|        

### Средний размер заказа по месяцам

In [26]:
avg_order_size = f_sales \
    .withColumn("year_month", date_format("sale_date", "yyyy-MM")) \
    .groupBy("year_month") \
    .agg(
        avg("sale_quantity").alias("avg_order_size"),
         avg("sale_total_price").alias("avg_order_price")
    ) \
    .orderBy("year_month")

In [27]:
save_show(avg_order_size, "3_avgordersize", "year_month")

+----------+------------------+--------------------+
|year_month|    avg_order_size|     avg_order_price|
+----------+------------------+--------------------+
|   2010-01| 5.548946716232962|252.5925650557620...|
|   2011-01| 5.381935483870968|254.3177032258064...|
|   2012-01|            5.6675|253.6781000000000...|
|   2013-01| 5.402284263959391|252.9758629441624...|
|   2014-01| 5.532988357050453|248.3579818887451...|
|   2015-01|5.3197368421052635|260.4063026315789...|
|   2016-01|               5.5|251.8123498694516...|
|   2017-01| 5.491525423728813|253.6779400260756...|
|   2018-01| 5.316770186335404|258.3427826086956...|
|   2019-01| 5.447988904299584|246.8088072122052...|
|   2020-01| 5.415977961432507|250.6467630853994...|
|   2021-01|5.4910941475826975|249.6330152671755...|
|   2022-01| 5.484848484848484|255.0706611570247...|
+----------+------------------+--------------------+



25/05/25 19:10:23 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:23 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:23 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [5936c5a7-5265-4317-871b-7a8c70722ddb] (2 queries & 0 savepoints) is committed.
25/05/25 19:10:23 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [150eb898-1aa6-48d6-ae8f-95165f05c93c] (0 queries & 0 savepoints) is committed.


## 4. Витрина продаж по магазинам

### Топ-5 магазинов с наибольшей выручкой

In [28]:
top_stores = f_sales \
    .join(d_store, f_sales.sale_store_id == d_store.store_id) \
    .groupBy(
        "store_id"
    ) \
    .agg(
        sum("sale_total_price").alias("total_revenue")
    ) \
    .orderBy(desc("total_revenue")) \
    .limit(5)

In [29]:
save_show(top_stores, "4_topstores", "store_id")

+--------+--------------------+
|store_id|       total_revenue|
+--------+--------------------+
|    2136|499.7600000000000...|
|    3268|499.8000000000000...|
|    4337|499.7600000000000...|
|    4590|499.7300000000000...|
|    6004|499.8500000000000...|
+--------+--------------------+



25/05/25 19:10:23 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:23 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:23 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [485f13bd-a4a3-4744-9236-2d4f94066350] (2 queries & 0 savepoints) is committed.
25/05/25 19:10:23 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [b59b302a-d99e-4288-9ea8-ce0d9ccf44c6] (0 queries & 0 savepoints) is committed.


### Распределение продаж по городам и странам

In [30]:
sales_by_countries = f_sales \
    .join(d_customer, f_sales.sale_customer_id == d_customer.customer_id) \
    .groupBy(d_customer.customer_country) \
    .agg(
        sum("sale_total_price").alias("total_revenue"),
        sum("sale_quantity").alias("total_items_sold")
    ) \
    .orderBy(desc("total_revenue"))

In [31]:
save_show(sales_by_countries, "4_salesbycountries", "customer_country")

+-------------------+--------------------+----------------+
|   customer_country|       total_revenue|total_items_sold|
+-------------------+--------------------+----------------+
|        Afghanistan|8532.720000000000...|             171|
|      Aland Islands|571.4600000000000...|               7|
|            Albania|11821.24000000000...|             227|
|     American Samoa|385.7500000000000...|              22|
|            Andorra|579.8500000000000...|              15|
|             Angola|1914.940000000000...|              30|
|Antigua and Barbuda|775.3700000000000...|              19|
|          Argentina|26739.31000000000...|             614|
|            Armenia|8847.560000000000...|             185|
|          Australia|726.0700000000000...|              19|
|            Austria|1441.180000000000...|              39|
|         Azerbaijan|5853.520000000000...|             126|
|            Bahamas|1658.530000000000...|              26|
|            Bahrain|596.4100000000000..

25/05/25 19:10:24 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:24 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:24 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [775d894b-20f2-423e-ad31-621a5773d550] (2 queries & 0 savepoints) is committed.
25/05/25 19:10:24 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [5977c8ad-0cfa-473f-85be-49d945fc1754] (0 queries & 0 savepoints) is committed.


### Средний чек для каждого магазина

In [32]:
avg_store_check = f_sales \
    .join(d_store, f_sales.sale_store_id == d_store.store_id) \
    .groupBy("store_id") \
    .agg(
        avg("sale_total_price").alias("avg_check_price"),
        avg("sale_quantity").alias("avg_items_sold")
    )

In [33]:
save_show(avg_store_check, "4_avgstorecheck", "store_id")

25/05/25 19:10:25 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:25 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:25 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [e2c9ec4c-920c-476b-b16e-1c2818578311] (11 queries & 0 savepoints) is committed.
25/05/25 19:10:25 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [8a658e2a-2af8-41e4-8721-26b607357d9f] (0 queries & 0 savepoints) is committed.


+--------+--------------------+--------------+
|store_id|     avg_check_price|avg_items_sold|
+--------+--------------------+--------------+
|       2|99.28000000000000...|           1.0|
|      11|388.1500000000000...|           9.0|
|      14|108.2800000000000...|           4.0|
|      33|90.46000000000000...|           4.0|
|      42|396.0100000000000...|           8.0|
|      58|233.1300000000000...|           6.0|
|      68|19.98000000000000...|           6.0|
|      71|100.0600000000000...|           9.0|
|      79|49.17000000000000...|           9.0|
|      83|123.4800000000000...|          10.0|
|     105|101.2900000000000...|           8.0|
|     106|459.9200000000000...|           5.0|
|     110|16.67000000000000...|           6.0|
|     116|323.1000000000000...|           6.0|
|     118|188.5900000000000...|           7.0|
|     119|138.4600000000000...|           5.0|
|     123|236.8500000000000...|           5.0|
|     131|359.3800000000000...|          10.0|
|     135|233

## 5. Витрина продаж по поставщикам

### Топ-5 поставщиков с наибольшей выручкой

In [34]:
supplier_revenue = f_sales \
    .join(d_product, f_sales.sale_product_id == d_product.product_id) \
    .join(d_supplier, d_product.supplier_id == d_supplier.supplier_id) \
    .groupBy(
        d_supplier.supplier_id
    ) \
    .agg(
        sum("sale_total_price").alias("total_revenue")
    ) \
    .orderBy(desc("total_revenue")) \
    .limit(5)

In [35]:
save_show(supplier_revenue, "5_top5suppliers", "supplier_id")

+-----------+--------------------+
|supplier_id|       total_revenue|
+-----------+--------------------+
|        718|499.7600000000000...|
|        796|499.7600000000000...|
|       1577|499.8000000000000...|
|       5598|499.7300000000000...|
|       6266|499.8500000000000...|
+-----------+--------------------+



25/05/25 19:10:26 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:26 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:26 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [646f8876-302c-4b05-986a-f48580127c8b] (2 queries & 0 savepoints) is committed.
25/05/25 19:10:26 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [5a1dc35c-1f89-4e75-a7b6-c23473b4e44c] (0 queries & 0 savepoints) is committed.


### Средняя цена товаров от каждого поставщика

In [36]:
supplier_avg_prices = d_product \
    .join(d_supplier, d_product.supplier_id == d_supplier.supplier_id) \
    .groupBy(
        d_supplier.supplier_id
    ) \
    .agg(
        avg("product_price").alias("avg_product_price")
    ) \
    .orderBy(desc("avg_product_price"))

In [37]:
save_show(supplier_avg_prices, "5_avgsupplierprice", "supplier_id")

25/05/25 19:10:27 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:27 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:27 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [3f8ad18c-cffd-47c2-aa72-19fe0b61b127] (11 queries & 0 savepoints) is committed.
25/05/25 19:10:27 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [6f0f7fdb-b395-4883-8e91-ffec67f5bec5] (0 queries & 0 savepoints) is committed.


+-----------+--------------------+
|supplier_id|   avg_product_price|
+-----------+--------------------+
|         10|16.56000000000000...|
|         15|12.16000000000000...|
|         16|16.64000000000000...|
|         23|18.78000000000000...|
|         32|12.82000000000000...|
|         49|11.77000000000000...|
|         50|13.92000000000000...|
|         58|19.56000000000000...|
|         69|13.45000000000000...|
|         91|14.50000000000000...|
|         93|21.11000000000000...|
|         97|13.33000000000000...|
|        111|17.87000000000000...|
|        120|20.77000000000000...|
|        122|16.32000000000000...|
|        139|16.43000000000000...|
|        143|12.75000000000000...|
|        144|13.52000000000000...|
|        157|15.34000000000000...|
|        162|13.62000000000000...|
+-----------+--------------------+
only showing top 20 rows


### Распределение продаж по странам поставщиков

In [38]:
sales_by_supplier_country = f_sales \
    .join(d_product, f_sales.sale_product_id == d_product.product_id) \
    .join(d_supplier, d_product.supplier_id == d_supplier.supplier_id) \
    .groupBy("supplier_country") \
    .agg(
        sum("sale_total_price").alias("total_revenue"),
        count("sale_id").alias("order_count"),
        sum("sale_quantity").alias("total_items_sold")
    )

In [39]:
save_show(sales_by_supplier_country, "5_salesbysuppliercountry", "supplier_country")

25/05/25 19:10:28 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:28 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:28 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [d92c474c-f7c0-4c09-b175-aad0ac93eb25] (2 queries & 0 savepoints) is committed.
25/05/25 19:10:28 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [63788394-8c83-4145-8f72-2ffb9feb0ea4] (0 queries & 0 savepoints) is committed.


+-------------------+--------------------+-----------+----------------+
|   supplier_country|       total_revenue|order_count|total_items_sold|
+-------------------+--------------------+-----------+----------------+
|        Afghanistan|9068.100000000000...|         41|             201|
|            Albania|11172.62000000000...|         46|             250|
|            Algeria|230.2500000000000...|          1|               8|
|     American Samoa|185.1000000000000...|          2|              11|
|            Andorra|344.8200000000000...|          2|               9|
|             Angola|1951.180000000000...|         11|              57|
|Antigua and Barbuda|835.6400000000000...|          4|              19|
|          Argentina|35606.32000000000...|        131|             691|
|            Armenia|7467.310000000000...|         29|             189|
|              Aruba|90.49000000000000...|          1|               1|
|          Australia|2946.940000000000...|         13|          

## 6. Качество продукции

### Продукты с наивысшим и наименьшим рейтингом

In [40]:
rated_products = d_product \
    .select(
        "product_id",
        "product_rating"
    )

In [41]:
save_show(rated_products, "6_productrating_asc", "product_id")
# Можно сортировать при выводе


25/05/25 19:10:28 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:28 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:28 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [3d674a21-182b-478c-ae83-cbaa11b63280] (11 queries & 0 savepoints) is committed.
25/05/25 19:10:28 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [3093e3cf-e752-4527-99f0-11d11dab2a42] (0 queries & 0 savepoints) is committed.


+----------+--------------------+
|product_id|      product_rating|
+----------+--------------------+
|      7664|1.100000000000000000|
|      7728|3.300000000000000000|
|      7730|2.700000000000000000|
|      7731|1.800000000000000000|
|      7733|2.600000000000000000|
|      7735|1.600000000000000000|
|      7736|1.200000000000000000|
|      7738|2.000000000000000000|
|      7740|1.000000000000000000|
|      7742|1.300000000000000000|
|      7744|3.900000000000000000|
|      7746|2.100000000000000000|
|      7748|3.500000000000000000|
|      7750|2.200000000000000000|
|      7752|4.800000000000000000|
|      7754|1.000000000000000000|
|      7756|3.800000000000000000|
|      7758|3.700000000000000000|
|      7760|4.600000000000000000|
|      7762|4.300000000000000000|
+----------+--------------------+
only showing top 20 rows


### Корреляция между рейтингом и объемом продаж

In [42]:
rating_sales_correlation = d_product \
    .join(
        f_sales.groupBy("sale_product_id")
            .agg(
                sum("sale_quantity").alias("total_quantity_sold"),
                sum("sale_total_price").alias("total_revenue"),
                count("sale_id").alias("order_count")
            ),
        d_product.product_id == f_sales.sale_product_id
    ) \
    .select(
        "product_id",
        "product_rating",
        "total_quantity_sold",
        "total_revenue",
        "order_count"
    )

In [43]:
save_show(rating_sales_correlation, "6_ratingsalescorrelation", "product_id")

25/05/25 19:10:29 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:29 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:29 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [c2886f31-16dc-4ee3-ab61-6f6bec10a599] (11 queries & 0 savepoints) is committed.
25/05/25 19:10:29 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [ceab4aa8-b7a1-41e9-9f8c-0fb0ebe182c3] (0 queries & 0 savepoints) is committed.


+----------+--------------------+-------------------+--------------------+-----------+
|product_id|      product_rating|total_quantity_sold|       total_revenue|order_count|
+----------+--------------------+-------------------+--------------------+-----------+
|        10|4.400000000000000000|                  4|479.3100000000000...|          1|
|        24|3.500000000000000000|                  9|380.8400000000000...|          1|
|        25|1.100000000000000000|                  6|225.2600000000000...|          1|
|        38|3.600000000000000000|                 10|87.81000000000000...|          1|
|        45|3.100000000000000000|                  2|390.8300000000000...|          1|
|        50|2.700000000000000000|                  3|262.7200000000000...|          1|
|        62|4.300000000000000000|                  5|350.2600000000000...|          1|
|        63|2.900000000000000000|                  9|391.4400000000000...|          1|
|        70|2.900000000000000000|          

### Продукты с наибольшим количеством отзывов

In [44]:
most_reviewed_products = d_product \
    .select(
        "product_id",
        "product_reviews"
    )

In [45]:
save_show(most_reviewed_products, "6_productreviewcount", "product_id")

25/05/25 19:10:29 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:29 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:10:29 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [c70f8ddd-1610-4034-9738-6c2cb140cd86] (11 queries & 0 savepoints) is committed.
25/05/25 19:10:29 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [e959d753-d9e8-4f43-9a77-5cfed8af8fb8] (0 queries & 0 savepoints) is committed.


+----------+---------------+
|product_id|product_reviews|
+----------+---------------+
|         1|             19|
|         2|            721|
|         3|            616|
|         4|            532|
|         5|             36|
|         6|            182|
|         7|            499|
|         8|            969|
|         9|            445|
|        10|            824|
|        11|            124|
|        12|            209|
|        13|            125|
|        14|            712|
|        15|             77|
|        16|            514|
|        17|            306|
|        18|            174|
|        19|            204|
|        20|            400|
+----------+---------------+
only showing top 20 rows


# Закрытие сессии

In [47]:
spark.stop()