Запускается либо локально, либо в контейнере с подключённым spark master/worker

# Общая настройка

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars ./jars/postgresql-42.6.0.jar,./jars/clickhouse-jdbc-0.4.6.jar pyspark-shell'

In [2]:
USER = os.getenv("USER")
PASSWORD = os.getenv("PASSWORD")
HOST = os.getenv("HOST")

PG_NAME = os.getenv("POSTGRES_DB")
PG_PORT = os.getenv("POSTGRES_PORT")

CH_NAME = os.getenv("CLICKHOUSE_DB")
CH_PORT = os.getenv("CLICKHOUSE_PORT")

# Настройка подключения pyspark

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Spark SQL with PostgreSQL") \
    .getOrCreate()

25/05/25 19:18:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
pg_jdbc_url = f"jdbc:postgresql://{HOST}:{PG_PORT}/{PG_NAME}"
pg_properties = {
    "user": USER,
    "password": PASSWORD,
    "driver": "org.postgresql.Driver"
}

In [5]:
ch_jdbc_url = f"jdbc:clickhouse://{HOST}:{CH_PORT}/{CH_NAME}"
ch_properties = {
    "user": USER,
    "password": PASSWORD,
    "driver": "com.clickhouse.jdbc.ClickHouseDriver"
}

# Витрины

In [6]:
from pyspark.sql.functions import *

In [7]:
def load_table(table_name):
    return spark.read.jdbc(url=pg_jdbc_url, table=table_name, properties=pg_properties)

In [8]:
d_customer = load_table("d_customer")
d_product = load_table("d_product")
d_seller = load_table("d_seller")
d_store = load_table("d_store")
d_supplier = load_table("d_supplier")

f_sales = load_table("f_sales")

In [9]:
def save_report(df, name: str, order: str):
    df.write \
    .mode("overwrite") \
    .option("createTableOptions", f"""
        ENGINE = MergeTree()
        ORDER BY ({order})
    """) \
    .jdbc(url=ch_jdbc_url, table=name, properties=ch_properties)

In [10]:
def read_show(table_name:str):
    ch_df = spark.read.jdbc(url=ch_jdbc_url, table=table_name, properties=ch_properties)
    ch_df.show()

In [11]:
def save_show(df, name, order):
    save_report(df, name, order)
    read_show(name)

## 1. Витрина продаж по продуктам

### Топ-10 самых продаваемых продуктов

In [12]:
top_products = f_sales \
    .join(d_product, f_sales.sale_product_id == d_product.product_id) \
    .groupBy(
        "product_id",
    ) \
    .agg(
        sum("sale_quantity").alias("total_quantity_sold"),
        sum("sale_total_price").alias("total_revenue")
    ) \
    .orderBy(desc("total_revenue")) \
    .limit(10)

In [13]:
save_show(top_products, "1_top10products", "product_id")

25/05/25 19:18:10 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:10 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:10 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [91e908a8-7ec1-408c-9df7-b6ad1ba5273a] (2 queries & 0 savepoints) is committed.
25/05/25 19:18:10 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [0e7e4ff6-524a-423e-a115-417b8b660816] (0 queries & 0 savepoints) is committed.


+----------+-------------------+--------------------+
|product_id|total_quantity_sold|       total_revenue|
+----------+-------------------+--------------------+
|       124|                  4|499.6900000000000...|
|       126|                  9|499.6200000000000...|
|       383|                 10|499.7300000000000...|
|      2481|                  8|499.7600000000000...|
|      2810|                  2|499.7600000000000...|
|      3826|                  7|499.8500000000000...|
|      5001|                  8|499.7100000000000...|
|      5523|                  9|499.8000000000000...|
|      7488|                 10|499.5900000000000...|
|      9481|                  3|499.6200000000000...|
+----------+-------------------+--------------------+



### Общая выручка по категориям продуктов

In [14]:
revenue_by_category = f_sales \
    .join(d_product, f_sales.sale_product_id == d_product.product_id) \
    .groupBy(
        "product_category",
        "pet_category"
    ) \
    .agg(
        sum("sale_total_price").alias("total_revenue")
    ) \
    .orderBy(desc("total_revenue"))

In [15]:
save_show(revenue_by_category, "1_bycategory", "product_category")

25/05/25 19:18:12 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:12 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:12 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [752dca19-9b8e-489e-b8c1-0a4cedce617b] (2 queries & 0 savepoints) is committed.
25/05/25 19:18:12 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [8cebe1e8-c40b-4a43-bfbf-712a7583e61c] (0 queries & 0 savepoints) is committed.


+----------------+------------+--------------------+
|product_category|pet_category|       total_revenue|
+----------------+------------+--------------------+
|            Cage|       Birds|181830.9300000000...|
|            Cage|    Reptiles|172473.9200000000...|
|            Cage|        Dogs|166057.7100000000...|
|            Cage|        Fish|158665.9600000000...|
|            Cage|        Cats|152089.4200000000...|
|            Food|       Birds|177085.2300000000...|
|            Food|        Fish|170664.5800000000...|
|            Food|        Cats|165220.8100000000...|
|            Food|        Dogs|162632.2800000000...|
|            Food|    Reptiles|155029.6500000000...|
|             Toy|        Dogs|182909.8500000000...|
|             Toy|        Fish|181426.5000000000...|
|             Toy|       Birds|176069.2100000000...|
|             Toy|        Cats|163869.8900000000...|
|             Toy|    Reptiles|163826.1800000000...|
+----------------+------------+---------------

### Средний рейтинг и количество отзывов для каждого продукта

In [16]:
product_ratings = d_product.select(
    "product_id",
    "product_rating",
    "product_reviews"
)

In [17]:
save_show(product_ratings, "1_ratings", "product_id")

25/05/25 19:18:12 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:12 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:12 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [bf76de60-a1c8-4249-a452-f08069d1328a] (11 queries & 0 savepoints) is committed.
25/05/25 19:18:12 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [6698460d-dd2f-4094-b5a7-3c0411dce765] (0 queries & 0 savepoints) is committed.


+----------+--------------------+---------------+
|product_id|      product_rating|product_reviews|
+----------+--------------------+---------------+
|      8996|2.500000000000000000|            660|
|      8997|1.200000000000000000|            974|
|      8998|2.500000000000000000|            733|
|      8999|4.900000000000000000|            228|
|      9000|3.800000000000000000|            307|
|      9001|3.000000000000000000|            539|
|      9002|1.800000000000000000|            261|
|      9003|4.500000000000000000|            983|
|      9004|1.100000000000000000|            369|
|      9005|4.000000000000000000|            110|
|      9006|2.400000000000000000|            698|
|      9007|2.600000000000000000|            999|
|      9008|3.300000000000000000|             36|
|      9009|4.600000000000000000|            685|
|      9010|3.300000000000000000|            691|
|      9011|4.100000000000000000|            646|
|      9012|2.400000000000000000|            226|


## 2. Витрина продаж по клиентам

### Топ-10 клиентов с наибольшей общей суммой покупок

In [18]:
top_customers = f_sales \
    .join(d_customer, f_sales.sale_customer_id == d_customer.customer_id) \
    .groupBy(
        "customer_id",
    ) \
    .agg(
        sum("sale_total_price").alias("total_spent")
    ) \
    .orderBy(desc("total_spent")) \
    .limit(10)

In [19]:
save_show(top_customers, "2_top10customers", "customer_id")

+-----------+--------------------+
|customer_id|         total_spent|
+-----------+--------------------+
|       1494|499.8000000000000...|
|       1527|499.6900000000000...|
|       1795|499.7300000000000...|
|       4466|499.7600000000000...|
|       4470|499.6200000000000...|
|       4508|499.5900000000000...|
|       5982|499.7100000000000...|
|       5991|499.6200000000000...|
|       6885|499.8500000000000...|
|       7992|499.7600000000000...|
+-----------+--------------------+



25/05/25 19:18:13 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:13 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:13 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [7696dd62-a283-472f-b99d-3d433cff0825] (2 queries & 0 savepoints) is committed.
25/05/25 19:18:13 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [b4a608c4-4f90-40c3-8048-4b098bbe4bac] (0 queries & 0 savepoints) is committed.


### Распределение клиентов по странам

In [20]:
customers_by_country = f_sales \
    .join(d_customer, f_sales.sale_customer_id == d_customer.customer_id) \
    .groupBy("customer_country") \
    .agg(
        count("customer_id").alias("customer_count")
    )

In [21]:
save_show(customers_by_country, "2_contryspread", "customer_country")

25/05/25 19:18:14 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:14 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:14 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [38afd24c-1878-46fa-a795-8ecef6bf811f] (2 queries & 0 savepoints) is committed.
25/05/25 19:18:14 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [89f646a2-0f25-4d9d-95ce-556e35e9315c] (0 queries & 0 savepoints) is committed.


+-------------------+--------------+
|   customer_country|customer_count|
+-------------------+--------------+
|        Afghanistan|            31|
|      Aland Islands|             2|
|            Albania|            46|
|     American Samoa|             3|
|            Andorra|             2|
|             Angola|             6|
|Antigua and Barbuda|             3|
|          Argentina|           113|
|            Armenia|            34|
|          Australia|             3|
|            Austria|             6|
|         Azerbaijan|            21|
|            Bahamas|             6|
|            Bahrain|             3|
|         Bangladesh|            17|
|           Barbados|             1|
|            Belarus|            30|
|            Belgium|             5|
|             Belize|             1|
|              Benin|             4|
+-------------------+--------------+
only showing top 20 rows


### Средний чек для каждого клиента

In [22]:
customer_avg_check = f_sales \
    .join(d_customer, f_sales.sale_customer_id == d_customer.customer_id) \
    .groupBy(
        "customer_id",
    ) \
    .agg(
        (sum("sale_total_price") / count("sale_id")).alias("avg_price"),
        avg("sale_quantity").alias("avg_items_count")
    )

In [23]:
save_show(customer_avg_check, "2_customeravgcheck", "customer_id")

25/05/25 19:18:15 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:15 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:15 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [9b2e6e2d-32dd-42b1-b17e-2d24a2b14def] (11 queries & 0 savepoints) is committed.
25/05/25 19:18:15 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [a015a0bf-20ef-4fda-ba5b-7d103d824e86] (0 queries & 0 savepoints) is committed.


+-----------+--------------------+---------------+
|customer_id|           avg_price|avg_items_count|
+-----------+--------------------+---------------+
|         21|224.5900000000000...|            6.0|
|         29|70.07000000000000...|            2.0|
|         32|46.50000000000000...|            6.0|
|         56|439.7500000000000...|            3.0|
|         60|238.9100000000000...|           10.0|
|         75|491.3400000000000...|            8.0|
|         90|379.9200000000000...|            3.0|
|         95|221.3900000000000...|            2.0|
|         98|238.9100000000000...|            8.0|
|        109|88.86000000000000...|            3.0|
|        141|110.0000000000000...|            7.0|
|        143|468.2200000000000...|           10.0|
|        145|181.6600000000000...|            4.0|
|        151|182.1000000000000...|            1.0|
|        195|229.6400000000000...|            6.0|
|        200|271.6200000000000...|            2.0|
|        203|325.0600000000000.

## 3. Витрина продаж по времени

### Месячные и годовые тренды продаж + Сравнение выручки за разные периоды

In [24]:
sales_trends = f_sales \
    .withColumn("year", year("sale_date")) \
    .withColumn("month", month("sale_date")) \
    .groupBy("year", "month") \
    .agg(
        sum("sale_total_price").alias("monthly_revenue"),
        count("sale_id").alias("order_count"),
        sum("sale_quantity").alias("total_items_sold"),
        (sum("sale_total_price") / count("sale_id")).alias("avg_order_value"),
        approx_count_distinct("sale_customer_id").alias("unique_customers")
    ) \
    .orderBy("year", "month")

In [25]:
save_show(sales_trends, "3_salestrends", "year, month")

25/05/25 19:18:15 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/05/25 19:18:16 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:16 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:16 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [8ffea3b0-5d10-4628-ae7f-8cd189184505] (2 queries & 0 savepoints) is committed.
25/05/25 19:18:16 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [770ba917-65a3-4b8f-98aa-8564cce5fc50] (0 queries & 0 savepoints) is committed.


+----+-----+--------------------+-----------+----------------+--------------------+----------------+
|year|month|     monthly_revenue|order_count|total_items_sold|     avg_order_value|unique_customers|
+----+-----+--------------------+-----------+----------------+--------------------+----------------+
|2010|    1|203842.2000000000...|        807|            4478|252.5925650557620...|             802|
|2011|    1|197096.2200000000...|        775|            4171|254.3177032258064...|             811|
|2012|    1|202942.4800000000...|        800|            4534|253.6781000000000...|             821|
|2013|    1|199344.9800000000...|        788|            4257|252.9758629441624...|             765|
|2014|    1|191980.7200000000...|        773|            4277|248.3579818887451...|             765|
|2015|    1|197908.7900000000...|        760|            4043|260.4063026315789...|             734|
|2016|    1|192888.2600000000...|        766|            4213|251.8123498694516...|        

### Средний размер заказа по месяцам

In [26]:
avg_order_size = f_sales \
    .withColumn("year_month", date_format("sale_date", "yyyy-MM")) \
    .groupBy("year_month") \
    .agg(
        avg("sale_quantity").alias("avg_order_size"),
         avg("sale_total_price").alias("avg_order_price")
    ) \
    .orderBy("year_month")

In [27]:
save_show(avg_order_size, "3_avgordersize", "year_month")

25/05/25 19:18:17 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:17 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:17 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [1ac7f863-6b8e-48a2-958c-473c4cea3417] (2 queries & 0 savepoints) is committed.
25/05/25 19:18:17 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [42fd2761-92f1-4e49-a6c7-85f813ce775e] (0 queries & 0 savepoints) is committed.


+----------+------------------+--------------------+
|year_month|    avg_order_size|     avg_order_price|
+----------+------------------+--------------------+
|   2010-01| 5.548946716232962|252.5925650557620...|
|   2011-01| 5.381935483870968|254.3177032258064...|
|   2012-01|            5.6675|253.6781000000000...|
|   2013-01| 5.402284263959391|252.9758629441624...|
|   2014-01| 5.532988357050453|248.3579818887451...|
|   2015-01|5.3197368421052635|260.4063026315789...|
|   2016-01|               5.5|251.8123498694516...|
|   2017-01| 5.491525423728813|253.6779400260756...|
|   2018-01| 5.316770186335404|258.3427826086956...|
|   2019-01| 5.447988904299584|246.8088072122052...|
|   2020-01| 5.415977961432507|250.6467630853994...|
|   2021-01|5.4910941475826975|249.6330152671755...|
|   2022-01| 5.484848484848484|255.0706611570247...|
+----------+------------------+--------------------+



## 4. Витрина продаж по магазинам

### Топ-5 магазинов с наибольшей выручкой

In [28]:
top_stores = f_sales \
    .join(d_store, f_sales.sale_store_id == d_store.store_id) \
    .groupBy(
        "store_id"
    ) \
    .agg(
        sum("sale_total_price").alias("total_revenue")
    ) \
    .orderBy(desc("total_revenue")) \
    .limit(5)

In [29]:
save_show(top_stores, "4_topstores", "store_id")

25/05/25 19:18:18 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:18 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:18 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [fbca9742-db40-48b6-8499-b8cae95cb0ea] (2 queries & 0 savepoints) is committed.
25/05/25 19:18:18 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [41e2327e-905b-47e2-9434-5ee291d63e72] (0 queries & 0 savepoints) is committed.


+--------+--------------------+
|store_id|       total_revenue|
+--------+--------------------+
|    2136|499.7600000000000...|
|    3268|499.8000000000000...|
|    4337|499.7600000000000...|
|    4590|499.7300000000000...|
|    6004|499.8500000000000...|
+--------+--------------------+



### Распределение продаж по городам и странам

In [30]:
sales_by_countries = f_sales \
    .join(d_customer, f_sales.sale_customer_id == d_customer.customer_id) \
    .groupBy(d_customer.customer_country) \
    .agg(
        sum("sale_total_price").alias("total_revenue"),
        sum("sale_quantity").alias("total_items_sold")
    ) \
    .orderBy(desc("total_revenue"))

In [31]:
save_show(sales_by_countries, "4_salesbycountries", "customer_country")

+-------------------+--------------------+----------------+
|   customer_country|       total_revenue|total_items_sold|
+-------------------+--------------------+----------------+
|        Afghanistan|8532.720000000000...|             171|
|      Aland Islands|571.4600000000000...|               7|
|            Albania|11821.24000000000...|             227|
|     American Samoa|385.7500000000000...|              22|
|            Andorra|579.8500000000000...|              15|
|             Angola|1914.940000000000...|              30|
|Antigua and Barbuda|775.3700000000000...|              19|
|          Argentina|26739.31000000000...|             614|
|            Armenia|8847.560000000000...|             185|
|          Australia|726.0700000000000...|              19|
|            Austria|1441.180000000000...|              39|
|         Azerbaijan|5853.520000000000...|             126|
|            Bahamas|1658.530000000000...|              26|
|            Bahrain|596.4100000000000..

25/05/25 19:18:18 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:18 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:18 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [f262a3bb-4113-476c-83a5-482760277cd7] (2 queries & 0 savepoints) is committed.
25/05/25 19:18:18 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [ed57404a-ca9e-4003-afdd-567aae46542a] (0 queries & 0 savepoints) is committed.


### Средний чек для каждого магазина

In [32]:
avg_store_check = f_sales \
    .join(d_store, f_sales.sale_store_id == d_store.store_id) \
    .groupBy("store_id") \
    .agg(
        avg("sale_total_price").alias("avg_check_price"),
        avg("sale_quantity").alias("avg_items_sold")
    )

In [33]:
save_show(avg_store_check, "4_avgstorecheck", "store_id")

25/05/25 19:18:19 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:19 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:19 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [ab203e17-4e30-4f04-853b-86a50a918efc] (11 queries & 0 savepoints) is committed.
25/05/25 19:18:19 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [dfb35be4-560b-4ca4-ba8f-d72319d3b1e9] (0 queries & 0 savepoints) is committed.


+--------+--------------------+--------------+
|store_id|     avg_check_price|avg_items_sold|
+--------+--------------------+--------------+
|      18|234.7500000000000...|           1.0|
|      30|138.2800000000000...|           2.0|
|      36|492.2300000000000...|           7.0|
|      46|408.3200000000000...|           6.0|
|      66|419.1000000000000...|           1.0|
|      67|450.1700000000000...|           3.0|
|      74|157.4300000000000...|           3.0|
|      89|253.0100000000000...|           7.0|
|      99|253.9800000000000...|          10.0|
|     104|274.9100000000000...|           9.0|
|     124|465.7700000000000...|           5.0|
|     134|438.0400000000000...|           8.0|
|     138|51.98000000000000...|           7.0|
|     144|69.73000000000000...|           7.0|
|     153|465.9700000000000...|           8.0|
|     172|480.4600000000000...|           1.0|
|     174|314.8800000000000...|          10.0|
|     180|361.2700000000000...|           1.0|
|     184|321

## 5. Витрина продаж по поставщикам

### Топ-5 поставщиков с наибольшей выручкой

In [34]:
supplier_revenue = f_sales \
    .join(d_product, f_sales.sale_product_id == d_product.product_id) \
    .join(d_supplier, d_product.supplier_id == d_supplier.supplier_id) \
    .groupBy(
        d_supplier.supplier_id
    ) \
    .agg(
        sum("sale_total_price").alias("total_revenue")
    ) \
    .orderBy(desc("total_revenue")) \
    .limit(5)

In [35]:
save_show(supplier_revenue, "5_top5suppliers", "supplier_id")

+-----------+--------------------+
|supplier_id|       total_revenue|
+-----------+--------------------+
|        718|499.7600000000000...|
|        796|499.7600000000000...|
|       1577|499.8000000000000...|
|       5598|499.7300000000000...|
|       6266|499.8500000000000...|
+-----------+--------------------+



25/05/25 19:18:20 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:20 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:20 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [8354d714-003c-40b0-850e-a1238d2c76c9] (2 queries & 0 savepoints) is committed.
25/05/25 19:18:20 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [2466eb3d-6217-42f4-9b90-dff924dc2fb6] (0 queries & 0 savepoints) is committed.


### Средняя цена товаров от каждого поставщика

In [36]:
supplier_avg_prices = d_product \
    .join(d_supplier, d_product.supplier_id == d_supplier.supplier_id) \
    .groupBy(
        d_supplier.supplier_id
    ) \
    .agg(
        avg("product_price").alias("avg_product_price")
    ) \
    .orderBy(desc("avg_product_price"))

In [37]:
save_show(supplier_avg_prices, "5_avgsupplierprice", "supplier_id")

25/05/25 19:18:21 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:21 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:21 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [3037b15b-ec7e-4a8a-8163-92b27ddb7841] (11 queries & 0 savepoints) is committed.
25/05/25 19:18:21 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [f2ba38fd-7ee3-4a25-a0b2-0a8e8ecf3cb3] (0 queries & 0 savepoints) is committed.


+-----------+--------------------+
|supplier_id|   avg_product_price|
+-----------+--------------------+
|          4|40.54000000000000...|
|          6|37.49000000000000...|
|         11|34.73000000000000...|
|         22|33.15000000000000...|
|         39|40.69000000000000...|
|         53|38.97000000000000...|
|         71|34.09000000000000...|
|         75|32.66000000000000...|
|        106|38.62000000000000...|
|        114|36.38000000000000...|
|        129|38.58000000000000...|
|        142|32.78000000000000...|
|        145|40.24000000000000...|
|        177|34.98000000000000...|
|        196|36.98000000000000...|
|        234|40.63000000000000...|
|        240|31.70000000000000...|
|        243|33.93000000000000...|
|        248|32.87000000000000...|
|        250|32.01000000000000...|
+-----------+--------------------+
only showing top 20 rows


### Распределение продаж по странам поставщиков

In [38]:
sales_by_supplier_country = f_sales \
    .join(d_product, f_sales.sale_product_id == d_product.product_id) \
    .join(d_supplier, d_product.supplier_id == d_supplier.supplier_id) \
    .groupBy("supplier_country") \
    .agg(
        sum("sale_total_price").alias("total_revenue"),
        count("sale_id").alias("order_count"),
        sum("sale_quantity").alias("total_items_sold")
    )

In [39]:
save_show(sales_by_supplier_country, "5_salesbysuppliercountry", "supplier_country")

+-------------------+--------------------+-----------+----------------+
|   supplier_country|       total_revenue|order_count|total_items_sold|
+-------------------+--------------------+-----------+----------------+
|        Afghanistan|9068.100000000000...|         41|             201|
|            Albania|11172.62000000000...|         46|             250|
|            Algeria|230.2500000000000...|          1|               8|
|     American Samoa|185.1000000000000...|          2|              11|
|            Andorra|344.8200000000000...|          2|               9|
|             Angola|1951.180000000000...|         11|              57|
|Antigua and Barbuda|835.6400000000000...|          4|              19|
|          Argentina|35606.32000000000...|        131|             691|
|            Armenia|7467.310000000000...|         29|             189|
|              Aruba|90.49000000000000...|          1|               1|
|          Australia|2946.940000000000...|         13|          

25/05/25 19:18:22 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:22 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:22 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [4daeee0a-c94c-43bd-83bc-11baa7a46d8b] (2 queries & 0 savepoints) is committed.
25/05/25 19:18:22 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [b10b6618-661b-4692-a1ff-6cd7ec91fe5e] (0 queries & 0 savepoints) is committed.


## 6. Качество продукции

### Продукты с наивысшим и наименьшим рейтингом

In [40]:
rated_products = d_product \
    .select(
        "product_id",
        "product_rating"
    )

In [41]:
save_show(rated_products, "6_productrating_asc", "product_id")
# Можно сортировать при выводе


25/05/25 19:18:22 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:22 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:22 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [c1f38c8c-94d1-4c00-8a4f-31a8f977db74] (11 queries & 0 savepoints) is committed.
25/05/25 19:18:22 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [5d8e5415-bbcf-4664-8141-25faac69117d] (0 queries & 0 savepoints) is committed.


+----------+--------------------+
|product_id|      product_rating|
+----------+--------------------+
|      8996|2.500000000000000000|
|      8997|1.200000000000000000|
|      8998|2.500000000000000000|
|      8999|4.900000000000000000|
|      9000|3.800000000000000000|
|      9001|3.000000000000000000|
|      9002|1.800000000000000000|
|      9003|4.500000000000000000|
|      9004|1.100000000000000000|
|      9005|4.000000000000000000|
|      9006|2.400000000000000000|
|      9007|2.600000000000000000|
|      9008|3.300000000000000000|
|      9009|4.600000000000000000|
|      9010|3.300000000000000000|
|      9011|4.100000000000000000|
|      9012|2.400000000000000000|
|      9013|3.700000000000000000|
|      9014|1.100000000000000000|
|      9015|2.800000000000000000|
+----------+--------------------+
only showing top 20 rows


### Корреляция между рейтингом и объемом продаж

In [42]:
rating_sales_correlation = d_product \
    .join(
        f_sales.groupBy("sale_product_id")
            .agg(
                sum("sale_quantity").alias("total_quantity_sold"),
                sum("sale_total_price").alias("total_revenue"),
                count("sale_id").alias("order_count")
            ),
        d_product.product_id == f_sales.sale_product_id
    ) \
    .select(
        "product_id",
        "product_rating",
        "total_quantity_sold",
        "total_revenue",
        "order_count"
    )

In [43]:
save_show(rating_sales_correlation, "6_ratingsalescorrelation", "product_id")

25/05/25 19:18:23 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:23 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:23 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [a8352aa6-c130-4ab2-a8cb-41bde128a056] (11 queries & 0 savepoints) is committed.
25/05/25 19:18:23 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [6d4c42e0-02e7-4a24-bca4-aec19f446873] (0 queries & 0 savepoints) is committed.


+----------+--------------------+-------------------+--------------------+-----------+
|product_id|      product_rating|total_quantity_sold|       total_revenue|order_count|
+----------+--------------------+-------------------+--------------------+-----------+
|        21|1.000000000000000000|                  8|143.1700000000000...|          1|
|        29|2.400000000000000000|                 10|484.6100000000000...|          1|
|        32|3.700000000000000000|                  5|436.6700000000000...|          1|
|        56|5.000000000000000000|                  1|410.7700000000000...|          1|
|        60|3.600000000000000000|                  5|449.5200000000000...|          1|
|        75|2.100000000000000000|                  5|266.1500000000000...|          1|
|        90|2.400000000000000000|                  5|424.6400000000000...|          1|
|        95|1.500000000000000000|                  6|498.8400000000000...|          1|
|        98|2.900000000000000000|          

### Продукты с наибольшим количеством отзывов

In [44]:
most_reviewed_products = d_product \
    .select(
        "product_id",
        "product_reviews"
    )

In [45]:
save_show(most_reviewed_products, "6_productreviewcount", "product_id")

25/05/25 19:18:23 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:23 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction is not supported. You may change jdbcCompliant to false to throw SQLException instead.
25/05/25 19:18:23 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [4e761645-97f5-4ed6-8a55-84063ad5263c] (11 queries & 0 savepoints) is committed.
25/05/25 19:18:23 WARN ClickHouseConnectionImpl: [JDBC Compliant Mode] Transaction [deae9055-5e0f-423a-ad98-3a4e7702825b] (0 queries & 0 savepoints) is committed.


+----------+---------------+
|product_id|product_reviews|
+----------+---------------+
|         1|             19|
|         2|            721|
|         3|            616|
|         4|            532|
|         5|             36|
|         6|            182|
|         7|            499|
|         8|            969|
|         9|            445|
|        10|            824|
|        11|            124|
|        12|            209|
|        13|            125|
|        14|            712|
|        15|             77|
|        16|            514|
|        17|            306|
|        18|            174|
|        19|            204|
|        20|            400|
+----------+---------------+
only showing top 20 rows


# Закрытие сессии

In [46]:
spark.stop()