In [3]:
from pyspark.sql.functions import *
import math
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("ETL to Star") \
    .getOrCreate()

# ===============================
# Настройки подключения к ClickHouse
# ===============================
ch_jdbc_url = "jdbc:clickhouse://clickhouse:8123/default"
ch_properties = {
    "user": "default",
    "password": "",
    "driver": "com.clickhouse.jdbc.ClickHouseDriver"
}

# Батчинг для больших данных
batch_size = 100000

# ===============================
# Функции для создания таблиц в ClickHouse
# ===============================

def execute_clickhouse_query(query):
    """Выполнение SQL запроса в ClickHouse"""
    try:
        spark.sql(query)
        print(f"✓ Запрос выполнен: {query[:50]}...")
    except Exception as e:
        print(f"✗ Ошибка выполнения запроса: {e}")

def create_clickhouse_tables():
    """Создание всех витрин в ClickHouse с оптимальной структурой"""
    
    queries = [
        # 1. Витрина продаж по продуктам
        """
        CREATE TABLE IF NOT EXISTS vitrina_product_sales (
            product_id UInt32,
            name String,
            category String,
            total_quantity UInt64,
            total_revenue Decimal(15,2),
            avg_rating Float32,
            review_count UInt32
        ) ENGINE = MergeTree()
        ORDER BY (category, product_id)
        """,
        
        # 2. Витрина продаж по клиентам
        """
        CREATE TABLE IF NOT EXISTS vitrina_customer_sales (
            customer_id UInt32,
            customer_name String,
            country String,
            total_spent Decimal(15,2),
            order_count UInt32,
            avg_check Decimal(15,2)
        ) ENGINE = MergeTree()
        ORDER BY (country, customer_id)
        """,
        
        # 3. Витрина продаж по времени
        """
        CREATE TABLE IF NOT EXISTS vitrina_time_sales (
            year UInt16,
            month UInt8,
            total_revenue Decimal(15,2),
            total_quantity UInt64,
            order_count UInt32,
            avg_check Decimal(15,2),
            avg_order_size Float32
        ) ENGINE = MergeTree()
        ORDER BY (year, month)
        """,
        
        # 4. Витрина продаж по магазинам
        """
        CREATE TABLE IF NOT EXISTS vitrina_store_sales (
            store_id UInt32,
            store_name String,
            city String,
            country String,
            total_revenue Decimal(15,2),
            order_count UInt32,
            avg_check Decimal(15,2)
        ) ENGINE = MergeTree()
        ORDER BY (country, store_id)
        """,
        
        # 5. Витрина продаж по поставщикам
        """
        CREATE TABLE IF NOT EXISTS vitrina_supplier_sales (
            supplier_id UInt32,
            supplier_name String,
            country String,
            total_revenue Decimal(15,2),
            avg_price Decimal(15,2)
        ) ENGINE = MergeTree()
        ORDER BY (country, supplier_id)
        """,
        
        # 6. Витрина качества продукции
        """
        CREATE TABLE IF NOT EXISTS vitrina_product_quality (
            product_id UInt32,
            name String,
            rating Float32,
            review_count UInt32,
            total_quantity UInt64,
            total_revenue Decimal(15,2)
        ) ENGINE = MergeTree()
        ORDER BY product_id
        """
    ]
    
    for query in queries:
        execute_clickhouse_query(query)

# Создаем таблицы
create_clickhouse_tables()

✗ Ошибка выполнения запроса: 
[PARSE_SYNTAX_ERROR] Syntax error at or near 'ENGINE'.(line 10, pos 10)

== SQL ==

        CREATE TABLE IF NOT EXISTS vitrina_product_sales (
            product_id UInt32,
            name String,
            category String,
            total_quantity UInt64,
            total_revenue Decimal(15,2),
            avg_rating Float32,
            review_count UInt32
        ) ENGINE = MergeTree()
----------^^^
        ORDER BY (category, product_id)
        

✗ Ошибка выполнения запроса: 
[PARSE_SYNTAX_ERROR] Syntax error at or near 'ENGINE'.(line 9, pos 10)

== SQL ==

        CREATE TABLE IF NOT EXISTS vitrina_customer_sales (
            customer_id UInt32,
            customer_name String,
            country String,
            total_spent Decimal(15,2),
            order_count UInt32,
            avg_check Decimal(15,2)
        ) ENGINE = MergeTree()
----------^^^
        ORDER BY (country, customer_id)
        

✗ Ошибка выполнения запроса: 
[PARSE_S

25/11/18 13:51:42 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
