In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [6]:
spark = SparkSession.builder \
        .appName("CassandraDataMartLoader") \
        .config("spark.sql.adaptive.enabled", "true") \
        .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
        .config("spark.cassandra.connection.host", "cassandra") \
        .config("spark.cassandra.connection.port", "9042") \
        .config("spark.cassandra.auth.username", "") \
        .config("spark.cassandra.auth.password", "") \
        .getOrCreate()

In [3]:
pg_url = "jdbc:postgresql://postgres:5432/bober_db"
pg_properties = {"user": "bober", "password": "bober", "driver": "org.postgresql.Driver"}
df = spark.read.jdbc(url=pg_url, table="mock_data", properties=pg_properties)

# Загружаем все таблицы звезды
fact = spark.read.jdbc(url=pg_url, table="fact_sales", properties=pg_properties)
dim_product = spark.read.jdbc(url=pg_url, table="dim_product", properties=pg_properties)
dim_customer = spark.read.jdbc(url=pg_url, table="dim_customer", properties=pg_properties)
dim_store = spark.read.jdbc(url=pg_url, table="dim_store", properties=pg_properties)
dim_supplier = spark.read.jdbc(url=pg_url, table="dim_supplier", properties=pg_properties)
dim_date = spark.read.jdbc(url=pg_url, table="dim_date", properties=pg_properties)
dim_date.head(1)

[Row(full_date=datetime.date(2021, 1, 1), date_id=1, year=2021, month=1, day=1, quarter=1)]

In [7]:
def create_cassandra_keyspace(spark):
    """Создание ключевого пространства в Cassandra"""
    try:
        # Правильный способ через CQL
        spark.sql("""
            CREATE KEYSPACE IF NOT EXISTS sales_dwh 
            WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1}
        """)
        print("✓ Keyspace 'sales_dwh' создан/проверен")
    except Exception as e:
        print(f"Ошибка при создании keyspace: {e}")

create_cassandra_keyspace(spark)

Ошибка при создании keyspace: 
[PARSE_SYNTAX_ERROR] Syntax error at or near 'KEYSPACE'.(line 2, pos 19)

== SQL ==

            CREATE KEYSPACE IF NOT EXISTS sales_dwh 
-------------------^^^
            WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1}
        



In [5]:
# Витрина 1: Продажи по продуктам
def create_product_sales_mart(spark, fact, dim_product):
    print("Создание витрины: Продажи по продуктам...")
    
    product_sales = fact.alias("f") \
        .join(dim_product.alias("p"), "product_id") \
        .groupBy(
            "p.product_id",
            "p.name",
            "p.category", 
            "p.rating",
            "p.reviews"
        ) \
        .agg(
            sum("f.sale_quantity").alias("total_quantity_sold"),
            sum("f.sale_total_price").alias("total_revenue"),
            avg("f.sale_total_price").alias("avg_sale_price"),
            count("f.sale_id").alias("number_of_sales")
        ) \
        .withColumn("rank", row_number().over(Window.orderBy(desc("total_quantity_sold")))) \
        .filter(col("rank") <= 10) \
        .drop("rank")
    
    # Запись в Cassandra
    product_sales.write \
        .format("org.apache.spark.sql.cassandra") \
        .mode("append") \
        .options(table="product_sales_mart", keyspace="sales_dwh") \
        .save()
    
    print("✓ Витрина 'product_sales_mart' создана")

create_product_sales_mart(spark, fact, dim_product)

Создание витрины: Продажи по продуктам...


Py4JJavaError: An error occurred while calling o109.save.
: com.datastax.spark.connector.util.ConfigCheck$ConnectorConfigurationException: Invalid Config Variables
Only known spark.cassandra.* variables are allowed when using the Spark Cassandra Connector.
spark.cassandra.connection.keyspace is not a valid Spark Cassandra Connector variable.
No likely matches found.
	at com.datastax.spark.connector.util.ConfigCheck$.checkConfig(ConfigCheck.scala:62)
	at com.datastax.spark.connector.cql.CassandraConnectorConf$.fromSparkConf(CassandraConnectorConf.scala:413)
	at com.datastax.spark.connector.cql.CassandraConnectorConf$.apply(CassandraConnectorConf.scala:358)
	at com.datastax.spark.connector.cql.CassandraConnector$.apply(CassandraConnector.scala:225)
	at org.apache.spark.sql.cassandra.DefaultSource.getTable(DefaultSource.scala:61)
	at org.apache.spark.sql.cassandra.DefaultSource.inferSchema(DefaultSource.scala:67)
	at org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils$.getTableFromProvider(DataSourceV2Utils.scala:90)
	at org.apache.spark.sql.DataFrameWriter.getTable$1(DataFrameWriter.scala:284)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:300)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:251)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:569)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:840)
