### Create Spark Session

In [1]:
from pyspark.sql import SparkSession
from delta import *

# Initialize Spark session
builder = SparkSession.builder \
    .appName("Delta Table Example") \
    .master("local[*]") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/anastasiiatrofymova/.ivy2/cache
The jars for the packages stored in: /Users/anastasiiatrofymova/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-91a8b7f6-08c0-423d-a4f2-7b9aab22dbc8;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.2.0 in central
	found io.delta#delta-storage;3.2.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 176ms :: artifacts dl 7ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.2.0 from central in [default]
	io.delta#delta-storage;3.2.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|    

### Create a Delta Table or Load a CSV

In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

In [3]:
# Sample Data
data = [
    (1, "2024-09-01", "Product A", 100, 20.0),
    (2, "2024-09-01", "Product B", 200, 30.0),
    (3, "2024-09-01", "Product A", 150, 15.0),
    (4, "2024-09-02", "Product B", 300, 25.0),
    (5, "2024-09-02", "Product A", 200, 20.0)
]

# Schema
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("date", StringType(), True),
    StructField("product", StringType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("price", FloatType(), True)
])

df = spark.createDataFrame(data, schema)

In [4]:
# Write DataFrame as Delta table
df.write.format("delta").mode("overwrite").save("data/sales_delta_table")

# Load Delta table
delta_df = spark.read.format("delta").load("data/sales_delta_table")
delta_df.show()

24/09/02 19:39:01 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/09/02 19:39:06 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

+---+----------+---------+--------+-----+
| id|      date|  product|quantity|price|
+---+----------+---------+--------+-----+
|  2|2024-09-01|Product B|     200| 30.0|
|  5|2024-09-02|Product A|     200| 20.0|
|  1|2024-09-01|Product A|     100| 20.0|
|  3|2024-09-01|Product A|     150| 15.0|
|  4|2024-09-02|Product B|     300| 25.0|
+---+----------+---------+--------+-----+



### Data Transformation Pipeline

In [5]:
from pyspark.sql import functions as F

# 1. Aggregate: Total Quantity Sold per Product
total_quantity_df = delta_df.groupBy("product").agg(F.sum("quantity").alias("total_quantity"))
total_quantity_df.show()

# 2. Aggregate: Average Price per Product
avg_price_df = delta_df.groupBy("product").agg(F.avg("price").alias("avg_price"))
avg_price_df.show()

# 3. Aggregate: Total Revenue per Product
revenue_df = delta_df.withColumn("revenue", F.col("quantity") * F.col("price"))
total_revenue_df = revenue_df.groupBy("product").agg(F.sum("revenue").alias("total_revenue"))
total_revenue_df.show()

# Filter: Products with Total Quantity > 200
filtered_df = total_quantity_df.filter(F.col("total_quantity") > 200)
filtered_df.show()

+---------+--------------+
|  product|total_quantity|
+---------+--------------+
|Product B|           500|
|Product A|           450|
+---------+--------------+

+---------+------------------+
|  product|         avg_price|
+---------+------------------+
|Product B|              27.5|
|Product A|18.333333333333332|
+---------+------------------+

+---------+-------------+
|  product|total_revenue|
+---------+-------------+
|Product B|      13500.0|
|Product A|       8250.0|
+---------+-------------+

+---------+--------------+
|  product|total_quantity|
+---------+--------------+
|Product B|           500|
|Product A|           450|
+---------+--------------+



### Write the Transformed Data

In [9]:
# Save as Delta table
filtered_df.write.format("delta").mode("overwrite").save("data/filteres_sales_delta_table")


### Read the Final Output

In [10]:
# Load Delta table
final_df = spark.read.format("delta").load("data/filteres_sales_delta_table")
final_df.show()

+---------+--------------+
|  product|total_quantity|
+---------+--------------+
|Product B|           500|
|Product A|           450|
+---------+--------------+

