In [1]:
!pip install pyspark delta-spark notebook pandas numpy

Collecting delta-spark
  Downloading delta_spark-4.0.0-py3-none-any.whl.metadata (1.9 kB)
Collecting jedi>=0.16 (from ipython>=7.23.1->ipykernel->notebook)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading delta_spark-4.0.0-py3-none-any.whl (39 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi, delta-spark
Successfully installed delta-spark-4.0.0 jedi-0.19.2


In [2]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

builder = (
    SparkSession.builder
    .appName("DeltaFeatureStore")
    .master("local[*]")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

np.random.seed(42)

users = pd.DataFrame({
    "user_id": range(1, 11),
    "signup_date": pd.date_range("2024-01-01", periods=10),
    "user_type": np.random.choice(["free", "premium"], size=10)
})

transactions = []
for _ in range(200):
    transactions.append({
        "transaction_id": np.random.randint(10000, 99999),
        "user_id": np.random.randint(1, 11),
        "product_id": np.random.randint(1, 6),
        "amount": round(np.random.uniform(10, 500), 2),
        "timestamp": datetime.now() - timedelta(days=np.random.randint(0, 7))
    })

transactions = pd.DataFrame(transactions)

users.head(), transactions.head()


(   user_id signup_date user_type
 0        1  2024-01-01      free
 1        2  2024-01-02   premium
 2        3  2024-01-03      free
 3        4  2024-01-04      free
 4        5  2024-01-05      free,
    transaction_id  user_id  product_id  amount                  timestamp
 0           92386        7           3  235.03 2025-12-28 16:25:13.279194
 1           54131        8           3   20.09 2025-12-31 16:25:13.279251
 2           69735        6           2   99.09 2025-12-28 16:25:13.279286
 3           93104       10           5  221.65 2026-01-01 16:25:13.279318
 4           35658       10           4  487.14 2025-12-26 16:25:13.279351)

In [4]:
users_df = spark.createDataFrame(users)
transactions_df = spark.createDataFrame(transactions)

users_df.show(5)
transactions_df.show(5)

+-------+-------------------+---------+
|user_id|        signup_date|user_type|
+-------+-------------------+---------+
|      1|2024-01-01 00:00:00|     free|
|      2|2024-01-02 00:00:00|  premium|
|      3|2024-01-03 00:00:00|     free|
|      4|2024-01-04 00:00:00|     free|
|      5|2024-01-05 00:00:00|     free|
+-------+-------------------+---------+
only showing top 5 rows
+--------------+-------+----------+------+--------------------+
|transaction_id|user_id|product_id|amount|           timestamp|
+--------------+-------+----------+------+--------------------+
|         92386|      7|         3|235.03|2025-12-28 16:25:...|
|         54131|      8|         3| 20.09|2025-12-31 16:25:...|
|         69735|      6|         2| 99.09|2025-12-28 16:25:...|
|         93104|     10|         5|221.65|2026-01-01 16:25:...|
|         35658|     10|         4|487.14|2025-12-26 16:25:...|
+--------------+-------+----------+------+--------------------+
only showing top 5 rows


In [5]:
users_df.write.format("delta").mode("overwrite").save("delta/raw/users")
transactions_df.write.format("delta").mode("overwrite").save("delta/raw/transactions")

In [6]:
from pyspark.sql.functions import sum, count, datediff, current_date, col

user_agg = (
    transactions_df
    .groupBy("user_id")
    .agg(
        sum("amount").alias("total_spent"),
        count("transaction_id").alias("transaction_count")
    )
)

user_features = (
    users_df
    .join(user_agg, on="user_id", how="left")
    .withColumn("days_since_signup", datediff(current_date(), col("signup_date")))
    .fillna(0)
)

user_features.show()


+-------+-------------------+---------+------------------+-----------------+-----------------+
|user_id|        signup_date|user_type|       total_spent|transaction_count|days_since_signup|
+-------+-------------------+---------+------------------+-----------------+-----------------+
|      5|2024-01-05 00:00:00|     free|           5076.52|               21|              727|
|      1|2024-01-01 00:00:00|     free|5019.9400000000005|               24|              731|
|      3|2024-01-03 00:00:00|     free| 5654.050000000001|               23|              729|
|      2|2024-01-02 00:00:00|  premium|           4262.67|               17|              730|
|      4|2024-01-04 00:00:00|     free|           4544.67|               18|              728|
|      7|2024-01-07 00:00:00|     free|           5518.48|               16|              725|
|      6|2024-01-06 00:00:00|  premium|           4519.67|               18|              726|
|      9|2024-01-09 00:00:00|     free|           

In [7]:
user_features.write.format("delta").mode("overwrite").save("delta/features/user_features")

In [9]:
updated_user_features = user_features.withColumn(
    "loyalty_score",
    col("transaction_count") * 10
)

updated_user_features.write.format("delta").mode("overwrite").option("mergeSchema", "true").save("delta/features/user_features")

In [10]:
old_features = spark.read.format("delta") \
    .option("versionAsOf", 0) \
    .load("delta/features/user_features")

old_features.show()

+-------+-------------------+---------+------------------+-----------------+-----------------+
|user_id|        signup_date|user_type|       total_spent|transaction_count|days_since_signup|
+-------+-------------------+---------+------------------+-----------------+-----------------+
|      5|2024-01-05 00:00:00|     free|           5076.52|               21|              727|
|      1|2024-01-01 00:00:00|     free|5019.9400000000005|               24|              731|
|      3|2024-01-03 00:00:00|     free| 5654.050000000001|               23|              729|
|      2|2024-01-02 00:00:00|  premium|           4262.67|               17|              730|
|      4|2024-01-04 00:00:00|     free|           4544.67|               18|              728|
|      7|2024-01-07 00:00:00|     free|           5518.48|               16|              725|
|      6|2024-01-06 00:00:00|  premium|           4519.67|               18|              726|
|      9|2024-01-09 00:00:00|     free|           

In [11]:
spark.read.format("delta").load("delta/features/user_features").show()


+-------+-------------------+---------+------------------+-----------------+-----------------+-------------+
|user_id|        signup_date|user_type|       total_spent|transaction_count|days_since_signup|loyalty_score|
+-------+-------------------+---------+------------------+-----------------+-----------------+-------------+
|      5|2024-01-05 00:00:00|     free|           5076.52|               21|              727|          210|
|      1|2024-01-01 00:00:00|     free|5019.9400000000005|               24|              731|          240|
|      3|2024-01-03 00:00:00|     free| 5654.050000000001|               23|              729|          230|
|      2|2024-01-02 00:00:00|  premium|           4262.67|               17|              730|          170|
|      4|2024-01-04 00:00:00|     free|           4544.67|               18|              728|          180|
|      7|2024-01-07 00:00:00|     free|           5518.48|               16|              725|          160|
|      6|2024-01-06

In [12]:
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, "delta/features/user_features")
delta_table.history().show(truncate=False)

+-------+-----------------------+------+--------+---------+--------------------------------------+----+--------+---------+-----------+--------------+-------------+-----------------------------------------------------------------------------------------------------------+------------+-----------------------------------+
|version|timestamp              |userId|userName|operation|operationParameters                   |job |notebook|clusterId|readVersion|isolationLevel|isBlindAppend|operationMetrics                                                                                           |userMetadata|engineInfo                         |
+-------+-----------------------+------+--------+---------+--------------------------------------+----+--------+---------+-----------+--------------+-------------+-----------------------------------------------------------------------------------------------------------+------------+-----------------------------------+
|1      |2026-01-01 16:27:29.303|NULL