# Real-World Simulation: Subscription-Based SaaS Platform

Loading the dataset

In [0]:
subscription_df = spark.read.format("csv") \
                            .option("header", "true") \
                            .option("inferSchema", "true") \
                            .load("/Volumes/workspace/default/saas_exercise/subscriptions.csv")

subscription_df.show()

+--------------+------+--------+----------+----------+--------+--------+----------+
|SubscriptionID|UserID|PlanType| StartDate|   EndDate|PriceUSD|IsActive|AutoRenew |
+--------------+------+--------+----------+----------+--------+--------+----------+
|        SUB001|  U001|   Basic|2024-01-01|2024-04-01|    30.0|    true|     true |
|        SUB002|  U002|     Pro|2024-02-15|2024-05-15|    90.0|    true|    false |
|        SUB003|  U003|     Pro|2024-03-10|2024-06-10|    90.0|   false|    false |
|        SUB004|  U001| Premium|2024-04-05|2024-07-05|   120.0|    true|     true |
|        SUB005|  U004|   Basic|2024-01-20|2024-04-20|    30.0|   false|     false|
+--------------+------+--------+----------+----------+--------+--------+----------+



In [0]:
user_activity_df = spark.read.format("csv") \
                            .option("header", "true") \
                            .option("inferSchema", "true") \
                            .load("/Volumes/workspace/default/saas_exercise/user_activity.csv")
user_activity_df.show()

+------+-------------------+---------+------------+
|UserID|          EventTime|EventType|FeatureUsed |
+------+-------------------+---------+------------+
|  U001|2024-04-07 10:22:00|    login|  Dashboard |
|  U002|2024-04-08 11:10:00|   upload|    Reports |
|  U003|2024-04-09 09:45:00| download|  Analytics |
|  U001|2024-04-10 16:00:00|   logout|  Dashboard |
|  U004|2024-04-11 12:00:00|    login|   Dashboard|
+------+-------------------+---------+------------+



#### A. Subscription Engagement Score (Real Metric Modeling)

Combine both datasets.

In [0]:
joined_df = user_activity_df.join(subscription_df, on=user_activity_df.UserID == subscription_df.UserID, how='inner') \
                            .drop(subscription_df.UserID)

joined_df.show()

+------+-------------------+---------+------------+--------------+--------+----------+----------+--------+--------+----------+
|UserID|          EventTime|EventType|FeatureUsed |SubscriptionID|PlanType| StartDate|   EndDate|PriceUSD|IsActive|AutoRenew |
+------+-------------------+---------+------------+--------------+--------+----------+----------+--------+--------+----------+
|  U001|2024-04-10 16:00:00|   logout|  Dashboard |        SUB001|   Basic|2024-01-01|2024-04-01|    30.0|    true|     true |
|  U002|2024-04-08 11:10:00|   upload|    Reports |        SUB002|     Pro|2024-02-15|2024-05-15|    90.0|    true|    false |
|  U003|2024-04-09 09:45:00| download|  Analytics |        SUB003|     Pro|2024-03-10|2024-06-10|    90.0|   false|    false |
|  U001|2024-04-10 16:00:00|   logout|  Dashboard |        SUB004| Premium|2024-04-05|2024-07-05|   120.0|    true|     true |
|  U004|2024-04-11 12:00:00|    login|   Dashboard|        SUB005|   Basic|2024-01-20|2024-04-20|    30.0|   fa

Trimming spaces in the column

In [0]:
for col_name in joined_df.columns:
    joined_df = joined_df.withColumnRenamed(col_name, col_name.strip())

 Calculate: \
 active_days = EndDate - StartDate \
 events_per_user = count(EventType) grouped by UserID

In [0]:

# Calculation for active_days

from pyspark.sql.functions import datediff
joined_df = joined_df.withColumn('activeDays', datediff('EndDate', 'StartDate'))

joined_df.select(
                    'UserID',
                    'SubscriptionID',
                    'PlanType',
                    'StartDate',
                    'EndDate',  
                    'activeDays',
                    'PriceUSD',
                    'IsActive'
                    
).show()


+------+--------------+--------+----------+----------+----------+--------+--------+
|UserID|SubscriptionID|PlanType| StartDate|   EndDate|activeDays|PriceUSD|IsActive|
+------+--------------+--------+----------+----------+----------+--------+--------+
|  U001|        SUB001|   Basic|2024-01-01|2024-04-01|        91|    30.0|    true|
|  U002|        SUB002|     Pro|2024-02-15|2024-05-15|        90|    90.0|    true|
|  U003|        SUB003|     Pro|2024-03-10|2024-06-10|        92|    90.0|   false|
|  U001|        SUB004| Premium|2024-04-05|2024-07-05|        91|   120.0|    true|
|  U004|        SUB005|   Basic|2024-01-20|2024-04-20|        91|    30.0|   false|
|  U001|        SUB001|   Basic|2024-01-01|2024-04-01|        91|    30.0|    true|
|  U001|        SUB004| Premium|2024-04-05|2024-07-05|        91|   120.0|    true|
+------+--------------+--------+----------+----------+----------+--------+--------+



In [0]:
from pyspark.sql.functions import count
#  Calculating events_per_user = count(EventType) grouped by UserID

eventsPerUser = joined_df.groupby('UserID') \
         .agg(count('EventType').alias('eventsPerUser'))
         
eventsPerUser.show()

+------+-------------+
|UserID|eventsPerUser|
+------+-------------+
|  U002|            1|
|  U001|            4|
|  U004|            1|
|  U003|            1|
+------+-------------+



Create a score: \
engagement_score = (events_per_user / active_days) * PriceUS

In [0]:
from pyspark.sql.functions import round
engagement_score_df = joined_df.join(eventsPerUser, on="UserID", how="inner") \
                               .select(
                                        'UserID', 
                                        'eventsPerUser', 
                                        'activeDays', 
                                        'PriceUSD'
                                      )
                               

engagement_score = engagement_score_df.withColumn('engagementScore',
                                                    round(((engagement_score_df.eventsPerUser / engagement_score_df.activeDays) * engagement_score_df.PriceUSD), 2)) \
                                       .select(
                                                'UserID', 
                                                'engagementScore'
                                               )
                                       


joined_df = joined_df.join(engagement_score, on="UserID", how="inner")
joined_df.select(
                    'FeatureUsed',
                    'PlanType',
                    'engagementScore'    
).show()

+-----------+--------+---------------+
|FeatureUsed|PlanType|engagementScore|
+-----------+--------+---------------+
|   Reports |     Pro|            1.0|
| Dashboard | Premium|           5.27|
|  Dashboard|   Basic|           0.33|
| Analytics |     Pro|           0.98|
| Dashboard | Premium|           1.32|
| Dashboard | Premium|           5.27|
| Dashboard | Premium|           1.32|
| Dashboard |   Basic|           5.27|
| Dashboard |   Basic|           1.32|
| Dashboard |   Basic|           5.27|
| Dashboard |   Basic|           1.32|
| Dashboard | Premium|           5.27|
| Dashboard | Premium|           1.32|
| Dashboard | Premium|           5.27|
| Dashboard | Premium|           1.32|
| Dashboard |   Basic|           5.27|
| Dashboard |   Basic|           1.32|
| Dashboard |   Basic|           5.27|
| Dashboard |   Basic|           1.32|
+-----------+--------+---------------+



#### Anomaly Detection via SQL

 Identify users with: \
 Subscription inactive but recent activity

In [0]:
joined_df.createOrReplaceTempView("joinedDF")

In [0]:
spark.sql("""
            CREATE OR REPLACE TEMP VIEW inactive_subscription AS
            SELECT 
                UserID,
                SubscriptionID,
                MAX(EventTime) AS LastEventTime,
                IsActive
            FROM joinedDF
            WHERE IsActive = false
            GROUP BY UserID, SubscriptionID, IsActive
            HAVING MAX(EventTime) >= date_sub(current_date(), 30)
""")

spark.sql("SELECT * FROM inactive_subscription").show()

+------+--------------+-------------+--------+
|UserID|SubscriptionID|LastEventTime|IsActive|
+------+--------------+-------------+--------+
+------+--------------+-------------+--------+



Identify users with: \
AutoRenew is true but no events in 30 days


In [0]:
spark.sql("""
            CREATE OR REPLACE TEMP VIEW no_recent_events AS
            WITH last_events AS (
            SELECT
                UserID,
                SubscriptionID,
                AutoRenew,
                EventTime,
                MAX(EventTime) OVER (PARTITION BY UserID) AS LastEventTime
            FROM joinedDf
            )

            SELECT DISTINCT
                UserID,
                SubscriptionID,
                AutoRenew,
                LastEventTime,
                DATEDIFF(current_date(), LastEventTime) AS DaysSinceLastEvent
            FROM last_events
            WHERE AutoRenew = true
            AND (LastEventTime IS NULL OR LastEventTime < date_sub(current_date(), 30))
""")
spark.sql("SELECT * FROM no_recent_events").show()


+------+--------------+---------+-------------------+------------------+
|UserID|SubscriptionID|AutoRenew|      LastEventTime|DaysSinceLastEvent|
+------+--------------+---------+-------------------+------------------+
|  U001|        SUB004|    true |2024-04-10 16:00:00|               432|
|  U001|        SUB001|    true |2024-04-10 16:00:00|               432|
+------+--------------+---------+-------------------+------------------+



#### Delta Lake + Merge Simulation

Imagine a billing fix needs to be applied: \
For all Pro plans in March, increase price by $5 retroactively.

In [0]:
joined_df.write.format("delta") \
    .mode("overwrite") \
    .save('/Volumes/workspace/default/saas_exercise/subscriptions') 


In [0]:
spark.sql("""
            MERGE INTO DELTA.`/Volumes/workspace/default/saas_exercise/subscriptions` AS TARGET
            USING (
                  SELECT 
                    SubscriptionID
                  FROM delta.`/Volumes/workspace/default/saas_exercise/subscriptions`
                  WHERE PlanType = 'Pro' AND (month(StartDate) = 3 OR month(EndDate) = 3)
            ) AS src
            ON target.SubscriptionID = src.SubscriptionID
            WHEN MATCHED THEN
            UPDATE SET PriceUSD = PriceUSD + 5
""").show()


+-----------------+----------------+----------------+-----------------+
|num_affected_rows|num_updated_rows|num_deleted_rows|num_inserted_rows|
+-----------------+----------------+----------------+-----------------+
|                1|               1|               0|                0|
+-----------------+----------------+----------------+-----------------+



####  Time Travel Debugging

Show describe history of the table before and after the billing fix.

In [0]:

spark.sql("""
            DESCRIBE HISTORY DELTA. `/Volumes/workspace/default/saas_exercise/subscriptions`
        """).display()


version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
5,2025-06-16T11:29:04.000Z,5039529456276268,ashwinharishp@gmail.com,MERGE,"Map(predicate -> [""(SubscriptionID#38761 = SubscriptionID#38787)""], clusterBy -> [], matchedPredicates -> [{""actionType"":""update""}], statsOnLoad -> false, notMatchedBySourcePredicates -> [], notMatchedPredicates -> [])",,,0616-095236-6zs8oj7f-v2n,4.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 0, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 1, numTargetBytesAdded -> 3096, numTargetBytesRemoved -> 0, numTargetDeletionVectorsAdded -> 1, numTargetRowsMatchedUpdated -> 1, executionTimeMs -> 1395, materializeSourceTimeMs -> 6, numTargetRowsInserted -> 0, numTargetRowsMatchedDeleted -> 0, numTargetDeletionVectorsUpdated -> 0, scanTimeMs -> 519, numTargetRowsUpdated -> 1, numOutputRows -> 1, numTargetDeletionVectorsRemoved -> 0, numTargetRowsNotMatchedBySourceUpdated -> 0, numTargetChangeFilesAdded -> 0, numSourceRows -> 1, numTargetFilesRemoved -> 0, numTargetRowsNotMatchedBySourceDeleted -> 0, rewriteTimeMs -> 805)",,Databricks-Runtime/16.4.x-photon-scala2.12
4,2025-06-16T11:29:02.000Z,5039529456276268,ashwinharishp@gmail.com,WRITE,"Map(mode -> Overwrite, statsOnLoad -> false, partitionBy -> [])",,,0616-095236-6zs8oj7f-v2n,3.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 1, numRemovedBytes -> 3845, numOutputRows -> 19, numOutputBytes -> 3845)",,Databricks-Runtime/16.4.x-photon-scala2.12
3,2025-06-16T11:14:48.000Z,5039529456276268,ashwinharishp@gmail.com,WRITE,"Map(mode -> Overwrite, statsOnLoad -> false, partitionBy -> [])",,,0616-095236-6zs8oj7f-v2n,2.0,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 2, numRemovedBytes -> 6941, numOutputRows -> 19, numOutputBytes -> 3845)",,Databricks-Runtime/16.4.x-photon-scala2.12
2,2025-06-16T11:13:16.000Z,5039529456276268,ashwinharishp@gmail.com,MERGE,"Map(predicate -> [""(SubscriptionID#34902 = SubscriptionID#34928)""], clusterBy -> [], matchedPredicates -> [{""actionType"":""update""}], statsOnLoad -> false, notMatchedBySourcePredicates -> [], notMatchedPredicates -> [])",,,0616-095236-6zs8oj7f-v2n,1.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 0, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 1, numTargetBytesAdded -> 3096, numTargetBytesRemoved -> 3096, numTargetDeletionVectorsAdded -> 0, numTargetRowsMatchedUpdated -> 1, executionTimeMs -> 1236, materializeSourceTimeMs -> 7, numTargetRowsInserted -> 0, numTargetRowsMatchedDeleted -> 0, numTargetDeletionVectorsUpdated -> 0, scanTimeMs -> 531, numTargetRowsUpdated -> 1, numOutputRows -> 1, numTargetDeletionVectorsRemoved -> 0, numTargetRowsNotMatchedBySourceUpdated -> 0, numTargetChangeFilesAdded -> 0, numSourceRows -> 1, numTargetFilesRemoved -> 1, numTargetRowsNotMatchedBySourceDeleted -> 0, rewriteTimeMs -> 637)",,Databricks-Runtime/16.4.x-photon-scala2.12
1,2025-06-16T11:10:12.000Z,5039529456276268,ashwinharishp@gmail.com,MERGE,"Map(predicate -> [""(SubscriptionID#34052 = SubscriptionID#34065)""], clusterBy -> [], matchedPredicates -> [{""actionType"":""update""}], statsOnLoad -> false, notMatchedBySourcePredicates -> [], notMatchedPredicates -> [])",,,0616-095236-6zs8oj7f-v2n,0.0,WriteSerializable,False,"Map(numTargetRowsCopied -> 0, numTargetRowsDeleted -> 0, numTargetFilesAdded -> 1, numTargetBytesAdded -> 3096, numTargetBytesRemoved -> 0, numTargetDeletionVectorsAdded -> 1, numTargetRowsMatchedUpdated -> 1, executionTimeMs -> 1419, materializeSourceTimeMs -> 6, numTargetRowsInserted -> 0, numTargetRowsMatchedDeleted -> 0, numTargetDeletionVectorsUpdated -> 0, scanTimeMs -> 509, numTargetRowsUpdated -> 1, numOutputRows -> 1, numTargetDeletionVectorsRemoved -> 0, numTargetRowsNotMatchedBySourceUpdated -> 0, numTargetChangeFilesAdded -> 0, numSourceRows -> 1, numTargetFilesRemoved -> 0, numTargetRowsNotMatchedBySourceDeleted -> 0, rewriteTimeMs -> 840)",,Databricks-Runtime/16.4.x-photon-scala2.12
0,2025-06-16T11:10:07.000Z,5039529456276268,ashwinharishp@gmail.com,WRITE,"Map(mode -> Overwrite, statsOnLoad -> false, partitionBy -> [])",,,0616-095236-6zs8oj7f-v2n,,WriteSerializable,False,"Map(numFiles -> 1, numRemovedFiles -> 0, numRemovedBytes -> 0, numOutputRows -> 19, numOutputBytes -> 3845)",,Databricks-Runtime/16.4.x-photon-scala2.12


Query using VERSION AS OF to prove the issue existed.

In [0]:

prev_version = spark.read.format("delta") \
                         .option("versionAsOf", 0).load("/Volumes/workspace/default/saas_exercise/subscriptions")

prev_version.filter(col("PlanType") == "Pro").display()

UserID,EventTime,EventType,FeatureUsed,SubscriptionID,PlanType,StartDate,EndDate,PriceUSD,IsActive,AutoRenew,activeDays,engagementScore
U002,2024-04-08T11:10:00.000Z,upload,Reports,SUB002,Pro,2024-02-15,2024-05-15,90.0,True,False,90,1.0
U003,2024-04-09T09:45:00.000Z,download,Analytics,SUB003,Pro,2024-03-10,2024-06-10,90.0,False,False,92,0.98


#### Build Tier Migration Table

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lag

user_window = Window.partitionBy("UserID").orderBy("StartDate")


user_with_lags = joined_df.withColumn("prev_plan", lag("PlanType", 1).over(user_window)) \
                          .withColumn("prev_prev_plan", lag("PlanType", 2).over(user_window))

user_with_lags.filter((col("PlanType") == "Premium") &
                      (col("prev_plan") == "Pro") &
                      (col("prev_prev_plan") == "Basic")) \
                    .select(
                            "UserID", 
                            "PlanType", 
                            "StartDate", 
                            "prev_plan", 
                            "prev_prev_plan"
                            ).show()



+------+--------+---------+---------+--------------+
|UserID|PlanType|StartDate|prev_plan|prev_prev_plan|
+------+--------+---------+---------+--------------+
+------+--------+---------+---------+--------------+



#### Power Users Detection


Used ≥ 2 features

In [0]:
feature_count = joined_df.groupBy("UserID") \
        .agg(count("FeatureUsed").alias("FeatureCount"))

feature_count.show()

+------+------------+
|UserID|FeatureCount|
+------+------------+
|  U002|           1|
|  U001|          16|
|  U004|           1|
|  U003|           1|
+------+------------+



Logged in ≥ 3 times

In [0]:
login_count = joined_df.filter(col("EventType") == "login") \
                     .groupBy("UserID") \
                     .count().withColumnRenamed("count", "loginCount")
login_count.select(
                    'UserID',
                    'loginCount'
).show()

+------+----------+
|UserID|loginCount|
+------+----------+
|  U001|         8|
|  U004|         1|
+------+----------+



Create a separate Delta table 
power_users

In [0]:
power_users = feature_count.join(login_count, on="UserID", how="inner") \
                           .filter((col("FeatureCount") >= 2) & (col("LoginCount") >= 3))

power_users.write.format("delta") \
    .mode("overwrite") \
    .save("/Volumes/workspace/default/saas_exercise/power_users") 

#### Session Replay View

Build a user session trace table using: \
 Window.partitionBy("UserID").orderBy("EventTime") \
 Show how long each user spent between login and logout events


In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lag, unix_timestamp, when

login_logout_df = joined_df.filter(col("EventType").isin("login", "logout"))

window_df = Window.partitionBy("UserID").orderBy("EventTime")

sessions_df = login_logout_df.withColumn("PrevEvent", lag("EventType").over(window_df)) \
                             .withColumn("PrevTime", lag("EventTime").over(window_df)) \
                             .withColumn("SessionDuration",
                                        when(
                                            (col("PrevEvent") == "login") & (col("EventType") == "logout"),
                                            unix_timestamp("EventTime") - unix_timestamp("PrevTime")))

sessions_df = sessions_df.filter(col("SessionDuration").isNotNull()) \
    .select(
                "UserID", 
                "PrevTime", 
                "EventTime",
                "SessionDuration"
            )

sessions_df.show()


+------+-------------------+-------------------+---------------+
|UserID|           PrevTime|          EventTime|SessionDuration|
+------+-------------------+-------------------+---------------+
|  U001|2024-04-07 10:22:00|2024-04-10 16:00:00|         279480|
+------+-------------------+-------------------+---------------+

