In [0]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()
spark 

Ingestion & Time Fields

In [0]:
# Load into PySpark with inferred schema
from pyspark.sql.functions import to_date, datediff, coalesce, col
df=spark.read.option("header", True).option("inferSchema", True) \
    .csv("file:/Workspace/Shared/course_enrollment.csv")

In [0]:
# Convert EnrollDate and CompletionDate to date type
df = df.withColumn("EnrollDate", to_date("EnrollDate", "yyyy-MM-dd")) \
       .withColumn("CompletionDate", to_date("CompletionDate", "yyyy-MM-dd"))

In [0]:
# Add DaysToComplete column if completed
df=df.withColumn("DaysToComplete",
           datediff("CompletionDate", "EnrollDate"))
df.show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+----+--------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|   R|DaysToComplete|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+----+--------------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|   4|             9|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|NULL|          NULL|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|NULL|          NULL|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100|   5|            16|
|    E005|  U004|    C004|Digital Marketing|   Marketing|2024-04-05|    2024-04-16|            100|   4|            11|
+--------+------+--------+--------------

User Learning Path Progress

In [0]:
# Group by UserID : count of courses enrolled
#Avg progress % across all enrollments
# Flag IsCompleted = ProgressPercent = 100
from pyspark.sql.functions import *
progress = df.groupBy("UserID").agg(
      count("*").alias("CoursesEnrolled"),
      avg("ProgressPercent").alias("AvgProgress"),
      sum(when(col("ProgressPercent") == 100, 1).otherwise(0)).alias("CompletedCourses"))
progress.show()

+------+---------------+-----------+----------------+
|UserID|CoursesEnrolled|AvgProgress|CompletedCourses|
+------+---------------+-----------+----------------+
|  U004|              1|      100.0|               1|
|  U002|              1|       45.0|               0|
|  U003|              1|      100.0|               1|
|  U001|              2|       65.0|               1|
+------+---------------+-----------+----------------+



Engagement Scoring

In [0]:
#Create a score: ProgressPercent * Rating (if not null)
# Replace null Rating with 0 before computing
from pyspark.sql.functions import *
df = df.withColumn("Rating", coalesce(col("Rating"), lit(0)))
print("EngagementScore:")
df = df.withColumn("EngagementScore", col("ProgressPercent") * col("Rating"))
df.show()

EngagementScore:
+--------+------+--------+-----------------+------------+----------+--------------+---------------+----+--------------+------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|   R|DaysToComplete|Rating|EngagementScore|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+----+--------------+------+---------------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|   4|             9|     4|            400|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|NULL|          NULL|     0|              0|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|NULL|          NULL|     0|              0|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100|   5|            16|     5

Identify Drop-offs

In [0]:
#Filter all records with ProgressPercent < 50 and CompletionDate is null
# Create a view called Dropouts
df.createOrReplaceTempView("enrollments")
spark.sql("""
  CREATE OR REPLACE TEMP VIEW Dropouts AS
  SELECT *
  FROM enrollments
  WHERE ProgressPercent < 50
    AND CompletionDate IS NULL
""")

DataFrame[]

Joins with Metadata

In [0]:
# Join to find average progress per instructor
catalog = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/course_catalog.csv")
joined = df.join(catalog, "CourseID", "left")
joined.groupBy("Instructor").agg(avg("ProgressPercent").alias("AvgProgress")).orderBy(col("AvgProgress").desc()).show()


+-------------+-----------+
|   Instructor|AvgProgress|
+-------------+-----------+
|Abdullah Khan|      100.0|
|  Zoya Sheikh|      100.0|
|   Sana Gupta|       45.0|
| Ibrahim Khan|       30.0|
+-------------+-----------+



In [0]:
# Show who teaches the most enrolled course
joined.groupBy("CourseID","CourseName") .agg(count("*").alias("EnrollCount")).orderBy(col("EnrollCount").desc()).limit(1).show()

+--------+-------------+-----------+
|CourseID|   CourseName|EnrollCount|
+--------+-------------+-----------+
|    C001|Python Basics|          2|
+--------+-------------+-----------+



Delta Lake Practice

In [0]:
# Save as Delta Table enrollments_delta
joined.write.format("delta").mode("overwrite").save("file:/Workspace/Shared/enrollments_delta")

In [0]:
# Apply:
# Update: Set all ratings to 5 where Course = 'Python Basics'
# Delete: All rows where ProgressPercent = 0
from delta.tables import DeltaTable
dt = DeltaTable.forPath(spark, "file:/Workspace/Shared/enrollments_delta")
dt.update(condition="CourseName = 'Python Basics'", set={"Rating":"5"})
dt.delete(condition="ProgressPercent = 0")

In [0]:
# Show DESCRIBE HISTORY
spark.sql("DESCRIBE HISTORY delta.`file:/Workspace/Shared/enrollments_delta`").show()

+-------+--------------------+----------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|          userId|            userName|operation| operationParameters| job|          notebook|           clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+----------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|      2|2025-06-19 09:11:...|4042796083082360|azuser3548_mml.lo...|   DELETE|{predicate -> ["(...|NULL|{4419187724732099}|0612-043650-nhuexwr6|          1|WriteSerializable|        false|{numRemovedFiles ...|        NULL|Databricks-Runtim...|
|      1|2025-06-19 09:1

Window Functions

In [0]:
# Use dense_rank() to rank courses by number of enrollments
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank, lead
w1 = Window.orderBy(col("EnrollCount").desc())
course_rank = joined.groupBy("CourseID","CourseName") .agg(count("*").alias("EnrollCount")).withColumn("Rank", dense_rank().over(w1))
course_rank.show()

+--------+-----------------+-----------+----+
|CourseID|       CourseName|EnrollCount|Rank|
+--------+-----------------+-----------+----+
|    C001|    Python Basics|          2|   1|
|    C004|Digital Marketing|          1|   2|
|    C002|Excel for Finance|          1|   2|
|    C003|  ML with PySpark|          1|   2|
+--------+-----------------+-----------+----+



In [0]:
# lead() to find next course by each user (sorted by EnrollDate)
w2 = Window.partitionBy("UserID").orderBy("EnrollDate")
joined = joined.withColumn("NextCourse", lead("CourseName").over(w2))
joined.show()

+--------+--------+------+-----------------+------------+----------+--------------+---------------+----+--------------+------+---------------+-------------+-------------+------------+---------------+
|CourseID|EnrollID|UserID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|   R|DaysToComplete|Rating|EngagementScore|   Instructor|DurationHours|       Level|     NextCourse|
+--------+--------+------+-----------------+------------+----------+--------------+---------------+----+--------------+------+---------------+-------------+-------------+------------+---------------+
|    C001|    E001|  U001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|   4|             9|     4|            400|Abdullah Khan|            8|    Beginner|ML with PySpark|
|    C003|    E003|  U001|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|NULL|          NULL|     0|              0| Ibrahim Khan|           10|Intermediate|           NULL|


SQL Logic for Dashboard Views

In [0]:
# Create views:
# daily_enrollments
spark.sql("""CREATE OR REPLACE TEMP VIEW daily_enrollments AS
  SELECT EnrollDate, COUNT(*) AS EnrollCount
  FROM enrollments
  GROUP BY EnrollDate""")

DataFrame[]

In [0]:
# category_performance (avg rating by category)
spark.sql("""CREATE OR REPLACE TEMP VIEW category_performance AS
  SELECT Category, AVG(Rating) AS AvgRating
  FROM enrollments
  GROUP BY Category""")

DataFrame[]

In [0]:
# top_3_courses
spark.sql("""CREATE OR REPLACE TEMP VIEW top_3_courses AS
  SELECT CourseID, CourseName, COUNT(*) AS EnrollCount
  FROM enrollments
  GROUP BY CourseID, CourseName
  ORDER BY EnrollCount DESC
  LIMIT 3""")

DataFrame[]

Time Travel

In [0]:
# View previous version before update/delete
spark.read.format("delta").option("versionAsOf", 0).load("file:/Workspace/Shared/enrollments_delta").show()

+--------+--------+------+-----------------+------------+----------+--------------+---------------+----+--------------+------+---------------+-------------+-------------+------------+
|CourseID|EnrollID|UserID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|   R|DaysToComplete|Rating|EngagementScore|   Instructor|DurationHours|       Level|
+--------+--------+------+-----------------+------------+----------+--------------+---------------+----+--------------+------+---------------+-------------+-------------+------------+
|    C001|    E001|  U001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|   4|             9|     4|            400|Abdullah Khan|            8|    Beginner|
|    C002|    E002|  U002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|NULL|          NULL|     0|              0|   Sana Gupta|            5|    Beginner|
|    C003|    E003|  U001|  ML with PySpark|Data Science|2024-04-03|          NU

Export Reporting

In [0]:
# Write to JSON, partitioned by Category
df.write.mode("overwrite").partitionBy("Category").json("file:/Workspace/Shared/enrollments_json")

In [0]:
# Create summary DataFrame:
# CourseName, TotalEnrollments, AvgRating, AvgProgress
summary_df = joined.groupBy("Category","CourseName") .agg(count("*").alias("TotalEnrollments"),avg("Rating").alias("AvgRating"),avg("ProgressPercent").alias("AvgProgress"))

In [0]:
# Save as Parquet
summary_df.write.mode("overwrite").parquet("file:/Workspace/Shared/enrollments_summary")