In [0]:
# 1
# Ingestion & Time Fields
# Load into PySpark with inferred schema
# Convert EnrollDate and CompletionDate to date type
# Add DaysToComplete column if completed
df = spark.read.csv("file:/Workspace/Shared/course_enrollments19.csv",header=True,inferSchema=True)

df = df.withColumn("EnrollDate", to_date("EnrollDate")).withColumn("CompletionDate", to_date("CompletionDate")).withColumn("DaysToComplete", when(col("CompletionDate").isNotNull(),datediff(col("CompletionDate"), col("EnrollDate"))))

df.show()


+--------+------+--------+-----------------+------------+----------+--------------+---------------+-------+--------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Ratings|DaysToComplete|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+-------+--------------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|      4|             9|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|   NULL|          NULL|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|   NULL|          NULL|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100|      5|            16|
|    E005|  U004|    C004|Digital Marketing|   Marketing|2024-04-05|    2024-04-16|            100|      4|            11|
+--------+------

In [0]:

# 2
# User Learning Path Progress
# Group by UserID : count of courses enrolled
# Avg progress % across all enrollments
# Flag IsCompleted = ProgressPercent = 100

user_progress = df.groupBy("UserID").agg(count("*").alias("CoursesEnrolled"),avg("ProgressPercent").alias("AvgProgress")).withColumn("IsCompleted", col("AvgProgress") == 100)

user_progress.show()


+------+---------------+-----------+-----------+
|UserID|CoursesEnrolled|AvgProgress|IsCompleted|
+------+---------------+-----------+-----------+
|  U004|              1|      100.0|       true|
|  U002|              1|       45.0|      false|
|  U003|              1|      100.0|       true|
|  U001|              2|       65.0|      false|
+------+---------------+-----------+-----------+



In [0]:

# 3
# Engagement Scoring
# Create a score: ProgressPercent * Rating (if not null)
# Replace null Rating with 0 before computing
from pyspark.sql.functions import *

df = df.withColumn("RatingFilled", coalesce(col("Ratings"), col("Ratings")*0)).withColumn("EngagementScore", col("ProgressPercent") * col("RatingFilled"))

df.select("EnrollID", "ProgressPercent", "RatingFilled", "EngagementScore").show()


+--------+---------------+------------+---------------+
|EnrollID|ProgressPercent|RatingFilled|EngagementScore|
+--------+---------------+------------+---------------+
|    E001|            100|           4|            400|
|    E002|             45|        NULL|           NULL|
|    E003|             30|        NULL|           NULL|
|    E004|            100|           5|            500|
|    E005|            100|           4|            400|
+--------+---------------+------------+---------------+



In [0]:

# 4
# Identify Drop-offs
# Filter all records with ProgressPercent < 50 and CompletionDate is null
# Create a view called Dropouts
df.createOrReplaceTempView("enrollments")

spark.sql("""
CREATE OR REPLACE TEMP VIEW Dropouts AS
SELECT *
FROM enrollments
WHERE ProgressPercent < 50
  AND CompletionDate IS NULL
""")

spark.sql("SELECT * FROM Dropouts").show()


+--------+------+--------+-----------------+------------+----------+--------------+---------------+-------+--------------+------------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Ratings|DaysToComplete|RatingFilled|EngagementScore|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+-------+--------------+------------+---------------+
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|   NULL|          NULL|        NULL|           NULL|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|   NULL|          NULL|        NULL|           NULL|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+-------+--------------+------------+---------------+



In [0]:

# 5
# Joins with Metadata
# Create course_catalog.csv :
# CourseID,Instructor,DurationHours,Level
# C001,Abdullah Khan,8,Beginner
# C002,Sana Gupta,5,Beginner
# C003,Ibrahim Khan,10,Intermediate
# C004,Zoya Sheikh,6,Beginner
# Join to find average progress per instructor
# Show who teaches the most enrolled course
catalog = spark.read.csv("file:/Workspace/Shared/course_catalog19.csv",header=True)
catalog.createOrReplaceTempView("catalog")

spark.sql("""
SELECT c.Instructor,
       ROUND(AVG(e.ProgressPercent),1) AS AvgProgress,
       COUNT(*) AS EnrollCount
FROM enrollments e
JOIN catalog c ON e.CourseID = c.CourseID
GROUP BY c.Instructor
""").show()

spark.sql("""
SELECT c.CourseID, COUNT(*) AS TotalEnroll
FROM enrollments e
JOIN catalog c ON e.CourseID = c.CourseID
GROUP BY c.CourseID
ORDER BY TotalEnroll DESC
LIMIT 1
""").show()



+-------------+-----------+-----------+
|   Instructor|AvgProgress|EnrollCount|
+-------------+-----------+-----------+
|  Zoya Sheikh|      100.0|          1|
|   Sana Gupta|       45.0|          1|
| Ibrahim Khan|       30.0|          1|
|Abdullah Khan|      100.0|          2|
+-------------+-----------+-----------+

+--------+-----------+
|CourseID|TotalEnroll|
+--------+-----------+
|    C001|          2|
+--------+-----------+



In [0]:

# 6
# Delta Lake Practice
# Save as Delta Table enrollments_delta
# Apply:
# Update: Set all ratings to 5 where Course = 'Python Basics'
# Delete: All rows where ProgressPercent = 0
# Show DESCRIBE HISTORY
from delta.tables import DeltaTable

df.write.format("delta").mode("overwrite").save("file:/Workspace/Shared/enrollments")
delta = DeltaTable.forPath(spark, "file:/Workspace/Shared/enrollments")

delta.update(
    condition="CourseName = 'Python Basics'",
    set={"Ratings": "5"}
)

delta.delete("ProgressPercent = 0")

delta.history().show()

# 7
# Window Functions
# Use dense_rank() to rank courses by number of enrollments
# lead() to find next course by each user (sorted by EnrollDate)
from pyspark.sql.window import Window
from pyspark.sql.functions import *

window1 = Window.orderBy(col("EnrollCount").desc())
enrollment_counts = spark.sql("""
SELECT CourseName, COUNT(*) AS EnrollCount
FROM enrollments
GROUP BY CourseName
""")
enrollment_counts.withColumn("Rank", dense_rank().over(window1)).show()

window2 = Window.partitionBy("UserID").orderBy("EnrollDate")
df.withColumn("NextCourse", lead("CourseName").over(window2)).show()


+-------+--------------------+----------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|          userId|            userName|operation| operationParameters| job|          notebook|           clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+----------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|      9|2025-06-19 05:50:...|1679761755594499|azuser3546_mml.lo...| OPTIMIZE|{predicate -> [],...|NULL|{2977741827703190}|0612-091342-i15khidz|          7|SnapshotIsolation|        false|{numRemovedFiles ...|        NULL|Databricks-Runtim...|
|      8|2025-06-19 05:5

In [0]:
from pyspark.sql.functions import round
# 8
# SQL Logic for Dashboard Views
# Create views:
# daily_enrollments
# category_performance (avg rating by category)
# top_3_courses
spark.sql("""
CREATE OR REPLACE TEMP VIEW daily_enrollments AS
SELECT EnrollDate, COUNT(*) AS TotalEnrolled
FROM enrollments
GROUP BY EnrollDate
""")

spark.sql("""
CREATE OR REPLACE TEMP VIEW category_performance AS
SELECT Category, ROUND(AVG(Ratings),2) AS AvgRating
FROM enrollments
GROUP BY Category
""")

spark.sql("""
CREATE OR REPLACE TEMP VIEW top_3_courses AS
SELECT CourseName, COUNT(*) AS EnrollCount
FROM enrollments
GROUP BY CourseName
ORDER BY EnrollCount DESC
LIMIT 3
""")

# 9
# Time Travel
# View previous version before update/delete
# Use VERSION AS OF and TIMESTAMP AS OF
# Export Reporting
# Write to JSON, partitioned by Category
# Create summary DataFrame:
# CourseName, TotalEnrollments, AvgRating, AvgProgress
# Save as Parquet

spark.read.format("delta").option("versionAsOf", 0).load("file:/Workspace/Shared/enrollments").show()

final_df = spark.table("enrollments")

final_df.write.mode("overwrite").partitionBy("Category").parquet("file:/Workspace/Shared/enrollments_by_category")
summary_df = final_df.groupBy("CourseName").agg(
    count("*").alias("TotalEnrollments"),
    round(avg("RatingFilled"),2).alias("AvgRating"),
    round(avg("ProgressPercent"),2).alias("AvgProgress")
)
summary_df.write.mode("overwrite").parquet("file:/Workspace/Shared/course_summary_parquet")


+--------+------+--------+-----------------+------------+----------+--------------+---------------+-------+--------------+------------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Ratings|DaysToComplete|RatingFilled|EngagementScore|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+-------+--------------+------------+---------------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|      4|             9|           4|            400|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|   NULL|          NULL|        NULL|           NULL|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|   NULL|          NULL|        NULL|           NULL|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|     