In [1]:
# Install required packages
!pip install -q pyspark==3.5.1 delta-spark==3.1.0

# Set environment variables
import os
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3"

# Create SparkSession with Delta support
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

builder = SparkSession.builder \
    .appName("OnlineCourseAnalytics") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [11]:
# Load course_enrollments.csv
enrollments_df = spark.read.option("header", True).option("inferSchema", True).csv("/content/course_enrollments.csv")

# Load course_catalog.csv
catalog_df = spark.read.option("header", True).option("inferSchema", True).csv("/content/course_catalog.csv")

# Optional: preview the data
enrollments_df.show()
catalog_df.show()


+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|   4.0|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|  NULL|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|  NULL|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100|   5.0|
|    E005|  U004|    C004|Digital Marketing|   Marketing|2024-04-05|    2024-04-16|            100|   4.0|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+

+--------+-------------+------------

In [12]:
from pyspark.sql.functions import avg, count, when

# Flag completed
enrollments_df = enrollments_df.withColumn("IsCompleted", when(col("ProgressPercent") == 100, True).otherwise(False))

# Group by UserID
user_progress_df = enrollments_df.groupBy("UserID").agg(
    count("*").alias("CoursesEnrolled"),
    avg("ProgressPercent").alias("AvgProgressPercent")
)

user_progress_df.show()


+------+---------------+------------------+
|UserID|CoursesEnrolled|AvgProgressPercent|
+------+---------------+------------------+
|  U004|              1|             100.0|
|  U002|              1|              45.0|
|  U003|              1|             100.0|
|  U001|              2|              65.0|
+------+---------------+------------------+



In [14]:
# Dropouts: Progress < 50 AND not completed
dropouts_df = enrollments_df.filter((col("ProgressPercent") < 50) & col("CompletionDate").isNull())

# Register view
dropouts_df.createOrReplaceTempView("Dropouts")

# Verify
spark.sql("SELECT * FROM Dropouts").show()


+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+-----------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|IsCompleted|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+-----------+
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|  NULL|      false|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|  NULL|      false|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+-----------+



In [15]:
# Join enrollments with catalog
joined_df = enrollments_df.join(catalog_df, on="CourseID", how="inner")

# Average progress per instructor
avg_progress = joined_df.groupBy("Instructor").agg(avg("ProgressPercent").alias("AvgProgress"))
avg_progress.show()

# Most enrolled course
most_enrolled = enrollments_df.groupBy("CourseName").count().orderBy(col("count").desc()).limit(1)
most_enrolled_course_name = most_enrolled.collect()[0]["CourseName"]

# Who teaches it?
instructor = joined_df.filter(col("CourseName") == most_enrolled_course_name).select("Instructor").distinct()
print(f"👨‍🏫 Instructor for most enrolled course ({most_enrolled_course_name}):")
instructor.show()


+-------------+-----------+
|   Instructor|AvgProgress|
+-------------+-----------+
|  Zoya Sheikh|      100.0|
|   Sana Gupta|       45.0|
| Ibrahim Khan|       30.0|
|Abdullah Khan|      100.0|
+-------------+-----------+

👨‍🏫 Instructor for most enrolled course (Python Basics):
+-------------+
|   Instructor|
+-------------+
|Abdullah Khan|
+-------------+



In [16]:
from delta.tables import DeltaTable

enrollments_df.write.format("delta").mode("overwrite").save("/content/enrollments_delta")

delta_table = DeltaTable.forPath(spark, "/content/enrollments_delta")

delta_table.update(
    condition=col("CourseName") == "Python Basics",
    set={"Rating": "5"}
)

delta_table.delete(col("ProgressPercent") == 0)

spark.sql("DROP TABLE IF EXISTS enrollments_delta")
spark.sql("CREATE TABLE enrollments_delta USING DELTA LOCATION '/content/enrollments_delta'")
spark.sql("DESCRIBE HISTORY enrollments_delta").show(truncate=False)


+-------+-----------------------+------+--------+---------+----------------------------------------------------+----+--------+---------+-----------+--------------+-------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+-----------------------------------+
|version|timestamp              |userId|userName|operation|operationParameters                                 |job |notebook|clusterId|readVersion|isolationLevel|isBlindAppend|operationMetrics                                                                                                                                                                                                                                                                     

In [17]:
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank, lead

# Rank courses by enrollments
ranked = enrollments_df.groupBy("CourseName").count()
rank_window = Window.orderBy(col("count").desc())
ranked = ranked.withColumn("Rank", dense_rank().over(rank_window))
ranked.show()

# Lead: Next course per user
lead_window = Window.partitionBy("UserID").orderBy("EnrollDate")
next_course_df = enrollments_df.withColumn("NextCourse", lead("CourseName").over(lead_window))
next_course_df.select("UserID", "CourseName", "NextCourse").show()


+-----------------+-----+----+
|       CourseName|count|Rank|
+-----------------+-----+----+
|    Python Basics|    2|   1|
|Digital Marketing|    1|   2|
|Excel for Finance|    1|   2|
|  ML with PySpark|    1|   2|
+-----------------+-----+----+

+------+-----------------+---------------+
|UserID|       CourseName|     NextCourse|
+------+-----------------+---------------+
|  U001|    Python Basics|ML with PySpark|
|  U001|  ML with PySpark|           NULL|
|  U002|Excel for Finance|           NULL|
|  U003|    Python Basics|           NULL|
|  U004|Digital Marketing|           NULL|
+------+-----------------+---------------+



In [18]:

enrollments_df.createOrReplaceTempView("Enrollments")


spark.sql("""
    CREATE OR REPLACE TEMP VIEW daily_enrollments AS
    SELECT EnrollDate, COUNT(*) AS TotalEnrollments
    FROM Enrollments
    GROUP BY EnrollDate
""")

spark.sql("""
    CREATE OR REPLACE TEMP VIEW category_performance AS
    SELECT Category, AVG(Rating) AS AvgRating
    FROM Enrollments
    GROUP BY Category
""")

spark.sql("""
    CREATE OR REPLACE TEMP VIEW top_3_courses AS
    SELECT CourseName, COUNT(*) AS Enrollments
    FROM Enrollments
    GROUP BY CourseName
    ORDER BY Enrollments DESC
    LIMIT 3
""")


spark.sql("SELECT * FROM top_3_courses").show()


+-----------------+-----------+
|       CourseName|Enrollments|
+-----------------+-----------+
|    Python Basics|          2|
|Digital Marketing|          1|
|Excel for Finance|          1|
+-----------------+-----------+



In [19]:

spark.read.format("delta").option("versionAsOf", 0).load("/content/enrollments_delta").show()

from datetime import datetime, timedelta
timestamp = (datetime.now() - timedelta(minutes=1)).isoformat(timespec="seconds")

spark.read.format("delta").option("timestampAsOf", timestamp).load("/content/enrollments_delta").show()

summary_df = enrollments_df.groupBy("CourseName").agg(
    count("*").alias("TotalEnrollments"),
    avg("Rating").alias("AvgRating"),
    avg("ProgressPercent").alias("AvgProgress")
)
summary_df.show()

enrollments_df.write.partitionBy("Category").mode("overwrite").json("/content/enrollment_json")
summary_df.write.mode("overwrite").parquet("/content/summary_report.parquet")


+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+-----------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|IsCompleted|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+-----------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|   4.0|       true|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|  NULL|      false|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|  NULL|      false|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100|   5.0|       true|
|    E005|  U004|    C004|Digital Marketing|   Marketing|2024-04-05|    2024-04-16|            100|   4.0|       true|
+--------+------+--------+-----------------+----