# Online Course Learning Platform Analytics

Creating Pyspark Session

In [128]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("OnlineCourseLearning").getOrCreate()
spark

#### 1. Ingestion & Time Fields

Load into PySpark with inferred schema


In [129]:
from google.colab import drive

drive.mount('/content/drive')

course_df = spark.read.format('csv') \
                       .option('header', True) \
                       .option('inferSchema', True) \
                       .load('/content/drive/MyDrive/Assessment/course_enrollments.csv')

course_df.printSchema()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
root
 |-- EnrollID: string (nullable = true)
 |-- UserID: string (nullable = true)
 |-- CourseID: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: date (nullable = true)
 |-- CompletionDate: date (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: integer (nullable = true)



Removing Leading and tailing spaces in the column

In [130]:
course_df = course_df.toDF(*[col_name.strip() for col_name in course_df.columns])

convert EnrollDate and completionDate to date type

In [131]:
from pyspark.sql.functions import to_date
course_df = course_df.withColumn('EnrollDate', to_date('EnrollDate', 'yyyy-MM-dd')) \
                     .withColumn('CompletionDate', to_date('CompletionDate', 'yyyy-MM-dd'))

course_df.show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|     4|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|  NULL|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|  NULL|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100|     5|
|    E005|  U004|    C004|Digital Marketing|   Marketing|2024-04-05|    2024-04-16|            100|     4|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+



Add DaysToComplete column if completed

In [132]:
from pyspark.sql.functions import when, datediff, col

course_df = course_df.withColumn('DaysToComplete',
                                 when(col('CompletionDate').isNotNull(), datediff(col('CompletionDate'), col('EnrollDate'))))

course_df.select(
                  'EnrollID',
                  'UserID',
                  'CourseName',
                  'EnrollDate',
                  'CompletionDate',
                  'DaysToComplete'
                  ).show()


+--------+------+-----------------+----------+--------------+--------------+
|EnrollID|UserID|       CourseName|EnrollDate|CompletionDate|DaysToComplete|
+--------+------+-----------------+----------+--------------+--------------+
|    E001|  U001|    Python Basics|2024-04-01|    2024-04-10|             9|
|    E002|  U002|Excel for Finance|2024-04-02|          NULL|          NULL|
|    E003|  U001|  ML with PySpark|2024-04-03|          NULL|          NULL|
|    E004|  U003|    Python Basics|2024-04-04|    2024-04-20|            16|
|    E005|  U004|Digital Marketing|2024-04-05|    2024-04-16|            11|
+--------+------+-----------------+----------+--------------+--------------+



#### 2. User Learning Path Progress

Group by UserID : count of courses enrolled

In [133]:
total_course_enrolled = course_df.groupBy('UserID') \
                                .count() \
                                .withColumnRenamed('count','TotalCourseEnrolled') \
                                .orderBy('TotalCourseEnrolled', ascending=False)

total_course_enrolled.show()

+------+-------------------+
|UserID|TotalCourseEnrolled|
+------+-------------------+
|  U001|                  2|
|  U004|                  1|
|  U002|                  1|
|  U003|                  1|
+------+-------------------+



Avg progress % across all enrollments

In [134]:
from pyspark.sql.functions import avg

avg_enrollments_df = course_df.select(avg("ProgressPercent").alias("AvgProgressPercent"))
avg_enrollments_df.show()


+------------------+
|AvgProgressPercent|
+------------------+
|              75.0|
+------------------+



Flag
IsCompleted = ProgressPercent = 100

In [135]:
course_df = course_df.withColumn('IsCompleted', col('ProgressPercent') == 100)

course_df.select(
                  'EnrollID',
                  'UserID',
                  'CourseID',
                  'CourseName',
                  'Category',
                  'ProgressPercent',
                  'IsCompleted'
                  ).show()


+--------+------+--------+-----------------+------------+---------------+-----------+
|EnrollID|UserID|CourseID|       CourseName|    Category|ProgressPercent|IsCompleted|
+--------+------+--------+-----------------+------------+---------------+-----------+
|    E001|  U001|    C001|    Python Basics| Programming|            100|       true|
|    E002|  U002|    C002|Excel for Finance|Productivity|             45|      false|
|    E003|  U001|    C003|  ML with PySpark|Data Science|             30|      false|
|    E004|  U003|    C001|    Python Basics| Programming|            100|       true|
|    E005|  U004|    C004|Digital Marketing|   Marketing|            100|       true|
+--------+------+--------+-----------------+------------+---------------+-----------+



####  3 Engagement Scoring


Replace null
Rating with 0 before computing

In [136]:
from pyspark.sql.functions import coalesce, lit
course_df = course_df.withColumn('Rating', coalesce(col('Rating'), lit(0)))

course_df.select(
                  'EnrollID',
                  'UserID',
                  'CourseID',
                  'CourseName',
                  'Category',
                  'Rating',
                  'IsCompleted'
).show()

+--------+------+--------+-----------------+------------+------+-----------+
|EnrollID|UserID|CourseID|       CourseName|    Category|Rating|IsCompleted|
+--------+------+--------+-----------------+------------+------+-----------+
|    E001|  U001|    C001|    Python Basics| Programming|     4|       true|
|    E002|  U002|    C002|Excel for Finance|Productivity|     0|      false|
|    E003|  U001|    C003|  ML with PySpark|Data Science|     0|      false|
|    E004|  U003|    C001|    Python Basics| Programming|     5|       true|
|    E005|  U004|    C004|Digital Marketing|   Marketing|     4|       true|
+--------+------+--------+-----------------+------------+------+-----------+



Create a score:  ProgressPercent * Rating (if not null)

In [137]:
course_df = course_df.withColumn('ProgressPercent',
                                  when(col('Rating') != 0 , col('ProgressPercent') * col('Rating'))
                                  .otherwise(col('ProgressPercent'))
)

course_df.select(
                  'EnrollID',
                  'UserID',
                  'CourseID',
                  'CourseName',
                  'Rating',
                  'ProgressPercent',
                  'IsCompleted'
).show()

+--------+------+--------+-----------------+------+---------------+-----------+
|EnrollID|UserID|CourseID|       CourseName|Rating|ProgressPercent|IsCompleted|
+--------+------+--------+-----------------+------+---------------+-----------+
|    E001|  U001|    C001|    Python Basics|     4|            400|       true|
|    E002|  U002|    C002|Excel for Finance|     0|             45|      false|
|    E003|  U001|    C003|  ML with PySpark|     0|             30|      false|
|    E004|  U003|    C001|    Python Basics|     5|            500|       true|
|    E005|  U004|    C004|Digital Marketing|     4|            400|       true|
+--------+------+--------+-----------------+------+---------------+-----------+



#### 4. Identify Drop-offs


Filter all records with progressPercent < 50 and CompletionDate is null

In [138]:
progress_filtered_df = course_df.filter((col('ProgressPercent') < 50) & (col('CompletionDate').isNull())) \
                                .select(
                                          'EnrollID',
                                          'UserID',
                                          'CourseID',
                                          'CourseName',
                                          'Category',
                                          'EnrollDate',
                                          'ProgressPercent'
                                       )

progress_filtered_df.show()

+--------+------+--------+-----------------+------------+----------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|ProgressPercent|
+--------+------+--------+-----------------+------------+----------+---------------+
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|             45|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|             30|
+--------+------+--------+-----------------+------------+----------+---------------+



Create a view called
Dropouts

In [139]:
progress_filtered_df.createOrReplaceTempView("Dropouts")

spark.sql("SELECT * FROM Dropouts").show()


+--------+------+--------+-----------------+------------+----------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|ProgressPercent|
+--------+------+--------+-----------------+------------+----------+---------------+
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|             45|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|             30|
+--------+------+--------+-----------------+------------+----------+---------------+



#### 5. Joins with Metadata

Create
course_catalog.csv

In [140]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

course_data = [
    ("C001", "Abdullah Khan", 8, "Beginner"),
    ("C002", "Sana Gupta", 5, "Beginner"),
    ("C003", "Ibrahim Khan", 10, "Intermediate"),
    ("C004", "Zoya Sheikh", 6, "Beginner")
]


course_schema = StructType([
    StructField("CourseID", StringType(), True),
    StructField("Instructor", StringType(), True),
    StructField("DurationHours", IntegerType(), True),
    StructField("Level", StringType(), True)
])

catalog_df = spark.createDataFrame(course_data, course_schema)
catalog_df.printSchema()

root
 |-- CourseID: string (nullable = true)
 |-- Instructor: string (nullable = true)
 |-- DurationHours: integer (nullable = true)
 |-- Level: string (nullable = true)



Join to find
average progress per instructor

In [141]:
from pyspark.sql.functions import avg

joined_df = course_df.join(catalog_df, on='CourseID', how='left')

average_progress = joined_df.groupBy('Instructor') \
                            .agg(avg('ProgressPercent').alias('AvgProgressPercent'))


average_progress.show()

+-------------+------------------+
|   Instructor|AvgProgressPercent|
+-------------+------------------+
|  Zoya Sheikh|             400.0|
|   Sana Gupta|              45.0|
| Ibrahim Khan|              30.0|
|Abdullah Khan|             450.0|
+-------------+------------------+



Show who teaches the most enrolled course

In [142]:
from pyspark.sql.functions import first, count, desc

most_enrolled_course = joined_df.groupBy("CourseID") \
                                .agg(
                                    count("EnrollID").alias("EnrollmentCount"),
                                    first("CourseName").alias("CourseName"),
                                    first("Instructor").alias("Instructor")) \
                                .orderBy(desc("EnrollmentCount")) \
                                .limit(1)

most_enrolled_course.show()

+--------+---------------+-------------+-------------+
|CourseID|EnrollmentCount|   CourseName|   Instructor|
+--------+---------------+-------------+-------------+
|    C001|              2|Python Basics|Abdullah Khan|
+--------+---------------+-------------+-------------+



#### 6. Delta Lake Practice

Save as Delta Table
enrollments_delta


In [None]:
joined_df.write.mode('overwrite') \
               .format('delta') \
               .save('/content/drive/MyDrive/Assessment/course/enrollments_delta')

Update: Set all ratings to 5 where Course = 'Python Basics

In [145]:
joined_df = joined_df.withColumn('Rating',
                                 when(col('CourseName') == 'Python Basics', lit(5))
                                 .otherwise(col('Rating')))

joined_df.select(
                  'EnrollID',
                  'UserID',
                  'CourseID',
                  'CourseName',
                  'Rating',
).show()

+--------+------+--------+-----------------+------+
|EnrollID|UserID|CourseID|       CourseName|Rating|
+--------+------+--------+-----------------+------+
|    E003|  U001|    C003|  ML with PySpark|     0|
|    E005|  U004|    C004|Digital Marketing|     4|
|    E001|  U001|    C001|    Python Basics|     5|
|    E004|  U003|    C001|    Python Basics|     5|
|    E002|  U002|    C002|Excel for Finance|     0|
+--------+------+--------+-----------------+------+



Delete: All rows where ProgressPercent = 0

In [148]:
joined_df = joined_df.filter(col('ProgressPercent') != 0)

joined_df.select(
                  'EnrollID',
                  'UserID',
                  'CourseID',
                  'CourseName',
                  'ProgressPercent'
                 ).show()

+--------+------+--------+-----------------+---------------+
|EnrollID|UserID|CourseID|       CourseName|ProgressPercent|
+--------+------+--------+-----------------+---------------+
|    E003|  U001|    C003|  ML with PySpark|             30|
|    E005|  U004|    C004|Digital Marketing|            400|
|    E001|  U001|    C001|    Python Basics|            400|
|    E004|  U003|    C001|    Python Basics|            500|
|    E002|  U002|    C002|Excel for Finance|             45|
+--------+------+--------+-----------------+---------------+



 Show
DESCRIBE HISTORY

In [None]:
spark.sql("DESCRIBE HISTORY delta.`/content/drive/MyDrive/Assessment/enrollments_delta`").show()


#### 7. Window Functions

use dense_rank() to rank courses by number of enrollments


In [150]:
from pyspark.sql.functions import dense_rank, count
from pyspark.sql.window import Window


course_rank_df = joined_df.groupBy('CourseID') \
                          .agg(count('EnrollID').alias('EnrollmentCount'))

window_rank = Window.orderBy(course_rank_df['EnrollmentCount'].desc())
course_rank_df = course_rank_df.withColumn('CourseRank', dense_rank().over(window_rank))

course_rank_df.show()

+--------+---------------+----------+
|CourseID|EnrollmentCount|CourseRank|
+--------+---------------+----------+
|    C001|              2|         1|
|    C002|              1|         2|
|    C003|              1|         2|
|    C004|              1|         2|
+--------+---------------+----------+



lead() to find next course by each user (sorted by EnrollDate)

In [160]:
from pyspark.sql.functions import lead

user_window = Window.partitionBy('UserID').orderBy('EnrollDate')

next_course_df = joined_df.withColumn('NextCourse', lead('CourseName').over(user_window)) \
                          .withColumn('NextCourseCategory', lead('Category').over(user_window))

next_course_df.select(
                        'CourseID',
                        'UserID',
                         col('CourseName').alias('CurrentCourse'),
                        'Category',
                        'NextCourse',
                        'NextCourseCategory'
).show()

+--------+------+-----------------+------------+---------------+------------------+
|CourseID|UserID|    CurrentCourse|    Category|     NextCourse|NextCourseCategory|
+--------+------+-----------------+------------+---------------+------------------+
|    C001|  U001|    Python Basics| Programming|ML with PySpark|      Data Science|
|    C003|  U001|  ML with PySpark|Data Science|           NULL|              NULL|
|    C002|  U002|Excel for Finance|Productivity|           NULL|              NULL|
|    C001|  U003|    Python Basics| Programming|           NULL|              NULL|
|    C004|  U004|Digital Marketing|   Marketing|           NULL|              NULL|
+--------+------+-----------------+------------+---------------+------------------+



#### 8. SQL Logic for Dashboard Views

View for Daily Enrollments

In [165]:
daily_enrollments = joined_df.withColumn('EnrollDateOnly', to_date('EnrollDate')) \
                             .groupBy('EnrollDateOnly') \
                             .agg(count('*').alias('TotalEnrollments'))

daily_enrollments.createOrReplaceTempView('daily_enrollments')

spark.sql("SELECT * FROM daily_enrollments").show()

+--------------+----------------+
|EnrollDateOnly|TotalEnrollments|
+--------------+----------------+
|    2024-04-02|               1|
|    2024-04-01|               1|
|    2024-04-04|               1|
|    2024-04-05|               1|
|    2024-04-03|               1|
+--------------+----------------+



View for Category Performance

In [164]:
category_performance = joined_df.groupBy('Category') \
                                .agg(avg('Rating').alias('AverageRating'))

category_performance.createOrReplaceTempView('category_performance')

spark.sql("SELECT * FROM category_performance").show()

+------------+-------------+
|    Category|AverageRating|
+------------+-------------+
| Programming|          5.0|
|Productivity|          0.0|
|   Marketing|          4.0|
|Data Science|          0.0|
+------------+-------------+



View for Top 3 Course

In [163]:
course_count_df = joined_df.groupBy('CourseID', 'CourseName') \
                           .agg(count('*').alias('EnrollCount'))

window_rank = Window.orderBy(course_count_df['EnrollCount'].desc())

top_3_courses = course_count_df.withColumn('Rank', dense_rank().over(window_rank)) \
                               .filter('Rank <= 3')

top_3_courses.createOrReplaceTempView('top_3_courses')

spark.sql("SELECT * FROM top_3_courses").show()

+--------+-----------------+-----------+----+
|CourseID|       CourseName|EnrollCount|Rank|
+--------+-----------------+-----------+----+
|    C001|    Python Basics|          2|   1|
|    C003|  ML with PySpark|          1|   2|
|    C004|Digital Marketing|          1|   2|
|    C002|Excel for Finance|          1|   2|
+--------+-----------------+-----------+----+



#### 9. Time Travel

View previous version before update/delete

In [None]:
df_before_update = spark.read.format('delta') \
                             .option('versionAsOf', 0) \
                             .load('/content/drive/MyDrive/Assessment/course/enrollments_delta')

df_before_update.show()

VERSION AS OF

In [None]:
df_version = spark.read.format('delta') \
                        .option('versionAsOf', 0) \
                        .load('/content/drive/MyDrive/Assessment/course/enrollments_delta')

df_version.show()

TIMESTAMP AS OF

In [None]:
df_time_stamp = spark.read.format('delta') \
                          .option('timestampAsOf', '2024-06-18T10:00:00') \
                          .load('/content/drive/MyDrive/Assessment/course/enrollments_delta')

df_time_stamp.show()

#### 10. Export Reportin

write to JSON, partitioned by Category

In [171]:
joined_df.write.mode('overwrite') \
               .partitionBy('Category') \
               .format('json') \
               .save('/content/drive/MyDrive/Assessment/course/report_json')

Create summary DataFrame

In [168]:
summary_df = course_df.groupBy('CourseName') \
                     .agg(
                        count('EnrollID').alias('TotalEnrollments'),
                        avg('Rating').alias('AvgRating'),
                        avg('ProgressPercent').alias('AvgProgress')
                      )

summary_df.createTempView('course_summary')

spark.sql("SELECT * FROM course_summary").show()

+-----------------+----------------+---------+-----------+
|       CourseName|TotalEnrollments|AvgRating|AvgProgress|
+-----------------+----------------+---------+-----------+
|Digital Marketing|               1|      4.0|      400.0|
|    Python Basics|               2|      4.5|      450.0|
|Excel for Finance|               1|      0.0|       45.0|
|  ML with PySpark|               1|      0.0|       30.0|
+-----------------+----------------+---------+-----------+



 Save as Parquet

In [172]:
joined_df.write.mode('overwrite') \
               .format('parquet') \
               .save('/content/drive/MyDrive/Assessment/course/course_details_parquet')

summary_df.write.mode('overwrite') \
               .format('parquet') \
               .save('/content/drive/MyDrive/Assessment/course/summary_parquet')

Viewing saved file

In [173]:
!ls -l /content/drive/MyDrive/Assessment/course

total 12
drwx------ 2 root root 4096 Jun 19 06:31 course_details_parquet
drwx------ 6 root root 4096 Jun 19 06:31 report_json
drwx------ 2 root root 4096 Jun 19 06:31 summary_parquet
