In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("OnlineCourseEnrollments") \
    .getOrCreate()

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
# Data Loading
# 1. Load the data with schema inference enabled.
enrollmentdf = spark.read.csv("dbfs:/FileStore/tables/enrollments.csv",header=True,inferSchema=True)
enrollmentdf.show()



+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|Rating|   Status|
+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|   4.5|   Active|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|   4.7|Completed|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|   Active|
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|  NULL| Inactive|
|      ENR005|       Zara|Machine Learning 101|         AI|2024-05-17|             60|   4.2|   Active|
|      ENR006|    Ibrahim|Python for Beginners|Programming|2024-05-18|             90|   4.6|Completed|
+------------+-----------+--------------------+-----------+-----

In [0]:
# 2. Manually define schema and compare both approaches.
schema = StructType([
    StructField("EnrollmentID", StringType(), True),
    StructField("StudentName", StringType(), True),
    StructField("CourseName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("EnrollDate", DateType(), True),
    StructField("ProgressPercent", IntegerType(), True),
    StructField("Rating", DoubleType(), True),
    StructField("Status", StringType(), True)
])

manual = spark.read.csv("dbfs:/FileStore/tables/enrollments.csv",header=True,inferSchema=True)
manual.printSchema()

root
 |-- EnrollmentID: string (nullable = true)
 |-- StudentName: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: date (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Status: string (nullable = true)



In [0]:
# Filtering and Transformation
# 3. Filter records where ProgressPercent < 50 .
percentage=manual.filter(col("ProgressPercent") < 50)
percentage.show()

# 4. Replace null ratings with average rating.
avg_rating = manual.select(avg("Rating")).first()[0]
filled = manual.withColumn("Rating", when(col("Rating").isNull(), avg_rating).otherwise(col("Rating")))
filled.show()

# 5. Add column IsActive → 1 if Status is Active, else 0.
isActive = filled.withColumn("IsActive", when(col("Status") == "Active", 1).otherwise(0))
isActive.show()

+------------+-----------+-------------------+-----------+----------+---------------+------+--------+
|EnrollmentID|StudentName|         CourseName|   Category|EnrollDate|ProgressPercent|Rating|  Status|
+------------+-----------+-------------------+-----------+----------+---------------+------+--------+
|      ENR003|     Aakash|Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|  Active|
|      ENR004|       Neha|        Java Basics|Programming|2024-05-15|              0|  NULL|Inactive|
+------------+-----------+-------------------+-----------+----------+---------------+------+--------+

+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+
|      ENR001|     Aditya|Python for Begin

In [0]:

# Aggregations & Metrics
# 6. Find average progress by course.
isActive.groupBy("CourseName").agg(avg("ProgressPercent").alias("AvgProgress")).show()
# 7. Get count of students in each course category.
isActive.groupBy("Category").agg(count("*")).alias("count").show()
# 8. Identify the most enrolled course.
isActive.groupBy("CourseName").count().orderBy(col("count").desc()).show(1)

+--------------------+-----------+
|          CourseName|AvgProgress|
+--------------------+-----------+
| Power BI Essentials|       30.0|
|Machine Learning 101|       60.0|
|Python for Beginners|       85.0|
|Data Analysis wit...|      100.0|
|         Java Basics|        0.0|
+--------------------+-----------+

+-----------+--------+
|   Category|count(1)|
+-----------+--------+
|Programming|       3|
|  Analytics|       2|
|         AI|       1|
+-----------+--------+

+--------------------+-----+
|          CourseName|count|
+--------------------+-----+
|Python for Beginners|    2|
+--------------------+-----+
only showing top 1 row


In [0]:
# Joins
# 9. Create second CSV: course_details.csv
coursedf = spark.read.csv("dbfs:/FileStore/tables/course_details.csv",header=True,inferSchema=True)
coursedf.show()
# 10. Join course_enrollments with course_details to include duration and instructor.
joined = isActive.join(coursedf, on="CourseName", how="left")
joined.show()


+--------------------+-------------+----------+
|          CourseName|DurationWeeks|Instructor|
+--------------------+-------------+----------+
|Python for Beginners|            4|    Rakesh|
|Data Analysis wit...|            3|    Anjali|
| Power BI Essentials|            5|     Rekha|
|         Java Basics|            6|     Manoj|
|Machine Learning 101|            8|     Samir|
+--------------------+-------------+----------+

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+
|Python for Beginners|      ENR001|     Aditya|Programming|2024-05-10|             80|              4.5|   Active|      

In [0]:
# Window Functions
# 11. Rank students in each course based on ProgressPercent .
window_rank = Window.partitionBy("CourseName").orderBy(col("ProgressPercent").desc())
ranked = joined.withColumn("Rank", rank().over(window_rank))
ranked.select("StudentName", "CourseName", "ProgressPercent", "Rank").show()
# 12. Get lead and lag of EnrollDate by Category.
window_category = Window.partitionBy("Category").orderBy("EnrollDate")

lead_lag = ranked.withColumn("NextEnrollDate", lead("EnrollDate").over(window_category)) \
                       .withColumn("PrevEnrollDate", lag("EnrollDate").over(window_category))
lead_lag.select("EnrollmentID", "StudentName", "Category", "EnrollDate", "PrevEnrollDate", "NextEnrollDate").show()

+-----------+--------------------+---------------+----+
|StudentName|          CourseName|ProgressPercent|Rank|
+-----------+--------------------+---------------+----+
|     Simran|Data Analysis wit...|            100|   1|
|       Neha|         Java Basics|              0|   1|
|       Zara|Machine Learning 101|             60|   1|
|     Aakash| Power BI Essentials|             30|   1|
|    Ibrahim|Python for Beginners|             90|   1|
|     Aditya|Python for Beginners|             80|   2|
+-----------+--------------------+---------------+----+

+------------+-----------+-----------+----------+--------------+--------------+
|EnrollmentID|StudentName|   Category|EnrollDate|PrevEnrollDate|NextEnrollDate|
+------------+-----------+-----------+----------+--------------+--------------+
|      ENR005|       Zara|         AI|2024-05-17|          NULL|          NULL|
|      ENR002|     Simran|  Analytics|2024-05-12|          NULL|    2024-05-13|
|      ENR003|     Aakash|  Analytics|2

In [0]:
# Pivoting & Formatting
# 13. Pivot data to show total enrollments by Category and Status.
pivotdf = lead_lag.groupBy("Category").pivot("Status").count()
pivotdf.show()
# 14. Extract year and month from EnrollDate .
lead_lag=lead_lag.withColumn("Year",year(col("EnrollDate")))
lead_lag=lead_lag.withColumn("Month",month(col("EnrollDate")))
lead_lag.show()

+-----------+------+---------+--------+
|   Category|Active|Completed|Inactive|
+-----------+------+---------+--------+
|Programming|     1|        1|       1|
|  Analytics|     1|        1|    NULL|
|         AI|     1|     NULL|    NULL|
+-----------+------+---------+--------+

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----+--------------+--------------+----+-----+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|Rank|NextEnrollDate|PrevEnrollDate|Year|Month|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----+--------------+--------------+----+-----+
|Machine Learning 101|      ENR005|       Zara|         AI|2024-05-17|             60|              4.2|   Active|       1|           

In [0]:
# Cleaning and Deduplication
# 15. Drop rows where Status is null or empty.
cleaned = lead_lag.filter((col("Status").isNotNull()) & (col("Status") != ""))
cleaned.show()
# 16. Remove duplicate enrollments using dropDuplicates() .
dd= cleaned.dropDuplicates()
dd.show()

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----+--------------+--------------+----+-----+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|Rank|NextEnrollDate|PrevEnrollDate|Year|Month|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----+--------------+--------------+----+-----+
|Machine Learning 101|      ENR005|       Zara|         AI|2024-05-17|             60|              4.2|   Active|       1|            8|     Samir|   1|          NULL|          NULL|2024|    5|
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|   1|    2024-05-13|          NULL|2024|    5|
| Power BI Essentials|   

In [0]:
# Export
# 17. Write the final cleaned DataFrame to:
# CSV (overwrite mode)
# JSON (overwrite mode)
# Parquet (snappy compression)
dd.write.mode("overwrite").csv("dbfs:/FileStore/tables/enrollments_csv", header=True)
dd.write.mode("overwrite").json("dbfs:/FileStore/tables/enrollments_json")
dd.write.mode("overwrite").option("compression", "snappy").parquet("dbfs:/FileStore/tables/enrollments_parquet")