In [0]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()
spark

Data Loading

In [0]:
# 1. Load the data with schema inference enabled.
df_auto = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/course_enrollments.csv")
df_auto.printSchema()
df_auto.show()

root
 |-- EnrollmentID: string (nullable = true)
 |-- StudentName: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: date (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Status: string (nullable = true)

+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|Rating|   Status|
+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|   4.5|   Active|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|   4.7|Completed|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|   Active|
|      ENR004|       Neha|         Ja

In [0]:
# 2. Manually define schema and compare both approaches.
from pyspark.sql.types import *
schema = StructType([
    StructField("EnrollmentID", StringType(), True),
    StructField("StudentName", StringType(), True),
    StructField("CourseName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("EnrollDate", DateType(), True),
    StructField("ProgressPercent", IntegerType(), True),
    StructField("Rating", DoubleType(), True),
    StructField("Status", StringType(), True)
])
ce_df= spark.read.option("header", True).schema(schema).csv("file:/Workspace/Shared/course_enrollments.csv")
ce_df.printSchema()
ce_df.show()


root
 |-- EnrollmentID: string (nullable = true)
 |-- StudentName: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: date (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Status: string (nullable = true)

+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|Rating|   Status|
+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|   4.5|   Active|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|   4.7|Completed|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|   Active|
|      ENR004|       Neha|         Ja

Filtering and Transformation

In [0]:
#3. Filter records where ProgressPercent < 50 .
ce_df.filter("ProgressPercent < 50").show()

+------------+-----------+-------------------+-----------+----------+---------------+------+--------+
|EnrollmentID|StudentName|         CourseName|   Category|EnrollDate|ProgressPercent|Rating|  Status|
+------------+-----------+-------------------+-----------+----------+---------------+------+--------+
|      ENR003|     Aakash|Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|  Active|
|      ENR004|       Neha|        Java Basics|Programming|2024-05-15|              0|  NULL|Inactive|
+------------+-----------+-------------------+-----------+----------+---------------+------+--------+



In [0]:
# 4. Replace null ratings with average rating.
from pyspark.sql.functions import col, avg, when
avg_rating = ce_df.select(avg(col("Rating"))).first()[0]
df_filled =ce_df.withColumn("Rating", when(col("Rating").isNull(), avg_rating).otherwise(col("Rating")))
df_filled.show()

+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|              4.5|   Active|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|              4.7|Completed|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|              3.8|   Active|
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|4.359999999999999| Inactive|
|      ENR005|       Zara|Machine Learning 101|         AI|2024-05-17|             60|              4.2|   Active|
|      ENR006|    Ibrahim|Python for Beginners|Programming|2024-05-18|          

In [0]:
# 5. Add column IsActive â†’ 1 if Status is Active, else 0.
df_active = df_filled.withColumn("IsActive", when(col("Status") == "Active", 1).otherwise(0))
df_active.show()

+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|              4.5|   Active|       1|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|              4.7|Completed|       0|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|              3.8|   Active|       1|
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|4.359999999999999| Inactive|       0|
|      ENR005|       Zara|Machine Learning 101|         AI|2024-05-17|             60|              4.2|   Active|       1|
|      E

Aggregations & Metrics

In [0]:
# 6. Find average progress by course.
df_active.groupBy("CourseName").agg(avg("ProgressPercent").alias("AvgProgress")).show()

+--------------------+-----------+
|          CourseName|AvgProgress|
+--------------------+-----------+
|Data Analysis wit...|      100.0|
|         Java Basics|        0.0|
|Machine Learning 101|       60.0|
|Python for Beginners|       85.0|
| Power BI Essentials|       30.0|
+--------------------+-----------+



In [0]:
# 7. Get count of students in each course category.
df_active.groupBy("Category").count().withColumnRenamed("count", "StudentCount").show()

+-----------+------------+
|   Category|StudentCount|
+-----------+------------+
|Programming|           3|
|         AI|           1|
|  Analytics|           2|
+-----------+------------+



In [0]:
# 8. Identify the most enrolled course.
from pyspark.sql.functions import count
df_active.groupBy("CourseName").agg(count("*").alias("Enrollments")) \
    .orderBy(col("Enrollments").desc()).limit(1).show()

+--------------------+-----------+
|          CourseName|Enrollments|
+--------------------+-----------+
|Python for Beginners|          2|
+--------------------+-----------+



Joins

In [0]:
# 9. Create second CSV: course_details.csv
df_course = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/course_details.csv")
df_course.show()

+--------------------+-------------+----------+
|          CourseName|DurationWeeks|Instructor|
+--------------------+-------------+----------+
|Python for Beginners|            4|    Rakesh|
|Data Analysis wit...|            3|    Anjali|
| Power BI Essentials|            5|     Rekha|
|         Java Basics|            6|     Manoj|
|Machine Learning 101|            8|     Samir|
+--------------------+-------------+----------+



In [0]:
# 10. Join course_enrollments with course_details to include duration and instructor.
df_joined = df_active.join(df_course, on="CourseName", how="left")
df_joined.show()

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+
|Python for Beginners|      ENR001|     Aditya|Programming|2024-05-10|             80|              4.5|   Active|       1|            4|    Rakesh|
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|
| Power BI Essentials|      ENR003|     Aakash|  Analytics|2024-05-13|             30|              3.8|   Active|       1|            5|     Rekha|
|         Java Basics|      ENR004|       Neha|Programming|2024-05-15|              0|4.359999999999999| I

Window Functions

In [0]:
# 11. Rank students in each course based on ProgressPercent .
from pyspark.sql.window import Window
from pyspark.sql.functions import rank
window_spec = Window.partitionBy("CourseName").orderBy(col("ProgressPercent").desc())
df_ranked = df_joined.withColumn("Rank", rank().over(window_spec))
df_ranked.show()

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|Rank|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----+
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|   1|
|         Java Basics|      ENR004|       Neha|Programming|2024-05-15|              0|4.359999999999999| Inactive|       0|            6|     Manoj|   1|
|Machine Learning 101|      ENR005|       Zara|         AI|2024-05-17|             60|              4.2|   Active|       1|            8|     Samir|   1|
| Power BI Essentials|      ENR003|     Aakash|  Analytics|2024-05-13|      

In [0]:
# 12. Get lead and lag of EnrollDate by Category.
from pyspark.sql.functions import lead, lag
window_cat = Window.partitionBy("Category").orderBy("EnrollDate")
df_lead_lag = df_ranked.withColumn("NextEnrollDate", lead("EnrollDate", 1).over(window_cat)).withColumn("PrevEnrollDate", lag("EnrollDate", 1).over(window_cat))
df_lead_lag.show()

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----+--------------+--------------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|Rank|NextEnrollDate|PrevEnrollDate|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----+--------------+--------------+
|Machine Learning 101|      ENR005|       Zara|         AI|2024-05-17|             60|              4.2|   Active|       1|            8|     Samir|   1|          NULL|          NULL|
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|   1|    2024-05-13|          NULL|
| Power BI Essentials|      ENR003|     Aakash|  Analytics|2024-05-13|          

Pivoting & Formatting

In [0]:
# 13. Pivot data to show total enrollments by Category and Status.
df_active.groupBy("Category").pivot("Status").count().show()

+-----------+------+---------+--------+
|   Category|Active|Completed|Inactive|
+-----------+------+---------+--------+
|Programming|     1|        1|       1|
|         AI|     1|     NULL|    NULL|
|  Analytics|     1|        1|    NULL|
+-----------+------+---------+--------+



In [0]:
# 14. Extract year and month from EnrollDate .
from pyspark.sql.functions import year, month
df_date = df_active.withColumn("EnrollYear", year("EnrollDate")) \
                   .withColumn("EnrollMonth", month("EnrollDate"))
df_date.show()

+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+----------+-----------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|EnrollYear|EnrollMonth|
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+----------+-----------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|              4.5|   Active|       1|      2024|          5|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|              4.7|Completed|       0|      2024|          5|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|              3.8|   Active|       1|      2024|          5|
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|4.359999999999999| Inactive|    

Cleaning and Deduplication

In [0]:
# 15. Drop rows where Status is null or empty.
df_cleaned = df_date.filter(col("Status").isNotNull() & (col("Status") != ""))
df_cleaned.show()

+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+----------+-----------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|EnrollYear|EnrollMonth|
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+----------+-----------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|              4.5|   Active|       1|      2024|          5|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|              4.7|Completed|       0|      2024|          5|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|              3.8|   Active|       1|      2024|          5|
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|4.359999999999999| Inactive|    

In [0]:
# 16. Remove duplicate enrollments using dropDuplicates() .
df_cleaned.dropDuplicates().show()

+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+----------+-----------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|EnrollYear|EnrollMonth|
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+----------+-----------+
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|              3.8|   Active|       1|      2024|          5|
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|              4.5|   Active|       1|      2024|          5|
|      ENR006|    Ibrahim|Python for Beginners|Programming|2024-05-18|             90|              4.6|Completed|       0|      2024|          5|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|              4.7|Completed|    

Export

In [0]:
# 17. Write the final cleaned DataFrame to:
# CSV (overwrite mode)
# JSON (overwrite mode)
# Parquet (snappy compression)
df_cleaned.write.mode("overwrite").csv("file:/Workspace/Shared/course_enrollments_cleaned.csv")
df_cleaned.write.mode("overwrite").json("file:/Workspace/Shared/course_enrollments_cleaned.json")
df_cleaned.write.mode("overwrite").option("compression", "snappy").parquet("file:/Workspace/Shared/course_enrollments_cleaned.parquet")