**Intialize the SparkSession**

In [0]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
      .appName("Course-Enrollement")\
      .getOrCreate()
spark
      

**Data Loading**

**Load the Dataset**

In [0]:
df= spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/course_enrollement.csv")
df.printSchema()
df.show(5)

root
 |-- EnrollmentID: string (nullable = true)
 |-- StudentName: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: date (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Status : string (nullable = true)

+------------+-----------+--------------------+-----------+----------+---------------+------+----------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|Rating|   Status |
+------------+-----------+--------------------+-----------+----------+---------------+------+----------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|   4.5|   Active |
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|   4.7|Completed |
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|   Active |
|      ENR004|       Neha|    

In [0]:
#manual
from pyspark.sql.types import StructType, StructField, ShortType, StringType, DoubleType, IntegerType, DateType
schema = StructType([
    StructField("EnrollmentID", StringType(), True),
    StructField("StudentName", StringType(), True),
    StructField("CourseName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("EnrollDate", DateType(), True),
    StructField("ProgressPercent", IntegerType(), True),
    StructField("Rating", DoubleType(), True),
    StructField("Status", StringType(), True),
])

manual = spark.read.option("header", True).schema(schema).csv("file:/Workspace/Shared/course_enrollement.csv")
manual.printSchema()
#compare
print(df.count(),manual.count())

root
 |-- EnrollmentID: string (nullable = true)
 |-- StudentName: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: date (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Status: string (nullable = true)

6 6


**Filtering and Transformation**

In [0]:
from pyspark.sql.functions import col, avg, when, isnan, coalesce, year, month, lead, lag,count
#3. Filter records where ProgressPercent < 50
print("records where ProgressPercent < 50")
df.filter(df.ProgressPercent < 50).show()
#4.Replace null ratings with average rating
print("Replace null ratings with average rating")
avg = df.select(avg("Rating")).first()[0]
ratings = df.withColumn("Rating", coalesce(col("Rating"), when(col("Rating").isNull(),avg)))
#5. Add column IsActive → 1 if Status is Active, else 0
status =ratings.withColumn("IsActive", when(col("Status ") == "Active", 1).otherwise(0))
status.show()

records where ProgressPercent < 50
+------------+-----------+-------------------+-----------+----------+---------------+------+---------+
|EnrollmentID|StudentName|         CourseName|   Category|EnrollDate|ProgressPercent|Rating|  Status |
+------------+-----------+-------------------+-----------+----------+---------------+------+---------+
|      ENR003|     Aakash|Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|  Active |
|      ENR004|       Neha|        Java Basics|Programming|2024-05-15|              0|  NULL|Inactive |
+------------+-----------+-------------------+-----------+----------+---------------+------+---------+

Replace null ratings with average rating
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+----------+--------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|           Rating|   Status |IsActive|
+------------+-----------+--------------------+--------

**Aggregations & Metrics**

In [0]:
from pyspark.sql.functions import avg as spark_avg, count
# 6. Find average progress by course
print("average progress by course")
avg_progress = status.groupBy("CourseName").agg(spark_avg("ProgressPercent").alias("AvgProgress")).show()
# 7. Get count of students in each course category
print("count of students in each course category")
student_count = status.groupBy("Category").agg(count("StudentName").alias("StudentCount")).show()
# 8. Identify the most enrolled course
print("most enrolled course")
most_enrolled = status.groupBy("CourseName").agg(count("StudentName").alias("count")).orderBy("count", ascending=False).limit(1)
most_enrolled.show()

average progress by course
+--------------------+-----------+
|          CourseName|AvgProgress|
+--------------------+-----------+
|Data Analysis wit...|      100.0|
|         Java Basics|        0.0|
|Machine Learning 101|       60.0|
|Python for Beginners|       85.0|
| Power BI Essentials|       30.0|
+--------------------+-----------+

count of students in each course category
+-----------+------------+
|   Category|StudentCount|
+-----------+------------+
|Programming|           3|
|         AI|           1|
|  Analytics|           2|
+-----------+------------+

most enrolled course
+--------------------+-----+
|          CourseName|count|
+--------------------+-----+
|Python for Beginners|    2|
+--------------------+-----+



**Joins**

In [0]:
#10.Join course_enrollments with course_details to include duration and intructor
course_details = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/course_details.csv")
joined = status.join(course_details, on="CourseName", how="left")
joined.show()

+--------------------+------------+-----------+-----------+----------+---------------+-----------------+----------+--------+-------------+-----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status |IsActive|DurationWeeks|Instructor |
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+----------+--------+-------------+-----------+
|Python for Beginners|      ENR001|     Aditya|Programming|2024-05-10|             80|              4.5|   Active |       0|            4|    Rakesh |
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed |       0|            3|    Anjali |
| Power BI Essentials|      ENR003|     Aakash|  Analytics|2024-05-13|             30|              3.8|   Active |       0|            5|     Rekha |
|         Java Basics|      ENR004|       Neha|Programming|2024-05-15|              0|4.359999

**Window Functions**

In [0]:
#11.Rank students in each course based on ProgressPercent
from pyspark.sql.window import Window
w = Window.partitionBy("CourseName").orderBy(col("ProgressPercent").desc())
#12.Get lead and lag of EnrollDate by Category
rank = joined.withColumn("RankInCourse", lead("ProgressPercent").over(w))
rank = rank.withColumn("PrevProgress", lag("ProgressPercent", 1).over(w))
rank.select("EnrollmentID", "CourseName", "ProgressPercent", "RankInCourse", "PrevProgress").show()

+------------+--------------------+---------------+------------+------------+
|EnrollmentID|          CourseName|ProgressPercent|RankInCourse|PrevProgress|
+------------+--------------------+---------------+------------+------------+
|      ENR002|Data Analysis wit...|            100|        NULL|        NULL|
|      ENR004|         Java Basics|              0|        NULL|        NULL|
|      ENR005|Machine Learning 101|             60|        NULL|        NULL|
|      ENR003| Power BI Essentials|             30|        NULL|        NULL|
|      ENR006|Python for Beginners|             90|          80|        NULL|
|      ENR001|Python for Beginners|             80|        NULL|          90|
+------------+--------------------+---------------+------------+------------+



**Pivoting & Formatting**

In [0]:
#13.Pivot data to show total enrollments by Category and Status
p =joined.groupBy("Category").pivot("Status ").count()
p.show()
#14. Extract year and month from EnrollDate
from pyspark.sql.functions import year, month
b=joined.withColumn("Year", year("EnrollDate")).withColumn("Month", month("EnrollDate"))
b.select("EnrollmentID", "EnrollDate", "Year", "Month").show()

+-----------+-------+---------+----------+---------+
|   Category|Active |Completed|Completed |Inactive |
+-----------+-------+---------+----------+---------+
|Programming|      1|        1|      NULL|        1|
|         AI|      1|     NULL|      NULL|     NULL|
|  Analytics|      1|     NULL|         1|     NULL|
+-----------+-------+---------+----------+---------+

+------------+----------+----+-----+
|EnrollmentID|EnrollDate|Year|Month|
+------------+----------+----+-----+
|      ENR001|2024-05-10|2024|    5|
|      ENR002|2024-05-12|2024|    5|
|      ENR003|2024-05-13|2024|    5|
|      ENR004|2024-05-15|2024|    5|
|      ENR005|2024-05-17|2024|    5|
|      ENR006|2024-05-18|2024|    5|
+------------+----------+----+-----+



**Cleaning and Deduplication**

In [0]:
#15.Drop rows with null/empty Status
print("Droped rows with null/empty Status")
c=b.filter(b['Status '].isNotNull())
b.show()
#16.Remove duplicate enrollments
d = c.dropDuplicates(["EnrollmentID"])
d.show()

Droped rows with null/empty Status
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+----------+--------+-------------+-----------+----+-----+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status |IsActive|DurationWeeks|Instructor |Year|Month|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+----------+--------+-------------+-----------+----+-----+
|Python for Beginners|      ENR001|     Aditya|Programming|2024-05-10|             80|              4.5|   Active |       0|            4|    Rakesh |2024|    5|
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed |       0|            3|    Anjali |2024|    5|
| Power BI Essentials|      ENR003|     Aakash|  Analytics|2024-05-13|             30|              3.8|   Active |       0|            5|     Rekha |2024|

**Export**

In [0]:
# CSV
c.write.mode("overwrite").option("header", True).csv("file:/Workspace/Shared/course_cleaned_csv")
# JSON
c.write.mode("overwrite").json("file:/Workspace/Shared/course_cleaned_json")
# Parquet/snappy
c.write.mode("overwrite").option("compression", "snappy").parquet("file:/Workspace/Shared/course_cleaned_parquet")