#### Data Loading

1. Load the data with schema inference enabled.

In [0]:
course_df = spark.read.option("header", True) \
                      .option("inferSchema", True) \
                      .format("csv") \
                      .load('/Volumes/shared/default/courseenrollments/course_enrollments.csv')
course_df.show()

+------------+-----------+--------------------+-----------+----------+---------------+------+----------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|Rating|   Status |
+------------+-----------+--------------------+-----------+----------+---------------+------+----------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|   4.5|   Active |
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|   4.7|Completed |
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|   Active |
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|  NULL| Inactive |
|      ENR005|       Zara|Machine Learning 101|         AI|2024-05-17|             60|   4.2|   Active |
|      ENR006|    Ibrahim|Python for Beginners|Programming|2024-05-18|             90|   4.6| Completed|
+------------+-----------+--------------------+--------

2. Manually define schema and compare both approaches

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
manual_schema = StructType([
    StructField('EnrollmentID', StringType(), True),
    StructField('StudentName', StringType(), True),
    StructField('CourseName', StringType(), True),
    StructField('Category', StringType(), True),
    StructField('EnrollDate', StringType(), True),
    StructField('ProgressPercent', IntegerType(), True),
    StructField('Rating', FloatType(), True),
    StructField('Status', StringType(), True)
])

course_manual_df = spark.read.option("header", True) \
                             .schema(manual_schema) \
                             .format("csv") \
                             .load('/Volumes/shared/default/courseenrollments/course_enrollments.csv')

course_manual_df.show()

+------------+-----------+--------------------+-----------+----------+---------------+------+----------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|Rating|    Status|
+------------+-----------+--------------------+-----------+----------+---------------+------+----------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|   4.5|   Active |
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|   4.7|Completed |
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|   Active |
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|  NULL| Inactive |
|      ENR005|       Zara|Machine Learning 101|         AI|2024-05-17|             60|   4.2|   Active |
|      ENR006|    Ibrahim|Python for Beginners|Programming|2024-05-18|             90|   4.6| Completed|
+------------+-----------+--------------------+--------

Comparing Both Approaches

In [0]:
print("schema inference enabled")
course_df.printSchema()

print("Manually define schema")
course_manual_df.printSchema()

schema inference enabled
root
 |-- EnrollmentID: string (nullable = true)
 |-- StudentName: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: date (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Status : string (nullable = true)

Manually define schema
root
 |-- EnrollmentID: string (nullable = true)
 |-- StudentName: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: string (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Status: string (nullable = true)



#### Filtering and Transformation

3. Filter records where ProgressPercent < 50 .

In [0]:
from pyspark.sql.functions import col
course_df.filter(col('ProgressPercent') < 50) \
         .select(
                    'EnrollmentID',
                    'StudentName',
                    'CourseName',
                    'ProgressPercent'
                 ).show()

+------------+-----------+-------------------+---------------+
|EnrollmentID|StudentName|         CourseName|ProgressPercent|
+------------+-----------+-------------------+---------------+
|      ENR003|     Aakash|Power BI Essentials|             30|
|      ENR004|       Neha|        Java Basics|              0|
+------------+-----------+-------------------+---------------+



4. Replace null ratings with average rating.

In [0]:
from pyspark.sql.functions import avg, when
average_rating_df = course_df.groupBy(col('Category')) \
                             .agg(avg(col('Rating')).alias('AverageRating'))

course_df = course_df.join(average_rating_df, on='Category', how='left')

course_df = course_df.withColumn("Rating",
                                  when(col("Rating").isNull(), col("AverageRating")).otherwise(col("Rating"))
                                )

course_df = course_df.drop('AverageRating')
course_df.select(
                  'EnrollmentID',
                  'StudentName',
                  'Category',
                  'CourseName',
                  'Rating'
).show()

+------------+-----------+-----------+--------------------+------+
|EnrollmentID|StudentName|   Category|          CourseName|Rating|
+------------+-----------+-----------+--------------------+------+
|      ENR001|     Aditya|Programming|Python for Beginners|   4.5|
|      ENR002|     Simran|  Analytics|Data Analysis wit...|   4.7|
|      ENR003|     Aakash|  Analytics| Power BI Essentials|   3.8|
|      ENR004|       Neha|Programming|         Java Basics|  4.55|
|      ENR005|       Zara|         AI|Machine Learning 101|   4.2|
|      ENR006|    Ibrahim|Programming|Python for Beginners|   4.6|
+------------+-----------+-----------+--------------------+------+



5.  Add column IsActive → 1 if Status is Active, else 0

In [0]:
from pyspark.sql.functions import trim


course_df = course_df.withColumn('Status', trim(col('Status ')))

course_df = course_df.withColumn('IsActive',
                                 when(col('Status') == 'Active', 1).otherwise(0)
                                )

course_df.select(
                'EnrollmentID',
                'StudentName',
                'CourseName',
                'EnrollDate',
                'ProgressPercent',
                'Status',
                'IsActive'
).show()


+------------+-----------+--------------------+----------+---------------+---------+--------+
|EnrollmentID|StudentName|          CourseName|EnrollDate|ProgressPercent|   Status|IsActive|
+------------+-----------+--------------------+----------+---------------+---------+--------+
|      ENR001|     Aditya|Python for Beginners|2024-05-10|             80|   Active|       1|
|      ENR002|     Simran|Data Analysis wit...|2024-05-12|            100|Completed|       0|
|      ENR003|     Aakash| Power BI Essentials|2024-05-13|             30|   Active|       1|
|      ENR004|       Neha|         Java Basics|2024-05-15|              0| Inactive|       0|
|      ENR005|       Zara|Machine Learning 101|2024-05-17|             60|   Active|       1|
|      ENR006|    Ibrahim|Python for Beginners|2024-05-18|             90|Completed|       0|
+------------+-----------+--------------------+----------+---------------+---------+--------+



####  Aggregations & Metrics

6. Find average progress by course

In [0]:
course_df.groupBy(col('courseName')) \
         .agg(avg(col('ProgressPercent')).alias('AverageProgress')) \
         .show()

+--------------------+---------------+
|          courseName|AverageProgress|
+--------------------+---------------+
| Power BI Essentials|           30.0|
|Machine Learning 101|           60.0|
|Python for Beginners|           85.0|
|Data Analysis wit...|          100.0|
|         Java Basics|            0.0|
+--------------------+---------------+



7.  Get count of students in each course category.

In [0]:
course_df.groupBy(col('Category')) \
         .count() \
         .withColumnRenamed('count', 'TotalStudentCount') \
         .show()

+-----------+-----------------+
|   Category|TotalStudentCount|
+-----------+-----------------+
|Programming|                3|
|  Analytics|                2|
|         AI|                1|
+-----------+-----------------+



8. Identify the most enrolled course.

In [0]:
from pyspark.sql.functions import max
course_count_df = course_df.groupBy(col('courseName')) \
         .count() \
         .withColumnRenamed('count', 'TotalStudentCount') 

max_count = course_count_df.agg(max('TotalStudentCount')).collect()[0][0]

course_count_df.filter(col('TotalStudentCount') == max_count).show()

+--------------------+-----------------+
|          courseName|TotalStudentCount|
+--------------------+-----------------+
|Python for Beginners|                2|
+--------------------+-----------------+



#### Joins

9. Creating second csv: 
course_details.csv

In [0]:
course_detail_schema = StructType([
    StructField('CourseName', StringType(), True),
    StructField('DurationWeeks', IntegerType(), True),
    StructField('Instructor', StringType(), True)
])

course_details_data = [
    ("Python for Beginners", 4, "Rakesh"),
    ("Data Analysis with Excel", 3, "Anjali"),
    ("Power BI Essentials", 5, "Rekha"),
    ("Java Basics", 6, "Manoj"),
    ("Machine Learning 101", 8, "Samir")
]

course_detail_df = spark.createDataFrame(course_details_data, course_detail_schema)

course_detail_df.show()

+--------------------+-------------+----------+
|          CourseName|DurationWeeks|Instructor|
+--------------------+-------------+----------+
|Python for Beginners|            4|    Rakesh|
|Data Analysis wit...|            3|    Anjali|
| Power BI Essentials|            5|     Rekha|
|         Java Basics|            6|     Manoj|
|Machine Learning 101|            8|     Samir|
+--------------------+-------------+----------+



10.  Join course_enrollments with instructor.

In [0]:
course_df = course_df.join(course_detail_df, on='CourseName', how='left')
course_df.show()

+--------------------+------------+-----------+-----------+----------+---------------+------+----------+---------+--------+-------------+----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|Rating|   Status |   Status|IsActive|DurationWeeks|Instructor|
+--------------------+------------+-----------+-----------+----------+---------------+------+----------+---------+--------+-------------+----------+
|Python for Beginners|      ENR001|     Aditya|Programming|2024-05-10|             80|   4.5|   Active |   Active|       1|            4|    Rakesh|
|Data Analysis wit...|      ENR002|     Simran|  Analytics|2024-05-12|            100|   4.7|Completed |Completed|       0|            3|    Anjali|
| Power BI Essentials|      ENR003|     Aakash|  Analytics|2024-05-13|             30|   3.8|   Active |   Active|       1|            5|     Rekha|
|         Java Basics|      ENR004|       Neha|Programming|2024-05-15|              0|  NULL| Inactive | I

####  Window Functions


11. Rank students in each course based on ProgressPercent

In [0]:
from pyspark.sql.functions import rank
from pyspark.sql.window import Window

rank_window = Window.partitionBy(col('CourseName')) \
                    .orderBy(col('ProgressPercent').desc())

rank_df = course_df.withColumn('StudentRank', rank().over(rank_window))
rank_df.select(
                'StudentName',
                'Category',
                'CourseName',
                'ProgressPercent',
                'StudentRank'
).show()

+-----------+-----------+--------------------+---------------+-----------+
|StudentName|   Category|          CourseName|ProgressPercent|StudentRank|
+-----------+-----------+--------------------+---------------+-----------+
|     Simran|  Analytics|Data Analysis wit...|            100|          1|
|       Neha|Programming|         Java Basics|              0|          1|
|       Zara|         AI|Machine Learning 101|             60|          1|
|     Aakash|  Analytics| Power BI Essentials|             30|          1|
|    Ibrahim|Programming|Python for Beginners|             90|          1|
|     Aditya|Programming|Python for Beginners|             80|          2|
+-----------+-----------+--------------------+---------------+-----------+



12. Get lead and lag Of EnrollDate by category.

In [0]:
from pyspark.sql.functions import lead, lag

enrollment_window = Window.partitionBy("Category").orderBy("EnrollDate")

lead_lag_df = course_df.withColumn("PreviousDate", lag("EnrollDate").over(enrollment_window)) \
                       .withColumn("NextDate", lead("EnrollDate").over(enrollment_window)) \

lead_lag_df.select( 
                    'Category',
                    'EnrollDate',
                    'PreviousDate',
                    'NextDate'
).show()



+-----------+----------+------------+----------+
|   Category|EnrollDate|PreviousDate|  NextDate|
+-----------+----------+------------+----------+
|         AI|2024-05-17|        NULL|      NULL|
|  Analytics|2024-05-12|        NULL|2024-05-13|
|  Analytics|2024-05-13|  2024-05-12|      NULL|
|Programming|2024-05-10|        NULL|2024-05-15|
|Programming|2024-05-15|  2024-05-10|2024-05-18|
|Programming|2024-05-18|  2024-05-15|      NULL|
+-----------+----------+------------+----------+



####  Pivoting & Formatting

13. Pivot data to show total enrollments by Category and Status

In [0]:
course_df.groupBy("Category") \
         .pivot("Status") \
         .count() \
         .show()



+-----------+------+---------+--------+
|   Category|Active|Completed|Inactive|
+-----------+------+---------+--------+
|Programming|     1|        1|       1|
|  Analytics|     1|        1|    NULL|
|         AI|     1|     NULL|    NULL|
+-----------+------+---------+--------+



14. Extract year and month from EnrollDate 

In [0]:
from pyspark.sql.functions import year, month, monthname
course_df = course_df.withColumn('Year', year('EnrollDate')) \
                     .withColumn('Month', month('EnrollDate')) \
                     .withColumn('MonthName', monthname('EnrollDate'))

course_df.select(
                    'EnrollmentID',
                    'StudentName',
                    'Category',
                    'CourseName',
                    'Year',
                    'Month',
                    'MonthName',
                    'EnrollDate'
).show()

+------------+-----------+-----------+--------------------+----+-----+---------+----------+
|EnrollmentID|StudentName|   Category|          CourseName|Year|Month|MonthName|EnrollDate|
+------------+-----------+-----------+--------------------+----+-----+---------+----------+
|      ENR001|     Aditya|Programming|Python for Beginners|2024|    5|      May|2024-05-10|
|      ENR002|     Simran|  Analytics|Data Analysis wit...|2024|    5|      May|2024-05-12|
|      ENR003|     Aakash|  Analytics| Power BI Essentials|2024|    5|      May|2024-05-13|
|      ENR004|       Neha|Programming|         Java Basics|2024|    5|      May|2024-05-15|
|      ENR005|       Zara|         AI|Machine Learning 101|2024|    5|      May|2024-05-17|
|      ENR006|    Ibrahim|Programming|Python for Beginners|2024|    5|      May|2024-05-18|
+------------+-----------+-----------+--------------------+----+-----+---------+----------+



#### Cleaning and Deduplication

15.  Drop rows where Status is null or empty.

In [0]:
course_df = course_df.filter((col('Status').isNotNull()) & (trim(col('Status')) != ''))

course_df.select(
                    'EnrollmentID',
                    'StudentName',
                    'CourseName',
                    'Category',
                    'Status'
).show()

+------------+-----------+--------------------+-----------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|   Status|
+------------+-----------+--------------------+-----------+---------+
|      ENR002|     Simran|Data Analysis wit...|  Analytics|Completed|
|      ENR004|       Neha|         Java Basics|Programming| Inactive|
|      ENR005|       Zara|Machine Learning 101|         AI|   Active|
|      ENR001|     Aditya|Python for Beginners|Programming|   Active|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|   Active|
|      ENR006|    Ibrahim|Python for Beginners|Programming|Completed|
+------------+-----------+--------------------+-----------+---------+



16. Remove duplicate enrollments using 
dropDuplicates() .

In [0]:
course_df = course_df.dropDuplicates(['EnrollmentID'])
course_df.select(
                    'EnrollmentID',
                    'StudentName',
                    'CourseName',
                    'Category'
).show()

+------------+-----------+--------------------+-----------+
|EnrollmentID|StudentName|          CourseName|   Category|
+------------+-----------+--------------------+-----------+
|      ENR002|     Simran|Data Analysis wit...|  Analytics|
|      ENR004|       Neha|         Java Basics|Programming|
|      ENR005|       Zara|Machine Learning 101|         AI|
|      ENR001|     Aditya|Python for Beginners|Programming|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|
|      ENR006|    Ibrahim|Python for Beginners|Programming|
+------------+-----------+--------------------+-----------+



####  Export


17. Write the final cleaned DataFrame to: \
 CSV (overwrite mode) \
 JSON (overwrite mode) \
 Parquet (snappy compression)

Writting in CSV Format

In [0]:
course_df.write.mode('overwrite') \
               .format('csv') \
               .option('header', 'true') \
               .save('/Volumes/shared/default/courseenrollments/cleaned_course.csv')

Writting as a Json format

In [0]:
course_df.write.mode('overwrite') \
               .format('json') \
               .save('/Volumes/shared/default/courseenrollments/cleaned_course.json')

Writting as a Parquet format

In [0]:
course_df.write.mode('overwrite') \
               .format('parquet') \
               .save('/Volumes/shared/default/courseenrollments/cleaned_course_parquet')

Displaying all the files

In [0]:
files = dbutils.fs.ls('/Volumes/shared/default/courseenrollments/')
for row in files:
    print(row.path)

dbfs:/Volumes/shared/default/courseenrollments/cleaned_course.csv/
dbfs:/Volumes/shared/default/courseenrollments/cleaned_course.json/
dbfs:/Volumes/shared/default/courseenrollments/cleaned_course_parquet/
dbfs:/Volumes/shared/default/courseenrollments/course_enrollments.csv
