In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("course").getOrCreate()

# Load Cleaned Data

In [0]:

# Week 4 - Task 1: Load cleaned data



# Replace with your actual path if different
students_df = spark.read.option("header", True).csv("file:/Workspace/Shared/students.csv")
courses_df = spark.read.option("header", True).csv("file:/Workspace/Shared/courses.csv")
enrollments_df = spark.read.option("header", True).csv("file:/Workspace/Shared/enrollments.csv")
progress_df = spark.read.option("header", True).csv("file:/Workspace/Shared/progress.csv")


# Check Schemas and Preview

In [0]:

students_df.printSchema()
students_df.show(5)

courses_df.printSchema()
courses_df.show(5)


root
 |-- student_id: string (nullable = true)
 |-- student_name: string (nullable = true)

+----------+------------+
|student_id|student_name|
+----------+------------+
|      S008|       Alice|
|      S017|         Bob|
|      S009|       Carol|
|      S018|       David|
|      S002|         Eve|
+----------+------------+

root
 |-- course_id: string (nullable = true)
 |-- course_name: string (nullable = true)

+---------+---------------+
|course_id|    course_name|
+---------+---------------+
|     C001|         Python|
|     C002|   Data Science|
|     C004|      AI Basics|
|     C005|Cloud Computing|
+---------+---------------+



# Join DataFrames


In [0]:
 
# Join enrollments with students
joined_df = enrollments_df.join(students_df, on="student_id", how="inner")

# Join with courses
joined_df = joined_df.join(courses_df, on="course_id", how="inner")

# Join with progress
final_df = joined_df.join(progress_df, on=["student_id", "course_id"], how="left")


# Select Required Columns

In [0]:
from pyspark.sql.functions import col

result_df = final_df.select(
    col("student_name"),
    col("course_name"),
    col("enrollment_date"),
    col("progress")  
)



# Save Final Table as Delta & CSV

In [0]:

# Save as Delta table
result_df.write.format("delta").mode("overwrite").save("dbfs:/FileStore/final_course_progress_delta")

# Save as CSV
result_df.write.option("header", True).mode("overwrite").csv("dbfs:/FileStore/final_course_progress_csv")
