#DATASET 1 — USER MASTER (CORRUPTED SCHEMA)

In [None]:
raw_users = [
("U001","Amit","28","Hyderabad","AI,ML,Cloud"),
("U002","Neha","Thirty","Delhi","Testing"),
("U003","Ravi",None,"Bangalore",["Data","Spark"]),
("U004","Pooja","29","Mumbai","AI|ML"),
("U005","", "31","Chennai",None)
]

#DATASET 2 — COURSE CATALOG

In [13]:
raw_courses = [
("C001","PySpark Mastery","Data Engineering","Advanced","₹9999"),
("C002","AI for Testers","QA","Beginner","8999"),
("C003","ML Foundations","AI","Intermediate",None),
("C004","Data Engineering Bootcamp","Data","Advanced","₹14999")
]

#DATASET 3 — USER ENROLLMENTS

In [None]:
raw_enrollments = [
("U001","C001","2024-01-05"),
("U002","C002","05/01/2024"),
("U003","C001","2024/01/06"),
("U004","C003","invalid_date"),
("U001","C004","2024-01-10"),
("U005","C002","2024-01-12")
]

#DATASET 4 — USER ACTIVITY LOGS

In [None]:
raw_activity = [
("U001","login,watch,logout","{'device':'mobile'}",120),
("U002",["login","watch"],"device=laptop",90),
("U003","login|logout",None,30),
("U004",None,"{'device':'tablet'}",60),
("U005","login","{'device':'mobile'}",15)
]

PART A — DATA CLEANING & STRUCTURING

1. Design explicit schemas for all datasets
2. Normalize data types (age, price, dates)
3. Convert skills and actions into arrays
4. Handle missing and invalid records gracefully
5. Produce clean DataFrames:

users_df

courses_df

enrollments_df

activity_df

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("LearninPlatform").getOrCreate()

In [None]:
#1

In [9]:
raw_users = [
("U001","Amit","28","Hyderabad","AI,ML,Cloud"),
("U002","Neha","Thirty","Delhi","Testing"),
("U003","Ravi",None,"Bangalore",["Data","Spark"]),
("U004","Pooja","29","Mumbai","AI|ML"),
("U005","", "31","Chennai",None)
]
user_schema = StructType([
StructField("user_id", StringType(), False),
StructField("name", StringType(), True),
StructField("age_raw", StringType(), True),
StructField("city", StringType(), True),
StructField("skills",StringType(), True)
])
#users_df = spark.createDataFrame(raw_users, user_schema)


In [15]:
raw_courses = [
("C001","PySpark Mastery","Data Engineering","Advanced","₹9999"),
("C002","AI for Testers","QA","Beginner","8999"),
("C003","ML Foundations","AI","Intermediate",None),
("C004","Data Engineering Bootcamp","Data","Advanced","₹14999")
]


course_schema = StructType([
    StructField("course_id",StringType(),True),
    StructField("course_name",StringType(),True),
    StructField("domain",StringType(),True),
    StructField("level",StringType(),True),
    StructField("price_raw",StringType(),True)
])

In [16]:
raw_enrollments = [
("U001","C001","2024-01-05"),
("U002","C002","05/01/2024"),
("U003","C001","2024/01/06"),
("U004","C003","invalid_date"),
("U001","C004","2024-01-10"),
("U005","C002","2024-01-12")
]


enroll_schema = StructType([
    StructField("user_id",StringType(),True),
    StructField("course_id",StringType(),True),
    StructField("enroll_date_raw",StringType(),True)
])

In [17]:
raw_activity = [
("U001","login,watch,logout","{'device':'mobile'}",120),
("U002",["login","watch"],"device=laptop",90),
("U003","login|logout",None,30),
("U004",None,"{'device':'tablet'}",60),
("U005","login","{'device':'mobile'}",15)
]


activity_schema = StructType([
    StructField("user_id",StringType(),True),
    StructField("activity_raw",StringType(),True),
    StructField("metadata_raw",StringType(),True),
    StructField("time_spent",IntegerType(),True)
])

In [None]:
#2

In [None]:
"""users_df = spark.createDataFrame(raw_users, user_schema)
users_df = users_df\
.withColumn("age",
            when(col("age").rlike("^[0-9]+$"),
                 col("age").cast(IntegerType()))
            .otherwise(None)
           )\
.withColumn("name",
            when(trim(col("name")) == "", None)
            .otherwise(col("name")))"""

In [22]:
users_df = spark.createDataFrame(raw_users, user_schema) \
.withColumn("age",
    when(col("age_raw").rlike("^[0-9]+$"), col("age_raw").cast("int"))
    .otherwise(lit(None).cast(IntegerType())) # Handle non-numeric age_raw gracefully
) \
.withColumn("skills",
    when(col("skills").isNotNull(),
         # Clean up skills string: remove leading/trailing brackets and replace '|' with ','
         # Then split by ',' to form an array
         split(
            regexp_replace(
                regexp_replace(col("skills"), r"^\[|\]$", ""), # Remove potential leading/trailing brackets using raw string
                "[|]", ","
            ),
            ","
         )
    ).otherwise(array().cast(ArrayType(StringType()))) # Handle None skills by creating an empty array of string type
) \
.withColumn("name",
    when(col("name") == "", None).otherwise(col("name"))
) \
.drop("age_raw") # Drop age_raw, skills_raw does not exist
users_df.printSchema()
users_df.show()


root
 |-- user_id: string (nullable = false)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- age: integer (nullable = true)

+-------+-----+---------+---------------+----+
|user_id| name|     city|         skills| age|
+-------+-----+---------+---------------+----+
|   U001| Amit|Hyderabad|[AI, ML, Cloud]|  28|
|   U002| Neha|    Delhi|      [Testing]|NULL|
|   U003| Ravi|Bangalore| [Data,  Spark]|NULL|
|   U004|Pooja|   Mumbai|       [AI, ML]|  29|
|   U005| NULL|  Chennai|             []|  31|
+-------+-----+---------+---------------+----+



In [None]:
#3

In [21]:
# The 'skills' column is already an ArrayType(StringType()) from previous transformations.
# This cell's logic is redundant and attempts to apply string operations on an array column.
# No action is needed here as the 'skills' column is already in the correct format.

"""users_df = users_df.withColumn(
    "skills",
    when(col("skills").isNull(), array())
   .when(col("skills").contains(","), split(col("skills"), ","))
   .when(col("skills").contains("|"), split(col("skills"), "\|"))
   .otherwise(array(col("skills")))
)"""

In [None]:
#4

In [23]:
courses_df = spark.createDataFrame(raw_courses, course_schema)


courses_df = courses_df.withColumn(
    "price_raw",
    regexp_replace(col("price_raw"), "₹", "")
)

In [24]:
courses_df = courses_df.withColumn(
    "price_raw",
    when(col("price_raw").rlike("^[0-9]+$"), col("price_raw").cast(IntegerType()))
    .otherwise(None)
)

In [25]:
enrollments_df = spark.createDataFrame(raw_enrollments, enroll_schema)


enroll_df=enrollments_df.withColumn("enroll_date",
                                    coalesce(
                                        to_date(try_to_timestamp(col("enroll_date_raw"),lit("yyyy-MM-dd"))),
                                        to_date(try_to_timestamp(col("enroll_date_raw"),lit("dd/MM/yyyy"))),
                                        to_date(try_to_timestamp(col("enroll_date_raw"),lit("yyyy/MM/dd")))
                                    )
                                   )
enrollments_df.show()

+-------+---------+---------------+
|user_id|course_id|enroll_date_raw|
+-------+---------+---------------+
|   U001|     C001|     2024-01-05|
|   U002|     C002|     05/01/2024|
|   U003|     C001|     2024/01/06|
|   U004|     C003|   invalid_date|
|   U001|     C004|     2024-01-10|
|   U005|     C002|     2024-01-12|
+-------+---------+---------------+



In [26]:
activity_raw_df = spark.createDataFrame(raw_activity, activity_schema)
activity_df = activity_raw_df\
.withColumn("actions",
            when(col("activity_raw").isNull(), array())
            .when(col("activity_raw").contains(","), split(col("activity_raw"), ","))
            .when(col("activity_raw").contains("|"), split(col("activity_raw"), "\|"))
            .otherwise(array(col("activity_raw"))))\
            .drop("activity_raw")
activity_df.show()

  .when(col("activity_raw").contains("|"), split(col("activity_raw"), "\|"))


+-------+-------------------+----------+--------------------+
|user_id|       metadata_raw|time_spent|             actions|
+-------+-------------------+----------+--------------------+
|   U001|{'device':'mobile'}|       120|[login, watch, lo...|
|   U002|      device=laptop|        90|   [[login,  watch]]|
|   U003|               NULL|        30|     [login, logout]|
|   U004|{'device':'tablet'}|        60|                  []|
|   U005|{'device':'mobile'}|        15|             [login]|
+-------+-------------------+----------+--------------------+



In [None]:
#5

In [27]:
users_df.printSchema()
users_df.show()

courses_df.printSchema()
courses_df.show()

enrollments_df.printSchema()
enrollments_df.show()

activity_df.printSchema()
activity_df.show()

root
 |-- user_id: string (nullable = false)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- age: integer (nullable = true)

+-------+-----+---------+---------------+----+
|user_id| name|     city|         skills| age|
+-------+-----+---------+---------------+----+
|   U001| Amit|Hyderabad|[AI, ML, Cloud]|  28|
|   U002| Neha|    Delhi|      [Testing]|NULL|
|   U003| Ravi|Bangalore| [Data,  Spark]|NULL|
|   U004|Pooja|   Mumbai|       [AI, ML]|  29|
|   U005| NULL|  Chennai|             []|  31|
+-------+-----+---------+---------------+----+

root
 |-- course_id: string (nullable = true)
 |-- course_name: string (nullable = true)
 |-- domain: string (nullable = true)
 |-- level: string (nullable = true)
 |-- price_raw: integer (nullable = true)

+---------+--------------------+----------------+------------+---------+
|course_id|         course_name|          domain|       l

PART B — DATA INTEGRATION (JOINS)

6. Join users with enrollments
7. Join enrollments with courses
8. Decide which table(s) should be broadcast
9. Justify your decision using explain(True)
10. Eliminate orphan records

In [28]:
#6
user_enroll_df  = enrollments_df.join(users_df, "user_id", "left")
user_enroll_df.show()

+-------+---------+---------------+-----+---------+---------------+----+
|user_id|course_id|enroll_date_raw| name|     city|         skills| age|
+-------+---------+---------------+-----+---------+---------------+----+
|   U002|     C002|     05/01/2024| Neha|    Delhi|      [Testing]|NULL|
|   U003|     C001|     2024/01/06| Ravi|Bangalore| [Data,  Spark]|NULL|
|   U001|     C001|     2024-01-05| Amit|Hyderabad|[AI, ML, Cloud]|  28|
|   U004|     C003|   invalid_date|Pooja|   Mumbai|       [AI, ML]|  29|
|   U005|     C002|     2024-01-12| NULL|  Chennai|             []|  31|
|   U001|     C004|     2024-01-10| Amit|Hyderabad|[AI, ML, Cloud]|  28|
+-------+---------+---------------+-----+---------+---------------+----+



In [29]:
#7
full_df = user_enroll_df.join(broadcast(courses_df), "course_id", "inner")
full_df.show()

+---------+-------+---------------+-----+---------+---------------+----+--------------------+----------------+------------+---------+
|course_id|user_id|enroll_date_raw| name|     city|         skills| age|         course_name|          domain|       level|price_raw|
+---------+-------+---------------+-----+---------+---------------+----+--------------------+----------------+------------+---------+
|     C002|   U002|     05/01/2024| Neha|    Delhi|      [Testing]|NULL|      AI for Testers|              QA|    Beginner|     8999|
|     C001|   U003|     2024/01/06| Ravi|Bangalore| [Data,  Spark]|NULL|     PySpark Mastery|Data Engineering|    Advanced|     9999|
|     C001|   U001|     2024-01-05| Amit|Hyderabad|[AI, ML, Cloud]|  28|     PySpark Mastery|Data Engineering|    Advanced|     9999|
|     C003|   U004|   invalid_date|Pooja|   Mumbai|       [AI, ML]|  29|      ML Foundations|              AI|Intermediate|     NULL|
|     C002|   U005|     2024-01-12| NULL|  Chennai|           

In [30]:
#8
# Based on the previous analysis, `users_df` is a small table and a good candidate for broadcasting.
# Explicitly broadcast users_df when joining with enrollments_df.
user_enroll_df  = enrollments_df.join(broadcast(users_df), "user_id", "left")
user_enroll_df.show()

+-------+---------+---------------+-----+---------+---------------+----+
|user_id|course_id|enroll_date_raw| name|     city|         skills| age|
+-------+---------+---------------+-----+---------+---------------+----+
|   U001|     C001|     2024-01-05| Amit|Hyderabad|[AI, ML, Cloud]|  28|
|   U002|     C002|     05/01/2024| Neha|    Delhi|      [Testing]|NULL|
|   U003|     C001|     2024/01/06| Ravi|Bangalore| [Data,  Spark]|NULL|
|   U004|     C003|   invalid_date|Pooja|   Mumbai|       [AI, ML]|  29|
|   U001|     C004|     2024-01-10| Amit|Hyderabad|[AI, ML, Cloud]|  28|
|   U005|     C002|     2024-01-12| NULL|  Chennai|             []|  31|
+-------+---------+---------------+-----+---------+---------------+----+



In [31]:
#9
full_df.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(Inner, [course_id])
:- Project [user_id#87, course_id#88, enroll_date_raw#89, name#63, city#59, skills#62, age#61]
:  +- Join LeftOuter, (user_id#87 = user_id#56)
:     :- LogicalRDD [user_id#87, course_id#88, enroll_date_raw#89], false
:     +- Project [user_id#56, name#63, city#59, skills#62, age#61]
:        +- Project [user_id#56, CASE WHEN (name#57 = ) THEN cast(null as string) ELSE name#57 END AS name#63, age_raw#58, city#59, skills#62, age#61]
:           +- Project [user_id#56, name#57, age_raw#58, city#59, CASE WHEN isnotnull(skills#60) THEN split(regexp_replace(regexp_replace(skills#60, ^\[|\]$, , 1), [|], ,, 1), ,, -1) ELSE cast(array() as array<string>) END AS skills#62, age#61]
:              +- Project [user_id#56, name#57, age_raw#58, city#59, skills#60, CASE WHEN RLIKE(age_raw#58, ^[0-9]+$) THEN cast(age_raw#58 as int) ELSE cast(null as int) END AS age#61]
:                 +- LogicalRDD [user_id#56, name#57, age_raw#58, city#59

In [32]:
#10
full_df = full_df.withColumn("enroll_date",
                                    coalesce(
                                        to_date(try_to_timestamp(col("enroll_date_raw"),lit("yyyy-MM-dd"))),
                                        to_date(try_to_timestamp(col("enroll_date_raw"),lit("dd/MM/yyyy"))),
                                        to_date(try_to_timestamp(col("enroll_date_raw"),lit("yyyy/MM/dd")))
                                    )
                                   )
full_df = full_df.drop("enroll_date_raw")
full_df.show()
full_df.printSchema()

+---------+-------+-----+---------+---------------+----+--------------------+----------------+------------+---------+-----------+
|course_id|user_id| name|     city|         skills| age|         course_name|          domain|       level|price_raw|enroll_date|
+---------+-------+-----+---------+---------------+----+--------------------+----------------+------------+---------+-----------+
|     C002|   U002| Neha|    Delhi|      [Testing]|NULL|      AI for Testers|              QA|    Beginner|     8999| 2024-01-05|
|     C001|   U003| Ravi|Bangalore| [Data,  Spark]|NULL|     PySpark Mastery|Data Engineering|    Advanced|     9999| 2024-01-06|
|     C001|   U001| Amit|Hyderabad|[AI, ML, Cloud]|  28|     PySpark Mastery|Data Engineering|    Advanced|     9999| 2024-01-05|
|     C003|   U004|Pooja|   Mumbai|       [AI, ML]|  29|      ML Foundations|              AI|Intermediate|     NULL|       NULL|
|     C002|   U005| NULL|  Chennai|             []|  31|      AI for Testers|             

In [33]:
full_df = full_df.filter(col("name").isNotNull() & col("enroll_date").isNotNull())
full_df.show()
full_df.printSchema()

+---------+-------+----+---------+---------------+----+--------------------+----------------+--------+---------+-----------+
|course_id|user_id|name|     city|         skills| age|         course_name|          domain|   level|price_raw|enroll_date|
+---------+-------+----+---------+---------------+----+--------------------+----------------+--------+---------+-----------+
|     C001|   U001|Amit|Hyderabad|[AI, ML, Cloud]|  28|     PySpark Mastery|Data Engineering|Advanced|     9999| 2024-01-05|
|     C004|   U001|Amit|Hyderabad|[AI, ML, Cloud]|  28|Data Engineering ...|            Data|Advanced|    14999| 2024-01-10|
|     C002|   U002|Neha|    Delhi|      [Testing]|NULL|      AI for Testers|              QA|Beginner|     8999| 2024-01-05|
|     C001|   U003|Ravi|Bangalore| [Data,  Spark]|NULL|     PySpark Mastery|Data Engineering|Advanced|     9999| 2024-01-06|
+---------+-------+----+---------+---------------+----+--------------------+----------------+--------+---------+-----------+


PART C — ANALYTICS & AGGREGATIONS

11. Total enrollments per course
12. Total revenue per course
13. Average engagement time per course
14. Total courses enrolled per user
15. Identify users with zero activity

In [34]:
#11
enroll_count = full_df.groupBy("course_id", "course_name").count()
enroll_count.show()

#12
revenue_df = full_df.groupBy("course_id", "course_name").agg(sum("price_raw").alias("total_revenue"))
revenue_df.show()

#13
course_activity = full_df.join(activity_df, "user_id", "left")
avg_time_df = course_activity.groupBy("course_id").agg(avg("time_spent").alias("avg_engagement_time"))
avg_time_df.show()

#14
course_per_user = full_df.groupBy("user_id").count()
course_per_user.show()

#15
inactive_users = users_df.join(activity_df, "user_id", "left_anti")
inactive_users.show()

+---------+--------------------+-----+
|course_id|         course_name|count|
+---------+--------------------+-----+
|     C004|Data Engineering ...|    1|
|     C001|     PySpark Mastery|    2|
|     C002|      AI for Testers|    1|
+---------+--------------------+-----+

+---------+--------------------+-------------+
|course_id|         course_name|total_revenue|
+---------+--------------------+-------------+
|     C004|Data Engineering ...|        14999|
|     C001|     PySpark Mastery|        19998|
|     C002|      AI for Testers|         8999|
+---------+--------------------+-------------+

+---------+-------------------+
|course_id|avg_engagement_time|
+---------+-------------------+
|     C004|              120.0|
|     C001|               75.0|
|     C002|               90.0|
+---------+-------------------+

+-------+-----+
|user_id|count|
+-------+-----+
|   U001|    2|
|   U002|    1|
|   U003|    1|
+-------+-----+

+-------+----+----+------+---+
|user_id|name|city|skills|a

PART D — WINDOW FUNCTIONS

16. Rank users by total time spent
17. Calculate running revenue per course by enrollment date
18. Identify top 2 users per course by engagement
19. Compare GroupBy vs Window results for at least one metric

In [35]:
#16
from pyspark.sql.window import Window

user_time = activity_df.groupBy("user_id").agg(sum("time_spent").alias("total_time"))
window_rank = Window.orderBy(col("total_time").desc())
ranked_users = user_time.withColumn("rank", rank().over(window_rank)).withColumn("dense_rank", dense_rank().over(window_rank))
ranked_users.show()

+-------+----------+----+----------+
|user_id|total_time|rank|dense_rank|
+-------+----------+----+----------+
|   U001|       120|   1|         1|
|   U002|        90|   2|         2|
|   U004|        60|   3|         3|
|   U003|        30|   4|         4|
|   U005|        15|   5|         5|
+-------+----------+----+----------+



In [36]:
#17
window_course = Window.partitionBy("course_id").orderBy("enroll_date")\
.rowsBetween(Window.unboundedPreceding, Window.currentRow)
running_revenue = full_df.withColumn("running_revenue", sum("price_raw")\
                                     .over(window_course))
running_revenue.show()

+---------+-------+----+---------+---------------+----+--------------------+----------------+--------+---------+-----------+---------------+
|course_id|user_id|name|     city|         skills| age|         course_name|          domain|   level|price_raw|enroll_date|running_revenue|
+---------+-------+----+---------+---------------+----+--------------------+----------------+--------+---------+-----------+---------------+
|     C001|   U001|Amit|Hyderabad|[AI, ML, Cloud]|  28|     PySpark Mastery|Data Engineering|Advanced|     9999| 2024-01-05|           9999|
|     C001|   U003|Ravi|Bangalore| [Data,  Spark]|NULL|     PySpark Mastery|Data Engineering|Advanced|     9999| 2024-01-06|          19998|
|     C002|   U002|Neha|    Delhi|      [Testing]|NULL|      AI for Testers|              QA|Beginner|     8999| 2024-01-05|           8999|
|     C004|   U001|Amit|Hyderabad|[AI, ML, Cloud]|  28|Data Engineering ...|            Data|Advanced|    14999| 2024-01-10|          14999|
+---------+--

In [37]:
#18
window_eng = Window.partitionBy("course_id").orderBy(desc("time_spent"))
top_users = course_activity.withColumn(
    "rank",
    dense_rank().over(window_eng)
).filter(col("rank") <= 2)
top_users.show()

+-------+---------+----+---------+---------------+----+--------------------+----------------+--------+---------+-----------+-------------------+----------+--------------------+----+
|user_id|course_id|name|     city|         skills| age|         course_name|          domain|   level|price_raw|enroll_date|       metadata_raw|time_spent|             actions|rank|
+-------+---------+----+---------+---------------+----+--------------------+----------------+--------+---------+-----------+-------------------+----------+--------------------+----+
|   U001|     C001|Amit|Hyderabad|[AI, ML, Cloud]|  28|     PySpark Mastery|Data Engineering|Advanced|     9999| 2024-01-05|{'device':'mobile'}|       120|[login, watch, lo...|   1|
|   U003|     C001|Ravi|Bangalore| [Data,  Spark]|NULL|     PySpark Mastery|Data Engineering|Advanced|     9999| 2024-01-06|               NULL|        30|     [login, logout]|   2|
|   U002|     C002|Neha|    Delhi|      [Testing]|NULL|      AI for Testers|              

In [38]:
#19
print("Comparison of GroupBy vs Window Function for Total Time Spent per User:")

user_time.show()

window_spec_user = Window.partitionBy("user_id")
total_time_window = activity_df.withColumn(
    "total_time_window", sum("time_spent").over(window_spec_user)
).select("user_id", "total_time_window").distinct()
total_time_window.show()

comparison_df = user_time.join(total_time_window, "user_id", "inner")
print("Results should be identical for total_time and total_time_window:")
comparison_df.show()

Comparison of GroupBy vs Window Function for Total Time Spent per User:
+-------+----------+
|user_id|total_time|
+-------+----------+
|   U002|        90|
|   U001|       120|
|   U004|        60|
|   U005|        15|
|   U003|        30|
+-------+----------+

+-------+-----------------+
|user_id|total_time_window|
+-------+-----------------+
|   U001|              120|
|   U002|               90|
|   U003|               30|
|   U004|               60|
|   U005|               15|
+-------+-----------------+

Results should be identical for total_time and total_time_window:
+-------+----------+-----------------+
|user_id|total_time|total_time_window|
+-------+----------+-----------------+
|   U001|       120|              120|
|   U002|        90|               90|
|   U003|        30|               30|
|   U004|        60|               60|
|   U005|        15|               15|
+-------+----------+-----------------+



PART E — UDF (ONLY IF REQUIRED)

20. Classify users into engagement levels:

High
Medium
Low

Rules:

Use built-in functions where possible
Use UDF only if unavoidable
Explain why UDF was needed (or avoided)

In [40]:
df = user_time.withColumn("engagement_levels", when(col("total_time") >= 80, "High")
              .when(col("total_time") >= 50, "Medium")
              .otherwise("Low"))
df.show()

+-------+----------+-----------------+
|user_id|total_time|engagement_levels|
+-------+----------+-----------------+
|   U002|        90|             High|
|   U001|       120|             High|
|   U004|        60|           Medium|
|   U005|        15|              Low|
|   U003|        30|              Low|
+-------+----------+-----------------+



UDF was avoided for better performance and catalyst optimization.

PART F — SORTING & ORDERING

21. Sort courses by total revenue (descending)
22. Sort users by engagement within each city
23. Explain why sorting caused a shuffle

In [42]:
#21
revenue_df.orderBy(desc("total_revenue")).show()

#22
window_city = Window.partitionBy("city").orderBy(desc("total_time"))
city_rank = users_df.join(user_time, "user_id", "left")\
.withColumn("rank", dense_rank().over(window_city))
city_rank.show()

+---------+--------------------+-------------+
|course_id|         course_name|total_revenue|
+---------+--------------------+-------------+
|     C001|     PySpark Mastery|        19998|
|     C004|Data Engineering ...|        14999|
|     C002|      AI for Testers|         8999|
+---------+--------------------+-------------+

+-------+-----+---------+---------------+----+----------+----+
|user_id| name|     city|         skills| age|total_time|rank|
+-------+-----+---------+---------------+----+----------+----+
|   U003| Ravi|Bangalore| [Data,  Spark]|NULL|        30|   1|
|   U005| NULL|  Chennai|             []|  31|        15|   1|
|   U002| Neha|    Delhi|      [Testing]|NULL|        90|   1|
|   U001| Amit|Hyderabad|[AI, ML, Cloud]|  28|       120|   1|
|   U004|Pooja|   Mumbai|       [AI, ML]|  29|        60|   1|
+-------+-----+---------+---------------+----+----------+----+



In [43]:
#23
print("Explaining why sorting causes a shuffle:")
revenue_df.orderBy(desc("total_revenue")).explain(True)

# Explanation:
# In the output of explain(True), look for a stage like 'Exchange' (e.g., 'Exchange rangepartitioning').
# This 'Exchange' operation signifies a shuffle. When you perform a global sort (like orderBy without a partitionBy),
# Spark needs to ensure that the entire dataset is ordered. To do this, it might need to redistribute data
# across all its partitions so that records with smaller values of the sort key come before records with larger values.
# This redistribution of data across the network to different executors is precisely what a shuffle is.
# For example, if you have data spread across multiple machines and you want to sort it globally, each machine
# needs to know the range of values held by other machines to decide where its own records should go.
# This coordination and data movement is a shuffle.

Explaining why sorting causes a shuffle:
== Parsed Logical Plan ==
'Sort ['total_revenue DESC NULLS LAST], true
+- Aggregate [course_id#88, course_name#81], [course_id#88, course_name#81, sum(price_raw#86) AS total_revenue#346L]
   +- Filter (isnotnull(name#63) AND isnotnull(enroll_date#252))
      +- Project [course_id#88, user_id#87, name#63, city#59, skills#62, age#61, course_name#81, domain#82, level#83, price_raw#86, enroll_date#252]
         +- Project [course_id#88, user_id#87, enroll_date_raw#89, name#63, city#59, skills#62, age#61, course_name#81, domain#82, level#83, price_raw#86, coalesce(to_date(try_to_timestamp(enroll_date_raw#89, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(enroll_date_raw#89, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(enroll_date_raw#89, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true)) AS enr

PART G — SET OPERATIONS

Create two DataFrames:

Users who enrolled

Users who completed activity

24. Find users who enrolled but never became active
25. Find users who are both enrolled and active
26. Explain why set operations are different from joins

In [44]:
enrolled_users = enrollments_df.select("user_id").distinct()
activity_users = activity_df.select("user_id").distinct()

In [45]:
#24
never_active = enrolled_users.subtract(activity_users)
never_active.show()

#25
active_and_enrolled = enrolled_users.intersect(activity_users)
active_and_enrolled.show()

+-------+
|user_id|
+-------+
+-------+

+-------+
|user_id|
+-------+
|   U004|
|   U005|
|   U002|
|   U003|
|   U001|
+-------+



26.
Key Differences

- Set operations combine rows based on content and schema similarity (like merging lists). They discard non-matching columns.
- Join operations combine columns from different DataFrames based on a common key, enriching rows with related information. They preserve all columns from matched records.
- Set operations generally require schemas to be identical, while joins can work with different schemas as long as there are common keys.

PART H — DAG & PERFORMANCE ANALYSIS

27. For at least three operations, run explain(True)
28. Identify:

Shuffles

Broadcast joins

Sort operations

29. Suggest one performance improvement

In [46]:
print("\n--- Explanation for users_df after transformations ---")
users_df.explain(True)

print("\n--- Explanation for full_df after joins and cleaning ---")
full_df.explain(True)

print("\n--- Explanation for running_revenue (Window Function) ---")
running_revenue.explain(True)


--- Explanation for users_df after transformations ---
== Parsed Logical Plan ==
Project [user_id#56, name#63, city#59, skills#62, age#61]
+- Project [user_id#56, CASE WHEN (name#57 = ) THEN cast(null as string) ELSE name#57 END AS name#63, age_raw#58, city#59, skills#62, age#61]
   +- Project [user_id#56, name#57, age_raw#58, city#59, CASE WHEN isnotnull(skills#60) THEN split(regexp_replace(regexp_replace(skills#60, ^\[|\]$, , 1), [|], ,, 1), ,, -1) ELSE cast(array() as array<string>) END AS skills#62, age#61]
      +- Project [user_id#56, name#57, age_raw#58, city#59, skills#60, CASE WHEN RLIKE(age_raw#58, ^[0-9]+$) THEN cast(age_raw#58 as int) ELSE cast(null as int) END AS age#61]
         +- LogicalRDD [user_id#56, name#57, age_raw#58, city#59, skills#60], false

== Analyzed Logical Plan ==
user_id: string, name: string, city: string, skills: array<string>, age: int
Project [user_id#56, name#63, city#59, skills#62, age#61]
+- Project [user_id#56, CASE WHEN (name#57 = ) THEN cast(n

28

Shuffle: groupBy, sort

Broadcast: courses join

Sort: Window, OrderBy

In [47]:
#29
activity_df.cache()
#cache activity_df as it was used multiple times

DataFrame[user_id: string, metadata_raw: string, time_spent: int, actions: array<string>]