In [3]:

from pyspark.sql import SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

spark = (SparkSession.builder
         .appName("LearningPlatform-Analytics")
         .config("spark.sql.shuffle.partitions", "8")  # small for Colab
         .config("spark.driver.memory", "2g")
         .getOrCreate())


In [8]:

# RAW datasets (as in the assignment)
raw_users = [
    ("U001","Amit","28","Hyderabad","AI,ML,Cloud"),
    ("U002","Neha","Thirty","Delhi","Testing"),
    ("U003","Ravi",None,"Bangalore",["Data","Spark"]),
    ("U004","Pooja","29","Mumbai","AI\nML"),
    ("U005","", "31","Chennai",None)
]

raw_courses = [
    ("C001","PySpark Mastery","Data Engineering","Advanced","₹9999"),
    ("C002","AI for Testers","QA","Beginner","8999"),
    ("C003","ML Foundations","AI","Intermediate",None),
    ("C004","Data Engineering Bootcamp","Data","Advanced","₹14999")
]

raw_enrollments = [
    ("U001","C001","2024-01-05"),
    ("U002","C002","05/01/2024"),
    ("U003","C001","2024/01/06"),
    ("U004","C003","invalid_date"),
    ("U001","C004","2024-01-10"),
    ("U005","C002","2024-01-12")
]

raw_activity = [
    ("U001","login,watch,logout","{'device':'mobile'}",120),
    ("U002",["login","watch"],"device=laptop",90),
    ("U003","login\nlogout",None,30),
    ("U004",None,"{'device':'tablet'}",60),
    ("U005","login","{'device':'mobile'}",15)
]

# EXPLICIT STAGING SCHEMAS (string-friendly)
users_staging_schema = StructType([
    StructField('user_id', StringType(), False),
    StructField('name', StringType(), True),
    StructField('age_raw', StringType(), True),
    StructField('city', StringType(), True),
    StructField('skills_raw', StringType(), True),
])

courses_staging_schema = StructType([
    StructField('course_id', StringType(), False),
    StructField('title', StringType(), True),
    StructField('category', StringType(), True),
    StructField('level', StringType(), True),
    StructField('price_raw', StringType(), True),
])

enrollments_staging_schema = StructType([
    StructField('user_id', StringType(), False),
    StructField('course_id', StringType(), False),
    StructField('enroll_date_raw', StringType(), True),
])

activity_staging_schema = StructType([
    StructField('user_id', StringType(), False),
    StructField('actions_raw', StringType(), True),
    StructField('metadata_raw', StringType(), True),
    StructField('time_spent', IntegerType(), True),
])

# Normalize list to string for staging where needed
raw_users_norm = []
for u in raw_users:
    skills = u[4]
    if isinstance(skills, list):
        skills = ",".join(skills)
    raw_users_norm.append((u[0], u[1], u[2] if u[2] is not None else None, u[3], skills if skills is not None else None))

raw_activity_norm = []
for a in raw_activity:
    actions = a[1]
    if isinstance(actions, list):
        actions = ",".join(actions)
    raw_activity_norm.append((a[0], actions if actions is not None else None, a[2] if a[2] is not None else None, a[3]))

# Create staging DataFrames
users_staging_df       = spark.createDataFrame(raw_users_norm, users_staging_schema)
courses_staging_df     = spark.createDataFrame(raw_courses, courses_staging_schema)
enrollments_staging_df = spark.createDataFrame(raw_enrollments, enrollments_staging_schema)
activity_staging_df = spark.createDataFrame(raw_activity_norm,activity_staging_schema  )


In [9]:

# Age normalization (avoid UDF)
age_num = when(lower(col('age_raw')) == 'thirty', lit(30)).otherwise(col('age_raw').cast('int'))

# Skills to arrays: unify separators to comma, split, trim, filter empties
skills_clean = split(
    regexp_replace(regexp_replace(regexp_replace(col('skills_raw'), r"[\n]+", ","), r"[\[\]'\"]", ''), r"[\s/]+", ","),
    ","
)
skills_arr = expr("filter(transform(skills_clean, x -> trim(x)), x -> x <> '')")

users_df = (users_staging_df
            .withColumn('age', age_num)
            .withColumn('skills_clean', skills_clean)
            .withColumn('skills', skills_arr)
            .drop('age_raw', 'skills_raw', 'skills_clean'))

# Price normalization: strip currency/non-digits
courses_df = (courses_staging_df
              .withColumn('price', regexp_replace(col('price_raw'), r"[^0-9]", '').cast('int'))
              .drop('price_raw'))

# Dates: parse multiple formats, keep NULL for invalid
fmt1 = to_date(col('enroll_date_raw'), 'yyyy-MM-dd')
fmt2 = to_date(col('enroll_date_raw'), 'dd/MM/yyyy')
fmt3 = to_date(col('enroll_date_raw'), 'yyyy/MM/dd')

enrollments_df = (enrollments_staging_df
                  .withColumn('enroll_date', coalesce(fmt1, fmt2, fmt3))
                  .drop('enroll_date_raw'))

# Actions to arrays; metadata -> device
actions_clean = split(regexp_replace(regexp_replace(col('actions_raw'), r"[\n]+", ","), r"[\s]+", ","), ",")
actions_arr   = expr("filter(transform(actions_clean, x -> trim(x)), x -> x <> '')")

json_meta = regexp_replace(col('metadata_raw'), "'", '"')
meta_schema = StructType([StructField('device', StringType(), True)])
meta_device_json = from_json(json_meta, meta_schema).getField('device')
meta_device_kv   = when(col('metadata_raw').contains('='), split(col('metadata_raw'), '=').getItem(1)).otherwise(None)

activity_df = (activity_staging_df
               .withColumn('actions', actions_arr)
               .withColumn('device', coalesce(meta_device_json, meta_device_kv))
               .drop('actions_raw','metadata_raw'))


{"ts": "2025-12-19 06:45:02.074", "level": "ERROR", "logger": "SQLQueryContextLogger", "msg": "[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `actions_clean` cannot be resolved. Did you mean one of the following? [`actions_raw`, `time_spent`, `user_id`, `metadata_raw`]. SQLSTATE: 42703", "context": {"errorClass": "UNRESOLVED_COLUMN.WITH_SUGGESTION"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o295.withColumn.\n: org.apache.spark.sql.AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `actions_clean` cannot be resolved. Did you mean one of the following? [`actions_raw`, `time_spent`, `user_id`, `metadata_raw`]. SQLSTATE: 42703; line 1 pos 17;\n'Project [user_id#46, actions_raw#47, metadata_raw#48, time_spent#49, 'filter('transform('actions_clean, lambdafunction('trim(lambda 'x), lambda 'x, false)), lambdafunction(NOT (lambda 'x = ), lambda 'x, false)) AS a

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `actions_clean` cannot be resolved. Did you mean one of the following? [`actions_raw`, `time_spent`, `user_id`, `metadata_raw`]. SQLSTATE: 42703; line 1 pos 17;
'Project [user_id#46, actions_raw#47, metadata_raw#48, time_spent#49, 'filter('transform('actions_clean, lambdafunction('trim(lambda 'x), lambda 'x, false)), lambdafunction(NOT (lambda 'x = ), lambda 'x, false)) AS actions#57]
+- LogicalRDD [user_id#46, actions_raw#47, metadata_raw#48, time_spent#49], false
