In [17]:
# Import Required Libraries
import os
import pandas as pd
from datetime import datetime, timedelta
import random
import json
import findspark
from pyspark.sql import functions as F

findspark.init()

# PySpark imports
try:
    from pyspark.sql import SparkSession
    from pyspark.sql.functions import *
    from pyspark.sql.types import *
    pyspark_available = True
    print("PySpark is available!")
except ImportError:
    print("PySpark not found. Please install with: pip install pyspark")
    pyspark_available = False

PySpark is available!


In [5]:
if pyspark_available:
    # Create SparkSession with custom configuration
    spark = SparkSession.builder \
        .appName("Lab2-PySpark-Basics") \
        .config("spark.sql.adaptive.enabled", "true") \
        .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
        .config("spark.driver.memory", "2g") \
        .config("spark.executor.memory", "1g") \
        .getOrCreate()
    
    # Set log level to reduce verbose output
    spark.sparkContext.setLogLevel("WARN")
    
    print("✓ SparkSession created successfully!")
    print(f"Spark Version: {spark.version}")
    print(f"Application Name: {spark.sparkContext.appName}")
    print(f"Master: {spark.sparkContext.master}")
    
    # Check available cores and memory
    print(f"Default Parallelism: {spark.sparkContext.defaultParallelism}")
    
else:
    print("Cannot proceed without PySpark. Please install PySpark first.")

✓ SparkSession created successfully!
Spark Version: 3.5.0
Application Name: Lab2-PySpark-Basics
Master: local[*]
Default Parallelism: 4


In [33]:
import time

if pyspark_available:
    print("=== Bronze Layer: Raw Data Ingestion ===")

    # File paths
    file_path = "./students_unstructured.json"

    # Convert to JSON array
    with open(file_path, "r") as f:
        lines = f.read().splitlines()

    # Create RDD from raw JSON strings
    raw_rdd = spark.sparkContext.parallelize(lines)

    # Parse JSON and handle errors (Bronze layer pattern)
    def parse_json_safe(json_str):
        try:
            data = json.loads(json_str)
            data['_ingestion_timestamp'] = time.time()
            data['_source'] = 'file'
            data['_status'] = 'valid'
            return data
        except:
            return {
                '_raw_data': json_str,
                '_ingestion_timestamp': time.time(),
                '_source': 'file',
                '_status': 'parse_error'
            }
        
    # Apply parsing
    bronze_rdd = raw_rdd.map(parse_json_safe)
    bronze_data = bronze_rdd.collect()
    
    # Convert to DataFrame for easier analysis
    bronze_df = spark.createDataFrame(bronze_data)
    
    print("Bronze Layer Data (Raw with Metadata):")
    bronze_df.show(truncate=False)
    
    # Show data quality metrics
    total_records = bronze_df.count()
    valid_records = bronze_df.filter(col("_status") == "valid").count()
    error_records = bronze_df.filter(col("_status") == "parse_error").count()
    
    print(f"\nData Quality Metrics:")
    print(f"Total records: {total_records}")
    print(f"Valid records: {valid_records}")
    print(f"Parse errors: {error_records}")
    print(f"Success rate: {(valid_records/total_records)*100:.1f}%")

=== Bronze Layer: Raw Data Ingestion ===


25/11/12 13:16:57 WARN TaskSetManager: Stage 85 contains a task of very large size (1086 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Bronze Layer Data (Raw with Metadata):
+-------------------+----------+------------------+----------+--------------------------+-------------+------+-------------+---------------+---------------------+----------------+------------------------+--------------------+--------------+-----------------+---------------+-----------+-----------+---------------+-----------------+--------------------+-------+-------+---------+
|Access_to_Resources|Attendance|Distance_from_Home|Exam_Score|Extracurricular_Activities|Family_Income|Gender|Hours_Studied|Internet_Access|Learning_Disabilities|Motivation_Level|Parental_Education_Level|Parental_Involvement|Peer_Influence|Physical_Activity|Previous_Scores|School_Type|Sleep_Hours|Teacher_Quality|Tutoring_Sessions|_ingestion_timestamp|_source|_status|_raw_data|
+-------------------+----------+------------------+----------+--------------------------+-------------+------+-------------+---------------+---------------------+----------------+----------------------

In [46]:
if pyspark_available:
    print("=== Silver Layer: Cleaned and Standardized Data ===")
    
    # Start with valid Bronze layer data
    valid_bronze_df = bronze_df.rdd.filter(lambda row: row['_status'] == 'valid')

    # Removes duplicated rows
    dedup_rdd = valid_bronze_df.map(lambda r: (tuple(sorted(r.items())), r)) \
        .reduceByKey(lambda a, b: a) \
        .map(lambda kv: kv[1])
    print(dedup_rdd)

    # Silver layer transformations
    def clean_and_cast(row):
        row_dict = dict(row)  

        # Numeric fields: 
        numeric_fields = ["Hours_Studied", "Attendance", "Sleep_Hours", "Previous_Scores",
                        "Tutoring_Sessions", "Physical_Activity", "Exam_Score"]
        
        # Categorical fields: cast to string, replace missing with 'Unknown'
        categorical_fields = ["Parental_Involvement", "Access_to_Resources", "Extracurricular_Activities",
                            "Motivation_Level", "Internet_Access", "Family_Income", "Teacher_Quality",
                            "School_Type", "Peer_Influence", "Learning_Disabilities", 
                            "Parental_Education_Level", "Distance_from_Home", "Gender"]
        
        # Cast to int and replace missing with 0
        for field in numeric_fields:
            val = row_dict.get(field)
            row_dict[field] = int(val) if val not in [None, ""] else 0

        # Cast to string, replace missing with 'Unknown'\
        for field in categorical_fields:
            val = row_dict.get(field)
            row_dict[field] = str(val).strip() if val not in [None, ""] else "Unknown"

        return row_dict

    # Apply cleaning + type casting
    cleaned_rdd = dedup_rdd.map(clean_and_cast)

    # Collect results
    #cleaned_data = cleaned_rdd.collect()
    print(cleaned_rdd)

    # ---- Map clean function over RDD ----
    silver_rdd = bronze_rdd.map(lambda r: clean_row(r))

    # ---- Example aggregation: average Exam_Score by School_Type ----
    school_scores = silver_rdd.map(lambda r: (r["School_Type"], r["Exam_Score"]))
    school_totals = school_scores.mapValues(lambda s: (s, 1)) \
                                .reduceByKey(lambda a, b: (a[0]+b[0], a[1]+b[1]))
    school_avg = school_totals.mapValues(lambda x: x[0]/x[1])

    print("\nAverage Exam_Score by School_Type:")
    for school, avg_score in school_avg.collect():
        print(f"{school}: {avg_score:.2f}")

    # ---- Optional: convert Silver RDD to DataFrame for downstream use ----
    silver_df = spark.createDataFrame(silver_rdd)
    silver_df.show(5, truncate=False)
    
    # Data validation and quality checks
    print("=== Silver Layer Data Quality ===")
    
    # Check for null values in critical fields
    null_checks = silver_df.select([
        F.count(when(col(c).isNull(), c)).alias(f"{c}_nulls") 
        for c in ["Exam_Score"]
    ])
    null_checks.show()
    

=== Silver Layer: Cleaned and Standardized Data ===
PythonRDD[260] at RDD at PythonRDD.scala:53
PythonRDD[261] at RDD at PythonRDD.scala:53

Average Exam_Score by School_Type:


25/11/12 14:12:03 WARN TaskSetManager: Stage 114 contains a task of very large size (1086 KiB). The maximum recommended task size is 1000 KiB.
25/11/12 14:12:04 WARN TaskSetManager: Stage 116 contains a task of very large size (1086 KiB). The maximum recommended task size is 1000 KiB.


Private: 67.26
Public: 67.21
Unknown: 0.00


25/11/12 14:12:04 WARN TaskSetManager: Stage 117 contains a task of very large size (1086 KiB). The maximum recommended task size is 1000 KiB.
25/11/12 14:12:04 WARN TaskSetManager: Stage 118 contains a task of very large size (1086 KiB). The maximum recommended task size is 1000 KiB.


+-------------------+----------+------------------+----------+--------------------------+-------------+------+-------------+---------------+---------------------+----------------+------------------------+--------------------+--------------+-----------------+---------------+-----------+-----------+---------------+-----------------+--------------------+-------+-------+
|Access_to_Resources|Attendance|Distance_from_Home|Exam_Score|Extracurricular_Activities|Family_Income|Gender|Hours_Studied|Internet_Access|Learning_Disabilities|Motivation_Level|Parental_Education_Level|Parental_Involvement|Peer_Influence|Physical_Activity|Previous_Scores|School_Type|Sleep_Hours|Teacher_Quality|Tutoring_Sessions|_ingestion_timestamp|_source|_status|
+-------------------+----------+------------------+----------+--------------------------+-------------+------+-------------+---------------+---------------------+----------------+------------------------+--------------------+--------------+-----------------+--

In [None]:
if pyspark_available:
    print("=== Gold Layer: Business Metrics and Analytics ===")
    
    # User activity summary (Gold layer aggregation)
    user_activity_gold = silver_df.groupBy("user_id").agg(
        F.count("*").alias("total_events"),
        countDistinct("event").alias("unique_events"),
        min("timestamp").alias("first_activity"),
        max("timestamp").alias("last_activity"),
        sum("amount").alias("total_spent"),
        countDistinct("device").alias("devices_used")
    ).withColumn("session_duration_minutes", 
                 (unix_timestamp("last_activity") - unix_timestamp("first_activity")) / 60
    )
    
    print("User Activity Summary (Gold Layer):")
    user_activity_gold.show()
    
    # Daily metrics rollup
    daily_metrics_gold = silver_df.groupBy("event_date").agg(
        F.count("*").alias("total_events"),
        countDistinct("user_id").alias("unique_users"),
        sum("amount").alias("daily_revenue"),
        avg("amount").alias("avg_transaction"),
        F.count(when(col("event") == "purchase", 1)).alias("purchases"),
        F.count(when(col("event") == "login", 1)).alias("logins"),
        F.count(when(col("event") == "signup", 1)).alias("signups")
    ).withColumn("conversion_rate", 
                 round(col("purchases") / col("unique_users") * 100, 2)
    )
    
    print("Daily Metrics Summary (Gold Layer):")
    daily_metrics_gold.show()
    
    # Hourly activity pattern
    hourly_pattern_gold = silver_df.groupBy("event_hour").agg(
        F.count("*").alias("events_count"),
        countDistinct("user_id").alias("active_users")
    ).orderBy("event_hour")
    
    print("Hourly Activity Pattern (Gold Layer):")
    hourly_pattern_gold.show()
    
    # Device preference analysis
    device_analysis_gold = silver_df.groupBy("device", "event").agg(
        F.count("*").alias("event_count")
    ).groupBy("device").pivot("event").sum("event_count").fillna(0)
    
    print("Device Preference Analysis (Gold Layer):")
    device_analysis_gold.show()