In [1]:
# Import Required Libraries
import os
import pandas as pd
from datetime import datetime, timedelta
import random
import json
import findspark

findspark.init()

# PySpark imports
try:
    from pyspark.sql import SparkSession
    from pyspark.sql.functions import *
    from pyspark.sql.types import *
    pyspark_available = True
    print("PySpark is available!")
except ImportError:
    print("PySpark not found. Please install with: pip install pyspark")
    pyspark_available = False

PySpark is available!


In [2]:
if pyspark_available:
    # Create SparkSession with custom configuration
    spark = SparkSession.builder \
        .appName("PySpark") \
        .config("spark.sql.adaptive.enabled", "true") \
        .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
        .config("spark.driver.memory", "2g") \
        .config("spark.executor.memory", "1g") \
        .getOrCreate()
    
    # Set log level to reduce verbose output
    spark.sparkContext.setLogLevel("WARN")
    
    print("✓ SparkSession created successfully!")
    print(f"Spark Version: {spark.version}")
    print(f"Application Name: {spark.sparkContext.appName}")
    print(f"Master: {spark.sparkContext.master}")
    
    # Check available cores and memory
    print(f"Default Parallelism: {spark.sparkContext.defaultParallelism}")
    
else:
    print("Cannot proceed without PySpark. Please install PySpark first.")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/27 16:14:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


✓ SparkSession created successfully!
Spark Version: 3.5.0
Application Name: PySpark
Master: local[*]
Default Parallelism: 4


In [3]:
import time

if pyspark_available:
    print("=== Bronze Layer: Raw Data Ingestion ===")

    with open("./data/students_no_scores.json/part-00000-3615aed5-2901-4b90-81af-4caacb7dba9a-c000.json", "r") as f:
        lines = f.read().splitlines()

    raw_rdd = spark.sparkContext.parallelize(lines)

    def parse_json_safe(json_str):
        try:
            data = json.loads(json_str)
            data['_ingestion_timestamp'] = time.time()
            data['_source'] = 'file'
            data['_status'] = 'valid'
            return data
        except:
            return {
                '_raw_data': json_str,
                '_ingestion_timestamp': time.time(),
                '_source': 'file',
                '_status': 'parse_error'
            }
        
    bronze_rdd = raw_rdd.map(parse_json_safe)
    bronze_data = bronze_rdd.collect()

=== Bronze Layer: Raw Data Ingestion ===


25/11/27 16:14:49 WARN TaskSetManager: Stage 0 contains a task of very large size (1011 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [4]:
if pyspark_available:
    print("=== Silver Layer: Cleaned and Standardized Data ===")
    
    # Start with valid Bronze layer data
    valid_bronze_df = bronze_rdd.filter(lambda row: row['_status'] == 'valid')
    total_valid = valid_bronze_df.count()

    # Removes duplicated rows
    exclude_cols = {"_ingestion_timestamp", "_source", "_status", "_raw_data"}

    dedup_rdd = valid_bronze_df.map(
        lambda r: (
            tuple(sorted((k, v) for k, v in r.items() if k not in exclude_cols)),
            r
        )
    ).reduceByKey(lambda a, b: a) \
    .map(lambda kv: kv[1])
        
    total_dedup = dedup_rdd.count()
    print(f"Rows after deduplication: {total_dedup} (removed {total_valid - total_dedup})")

    # Drop rows with invalid Student_ID
    dedup_rdd_2 = dedup_rdd.filter(
        lambda r: r.get("Student_ID") is not None and str(r.get("Student_ID")).isdigit()
    )
    total_after_student_id_removal = dedup_rdd_2.count()
    print(f"Rows after removing invalid student id: {total_dedup} (removed {total_dedup - total_after_student_id_removal})")

    # Numeric fields: 
    numeric_fields = ["Student_ID", "Hours_Studied", "Attendance", "Sleep_Hours", "Previous_Scores",
                    "Tutoring_Sessions", "Physical_Activity"]
        
    # Categorical fields: cast to string, replace missing with 'Unknown'
    categorical_fields = ["Parental_Involvement", "Access_to_Resources", "Extracurricular_Activities",
                        "Motivation_Level", "Internet_Access", "Family_Income", "Teacher_Quality",
                        "School_Type", "Peer_Influence", "Learning_Disabilities", 
                        "Parental_Education_Level", "Distance_from_Home", "Gender"]
    
    # Define allowed values
    allowed_values = {
        "Gender": {"Male", "Female"},
        "Parental_Involvement": {"Low", "Medium", "High"},
        "Access_to_Resources": {"Low", "Medium", "High"},
        "Extracurricular_Activities": {"Yes", "No"},
        "Motivation_Level": {"Low", "Medium", "High"},
        "Internet_Access": {"Yes", "No"},
        "Family_Income": {"Low", "Medium", "High"},
        "Teacher_Quality": {"Low", "Medium", "High"},
        "School_Type": {"Public", "Private"},
        "Peer_Influence": {"Positive", "Neutral", "Negative"},
        "Learning_Disabilities": {"Yes", "No"},
        "Parental_Education_Level": {"High School", "College", "Postgraduate"},
        "Distance_from_Home": {"Near", "Moderate", "Far"}
    }

    # Get a dictonary with the default value for each field
    field_summary = {}

    # Categorical counts
    for field, allowed_values2 in allowed_values.items():
        counts = (
            dedup_rdd_2
            .map(lambda r: r.get(field))
            .filter(lambda v: v in allowed_values2)  # only allowed values
            .map(lambda v: (v, 1))
            .reduceByKey(lambda a, b: a + b)
            .collectAsMap()
        )
        field_summary[field] = counts

    for field in numeric_fields:
        values_rdd = dedup_rdd_2 \
            .map(lambda r: r.get(field)) \
            .filter(lambda v: v not in [None, ""]) \
            .map(lambda v: float(v))
        
        # Compute sum and count
        sum_val = values_rdd.sum()
        count_val = values_rdd.count()

        if count_val > 0:
            avg = sum_val / count_val
        else:
            avg = None
        
        field_summary[field] = avg


    print(field_summary)

    default_field_values = {}

    for field, value in field_summary.items():
        if isinstance(value, dict):
            highest_count = -1
            dominant_val = None
            for category_field, val in value.items():
                if val > highest_count:
                    highest_count = val
                    dominant_val = category_field
            default_field_values[field] = dominant_val
        else:  # numeric
            default_field_values[field] = value

    print(default_field_values)


    # Initialize counters: each field maps to [total, valid, invalid]
    field_stats = {f: [0,0,0] for f in numeric_fields + categorical_fields}
    
    
    # Silver layer transformations
    # Cleaning function
    def clean_and_cast(row):
        row_dict = dict(row)

        # Numeric fields
        for field in numeric_fields:
            val = row_dict.get(field)
            if val in [None, ""]:
                row_dict[field] = default_field_values.get(field)
            else:
                try:
                    val = float(val)
                    # Validate ranges for numeric fields, that has an allowed range
                    if field in ["Attendance","Previous_Scores"] and not (0 <= val <= 100):
                        row_dict[field] = default_field_values.get(field)
                    else:
                        row_dict[field] = val
                except:
                    row_dict[field] = default_field_values.get(field)

        # Categorical fields
        for field in categorical_fields:
            val = row_dict.get(field)
            if val in [None, ""]:
                row_dict[field] = default_field_values.get(field)
            else:
                val_str = str(val).strip().title()
                if val_str in allowed_values.get(field, set()):
                    row_dict[field] = val_str
                else:
                    row_dict[field] = default_field_values.get(field)

        # Add Silver processing timestamp
        row_dict["_silver_processed_timestamp"] = time.time()

        return row_dict
    

    # Apply cleaning + type casting
    cleaned_rdd = dedup_rdd_2.map(clean_and_cast)

    def safe_debug(row):
        try:
            clean_and_cast(row)
            return 1
        except Exception as e:
            print(f"Error row: {row}, {e}")
            return 0

    cleaned_rdd.map(safe_debug).take(10)

    total_cleaned = cleaned_rdd.count()
    print(f"Rows after cleaning: {total_cleaned}")
    
    # Collect results
    cleaned_data = cleaned_rdd.collect()

    # --- Compute field validation stats ---
    def row_field_stats(row):
        stats = []
        for f in numeric_fields:
            val = row.get(f)
            total = 1
            valid = 1 if val not in [None, ""] else 0
            stats.append((f, (total, valid, 1-valid)))

        for f in categorical_fields:
            val = row.get(f)
            total = 1
            valid = 1 if val not in [None, ""] else 0
            stats.append((f, (total, valid, 1-valid)))
        return stats
    
    stats_rdd = cleaned_rdd.flatMap(row_field_stats)
    field_summary = stats_rdd.reduceByKey(lambda a,b: (a[0]+b[0], a[1]+b[1], a[2]+b[2]))

    # Print summary
    print("=== Field Validation Summary ===")
    for field, (total, valid, invalid) in field_summary.collect():
        print(f"{field:<25} | Total: {total:<5} | Valid: {valid:<5} | Invalid: {invalid:<5}")

    # Data validation and quality checks
    print("=== Silver Layer Data Quality ===")
    
    # Check for null values in critical fields
    critical_fields = ["Student_ID"]
    
    # Count nulls per field
    null_counts = {}
    for field in critical_fields:
        null_counts[field] = cleaned_rdd.filter(lambda r: r.get(field) in [None, ""]).count()


    event_counts = cleaned_rdd.map(lambda r: (r.get("event", "Unknown"), 1)) \
                            .reduceByKey(lambda a, b: a + b) \
                            .sortBy(lambda x: -x[1])  # sort descending by count


    # Function to count values for a single field
    def value_counts(rdd, field):
        return (rdd
                .map(lambda row: (row.get(field, "Unknown"), 1))  # get value, default "Unknown"
                .reduceByKey(lambda a, b: a + b)                 # sum counts
                .sortBy(lambda x: -x[1]))                        # sort descending

    # Iterate over all categorical fields
    for field in categorical_fields:
        counts = value_counts(cleaned_rdd, field)
        print(f"\nValue distribution for {field}:")
        for val, cnt in counts.collect():
            print(f"{val}: {cnt}")

    

    # Convert Silver RDD to DataFrame for downstream use
    silver_df = spark.createDataFrame(cleaned_data)
    silver_df.show(5, truncate=False)

    silver_df.write.mode("overwrite").parquet("./data/silverResultForStudentsWithoutExam")
    spark.stop()

25/11/27 16:14:50 WARN TaskSetManager: Stage 1 contains a task of very large size (1011 KiB). The maximum recommended task size is 1000 KiB.


=== Silver Layer: Cleaned and Standardized Data ===


25/11/27 16:14:51 WARN TaskSetManager: Stage 2 contains a task of very large size (1011 KiB). The maximum recommended task size is 1000 KiB.


Rows after deduplication: 8783 (removed 0)
Rows after removing invalid student id: 8783 (removed 0)
{'Gender': {'Male': 4623, 'Female': 3888}, 'Parental_Involvement': {'Medium': 4581, 'Low': 1641, 'High': 2289}, 'Access_to_Resources': {'Medium': 4014, 'Low': 2097, 'High': 2412}, 'Extracurricular_Activities': {'Yes': 5250, 'No': 3273}, 'Motivation_Level': {'Medium': 4549, 'Low': 2329, 'High': 1645}, 'Internet_Access': {'Yes': 7896, 'No': 615}, 'Family_Income': {'Medium': 3697, 'Low': 3230, 'High': 1584}, 'Teacher_Quality': {'Medium': 4767, 'Low': 771, 'High': 2872}, 'School_Type': {'Private': 2927, 'Public': 5584}, 'Peer_Influence': {'Neutral': 3591, 'Negative': 1665, 'Positive': 3255}, 'Learning_Disabilities': {'No': 7672, 'Yes': 839}, 'Parental_Education_Level': {'Postgraduate': 1577, 'High School': 4417, 'College': 2409}, 'Distance_from_Home': {'Near': 5234, 'Moderate': 2418, 'Far': 770}, 'Student_ID': 910538582.4747808, 'Hours_Studied': 19.51718878329227, 'Attendance': 81.0944503109

                                                                                