In [1]:
# Import Required Libraries
import os
import pandas as pd
from datetime import datetime, timedelta
import random
import json
import findspark

findspark.init()

# PySpark imports
try:
    from pyspark.sql import SparkSession
    from pyspark.sql.functions import *
    from pyspark.sql.types import *
    pyspark_available = True
    print("PySpark is available!")
except ImportError:
    print("PySpark not found. Please install with: pip install pyspark")
    pyspark_available = False

PySpark is available!


In [2]:
if pyspark_available:
    # Create SparkSession with custom configuration
    spark = SparkSession.builder \
        .appName("PySpark") \
        .config("spark.sql.adaptive.enabled", "true") \
        .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
        .config("spark.driver.memory", "2g") \
        .config("spark.executor.memory", "1g") \
        .getOrCreate()
    
    # Set log level to reduce verbose output
    spark.sparkContext.setLogLevel("WARN")
    
    print("✓ SparkSession created successfully!")
    print(f"Spark Version: {spark.version}")
    print(f"Application Name: {spark.sparkContext.appName}")
    print(f"Master: {spark.sparkContext.master}")
    
    # Check available cores and memory
    print(f"Default Parallelism: {spark.sparkContext.defaultParallelism}")
    
else:
    print("Cannot proceed without PySpark. Please install PySpark first.")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/24 12:03:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


✓ SparkSession created successfully!
Spark Version: 3.5.0
Application Name: PySpark
Master: local[*]
Default Parallelism: 4


In [None]:
import time

if pyspark_available:
    print("=== Bronze Layer: Raw Data Ingestion ===")

    # File paths
    file_path = "./data/raw/students_unstructured.json"

    # Convert to JSON array
    with open(file_path, "r") as f:
        lines = f.read().splitlines()

    # Create RDD from raw JSON strings
    raw_rdd = spark.sparkContext.parallelize(lines)

    # Parse JSON and handle errors (Bronze layer pattern)
    def parse_json_safe(json_str):
        try:
            data = json.loads(json_str)
            data['_ingestion_timestamp'] = time.time()
            data['_source'] = 'file'
            data['_status'] = 'valid'
            return data
        except:
            return {
                '_raw_data': json_str,
                '_ingestion_timestamp': time.time(),
                '_source': 'file',
                '_status': 'parse_error'
            }
        
    # Apply parsing
    bronze_rdd = raw_rdd.map(parse_json_safe)
    bronze_data = bronze_rdd.collect()
    
    # Convert to DataFrame for easier analysis
    bronze_df = spark.createDataFrame(bronze_data)
    
    print("Bronze Layer Data (Raw with Metadata):")
    bronze_df.show(truncate=False)
    
    # Show data quality metrics
    total_records = bronze_df.count()
    valid_records = bronze_df.filter(col("_status") == "valid").count()
    error_records = bronze_df.filter(col("_status") == "parse_error").count()
    
    print(f"\nData Quality Metrics:")
    print(f"Total records: {total_records}")
    print(f"Valid records: {valid_records}")
    print(f"Parse errors: {error_records}")
    print(f"Success rate: {(valid_records/total_records)*100:.1f}%")

=== Bronze Layer: Raw Data Ingestion ===


25/11/24 12:03:31 WARN TaskSetManager: Stage 0 contains a task of very large size (1086 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

Bronze Layer Data (Raw with Metadata):
+-------------------+----------+------------------+----------+--------------------------+-------------+------+-------------+---------------+---------------------+----------------+------------------------+--------------------+--------------+-----------------+---------------+-----------+-----------+---------------+-----------------+--------------------+-------+-------+---------+
|Access_to_Resources|Attendance|Distance_from_Home|Exam_Score|Extracurricular_Activities|Family_Income|Gender|Hours_Studied|Internet_Access|Learning_Disabilities|Motivation_Level|Parental_Education_Level|Parental_Involvement|Peer_Influence|Physical_Activity|Previous_Scores|School_Type|Sleep_Hours|Teacher_Quality|Tutoring_Sessions|_ingestion_timestamp|_source|_status|_raw_data|
+-------------------+----------+------------------+----------+--------------------------+-------------+------+-------------+---------------+---------------------+----------------+----------------------

In [None]:
if pyspark_available:
    print("=== Silver Layer: Cleaned and Standardized Data ===")
    
    # Start with valid Bronze layer data
    valid_bronze_df = bronze_rdd.filter(lambda row: row['_status'] == 'valid')
    total_valid = valid_bronze_df.count()

    # Removes duplicated rows
    exclude_cols = {"_ingestion_timestamp", "_source", "_status", "_raw_data"}

    dedup_rdd = valid_bronze_df.map(
        lambda r: (
            tuple(sorted((k, v) for k, v in r.items() if k not in exclude_cols)),
            r
        )
    ).reduceByKey(lambda a, b: a) \
    .map(lambda kv: kv[1])
        
    total_dedup = dedup_rdd.count()
    print(f"Rows after deduplication: {total_dedup} (removed {total_valid - total_dedup})")

    # Drop rows with invalid Exam_Score
    dedup_rdd_2 = dedup_rdd.filter(
        lambda r: r.get("Exam_Score") is not None and 0 <= r["Exam_Score"] <= 100
    )
    total_after_exam_removal = dedup_rdd_2.count()
    print(f"Rows after removing invalid exam scores: {total_dedup} (removed {total_dedup - total_after_exam_removal})")

    # Numeric fields: 
    numeric_fields = ["Hours_Studied", "Attendance", "Sleep_Hours", "Previous_Scores",
                    "Tutoring_Sessions", "Physical_Activity", "Exam_Score"]
        
    # Categorical fields: cast to string, replace missing with 'Unknown'
    categorical_fields = ["Parental_Involvement", "Access_to_Resources", "Extracurricular_Activities",
                        "Motivation_Level", "Internet_Access", "Family_Income", "Teacher_Quality",
                        "School_Type", "Peer_Influence", "Learning_Disabilities", 
                        "Parental_Education_Level", "Distance_from_Home", "Gender"]

    # Define allowed values
    allowed_values = {
        "Gender": {"Male", "Female"},
        "Parental_Involvement": {"Low", "Medium", "High"},
        "Access_to_Resources": {"Low", "Medium", "High"},
        "Extracurricular_Activities": {"Yes", "No"},
        "Motivation_Level": {"Low", "Medium", "High"},
        "Internet_Access": {"Yes", "No"},
        "Family_Income": {"Low", "Medium", "High"},
        "Teacher_Quality": {"Low", "Medium", "High"},
        "School_Type": {"Public", "Private"},
        "Peer_Influence": {"Positive", "Neutral", "Negative"},
        "Learning_Disabilities": {"Yes", "No"},
        "Parental_Education_Level": {"High School", "College", "Postgraduate"},
        "Distance_from_Home": {"Near", "Moderate", "Far"}
    }

    

    # Initialize counters: each field maps to [total, valid, invalid]
    field_stats = {f: [0,0,0] for f in numeric_fields + categorical_fields}

    # Silver layer transformations
    # Cleaning function
    def clean_and_cast(row):
        row_dict = dict(row)

        # Numeric fields
        for field in numeric_fields:
            val = row_dict.get(field)
            if val in [None, ""]:
                row_dict[field] = None
            else:
                try:
                    val = int(val)
                    # Validate ranges for certain numeric fields
                    if field in ["Attendance","Previous_Scores"] and not (0 <= val <= 100):
                        row_dict[field] = None
                    else:
                        row_dict[field] = val
                except:
                    row_dict[field] = None

        # Categorical fields
        for field in categorical_fields:
            val = row_dict.get(field)
            if val in [None, ""]:
                row_dict[field] = None
            else:
                val_str = str(val).strip().title()
                if val_str in allowed_values.get(field, set()):
                    row_dict[field] = val_str
                else:
                    row_dict[field] = None

        # Add Silver processing timestamp
        row_dict["_silver_processed_timestamp"] = time.time()

        return row_dict
    

    # Apply cleaning + type casting
    cleaned_rdd = dedup_rdd_2.map(clean_and_cast)

    total_cleaned = cleaned_rdd.count()
    print(f"Rows after cleaning: {total_cleaned}")
    
    # Collect results
    cleaned_data = cleaned_rdd.collect()

    # --- Compute field validation stats ---
    def row_field_stats(row):
        stats = []
        for f in numeric_fields:
            val = row.get(f)
            total = 1
            valid = 1 if val not in [None, ""] else 0
            stats.append((f, (total, valid, 1-valid)))

        for f in categorical_fields:
            val = row.get(f)
            total = 1
            valid = 1 if val not in [None, ""] else 0
            stats.append((f, (total, valid, 1-valid)))
        return stats
    
    stats_rdd = cleaned_rdd.flatMap(row_field_stats)
    field_summary = stats_rdd.reduceByKey(lambda a,b: (a[0]+b[0], a[1]+b[1], a[2]+b[2]))

    # Print summary
    print("=== Field Validation Summary ===")
    for field, (total, valid, invalid) in field_summary.collect():
        print(f"{field:<25} | Total: {total:<5} | Valid: {valid:<5} | Invalid: {invalid:<5}")

    # Data validation and quality checks
    print("=== Silver Layer Data Quality ===")
    
    # Check for null values in critical fields
    critical_fields = ["Exam_scores"]
    
    # Count nulls per field
    null_counts = {}
    for field in critical_fields:
        null_counts[field] = cleaned_rdd.filter(lambda r: r.get(field) in [None, ""]).count()


    event_counts = cleaned_rdd.map(lambda r: (r.get("event", "Unknown"), 1)) \
                            .reduceByKey(lambda a, b: a + b) \
                            .sortBy(lambda x: -x[1])  # sort descending by count


    categorical_fields = ["Parental_Involvement", "Access_to_Resources", "Extracurricular_Activities",
                            "Motivation_Level", "Internet_Access", "Family_Income", "Teacher_Quality",
                            "School_Type", "Peer_Influence", "Learning_Disabilities", 
                            "Parental_Education_Level", "Distance_from_Home", "Gender"]
    

    # Function to count values for a single field
    def value_counts(rdd, field):
        return (rdd
                .map(lambda row: (row.get(field, "Unknown"), 1))  # get value, default "Unknown"
                .reduceByKey(lambda a, b: a + b)                 # sum counts
                .sortBy(lambda x: -x[1]))                        # sort descending

    # Iterate over all categorical fields
    for field in categorical_fields:
        counts = value_counts(cleaned_rdd, field)
        print(f"\nValue distribution for {field}:")
        for val, cnt in counts.collect():
            print(f"{val}: {cnt}")

    

    # Convert Silver RDD to DataFrame for downstream use
    silver_df = spark.createDataFrame(cleaned_data)
    silver_df.show(5, truncate=False)

    silver_df.write.mode("overwrite").parquet("./data/silver/silverResult")
    spark.stop()

=== Silver Layer: Cleaned and Standardized Data ===


25/11/24 12:03:42 WARN TaskSetManager: Stage 11 contains a task of very large size (1086 KiB). The maximum recommended task size is 1000 KiB.
25/11/24 12:03:43 WARN TaskSetManager: Stage 12 contains a task of very large size (1086 KiB). The maximum recommended task size is 1000 KiB.


Rows after deduplication: 6620 (removed 407)
Rows after removing invalid exam scores: 6620 (removed 2)
Rows after cleaning: 6618
=== Field Validation Summary ===
Attendance                | Total: 6618  | Valid: 6617  | Invalid: 1    
Sleep_Hours               | Total: 6618  | Valid: 6618  | Invalid: 0    
Exam_Score                | Total: 6618  | Valid: 6618  | Invalid: 0    
Motivation_Level          | Total: 6618  | Valid: 6618  | Invalid: 0    
School_Type               | Total: 6618  | Valid: 6617  | Invalid: 1    
Peer_Influence            | Total: 6618  | Valid: 6617  | Invalid: 1    
Distance_from_Home        | Total: 6618  | Valid: 6550  | Invalid: 68   
Physical_Activity         | Total: 6618  | Valid: 6618  | Invalid: 0    
Family_Income             | Total: 6618  | Valid: 6617  | Invalid: 1    
Teacher_Quality           | Total: 6618  | Valid: 6539  | Invalid: 79   
Learning_Disabilities     | Total: 6618  | Valid: 6617  | Invalid: 1    
Parental_Education_Level  | Total: 

25/11/24 12:03:53 ERROR FileOutputCommitter: Mkdirs failed to create file:/data/silverResult/_temporary/0
25/11/24 12:03:53 ERROR Executor: Exception in task 0.0 in stage 162.0 (TID 329)
java.io.IOException: Mkdirs failed to create file:/data/silverResult/_temporary/0/_temporary/attempt_202511241203538974794080402976090_0162_m_000000_329 (exists=false, cwd=file:/home/ubuntu/spark-notebooks)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:515)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:500)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1195)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1175)
	at org.apache.parquet.hadoop.util.HadoopOutputFile.create(HadoopOutputFile.java:74)
	at org.apache.parquet.hadoop.ParquetFileWriter.<init>(ParquetFileWriter.java:347)
	at org.apache.parquet.hadoop.ParquetFileWriter.<init>(ParquetFileWriter.java:314)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter

Py4JJavaError: An error occurred while calling o971.parquet.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 162.0 failed 1 times, most recent failure: Lost task 1.0 in stage 162.0 (TID 330) (sparkvm03 executor driver): java.io.IOException: Mkdirs failed to create file:/data/silverResult/_temporary/0/_temporary/attempt_202511241203538974794080402976090_0162_m_000001_330 (exists=false, cwd=file:/home/ubuntu/spark-notebooks)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:515)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:500)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1195)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1175)
	at org.apache.parquet.hadoop.util.HadoopOutputFile.create(HadoopOutputFile.java:74)
	at org.apache.parquet.hadoop.ParquetFileWriter.<init>(ParquetFileWriter.java:347)
	at org.apache.parquet.hadoop.ParquetFileWriter.<init>(ParquetFileWriter.java:314)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:484)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:422)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:411)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetOutputWriter.scala:36)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetUtils$$anon$1.newInstance(ParquetUtils.scala:490)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.newOutputWriter(FileFormatDataWriter.scala:161)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.<init>(FileFormatDataWriter.scala:146)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:389)
	at org.apache.spark.sql.execution.datasources.WriteFilesExec.$anonfun$doExecuteWrite$1(WriteFiles.scala:100)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:890)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2844)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2780)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2779)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2779)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1242)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1242)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3048)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2982)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2971)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:984)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeWrite$4(FileFormatWriter.scala:307)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.writeAndCommit(FileFormatWriter.scala:271)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeWrite(FileFormatWriter.scala:304)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:190)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:190)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:859)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:388)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:361)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:240)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:792)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.io.IOException: Mkdirs failed to create file:/data/silverResult/_temporary/0/_temporary/attempt_202511241203538974794080402976090_0162_m_000001_330 (exists=false, cwd=file:/home/ubuntu/spark-notebooks)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:515)
	at org.apache.hadoop.fs.ChecksumFileSystem.create(ChecksumFileSystem.java:500)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1195)
	at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1175)
	at org.apache.parquet.hadoop.util.HadoopOutputFile.create(HadoopOutputFile.java:74)
	at org.apache.parquet.hadoop.ParquetFileWriter.<init>(ParquetFileWriter.java:347)
	at org.apache.parquet.hadoop.ParquetFileWriter.<init>(ParquetFileWriter.java:314)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:484)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:422)
	at org.apache.parquet.hadoop.ParquetOutputFormat.getRecordWriter(ParquetOutputFormat.java:411)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.<init>(ParquetOutputWriter.scala:36)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetUtils$$anon$1.newInstance(ParquetUtils.scala:490)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.newOutputWriter(FileFormatDataWriter.scala:161)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.<init>(FileFormatDataWriter.scala:146)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:389)
	at org.apache.spark.sql.execution.datasources.WriteFilesExec.$anonfun$doExecuteWrite$1(WriteFiles.scala:100)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:890)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:890)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:364)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:328)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:161)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
