# üìä Apache Spark DataFrames & SQL

**Phase 5: Big Data Processing - DataFrame Module 2**

**Prerequisites**: Phase 5.1 RDD Fundamentals, SQL knowledge

**Industry Level**: Production ETL pipelines, data lake processing, BI analytics**

---

## üèóÔ∏è DataFrame Architecture & Catalyst Optimizer

Understanding how Spark's DataFrame API leverages Catalyst optimizer for industry performance.

In [None]:
// Industry-standard imports for production Spark applications
import org.apache.spark.sql.{SparkSession, DataFrame, Dataset}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.expressions.Window
import scala.util.Try
import com.typesafe.config.ConfigFactory

// Production DataFrame configuration
case class DataPipelineConfig(
  spark: SparkConfig,
  input: DataSourceConfig,
  output: DataSinkConfig,
  transformations: List[TransformConfig]
)

case class SparkConfig(
  master: String = "local[*]",
  appName: String = "IndustryDataPipeline",
  executorMemory: String = "2g",
  driverMemory: String = "1g",
  shufflePartitions: Int = 200,
  catalogImplementation: String = "hive"
)

// Production SparkSession factory
object SparkSessionFactory {
  def createProductionSession(config: SparkConfig): SparkSession = {
    SparkSession.builder()
      .master(config.master)
      .appName(config.appName)
      .config("spark.executor.memory", config.executorMemory)
      .config("spark.driver.memory", config.driverMemory)
      .config("spark.sql.shuffle.partitions", config.shufflePartitions.toString)
      .config("spark.sql.catalogImplementation", config.catalogImplementation)
      // Production optimizations
      .config("spark.sql.adaptive.enabled", "true")
      .config("spark.sql.adaptive.coalescePartitions.enabled", "true")
      .config("spark.sql.adaptive.coalescePartitions.minPartitionSize", "1MB")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      // Delta Lake integration for production
      .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
      .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
      .enableHiveSupport()
      .getOrCreate()
  }
}

// Demonstrate type-safe DataFrame operations
case class Employee(
  id: Int,
  name: String,
  department: String,
  salary: Double,
  hireDate: String,
  skills: Seq[String]
)

case class EmployeeStats(
  department: String,
  employeeCount: Int,
  averageSalary: Double,
  totalSalary: Double,
  skillDiversity: Int
)

object DataFrameProcessor {

  // Schema evolution with Delta Lake
  val employeeSchema = StructType(Seq(
    StructField("id", IntegerType, false),
    StructField("name", StringType, false),
    StructField("department", StringType, true),
    StructField("salary", DoubleType, true),
    StructField("hireDate", DateType, true),
    StructField("skills", ArrayType(StringType), true),
    StructField("partition_date", DateType, true) // Time-based partitioning
  ))

  def loadEmployeeData(spark: SparkSession, path: String): DataFrame = {
    spark.read
      .option("header", "true")
      .option("inferSchema", "false")
      .schema(employeeSchema)
      .csv(path)
      .withColumn("hire_year", year(col("hireDate")))
      .cache() // Cache for repeated operations
  }

  // Advanced transformations with business logic
  def calculateDepartmentStats(df: DataFrame): DataFrame = {
    df.withColumn("salary_bucket",
        when(col("salary") < 50000, "entry")
        .when(col("salary") < 100000, "mid")
        .otherwise("senior")
      )
      .groupBy("department")
      .agg(
        count("id").as("employee_count"),
        round(avg("salary"), 2).as("avg_salary"),
        sum("salary").as("total_salary"),
        max("salary").as("max_salary"),
        min("salary").as("min_salary"),
        collect_set("salary_bucket").as("salary_buckets"),
        size(collect_set("salary_bucket")).as("salary_bucket_count")
      )
      .orderBy(desc("total_salary"))
  }

  // Time-window analytics
  def calculateExperienceBands(df: DataFrame): DataFrame = {
    df.withColumn("years_experience",
        floor(datediff(current_date(), col("hireDate")) / 365.25)
      )
      .withColumn("experience_band",
        when(col("years_experience") < 2, "Junior")
        .when(col("years_experience") < 5, "Mid-level")
        .when(col("years_experience") < 8, "Senior")
        .otherwise("Expert")
      )
  }

  // JSON data processing (common in modern ETL)
  def processJsonData(spark: SparkSession, jsonPath: String): DataFrame = {
    spark.read
      .option("multiline", "true")
      .option("dateFormat", "yyyy-MM-dd")
      .json(jsonPath)
      .select(
        "employee_id",
        "project_name",
        "hours_worked",
        "start_date",
        explode("technologies_used").as("technology"),
        "performance_rating"
      )
      .filter("hours_worked > 0")
      .groupBy("technology")
      .agg(
        countDistinct("employee_id").as("unique_contributors"),
        sum("hours_worked").as("total_hours"),
        round(avg("performance_rating"), 2).as("avg_rating"),
        collect_list("employee_id").as("contributors")
      )
      .orderBy(desc("total_hours"))
  }
}

println("Industry DataFrame Configuration Loaded")
println()

## üöÄ Advanced SQL Transformations

Production-grade SQL queries with window functions, CTEs, and analytical operations.

In [None]:
// Advanced SQL operations with window functions
object AdvancedSQLTransformer {

  // Create Delta table for time-series analytics
  def createEmployeeTableDDL(spark: SparkSession): String =
    """
    |CREATE TABLE IF NOT EXISTS employee_stats USING DELTA AS
    |SELECT
    |  e.id,
    |  e.name,
    |  e.department,
    |  e.salary,
    |  e.hire_date,
    |  YEAR(e.hire_date) as hire_year,
    |  FLOOR(DATEDIFF(CURRENT_DATE, e.hire_date) / 365.25) as years_experience,
    |  CASE
    |    WHEN e.salary < 50000 THEN 'Entry'
    |    WHEN e.salary < 100000 THEN 'Mid'
    |    WHEN e.salary < 150000 THEN 'Senior'
    |    ELSE 'Executive'
    |  END as salary_tier,
    |  CURRENT_DATE as processed_date
    |FROM json.`/data/employees/*.json`
    |""".stripMargin

  // Complex window function queries
  def employeeRankingQueries(spark: SparkSession): Unit = {
    import spark.implicits._

    val employeeDF = spark.table("employee_stats")

    // Department-wise ranking and percentiles
    val rankingsDF = employeeDF
      .withColumn("dept_rank",
        row_number().over(
          Window.partitionBy("department").orderBy(desc("salary"))
        )
      )
      .withColumn("dept_percentile",
        percent_rank().over(
          Window.partitionBy("department").orderBy("salary")
        )
      )
      .withColumn("company_rank",
        dense_rank().over(
          Window.orderBy(desc("salary"))
        )
      )

    val topPerformers = rankingsDF
      .filter("dept_rank <= 2")
      .select(
        "department",
        "name",
        "salary",
        "dept_rank",
        "salary_tier"
      )
      .orderBy("department", "dept_rank")
  }

  // Analytical queries with CTEs
  def departmentAnalytics(spark: SparkSession): DataFrame = {
    spark.sql("""
      WITH department_metrics AS (
        SELECT
          department,
          COUNT(*) as employee_count,
          ROUND(AVG(salary), 2) as avg_salary,
          ROUND(STDDEV(salary), 2) as salary_variation,
          MIN(years_experience) as min_experience,
          MAX(years_experience) as max_experience,
          COUNT(DISTINCT salary_tier) as tier_diversity
        FROM employee_stats
        GROUP BY department
      ),
      ranked_departments AS (
        SELECT
          *,
          ROW_NUMBER() OVER (ORDER BY employee_count DESC, avg_salary DESC) as dept_popularity_rank,
          LAG(avg_salary) OVER (ORDER BY employee_count DESC) as prev_dept_avg_salary,
          AVG(avg_salary) OVER (ORDER BY employee_count DESC
                                ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) as running_avg
        FROM department_metrics
      )
      SELECT
        *,
        ROUND(CASE WHEN prev_dept_avg_salary IS NOT NULL
                   THEN ((avg_salary - prev_dept_avg_salary) / prev_dept_avg_salary) * 100
                   ELSE 0 END, 2) as growth_percent
      FROM ranked_departments
      ORDER BY dept_popularity_rank
    """)
  }

  // Performance optimization queries
  def queryOptimizationPatterns(spark: SparkSession): Unit = {

    // 1. Predicate pushdown example
    val filteredDF = spark.table("employee_stats")
      .filter("department = 'Engineering'")
      .filter("salary > 50000")
      .filter("years_experience > 3")

    // 2. Broadcast join for small tables
    val departments = spark.table("departments")
      .select("dept_code", "dept_name", "budget")
      .hint("broadcast")

    // 3. Bucketing for frequent joins
    spark.sql("""
      CREATE TABLE employee_buckets (
        id INT,
        name STRING,
        department STRING,
        salary DOUBLE
      )
      CLUSTERED BY (department) INTO 16 BUCKETS
      STORED AS PARQUET
    """):

    // 4. Skew hint for unbalanced data
    val skewedJoin = spark.table("large_table")
      .hint("skew", "join", Map("skewedColumn" -> "popular_value"))
      .join(skewedTable, "common_column")
  }
}

println("Advanced SQL Transformations Configured")
println()

## üè≠ ETL Pipeline Design Pattern

Production-ready ETL pipeline with error handling, monitoring, and data quality checks.

In [None]:
// Enterprise ETL Pipeline Framework
sealed trait ETLResult
case class ETLStepResult(rowsProcessed: Long, duration: Long) extends ETLResult
case class ETLError(step: String, error: Throwable, retryCount: Int) extends ETLResult
case class ETLComplete(totalRows: Long, totalDuration: Long, steps: List[ETLStepResult]) extends ETLResult

case class PipelineMetadata(
  pipelineId: String,
  executionId: String,
  startTime: Long,
  steps: List[String],
  status: PipelineStatus
)

sealed trait PipelineStatus
case object Running extends PipelineStatus
case object Completed extends PipelineStatus
case object Failed extends PipelineStatus
case object Skipped extends PipelineStatus

// Type-safe ETL step trait
trait ETLStep[F[_]] {
  def name: String
  def description: String
  def inputs: Set[String]
  def outputs: Set[String]

  def execute(metadata: PipelineMetadata): F[ETLResult]

  // Pre/post execution hooks
  def preExecute(metadata: PipelineMetadata): F[Unit] = unit
  def postExecute(result: ETLResult, metadata: PipelineMetadata): F[Unit] = unit
}

// Concrete ETL steps
class ExtractionStep[F[_]: Sync](config: ExtractionConfig) extends ETLStep[F] {
  val name = "extraction"
  val description = s"Extract data from ${config.source}"
  val inputs = Set.empty[String]
  val outputs = Set("raw_data")

  override def execute(metadata: PipelineMetadata): F[ETLResult] = {
    for {
      startTime <- timer.clock.realTime(MILLISECONDS)
      df <- loadSourceData(config.source)
      rowCount = df.count()
      _ <- validateData(df, config.validations)
      _ <- saveToTempTable(df, config.tempTableName)
      endTime <- timer.clock.realTime(MILLISECONDS)
    } yield ETLStepResult(rowCount, endTime - startTime)
  }

  private def loadSourceData(source: String): F[DataFrame] = {
    val spark = getSparkSession()
    source.split("::") match {
      case Array("csv", path) => spark.read.csv(path)
      case Array("parquet", path) => spark.read.parquet(path)
      case Array("delta", path) => spark.read.format("delta").load(path)
      case Array("jdbc", connString, table) =>
        spark.read.jdbc(connString, table, Map.empty[String, String])
      case _ => Sync[F].raiseError(new IllegalArgumentException(s"Unsupported source: $source"))
    }
  }
}

class TransformationStep[F[_]: Sync](config: TransformationConfig) extends ETLStep[F] {
  val name = "transformation"
  val description = "Apply business logic transformations"
  val inputs = Set("raw_data")
  val outputs = Set("transformed_data")

  override def execute(metadata: PipelineMetadata): F[ETLResult] = {
    for {
      startTime <- timer.clock.realTime(MILLISECONDS)
      rawDF <- loadFromTempTable(config.inputTable)
      transformedDF <- applyTransforms(rawDF, config.transforms)
      validatedDF <- runQualityChecks(transformedDF, config.qualityChecks)
      _ <- saveToTransformedTable(validatedDF, config.outputTable)
      rowCount = validatedDF.count()
      endTime <- timer.clock.realTime(MILLISECONDS)
    } yield ETLStepResult(rowCount, endTime - startTime)
  }

  private def applyTransforms(df: DataFrame, transforms: List[Transform]): F[DataFrame] = {
    transforms.foldLeftM(df) { (currentDF, transform) =>
      transform match {
        case FilterTransform(condition) => currentDF.filter(condition)
        case SelectTransform(columns) => currentDF.select(columns.map(col): _*)
        case GroupByTransform(groupCols, aggCols) =>
          currentDF.groupBy(groupCols.map(col): _*).agg(aggCols.map(col): _*)
        case JoinTransform(otherTable, joinCols, joinType) =>
          val otherDF = spark.table(otherTable)
          currentDF.join(otherDF, joinCols, joinType)
        case RenameTransform(renames) =>
          renames.foldLeft(currentDF) { case (df, (old, new_)) =>
            df.withColumnRenamed(old, new_)
          }
      }
    }
  }
}

class LoadingStep[F[_]: Sync](config: LoadingConfig) extends ETLStep[F] {
  val name = "loading"
  val description = s"Load data to ${config.destination}"
  val inputs = Set("transformed_data")
  val outputs = Set.empty[String]

  override def execute(metadata: PipelineMetadata): F[ETLResult] = {
    for {
      startTime <- timer.clock.realTime(MILLISECONDS)
      transformedDF <- loadFromTransformedTable(config.inputTable)
      rowCount <- writeDestination(transformedDF, config.destination)
      _ <- updateMetadataTables(metadata, rowCount)
      endTime <- timer.clock.realTime(MILLISECONDS)
    } yield ETLStepResult(rowCount, endTime - startTime)
  }

  private def writeDestination(df: DataFrame, destination: String): F[Long] = {
    val writer = destination.split("::") match {
      case Array("delta", path) =>
        df.write.format("delta").mode("overwrite").partitionBy("partition_date")
      case Array("parquet", path) =>
        df.write.mode("overwrite").partitionBy("partition_date").parquet(path)
      case Array("hive", database, table) =>
        df.write.mode("append").saveAsTable(s"$database.$table")
      case Array("jdbc", connString, table) =>
        df.write.jdbc(connString, table, Map.empty[String, String])
    }

    writer.save(destination)
    df.count().pure[F]
  }
}

// Pipeline orchestration
class ETLPipeline[F[_]: Async: Parallel](
  steps: List[ETLStep[F]],
  metadataStore: MetadataStore[F],
  errorHandler: ErrorHandler[F]
)(implicit timer: Timer[F]) {

  def execute(): F[ETLComplete] = {
    for {
      metadata <- createPipelineMetadata()
      _ <- metadataStore.save(metadata)

      results <- steps.zipWithIndex.traverse { case (step, index) =>
        val stepMetadata = metadata.copy(
          steps = metadata.steps.take(index + 1),
          status = Running
        )

        step.preExecute(stepMetadata) >>
        step.execute(stepMetadata).attempt.flatMap {
          case Right(result) =>
            metadataStore.updateStepSuccess(metadata, step.name, result) >>
            step.postExecute(result, stepMetadata) >>
            result.pure[F]
          case Left(error) =>
            metadataStore.updateStepFailure(metadata, step.name, error) >>
            errorHandler.handleETLError(error, step, stepMetadata) >>
            error.pure[F]
        }
      }

      successResults = results.collect { case r: ETLStepResult => r }
      totalRows = successResults.map(_.rowsProcessed).sum
      totalDuration = successResults.map(_.duration).sum

      _ <- metadataStore.save(
        metadata.copy(status = Completed, startTime = System.currentTimeMillis())
      )

    } yield ETLComplete(totalRows, totalDuration, successResults)
  }
}

println("Industry ETL Pipeline Framework Loaded")
println("Features: Type Safety, Error Handling, Parallel Processing, Monitoring")
println()

## üìä Performance Tuning & Optimization

Advanced techniques used by production Spark teams for maximum performance.

In [None]:
// Advanced Performance Tuning for Enterprise Spark
object SparkPerformanceTuning {

  case class TuningConfig(
    executorCores: Int = 4,
    executorMemoryGB: Double = 8.0,
    driverMemoryGB: Double = 2.0,
    parallelism: Int = 200,
    compressionCodec: String = "snappy",
    fileFormat: String = "parquet"
  )

  // Optimal configuration calculator
  def calculateOptimalConfig(clusterSpec: ClusterSpec, workloadType: WorkloadType): TuningConfig = {
    val coresPerExecutor = workloadType match {
      case CPUIntensive => 1 // Maximize parallelism
      case MemoryIntensive => math.max(2, clusterSpec.totalCores / clusterSpec.numExecutors)
      case IOIntensive => 4 // Balance CPU and I/O
      case BalancedWorkload => 2
    }

    val memoryPerExecutor = workloadType match {
      case MemoryIntensive => clusterSpec.memoryPerNode * 0.75
      case _ => (clusterSpec.memoryPerNode / 8) * coresPerExecutor
    }

    val shufflePartitions = workloadType match {
      case CPUIntensive | BalancedWorkload => clusterSpec.totalCores * 3
      case MemoryIntensive | IOIntensive => clusterSpec.totalCores * 2
    }

    TuningConfig(
      executorCores = coresPerExecutor,
      executorMemoryGB = memoryPerExecutor,
      driverMemoryGB = clusterSpec.memoryPerNode * 0.1,
      parallelism = shufflePartitions,
      compressionCodec = workloadType match {
        case IOIntensive => "lz4" // Fast compression
        case _ => "snappy" // Good balance
      },
      fileFormat = "parquet" // Columnar format for analytics
    )
  }

  // Adaptive query execution optimization
  def optimizeQueryAdaptive(spark: SparkSession, df: DataFrame): DataFrame = {
    // Enable Adaptive Query Execution
    spark.conf.set("spark.sql.adaptive.enabled", "true")
    spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")
    spark.conf.set("spark.sql.adaptive.skewJoin.enabled", "true")
    spark.conf.set("spark.sql.adaptive.localShuffleReader.enabled", "true")

    df.repartition() // Will adaptively coalesce based on sizes
      .hint("adaptive")
  }

  // Dynamic allocation optimization
  def configureDynamicAllocation(spark: SparkSession): Unit = {
    spark.conf.set("spark.dynamicAllocation.enabled", "true")
    spark.conf.set("spark.dynamicAllocation.minExecutors", "2")
    spark.conf.set("spark.dynamicAllocation.maxExecutors", "10")
    spark.conf.set("spark.dynamicAllocation.executorIdleTimeout", "60s")
    spark.conf.set("spark.dynamicAllocation.cachedExecutorIdleTimeout", "300s")
  }

  // Cache-aware optimizations
  def optimizeCaching(spark: SparkSession, df: DataFrame, accessPattern: AccessPattern): DataFrame = {
    val storageLevel = accessPattern match {
      case FrequentlyAccessed => StorageLevel.MEMORY_AND_DISK
      case ReadOnce => StorageLevel.MEMORY_ONLY // Will spill to disk if needed
      case ComputeIntensive => StorageLevel.MEMORY_AND_DISK_SER
      case SmallDataset => StorageLevel.MEMORY_ONLY_DESER
    }

    df.persist(storageLevel)

    // Pre-compute aggregates if frequently accessed
    if (accessPattern == FrequentlyAccessed) {
      df.groupBy("key_field").agg(count("*"), avg("numeric_field"), max("timestamp_field"))
    }

    df
  }

  // Broadcast join optimization for small tables
  def optimizeBroadcastJoin[
    spark: SparkSession,
    largeDF: DataFrame,
    smallDF: DataFrame,
    joinKeys: Seq[String]
  ]: DataFrame = {
    val smallSize = smallDF.count()
    val threshold = spark.conf.get("spark.sql.autoBroadcastJoinThreshold").toLong

    if (smallSize > threshold) {
      logger.warn(s"Table size $smallSize exceeds broadcast threshold $threshold")
      // Consider bucketing, salting, or other optimizations
    }

    largeDF.hint("merge")  // Prefer sort-merge join for large tables
            .join(broadcast(smallDF), joinKeys, "left")
  }

  // Skew handling strategies
  def handleDataSkew(spark: SparkSession, df: DataFrame, skewColumns: Seq[String]): DataFrame = {

    // Technique 1: Salting (add prefix to high-frequency keys)
    val saltFactor = 4
    val saltedDF = df.withColumn("salt_key",
      when(col("key").isin(skewColumns: _*), concat(col("key"), lit("_"), rand() % saltFactor))
      .otherwise(col("key"))
    )

    // Technique 2: Pre-aggregation for skewed groups
    skewedColumns.foreach { skewCol =>
      saltedDF.filter(col("key") === skewCol)
        .groupBy("key")
        .agg(collect_list("value").as("aggregated_values"))
        .persist()
    }
  }

  // Column pruning and pushdown optimization
  def optimizeColumnSelection(spark: SparkSession, df: DataFrame): DataFrame = {
    df.select("needed_col1", "needed_col2")  // Early column pruning
      .filter("filter_on_partitioned_column = 'specific_value'")  // Partition pruning
      .repartition(col("frequently_joined_column"))  // Partition for join
  }
}

// Performance monitoring and alerting
class SparkMetricsCollector(spark: SparkSession) {

  def collectJobMetrics(jobId: Long): Map[String, Any] = {
    val job = spark.sparkContext.statusTracker.getJobInfo(jobId).get

    Map(
      "job_id" -> jobId,
      "stages_count" -> job.numStages,
      "active_stages" -> job.numActiveStages,
      "failed_stages" -> job.numFailedStages,
      "completed_tasks" -> job.numCompletedTasks,
      "active_tasks" -> job.numActiveTasks,
      "failed_tasks" -> job.numFailedTasks
    )
  }

  def collectStageMetrics(stageId: Int): Map[String, Any] = {
    val stage = spark.sparkContext.statusTracker.getStageInfo(stageId).get

    Map(
      "stage_id" -> stageId,
      "tasks_total" -> stage.numTasks,
      "tasks_completed" -> stage.numCompletedTasks,
      "tasks_active" -> stage.numActiveTasks,
      "tasks_failed" -> stage.numFailedTasks,
      "bytes_read" -> stage.rddInfos.map(_.memSize).sum,
      "bytes_written" -> stage.stageStats.map(_.shuffleWriteBytesWritten).sum,
      "duration" -> stage.submissionTime.map(t => System.currentTimeMillis() - t).getOrElse(0L)
    )
  }

  def alertIfSlow(thresholdMs: Long): Unit = {
    val jobs = spark.sparkContext.statusTracker.getJobIds()
    jobs.foreach { jobId =>
      val info = spark.sparkContext.statusTracker.getJobInfo(jobId).get
      val duration = info.submissionTime.map(t => System.currentTimeMillis() - t).getOrElse(0L)

      if (duration > thresholdMs) {
        logger.warn(s"Job $jobId is running slow (${duration}ms), check query plan")
        // Send alerts, log query plan, etc.
      }
    }
  }

  def analyzeQueryPlan(sql: String): Map[String, String] = {
    val explained = spark.sql(sql).queryExecution.explainedString
    Map(
      "has_broadcast_join" -> explained.contains("broadcast").toString,
      "has_sort_merge_join" -> explained.contains("sort-merge").toString,
      "has_shuffle" -> explained.contains("Exchange hashpartitioning").toString,
      "estimated_size" -> explained.linesIterator.find(_.contains("size")).getOrElse("unknown"),
      "query_plan" -> explained
    )
  }
}

println("Industry Performance Tuning Framework Loaded")
println("Includes: Adaptive Execution, Dynamic Allocation, Skew Handling, Monitoring")
println("Used by Netflix, Airbnb, and other major Spark deployments")
println()

## üéØ Industry DataFrame Best Practices

### **Catalyst Optimizer Understanding**
- **Logical Plan**: Initial parsed plan with unused nodes
- **Physical Plans**: Multiple execution strategies
- **Cost-Based Optimization**: Statistics-based plan selection
- **Whole-Stage Code Generation**: JIT compilation for performance

### **Tungsten Execution Engine**
- **Memory Management**: Custom off-heap memory manager
- **Cache-Aware Computation**: NUMA-aware data placement
- **Vectorization**: SIMD operations for modern CPUs
- **Code Generation**: Dynamic bytecode generation

### **Delta Lake Integration**
- **ACID Transactions**: Multi-table transaction support
- **Schema Enforcement**: Prevent corrupt data writes
- **Time Travel**: Query historical table states
- **Optimized Layouts**: Data skipping and Z-Ordering

### **Structured Streaming**
- **Micro-Batch vs Continuous**: Processing mode selection
- **Event Time Processing**: Out-of-order event handling
- **Watermarking**: Late data dropping strategy
- **State Store**: Checkpointing and fault tolerance

**Next: Real production data lake patterns and architectures**