[Reference](https://medium.com/@anchitgupt2012/what-developers-need-to-know-about-apache-spark-4-0-10200a9000bf)

# VARIANT: The 8x JSON Performance Boost
```
CREATE TABLE events (
  event_id STRING,
  payload VARIANT
) USING DELTA;

-- Direct field access, no parsing overhead
SELECT
  payload.user.id AS user_id,
  payload.metadata.source
FROM events
WHERE payload.type = 'click';
```

# Python Data Source API

In [1]:
from pyspark.sql.datasource import DataSource, DataSourceReader
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

class MyCustomSource(DataSource):
    @classmethod
    def name(cls):
        return "mycustom"

    def schema(self):
        return StructType([
            StructField("name", StringType()),
            StructField("value", IntegerType())
        ])

    def reader(self, schema):
        return MyCustomReader()

# Native Plotting

In [2]:
df = spark.range(10).withColumn("square", col("id") ** 2)
df.plot.scatter(x="id", y="square")

# Polymorphic UDTFs

In [3]:
class FlexibleParser:
    def eval(self, json_str: str, include_metadata: bool = False):
        data = json.loads(json_str)
        if include_metadata:
            yield (data['id'], data['value'], data.get('timestamp'))
        else:
            yield (data['id'], data['value'])

# PIPE Syntax
```
-- Old style: nested and hard to follow
SELECT user_id FROM (
  SELECT user_id, COUNT(*) AS purchases
  FROM events
  WHERE event_type = 'purchase'
  GROUP BY user_id
) t WHERE purchases > 10;

-- New PIPE syntax: linear and clear
FROM events
WHERE event_type = 'purchase'
|> SELECT user_id, COUNT(*) AS purchases GROUP BY user_id
|> WHERE purchases > 10;
````

# SQL UDFs

```
CREATE FUNCTION extract_domain(email STRING)
  RETURNS STRING
  RETURN SPLIT(email, '@')[1];

-- Now use it everywhere
SELECT extract_domain(user_email) AS domain
FROM users;
```

# SQL Scripting
```
DECLARE threshold INT DEFAULT 1000;

CREATE TEMP VIEW filtered AS
SELECT * FROM events WHERE count > threshold;
INSERT INTO summary_table
SELECT date, COUNT(*) FROM filtered GROUP BY date;
````

# String Collation
```
CREATE TABLE names (
  name STRING COLLATE 'en_US.UTF8'
);
```

# State Data Source
```
df = spark.read.format("statestore").load("<checkpointLocation>")
df.show()
```

# Structured Logging
```
{
  "timestamp": "2025-05-28T12:34:56Z",
  "level": "INFO",
  "component": "TaskSetManager",
  "message": "Finished task",
  "taskId": 200,
  "stageId": 12,
  "durationMs": 314,
  "host": "10.1.2.3"
}
````