In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_json, struct
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Initialize SparkSession
spark = SparkSession.builder.appName("JSONFunctionsExample").getOrCreate()

# Create a DataFrame with a nested structure
data = [
    ("Alice", "New York", 30),
    ("Bob", "Los Angeles", 35),
    ("Charlie", "Chicago", 25)
]
schema = StructType([
    StructField("name", StringType(), True),
    StructField("city", StringType(), True),
    StructField("age", IntegerType(), True)
])
df = spark.createDataFrame(data, schema)

# Convert the struct to a JSON string
df_json = df.withColumn("json_data", to_json(struct("name", "city", "age")))

df_json.show(truncate=False)


+-------+-----------+---+--------------------------------------------+
|name   |city       |age|json_data                                   |
+-------+-----------+---+--------------------------------------------+
|Alice  |New York   |30 |{"name":"Alice","city":"New York","age":30} |
|Bob    |Los Angeles|35 |{"name":"Bob","city":"Los Angeles","age":35}|
|Charlie|Chicago    |25 |{"name":"Charlie","city":"Chicago","age":25}|
+-------+-----------+---+--------------------------------------------+



In [2]:
df_json.printSchema()

root
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- json_data: string (nullable = true)



In [3]:
from pyspark.sql.functions import from_json

# Define the schema of the JSON string
json_schema = StructType([
    StructField("name", StringType(), True),
    StructField("city", StringType(), True),
    StructField("age", IntegerType(), True)
])

# Parse the JSON string column into a struct
df_parsed = df_json.withColumn("parsed_data", from_json("json_data", json_schema))

df_parsed.select("json_data", "parsed_data.*").show(truncate=False)
df_parsed.printSchema()


+--------------------------------------------+-------+-----------+---+
|json_data                                   |name   |city       |age|
+--------------------------------------------+-------+-----------+---+
|{"name":"Alice","city":"New York","age":30} |Alice  |New York   |30 |
|{"name":"Bob","city":"Los Angeles","age":35}|Bob    |Los Angeles|35 |
|{"name":"Charlie","city":"Chicago","age":25}|Charlie|Chicago    |25 |
+--------------------------------------------+-------+-----------+---+

root
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- json_data: string (nullable = true)
 |-- parsed_data: struct (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- age: integer (nullable = true)



In [4]:
from pyspark.sql.functions import json_tuple

# Extract 'name' and 'city' from the JSON string
df_tuple = df_json.select("json_data", json_tuple("json_data", "name", "city").alias("name", "city"))

df_tuple.show(truncate=False)
df_tuple.printSchema()

+--------------------------------------------+-------+-----------+
|json_data                                   |name   |city       |
+--------------------------------------------+-------+-----------+
|{"name":"Alice","city":"New York","age":30} |Alice  |New York   |
|{"name":"Bob","city":"Los Angeles","age":35}|Bob    |Los Angeles|
|{"name":"Charlie","city":"Chicago","age":25}|Charlie|Chicago    |
+--------------------------------------------+-------+-----------+

root
 |-- json_data: string (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)



In [5]:
from pyspark.sql.functions import get_json_object

# Extract the 'age' from the JSON string
df_get_json = df_json.withColumn("age_extracted", get_json_object("json_data", "$.age"))

df_get_json.show(truncate=False)
df_tuple.printSchema()


+-------+-----------+---+--------------------------------------------+-------------+
|name   |city       |age|json_data                                   |age_extracted|
+-------+-----------+---+--------------------------------------------+-------------+
|Alice  |New York   |30 |{"name":"Alice","city":"New York","age":30} |30           |
|Bob    |Los Angeles|35 |{"name":"Bob","city":"Los Angeles","age":35}|35           |
|Charlie|Chicago    |25 |{"name":"Charlie","city":"Chicago","age":25}|25           |
+-------+-----------+---+--------------------------------------------+-------------+

root
 |-- json_data: string (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)

