In [5]:
import sys
import os
sys.path.append(os.getenv("PYTHONPATH", "/app")) #REVIEW
from utils import Utils

In [6]:
utils = Utils()
spark = utils.get_spark_session()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/11 18:06:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [7]:
singleline_json_path = "/app/inputs/order_singleline.json"

In [None]:
# To read jsonl files, the default read with format json is enough
single_json = spark.read.format("json").load(singleline_json_path)

In [None]:
# The schema is infered through a load job
single_json.printSchema()

root
 |-- contact: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- customer_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_line_items: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- amount: double (nullable = true)
 |    |    |-- item_id: string (nullable = true)
 |    |    |-- qty: long (nullable = true)



In [5]:
single_json.show()

+--------------------+-----------+--------+--------------------+
|             contact|customer_id|order_id|    order_line_items|
+--------------------+-----------+--------+--------------------+
|[9000010000, 9000...|       C001|    O101|[{102.45, I001, 6...|
+--------------------+-----------+--------+--------------------+



In [None]:
# Spark doesn't read the multiline json out the box. This will read the file as a corrupted record
spark.read.format("json").load("/app/inputs/order_multiline.json").show()

In [None]:
# To ensure it reads the file as multiline json, the option multiline must be set to true
multi_json = spark.read.format("json").option("multiline", True).load("/app/inputs/order_multiline.json")

In [12]:
multi_json.printSchema()

root
 |-- contact: array (nullable = true)
 |    |-- element: long (containsNull = true)
 |-- customer_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_line_items: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- amount: double (nullable = true)
 |    |    |-- item_id: string (nullable = true)
 |    |    |-- qty: long (nullable = true)



In [None]:
# Spark identified the schema and expanded the outer granular keys in columns
multi_json.show()

+--------------------+-----------+--------+--------------------+
|             contact|customer_id|order_id|    order_line_items|
+--------------------+-----------+--------+--------------------+
|[9000010000, 9000...|       C001|    O101|[{102.45, I001, 6...|
+--------------------+-----------+--------+--------------------+



In [None]:
# To read the single file without expanding the fields, it can be read as text:
df = spark.read.format("text").load(singleline_json_path)
df.printSchema()

root
 |-- value: string (nullable = true)



In [19]:
df.show(truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                              |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"order_id":"O101","customer_id":"C001","order_line_items":[{"item_id":"I001","qty":6,"amount":102.45},{"item_id":"I003","qty":2,"amount":2.01}],"contact":[9000010000,9000010001]}|
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+



In [None]:
# Schema can be enforced when reading json files:
json_schema = "customer_id string, order_id string, contact array<long>"
single_json_schema = spark.read.format("json").schema(json_schema).load(singleline_json_path)
single_json_schema.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- contact: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [None]:
# The json keys not specified in the schema are not read
single_json_schema.show()

+-----------+--------+--------------------+
|customer_id|order_id|             contact|
+-----------+--------+--------------------+
|       C001|    O101|[9000010000, 9000...|
+-----------+--------+--------------------+



In [9]:
# To specify a complex schema, nested fields can be defined inside <> as in:
complex_schema = "contact array<string>, customer_id string, order_id string, order_line_items array<struct<amount double, item_id string, qty long>>"
complex_schema_json = spark.read.format("json").schema(complex_schema).load(singleline_json_path)

In [None]:
# The contact elements are now of type string, since the schema is asserted
complex_schema_json.printSchema()

root
 |-- contact: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- customer_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_line_items: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- amount: double (nullable = true)
 |    |    |-- item_id: string (nullable = true)
 |    |    |-- qty: long (nullable = true)



In [11]:
complex_schema_json.show()

+--------------------+-----------+--------+--------------------+
|             contact|customer_id|order_id|    order_line_items|
+--------------------+-----------+--------+--------------------+
|[9000010000, 9000...|       C001|    O101|[{102.45, I001, 6...|
+--------------------+-----------+--------+--------------------+

