### Working with JSON files

In [1]:
!ls -lst *.json

4 -rw-r--r-- 1 atif atif 401 Nov 15 21:24 input_1.json


In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Handling JSON").getOrCreate()
sc = spark.sparkContext

In [9]:
df_json = spark.read.json('input_1.json')

In [10]:
df_json.show(truncate=0)

AnalysisException: Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the
referenced columns only include the internal corrupt record column
(named _corrupt_record by default). For example:
spark.read.schema(schema).json(file).filter($"_corrupt_record".isNotNull).count()
and spark.read.schema(schema).json(file).select("_corrupt_record").show().
Instead, you can cache or save the parsed results and then send the same query.
For example, val df = spark.read.schema(schema).json(file).cache() and then
df.filter($"_corrupt_record".isNotNull).count().;

### This is failing because , a single row of json is spread on multiple line,  and spark by default considers single records in single line 

In [11]:
df_json = spark.read.json('input_1.json',multiLine=True)

In [13]:
df_json.show(truncate=0)

+---------------------------------+-------+---------+
|Delivery                         |name   |product  |
+---------------------------------+-------+---------+
|[Chennai, 1234567, Azarudeen]    |AZAR   |Headphone|
|[Bangalore, 5738612, Bharathiraj]|Bharath|T-shirt  |
+---------------------------------+-------+---------+



In [14]:
df_json.printSchema()

root
 |-- Delivery: struct (nullable = true)
 |    |-- address: string (nullable = true)
 |    |-- mob: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- name: string (nullable = true)
 |-- product: string (nullable = true)



In [20]:
from pyspark.sql.functions import explode, explode_outer,col

In [27]:
df_json_flattened = df_json.withColumn("Delivery_address",col("Delivery.address")).withColumn("Phone",col("Delivery.mob")).withColumn("Delivery_name",col("Delivery.name"))


In [28]:
df_json_flattened.show()

+--------------------+-------+---------+----------------+-------+-------------+
|            Delivery|   name|  product|Delivery_address|  Phone|Delivery_name|
+--------------------+-------+---------+----------------+-------+-------------+
|[Chennai, 1234567...|   AZAR|Headphone|         Chennai|1234567|    Azarudeen|
|[Bangalore, 57386...|Bharath|  T-shirt|       Bangalore|5738612|  Bharathiraj|
+--------------------+-------+---------+----------------+-------+-------------+



In [32]:
df_json_flattened_final = df_json_flattened.drop("Delivery")

In [33]:
df_json_flattened_final.printSchema()

root
 |-- name: string (nullable = true)
 |-- product: string (nullable = true)
 |-- Delivery_address: string (nullable = true)
 |-- Phone: string (nullable = true)
 |-- Delivery_name: string (nullable = true)



In [34]:
df_json_flattened_final.show(truncate=0)

+-------+---------+----------------+-------+-------------+
|name   |product  |Delivery_address|Phone  |Delivery_name|
+-------+---------+----------------+-------+-------------+
|AZAR   |Headphone|Chennai         |1234567|Azarudeen    |
|Bharath|T-shirt  |Bangalore       |5738612|Bharathiraj  |
+-------+---------+----------------+-------+-------------+

