## 02-pyspark-read-json.py

In [None]:
# pyspark-read-json.py
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,BooleanType,DoubleType
spark = SparkSession.builder \
    .master("local[1]") \
    .appName("PySparkExamples") \
    .getOrCreate()

# Read JSON file into dataframe    
df = spark.read.json("dbfs:/FileStore/tables/iris.json", multiLine = True)
df.printSchema()
df.show()

# Read multiline json file
multiline_df = spark.read.option("multiline", "true") \
      .json("dbfs:/FileStore/tables/multiline_zipcode.json")
multiline_df.show()

root
 |-- petalLength: double (nullable = true)
 |-- petalWidth: double (nullable = true)
 |-- sepalLength: double (nullable = true)
 |-- sepalWidth: double (nullable = true)
 |-- species: string (nullable = true)

+-----------+----------+-----------+----------+-------+
|petalLength|petalWidth|sepalLength|sepalWidth|species|
+-----------+----------+-----------+----------+-------+
|        1.4|       0.2|        5.1|       3.5| setosa|
|        1.4|       0.2|        4.9|       3.0| setosa|
|        1.3|       0.2|        4.7|       3.2| setosa|
|        1.5|       0.2|        4.6|       3.1| setosa|
|        1.4|       0.2|        5.0|       3.6| setosa|
|        1.7|       0.4|        5.4|       3.9| setosa|
|        1.4|       0.3|        4.6|       3.4| setosa|
|        1.5|       0.2|        5.0|       3.4| setosa|
|        1.4|       0.2|        4.4|       2.9| setosa|
|        1.5|       0.1|        4.9|       3.1| setosa|
|        1.5|       0.2|        5.4|       3.7| setosa|
|

In [None]:
#Read multiple files
df2 = spark.read.json(
    ['dbfs:/FileStore/tables/iris_setosa.json','dbfs:/FileStore/tables/iris_versicolor.json',
     'dbfs:/FileStore/tables/iris_virginica.json'],
    multiLine = True)
df2.show()    

+-----------+----------+-----------+----------+----------+
|petalLength|petalWidth|sepalLength|sepalWidth|   species|
+-----------+----------+-----------+----------+----------+
|        4.7|       1.4|        7.0|       3.2|versicolor|
|        4.5|       1.5|        6.4|       3.2|versicolor|
|        4.9|       1.5|        6.9|       3.1|versicolor|
|        4.0|       1.3|        5.5|       2.3|versicolor|
|        4.1|       1.4|        5.4|       2.4|versicolor|
|        6.0|       2.5|        6.3|       3.3| virginica|
|        5.1|       1.9|        5.8|       2.7| virginica|
|        5.9|       2.1|        7.1|       3.0| virginica|
|        5.6|       1.8|        6.3|       2.9| virginica|
|        5.8|       2.2|        6.5|       3.0| virginica|
|        1.4|       0.2|        5.1|       3.5|    setosa|
|        1.4|       0.2|        4.9|       3.0|    setosa|
|        1.3|       0.2|        4.7|       3.2|    setosa|
|        1.5|       0.2|        4.6|       3.1|    setos

In [None]:
# Read All JSON files from a directory
df3 = spark.read.json("dbfs:/FileStore/tables/iris_*.json", multiLine = True)
df3.show()
df3.select('petalLength', 'petalWidth', 'species').show()  # Multiple Columns
df3.select(['petalLength', 'petalWidth', 'species']).show()

+-----------+----------+-----------+----------+----------+
|petalLength|petalWidth|sepalLength|sepalWidth|   species|
+-----------+----------+-----------+----------+----------+
|        4.7|       1.4|        7.0|       3.2|versicolor|
|        4.5|       1.5|        6.4|       3.2|versicolor|
|        4.9|       1.5|        6.9|       3.1|versicolor|
|        4.0|       1.3|        5.5|       2.3|versicolor|
|        4.1|       1.4|        5.4|       2.4|versicolor|
|        6.0|       2.5|        6.3|       3.3| virginica|
|        5.1|       1.9|        5.8|       2.7| virginica|
|        5.9|       2.1|        7.1|       3.0| virginica|
|        5.6|       1.8|        6.3|       2.9| virginica|
|        5.8|       2.2|        6.5|       3.0| virginica|
|        1.4|       0.2|        5.1|       3.5|    setosa|
|        1.4|       0.2|        4.9|       3.0|    setosa|
|        1.3|       0.2|        4.7|       3.2|    setosa|
|        1.5|       0.2|        4.6|       3.1|    setos

In [None]:
# Define custom schema
iris_schema = StructType() \
    .add("sepalLength", DoubleType(), True) \
    .add("sepalWidth", DoubleType(), True) \
    .add("petalLength", DoubleType(), True) \
    .add("petalWidth", DoubleType(), True) \
    .add("species", StringType(), True)

df_with_schema = spark.read.schema(iris_schema) \
        .json("dbfs:/FileStore/tables/iris.json", multiLine = True)
df_with_schema.printSchema()
df_with_schema.show(5)

root
 |-- sepalLength: double (nullable = true)
 |-- sepalWidth: double (nullable = true)
 |-- petalLength: double (nullable = true)
 |-- petalWidth: double (nullable = true)
 |-- species: string (nullable = true)

+-----------+----------+-----------+----------+-------+
|sepalLength|sepalWidth|petalLength|petalWidth|species|
+-----------+----------+-----------+----------+-------+
|        5.1|       3.5|        1.4|       0.2| setosa|
|        4.9|       3.0|        1.4|       0.2| setosa|
|        4.7|       3.2|        1.3|       0.2| setosa|
|        4.6|       3.1|        1.5|       0.2| setosa|
|        5.0|       3.6|        1.4|       0.2| setosa|
+-----------+----------+-----------+----------+-------+
only showing top 5 rows



In [None]:
# Create a table from Parquet File
# Parquet is an open source file format built to handle flat columnar storage data formats.
# Parquet operates well with complex data in large volumes.It is known for its both performant
# data compression and its ability to handle a wide variety of encoding types.
spark.sql("CREATE OR REPLACE TEMPORARY VIEW iris_setosa_parquet USING json OPTIONS" + 
      " (path 'dbfs:/FileStore/tables/iris_setosa.json', multiLine = True)")
spark.sql("select * from iris_setosa_parquet").show()

+-----------+----------+-----------+----------+-------+
|petalLength|petalWidth|sepalLength|sepalWidth|species|
+-----------+----------+-----------+----------+-------+
|        1.4|       0.2|        5.1|       3.5| setosa|
|        1.4|       0.2|        4.9|       3.0| setosa|
|        1.3|       0.2|        4.7|       3.2| setosa|
|        1.5|       0.2|        4.6|       3.1| setosa|
|        1.4|       0.2|        5.0|       3.6| setosa|
+-----------+----------+-----------+----------+-------+



In [None]:
# PySpark write Parquet File
df2.write.mode('Overwrite').json("dbfs:/FileStore/tables/iris_writezipcodes.json")
# Folder containing part files will be created at dbfs:/FileStore/tables/iris_writezipcodes.json

### Data Bricks File System (DBFS) commands:
**To list filesystem**<br>
%fs ls<br>
%fs ls dbfs:/FileStore/tables<br>

**To rename a file**<br>
%fs mv dbfs:/FileStore/tables/old_file_name dbfs:/FileStore/tables/new_file_name

**To delete a file from DBFS**<br>
dbutils.fs.rm("dbfs:/FileStore/tables/your_file_name")<br>
%fs rm dbfs:/FileStore/tables/your_file_name

**To delete a folder from DBFS**<br>
%fs rm -r dbfs:/FileStore/tables/your_folder_name

**to copy a file in DBFS**<br>
%fs cp dbfs:/FileStore/tables/old_file_name dbfs:/FileStore/tables/new_file_name

**To move a file to another folder in DBFS**<br>
%fs mv dbfs:/FileStore/tables/old_file_name dbfs:/FileStore/tables/target_folder

**To create a folder in DBFS**<br>
%fs mkdirs dbfs:/FileStore/tables/mydir