# Mod 05 - JSON objects

### Lab 01: 'get_json_object'

In [0]:
# Lab 01a: Create DataFrame first

from pyspark.sql.functions import get_json_object

data = [("1", '''{"f1": "Mark", "f2": "Ott"}'''), ("2", '''{"f1": "Jarrod"}''')]

df = spark.createDataFrame(data, ("key", "jstring"))
display(df)

In [0]:
# Lab 01b: Pluck out f1 and f2

df.select(df.key, get_json_object(df.jstring, '$.f1').alias("c0"), \
                  get_json_object(df.jstring, '$.f2').alias("c1") ).show()

In [0]:
%scala
// Lab 01c: Create DataFrame

val eventsFromJSONDF = Seq (
 (0, """{"device_id": 0, "device_type": "sensor-ipad", "ip": "68.161.225.1", "cca3": "USA", "cn": "United States", "temp": 25, "signal": 23, "battery_level": 8, "c02_level": 917, "timestamp" :1475600496 }"""),
 (1, """{"device_id": 1, "device_type": "sensor-igauge", "ip": "213.161.254.1", "cca3": "NOR", "cn": "Norway", "temp": 30, "signal": 18, "battery_level": 6, "c02_level": 1413, "timestamp" :1475600498 }"""),
 (2, """{"device_id": 2, "device_type": "sensor-ipad", "ip": "88.36.5.1", "cca3": "ITA", "cn": "Italy", "temp": 18, "signal": 25, "battery_level": 5, "c02_level": 1372, "timestamp" :1475600500 }"""),
 (3, """{"device_id": 3, "device_type": "sensor-inest", "ip": "66.39.173.154", "cca3": "USA", "cn": "United States", "temp": 47, "signal": 12, "battery_level": 1, "c02_level": 1447, "timestamp" :1475600502 }"""),
(4, """{"device_id": 4, "device_type": "sensor-ipad", "ip": "203.82.41.9", "cca3": "PHL", "cn": "Philippines", "temp": 29, "signal": 11, "battery_level": 0, "c02_level": 983, "timestamp" :1475600504 }"""),
(5, """{"device_id": 5, "device_type": "sensor-istick", "ip": "204.116.105.67", "cca3": "USA", "cn": "United States", "temp": 50, "signal": 16, "battery_level": 8, "c02_level": 1574, "timestamp" :1475600506 }"""),
(6, """{"device_id": 6, "device_type": "sensor-ipad", "ip": "220.173.179.1", "cca3": "CHN", "cn": "China", "temp": 21, "signal": 18, "battery_level": 9, "c02_level": 1249, "timestamp" :1475600508 }"""),
(7, """{"device_id": 7, "device_type": "sensor-ipad", "ip": "118.23.68.227", "cca3": "JPN", "cn": "Japan", "temp": 27, "signal": 15, "battery_level": 0, "c02_level": 1531, "timestamp" :1475600512 }"""),
(8 ,""" {"device_id": 8, "device_type": "sensor-inest", "ip": "208.109.163.218", "cca3": "USA", "cn": "United States", "temp": 40, "signal": 16, "battery_level": 9, "c02_level": 1208, "timestamp" :1475600514 }"""),
(9,"""{"device_id": 9, "device_type": "sensor-ipad", "ip": "88.213.191.34", "cca3": "ITA", "cn": "Italy", "temp": 19, "signal": 11, "battery_level": 0, "c02_level": 1171, "timestamp" :1475600516 }""")).toDF("id", "json")

display(eventsFromJSONDF)

In [0]:
%scala
// Lab 01d:  Query DataFrame via 'get_json_object'

import org.apache.spark.sql.functions.get_json_object
                               
 val jsDF = eventsFromJSONDF.select($"id", get_json_object($"json", "$.device_type").alias("device_type"),get_json_object($"json", "$.ip").alias("ip"), get_json_object($"json", "$.cca3").alias("cca3"))

display(jsDF)

### Lab 02: Create JSON schema using STRUCT

In [0]:
%scala
// Lab 02a: Create JSON Schema using STRUCT

import org.apache.spark.sql.types._                         // include the Spark Types to define our schema
import org.apache.spark.sql.functions._                     // include the Spark helper functions

val jsonSchema = new StructType().add("battery_level", LongType).add("c02_level", LongType).add("cca3",StringType).add("cn", StringType).add("device_id", LongType).add("device_type", StringType).add("signal", LongType).add("ip", StringType).add("temp", LongType).add("timestamp", TimestampType)

In [0]:
%scala
// Lab 02b: Create DataFrame using Schema from previous Cell

import org.apache.spark.sql.types._                         // include the Spark Types to define our schema
import org.apache.spark.sql.functions._                     // include the Spark helper functions

// Define a case class
case class DeviceData (id: Int, device: String)

// Create some sample data
val eventsDS = Seq (
 (0, """{"device_id": 0, "device_type": "sensor-ipad", "ip": "68.161.225.1", "cca3": "USA", "cn": "United States", "temp": 25, "signal": 23, "battery_level": 8, "c02_level": 917, "timestamp" :1475600496 }"""),
 (1, """{"device_id": 1, "device_type": "sensor-igauge", "ip": "213.161.254.1", "cca3": "NOR", "cn": "Norway", "temp": 30, "signal": 18, "battery_level": 6, "c02_level": 1413, "timestamp" :1475600498 }"""),
 (2, """{"device_id": 2, "device_type": "sensor-ipad", "ip": "88.36.5.1", "cca3": "ITA", "cn": "Italy", "temp": 18, "signal": 25, "battery_level": 5, "c02_level": 1372, "timestamp" :1475600500 }"""),
 (3, """{"device_id": 3, "device_type": "sensor-inest", "ip": "66.39.173.154", "cca3": "USA", "cn": "United States", "temp": 47, "signal": 12, "battery_level": 1, "c02_level": 1447, "timestamp" :1475600502 }"""),
(4, """{"device_id": 4, "device_type": "sensor-ipad", "ip": "203.82.41.9", "cca3": "PHL", "cn": "Philippines", "temp": 29, "signal": 11, "battery_level": 0, "c02_level": 983, "timestamp" :1475600504 }"""),
(5, """{"device_id": 5, "device_type": "sensor-istick", "ip": "204.116.105.67", "cca3": "USA", "cn": "United States", "temp": 50, "signal": 16, "battery_level": 8, "c02_level": 1574, "timestamp" :1475600506 }"""),
(6, """{"device_id": 6, "device_type": "sensor-ipad", "ip": "220.173.179.1", "cca3": "CHN", "cn": "China", "temp": 21, "signal": 18, "battery_level": 9, "c02_level": 1249, "timestamp" :1475600508 }"""),
(7, """{"device_id": 7, "device_type": "sensor-ipad", "ip": "118.23.68.227", "cca3": "JPN", "cn": "Japan", "temp": 27, "signal": 15, "battery_level": 0, "c02_level": 1531, "timestamp" :1475600512 }"""),
(8 ,""" {"device_id": 8, "device_type": "sensor-inest", "ip": "208.109.163.218", "cca3": "USA", "cn": "United States", "temp": 40, "signal": 16, "battery_level": 9, "c02_level": 1208, "timestamp" :1475600514 }"""),
(9,"""{"device_id": 9, "device_type": "sensor-ipad", "ip": "88.213.191.34", "cca3": "ITA", "cn": "Italy", "temp": 19, "signal": 11, "battery_level": 0, "c02_level": 1171, "timestamp" :1475600516 }"""),
(10,"""{"device_id": 10, "device_type": "sensor-igauge", "ip": "68.28.91.22", "cca3": "USA", "cn": "United States", "temp": 32, "signal": 26, "battery_level": 7, "c02_level": 886, "timestamp" :1475600518 }"""),
(11,"""{"device_id": 11, "device_type": "sensor-ipad", "ip": "59.144.114.250", "cca3": "IND", "cn": "India", "temp": 46, "signal": 25, "battery_level": 4, "c02_level": 863, "timestamp" :1475600520 }"""),
(12, """{"device_id": 12, "device_type": "sensor-igauge", "ip": "193.156.90.200", "cca3": "NOR", "cn": "Norway", "temp": 18, "signal": 26, "battery_level": 8, "c02_level": 1220, "timestamp" :1475600522 }"""),
(13, """{"device_id": 13, "device_type": "sensor-ipad", "ip": "67.185.72.1", "cca3": "USA", "cn": "United States", "temp": 34, "signal": 20, "battery_level": 8, "c02_level": 1504, "timestamp" :1475600524 }"""),
(14, """{"device_id": 14, "device_type": "sensor-inest", "ip": "68.85.85.106", "cca3": "USA", "cn": "United States", "temp": 39, "signal": 17, "battery_level": 8, "c02_level": 831, "timestamp" :1475600526 }"""),
(15, """{"device_id": 15, "device_type": "sensor-ipad", "ip": "161.188.212.254", "cca3": "USA", "cn": "United States", "temp": 27, "signal": 26, "battery_level": 5, "c02_level": 1378, "timestamp" :1475600528 }"""),
(16, """{"device_id": 16, "device_type": "sensor-igauge", "ip": "221.3.128.242", "cca3": "CHN", "cn": "China", "temp": 10, "signal": 24, "battery_level": 6, "c02_level": 1423, "timestamp" :1475600530 }"""),
(17, """{"device_id": 17, "device_type": "sensor-ipad", "ip": "64.124.180.215", "cca3": "USA", "cn": "United States", "temp": 38, "signal": 17, "battery_level": 9, "c02_level": 1304, "timestamp" :1475600532 }"""),
(18, """{"device_id": 18, "device_type": "sensor-igauge", "ip": "66.153.162.66", "cca3": "USA", "cn": "United States", "temp": 26, "signal": 10, "battery_level": 0, "c02_level": 902, "timestamp" :1475600534 }"""),
(19, """{"device_id": 19, "device_type": "sensor-ipad", "ip": "193.200.142.254", "cca3": "AUT", "cn": "Austria", "temp": 32, "signal": 27, "battery_level": 5, "c02_level": 1282, "timestamp" :1475600536 }""")).toDF("id", "device").as[DeviceData]

display(eventsDS)

In [0]:
%scala
// Lab 02c: Query DataFrame via 'from_json' (Convert JSON String to STRUCT Schema)

val devicesDF = eventsDS.select(from_json($"device", jsonSchema) as "devices").select($"devices.*").filter($"devices.temp" > 10 and $"devices.signal" > 15)

display(devicesDF)

In [0]:
%scala
// Lab 02d: Query DataFrame via 'to_json' (Convert STRUCT Schema to JSON String)

val stringJsonDF = eventsDS.select(to_json(struct($"*"))).toDF("devices")
 
display(stringJsonDF)

### Lab 03: Query DataFrame using 'selectExpr()'

In [0]:
%scala
// Lab 03: Query DataFrame via 'selectExpr'

val stringsDF = eventsDS.selectExpr("CAST(id AS INT)", "CAST(device AS STRING)")

stringsDF.printSchema

stringsDF.show(4, false)

devicesDF.selectExpr("c02_level", "round(c02_level/temp) as ratio_c02_temperature").orderBy($"ratio_c02_temperature" desc).show(4, false)

### Lab 04: Create Complex Data Type Schema from JSON Data

In [0]:
%scala
// Lab 04a: Create Complex Data Type scheams from JSON data

import org.apache.spark.sql.types._

val schema1 = new StructType()
  .add("dc_id", StringType)
  .add("source",                                       // Info about the source of alarm
    MapType(                                           // Define this as a Map(Key->value)
      StringType,
      new StructType()
      .add("description", StringType)
      .add("ip", StringType)
      .add("id", LongType)
      .add("temp", LongType)
      .add("c02_level", LongType)
      .add("geo", 
         new StructType()
          .add("lat", DoubleType)
          .add("long", DoubleType)  )))

In [0]:
%scala
// Lab 04b: Create single row complex JSON DataFrame

val dataDS = Seq("""
{
"dc_id": "dc-101",
"source": {
    "sensor-igauge": {
      "id": 10,
      "ip": "68.28.91.22",
      "description": "Sensor attached to the container ceilings",
      "temp":35,
      "c02_level": 1475,
      "geo": {"lat":38.00, "long":97.00}                        
    },
    "sensor-ipad": {
      "id": 13,
      "ip": "67.185.72.1",
      "description": "Sensor ipad attached to carbon cylinders",
      "temp": 34,
      "c02_level": 1370,
      "geo": {"lat":47.41, "long":-122.00}
    },
    "sensor-inest": {
      "id": 8,
      "ip": "208.109.163.218",
      "description": "Sensor attached to the factory ceilings",
      "temp": 40,
      "c02_level": 1346,
      "geo": {"lat":33.61, "long":-111.89}
    },
     "sensor-istick": {
      "id": 5,
      "ip": "204.116.105.67",
      "description": "Sensor embedded in exhaust pipes in the ceilings",
      "temp": 40,
      "c02_level": 1574,
      "geo": {"lat":35.93, "long":-85.46}
    }
  }
}""").toDS()

display(dataDS)

In [0]:
%scala
// Lab 04c: Query the JSON DataFrame

val df = spark.read.schema(schema1).json(dataDS)

df.printSchema 

In [0]:
%scala
// Lab 04d: Query the JSON DataFrame

display(df)

In [0]:
%scala
// Lab 04e: Query the JSON DataFrame via 'explode'

val explodedDF = df.select($"dc_id", explode($"source"))

explodedDF.printSchema

explodedDF.show(4, false) 

### Lab 05: Create JSON schema and DataFrame using 'getItem()' for next Lab

In [0]:
%scala
// Lab 05a: Create Schema and DF using 'getItem' for next lab

//case class to denote our desired Scala object
case class DeviceAlert(dcId: String, deviceType:String, ip:String, deviceId:Long, temp:Long, c02_level: Long, lat: Double, lon: Double)

// Access all values using getItem() on value, by providing the "Key," which is attribute in JSON 
val notifydevicesDS = explodedDF.select( $"dc_id" as "dcId",
                        $"key" as "deviceType",
                        'value.getItem("ip") as 'ip,
                        'value.getItem("id") as 'deviceId,
                        'value.getItem("c02_level") as 'c02_level,
                        'value.getItem("temp") as 'temp,
                        'value.getItem("geo").getItem("lat") as 'lat,          // 2nd level needs level of fetching
                        'value.getItem("geo").getItem("long") as 'lon)
                        .as[DeviceAlert]  		                  // Return as a Dataset

In [0]:
%scala
// Lab 05b: Query the JSON DataFrame from previous lab 

notifydevicesDS.printSchema

notifydevicesDS.show(4, false)   

# End of Mod 5: JSON
## Ignore past here

### Create Dataframe using Row function

In [0]:
%py

# Lab 01a: WARNING:  This is WIP (needs fixing)
# import pyspark class Row from module sql
from pyspark.sql import *

# Create Example Data - Departments and Employees

# Create the Departments
department1 = Row(id='123456', name='Computer Science')
department2 = Row(id='789012', name='Mechanical Engineering')
department3 = Row(id='345678', name='Theater and Drama')
department4 = Row(id='901234', name='Indoor Recreation')

# Create the Employees (Notice nesting data type in a data type)
Employee = Row("firstName", "lastName", "email", "salary")
employee1 = Employee('mark', 'ott', 'no-reply@berkeley.edu', 100000)
employee2 = Employee('jarrod', 'johnson', 'no-reply@stanford.edu', 120000)
employee3 = Employee('karen', None, 'no-reply@waterloo.edu', 140000)
employee4 = Employee(None, 'nimitz', 'no-reply@berkeley.edu', 160000)
employee5 = Employee('mark', 'maryott', 'no-reply@neverla.nd', 80000)

# Create the DepartmentWithEmployees instances from Departments and Employees
departmentWithEmployees1 = Row(department=department1, employees=[employee1, employee2])
departmentWithEmployees2 = Row(department=department2, employees=[employee3, employee4])
departmentWithEmployees3 = Row(department=department3, employees=[employee5, employee4])
departmentWithEmployees4 = Row(department=department4, employees=[employee2, employee3])

##############################################

# Create the temperature (Notice nesting data type in a data type)
header1 = Row("dt", "city", "email", "mytemp")
employee1 = header1('2019-07-23', 'Cincinnati', 'no-reply@berkeley.edu', 100000)
employee2 = header1('jarrod', 'johnson', 'no-reply@stanford.edu', 120000)


# Create the DepartmentWithEmployees instances from Departments and Employees
departmentWithEmployees1 = Row(department=department1, employees=[employee1, employee2])
departmentWithEmployees2 = Row(department=department2, employees=[employee3, employee4])

In [0]:
%py

# Lab 01b:  Use 'createDataFrame' to create Dataframe from list of rows

departmentsWithEmployeesSeq1 = [departmentWithEmployees1, departmentWithEmployees2]
df1 = spark.createDataFrame(departmentsWithEmployeesSeq1)

departmentsWithEmployeesSeq2 = [departmentWithEmployees3, departmentWithEmployees4]
df2 = spark.createDataFrame(departmentsWithEmployeesSeq2)

df1.show(20, False)
df2.show(20, False)

In [0]:
# Lab 01c:  View Data types

df1.printSchema()

In [0]:
%py

# Lab 01d:  Use 'display' to view entire Dataframe contents

display(df1)

### Union 2 Dataframes

In [0]:
unionDF = df1.union(df2)
display(unionDF)

### 'explode' the Employees column

In [0]:
%py
from pyspark.sql.functions import explode

# Explode takes single row of values and generates multiple ros (1-M)
# Used frequently with Complex Data types (more on that later)

explodeDF = unionDF.select(explode("employees").alias("e"))
flattenDF = explodeDF.selectExpr("e.firstName", "e.lastName", "e.email", "e.salary")

display(flattenDF)

### 'filter' to return rows that match a Predicate

In [0]:
from pyspark.sql.functions import col, asc

# Use `|` instead of `or`
filterDF = flattenDF.filter((col("firstName") == "mark") | (col("firstName") == "jarrod")).sort(asc("lastName"))
display(filterDF)

### Using 'fillna' to replace 'NULL' values

In [0]:
nonNullDF = flattenDF.fillna("--")
display(nonNullDF)

### Using 'isNull', Retrieve only rows with missing firstName or lastName

In [0]:
filterNonNullDF = flattenDF.filter(col("firstName").isNull() | col("lastName").isNull()).sort("email")
display(filterNonNullDF)

### Example aggregations using 'agg' and 'countDistinct'

In [0]:
from pyspark.sql.functions import countDistinct

countDistinctDF = nonNullDF.select("firstName", "lastName")\
  .groupBy("firstName")\
  .agg(countDistinct("lastName").alias("distinct_last_names"))

display(countDistinctDF)

In [0]:
%py

# To run SQL-like syntax in Python, use below Syntax

countDistinctDF_sql = spark.sql('''
  SELECT firstName, count(distinct lastName) AS distinct_last_names
  FROM databricks_df_example
  GROUP BY firstName
''')

display(countDistinctDF_sql)

### 'Explain'

In [0]:
countDistinctDF.explain()

### 'createOrReplaceTempView' to convert Dataframe into Temp View so can query via SQL

In [0]:
# register the DataFrame as a Temp View so that we can query it using SQL
nonNullDF.createOrReplaceTempView("databricks_df_example")

In [0]:
%sql

-- To run as SQL, use below syntax 

SELECT firstName, count(distinct lastName) AS distinct_last_names
FROM databricks_df_example
GROUP BY firstName;