# Chapter 3: DataFrames, Datasets, and Spark SQL

In [1]:
import $ivy.`org.apache.spark::spark-sql:3.0.0`
import $ivy.`org.apache.spark::spark-hive:3.0.0`

[32mimport [39m[36m$ivy.$                                  
[39m
[32mimport [39m[36m$ivy.$                                   [39m

In [2]:
import org.apache.log4j.{Level, Logger}
Logger.getLogger("org").setLevel(Level.OFF);

[32mimport [39m[36morg.apache.log4j.{Level, Logger}
[39m

In [3]:
import org.apache.spark.sql.{Dataset, DataFrame, SparkSession, Row}
import org.apache.spark.sql.catalyst.expressions.aggregate._
import org.apache.spark.sql.expressions._
import org.apache.spark.sql.functions._

[32mimport [39m[36morg.apache.spark.sql.{Dataset, DataFrame, SparkSession, Row}
[39m
[32mimport [39m[36morg.apache.spark.sql.catalyst.expressions.aggregate._
[39m
[32mimport [39m[36morg.apache.spark.sql.expressions._
[39m
[32mimport [39m[36morg.apache.spark.sql.functions._[39m

In [4]:
val spark = SparkSession
    .builder()
    .appName("chap03-01")
    .master("spark://spark-master:7077")
    .config("spark.executor.memory", "512m")
    .enableHiveSupport()
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties


[36mspark[39m: [32mSparkSession[39m = org.apache.spark.sql.SparkSession@29f5863b

In [5]:
import spark.implicits._

[32mimport [39m[36mspark.implicits._[39m

In [6]:
val df1 = spark.read.json("data/rawpanda.json")

[36mdf1[39m: [32mDataFrame[39m = [_corrupt_record: string]

In [7]:
df1.printSchema()

root
 |-- _corrupt_record: string (nullable = true)



**JSON Example**

```
{
    "name":"mission",
    "pandas":[
        {
            "id":1,
            "zip":"94110",
            "pt":"giant",
            "happy":true,
            "attributes":[0.4,0.5]
        }
    ]
}
```

**Sample Schema Information for nested structure**

```
root
 |-- name: string (nullable = true)
 |-- pandas: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = false)
 |    |    |-- zip: string (nullable = true)
 |    |    |-- pt: string (nullable = true)
 |    |    |-- happy: boolean (nullable = false)
 |    |    |-- attributes: array (nullable = true)
 |    |    |    |-- element: double (containsNull = false)
```

In [8]:
// Case class, can used to create Dataset and print the schema
case class RawPanda(
    id: Long,
    zip: String,
    pt: String,
    happy: Boolean, 
    attributes: Array[Double]
)

case class PandaPlace(
    name: String,
    pandas: Array[RawPanda]
)

defined [32mclass[39m [36mRawPanda[39m
defined [32mclass[39m [36mPandaPlace[39m

**Create a Dataset with the case class**

In [9]:
def createAndPrintSchema() = {
    val damao = RawPanda(1, "M1B 5K7", "giant", true, Array(0.1, 0.1))
    val pandaPlace = PandaPlace("toronro", Array(damao))
    val df = spark.createDataFrame(Seq(pandaPlace))
    df.printSchema()
}

defined [32mfunction[39m [36mcreateAndPrintSchema[39m

In [10]:
createAndPrintSchema()

root
 |-- name: string (nullable = true)
 |-- pandas: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = false)
 |    |    |-- zip: string (nullable = true)
 |    |    |-- pt: string (nullable = true)
 |    |    |-- happy: boolean (nullable = false)
 |    |    |-- attributes: array (nullable = true)
 |    |    |    |-- element: double (containsNull = false)



**Sample schema information for nested structure (.schema())—manually formatted**

```
org.apache.spark.sql.types.StructType = StructType(
    StructField(name,StringType,true),
    StructField(pandas,
        ArrayType(
            StructType(StructField(id,LongType,false),
                       StructField(zip,StringType,true),
                       StructField(pt,StringType,true),
                       StructField(happy,BooleanType,false),
                       StructField(attributes,ArrayType(DoubleType,false),true)
                      ),
            true),
        true)
```

In [11]:
spark.stop()