In [0]:
# If need to repeat labs, may have to run this first
# display(dbutils.fs.rm("dbfs:/user/hive/warehouse/", True))

# Mod 08: Catalog, Catalyst, Tungsten

## Lab 01: Catalog - List Hive Databases, Tables, TempViews

In [0]:
%scala
// Lab 01a: List Databases

display(spark.catalog.listDatabases)

In [0]:
# Lab 01b: List Databases (python)
display(spark.catalog.listDatabases())

In [0]:
%scala
// Lab 01c: Create TempView, then view in HCAT Metadata Catalog
val df = Seq((1, "andy"), (2, "bob"), (2, "andy")).toDF("count", "name")                                                    
df.createOrReplaceTempView("temp_view1")
display(spark.catalog.listTables)

In [0]:
dbutils.fs.ls("dbfs:/user/hive/warehouse/")

In [0]:
%sql
-- Lab 01c: Create TempView, then view in HCAT Metadata Catalog
CREATE TABLE IF NOT EXISTS perm_mng_table1 AS SELECT * FROM temp_view1;

In [0]:
%sql
-- Lab
CREATE TABLE IF NOT EXISTS perm_ext_table1 LOCATION '/tmp/ext2/' AS SELECT * FROM temp_view1;

In [0]:
%scala
// Lab 01d: 
display(spark.catalog.listTables)

In [0]:
display(dbutils.fs.ls("dbfs:/user/hive/warehouse/"))

In [0]:
%sql
-- Lab 01e: HCAT allows you to view Details about your Table
DESCRIBE TABLE perm_mng_table1

In [0]:
%scala
// Lab 01f: Table Column names and Data type and other
display(spark.catalog.listColumns("perm_mng_table1"))

In [0]:
%sql
-- Lab 01g: HCAT shows your DLL statement
SHOW CREATE TABLE perm_ext_table1;

In [0]:
%scala
// Lab 01h: Drop TempView and Confirm it is removed from Catalog
spark.catalog.dropTempView("temp_view_1")
display(spark.catalog.listTables)

In [0]:
%scala
// Lab 01i: List Spark functions
display(spark.catalog.listFunctions())

In [0]:
# Lab 01j: Spark Catalog can tell if if Object is 'cached' on not
spark.read.format("json").load("dbfs:/FileStore/tables/names1.json").createOrReplaceTempView("temp_view1")
print(spark.catalog.isCached('temp_view1'))

In [0]:
# Lab 01k: Next, we cache the table and ask Catalog if it is indeed Cached.  It is.
spark.sql("cache table temp_view1")
print(spark.catalog.isCached('temp_view1'))

## Lab 02: Catalyst Optimizier

In [0]:
# Lab 02a: First, disable side effects
spark.conf.set("spark.databricks.io.cache.enabled", False)
spark.conf.set("spark.sql.adaptive.enabled", False)
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [0]:
# Lab 02b: First create some DataFrames and TempViews
empDF = spark.read.parquet("dbfs:/FileStore/tables/emp_snappy.parquet/")
deptDF = spark.read.parquet("dbfs:/FileStore/tables/dept_snappy.parquet/")

spark.read.parquet("dbfs:/FileStore/tables/emp_snappy.parquet/").createOrReplaceTempView("emp_view")
spark.read.parquet("dbfs:/FileStore/tables/dept_snappy.parquet/").createOrReplaceTempView("dept_view")

display(empDF)
display(deptDF)

In [0]:
# Lab 02c: Notice in first 2 Plans Catalyst does JOIN, then does FILTER
#          But in Optimized Plan, Catalyst moves the FILTER before the JOIN for better Performance
spark.sql("SELECT e.last_name, d.dept FROM emp_view e INNER JOIN dept_view d ON e.dept=d.dept WHERE e.dept = 301").explain(True)

In [0]:
# Lab 02d: Notice the size = 46 MB for both files
display(dbutils.fs.ls("dbfs:/FileStore/tables/cops"))

In [0]:
# Lab 02e: Create TempViews 
spark.read.format("parquet").load("dbfs:/FileStore/tables/cops_02_snappy.parquet").createOrReplaceTempView("cops_view1")
spark.read.format("parquet").load("dbfs:/FileStore/tables/cops_03_snappy.parquet").createOrReplaceTempView("cops_view2")

# Here we use the 'explain' method so don't have to go to Spark UI 'SQL' tab. 
# Without a Hint, Catalyst uses Join Strategy = 'SortMergeJoin'
spark.sql("SELECT a.Category, b.PdDistrict FROM cops_view1 a JOIN cops_view2 b ON a.Time = b.Time").explain(True)

In [0]:
# Lab 02f: Go to Spark UI and go to SQL tab to confirm SortMergeJoin
display(spark.sql("SELECT a.Category, b.PdDistrict FROM cops_view1 a JOIN cops_view2 b ON a.Time = b.Time"))

In [0]:
# Lab 02g: Tell Catalyst it's OK to do BroadcastHashJoin on Table up to 50MB 
spark.sql("SET spark.sql.autoBroadcastJoinThreshold = 52428800")

# Lab 02h: I drop Hint and tell Catalyst to broadcast 'cops_view2' table
spark.sql("SELECT /*+ BROADCAST(cops_view2) */ * FROM cops_view1 a JOIN cops_view2 b ON a.Time = b.Time").explain(True)

In [0]:
%sql
-- Lab 02i:  From Spark UI, go to SQL tab and confirm Join Strategy = BroadcastHashJoin
SELECT e.last_name, e.first_name, d.dept, e.dept, d.dept 
FROM emp_view e JOIN dept_view d 
ON e.dept=d.dept WHERE e.dept = 301

## Lab 03: Catalyst Column Pruning

In [0]:
# Lab 03b: Create 2 DataFrames (one as CSV, one as Delta)
from pyspark.sql.types import StructType, StructField, StringType

policeSchema = StructType([StructField('IncidentNum', StringType(), True), StructField('Category', StringType(), True), StructField('Description', StringType(), True), StructField('DayOfWeek', StringType(), True), StructField('Date', StringType(), True), StructField('Time', StringType(), True), StructField('PdDistrict', StringType(), True),  StructField('Resolution', StringType(), True), StructField('Address', StringType(), True), StructField('X', StringType(), True), StructField('Y', StringType(), True), StructField('Loc', StringType(), True), StructField('PdId', StringType(), True)])

CSVColPruneDF = spark.read.schema(policeSchema).csv("dbfs:/FileStore/tables/sfpd1/sf101")
display(CSVColPruneDF)

CSVColPruneDF.write.format("delta").mode("overwrite").save("/tmp/delta_colPrune/")
DeltaColPruneDF = spark.read.format("delta").load("/tmp/delta_colPrune/")

In [0]:
# Lab 03c: Attempt Column Pruning on CSV File formats (1 of 2)
# Go to Spark UI, SQL tab. 

display(CSVColPruneDF.select("Category", "Description"))

In [0]:
# Lab 03d: Attempt Column Pruning on CSV File formats (2 of 2)
# Go to Spark UI, SQL tab. 

display(DeltaColPruneDF.select("Category", "Description"))

## Lab 04: Catalyst - Predicate Pushed Filters (WHERE clause)

In [0]:
%scala
// Here's JDBC driver and credentials to log into PostgreSQL
import org.apache.spark.sql.functions._

// Configure this app to connect to a PostgreSQL database
Class.forName("org.postgresql.Driver")

val connectionProperties = new java.util.Properties()
connectionProperties.put("user", "readonly")
connectionProperties.put("password", "readonly")

val tableName = "training.people_1m"
val jdbcUrl = "jdbc:postgresql://server1.databricks.training:5432/training"

In [0]:
%scala
// Here we query PostgreSQL table
// Goal: Have PostgreSQL only bring back Filter rows instead of entire table
//       Spark only puts into Memory what it needs

val df = spark.read
  .jdbc(url=jdbcUrl, table=tableName, properties=connectionProperties) // Open a JDBC connect
  .filter($"id" > 343517)                                             
display(df) 

## Lab 05: Partition Pruning

In [0]:
%sql
-- Lab 05a: Before we begin, we create Partitioned table 'cust_part'

 DROP TABLE IF EXISTS cust_part2;
 CREATE TABLE cust_part2 (id INT, name STRING) PARTITIONED BY (state STRING, city STRING);

 INSERT INTO cust_part2 PARTITION (state = 'CA', city = 'Fremont') VALUES (100, 'Al'); 
 INSERT INTO cust_part2 PARTITION (state = 'CA', city = 'San Jose') VALUES (200, 'Bo'); 
 INSERT INTO cust_part2 PARTITION (state = 'AZ', city = 'Peoria') VALUES (300, 'Cy');

In [0]:
# Lab 05b: Confirm have 3 Directory Paths for Hive table 'cust_part'

display(dbutils.fs.ls("dbfs:/user/hive/warehouse/cust_part2/state=AZ/"))
display(dbutils.fs.ls("dbfs:/user/hive/warehouse/cust_part2/state=CA/"))

In [0]:
# Lab 05c: Now, let's load the Partition Table into a DataFrame and confirm it has 3 Partitions (i.e.: 3 Directory paths)

partitionDF = spark.table("cust_part2")
display(partitionDF)

In [0]:
%sql
-- 05d:
DESCRIBE cust_part2;

In [0]:
%sql
-- Lab 05e: Without WHERE clause using Partition column(s)
-- Go to Spark UI and drill down to DETAILS to see how many files were read

SELECT * FROM cust_part2;

In [0]:
%sql
-- Lab 05f: With Partitions by having WHERE clause with Partitioning columns  'city' and 'state'
--  Notice it only scanned /user/hive/warehouse/cust_part/state=CA/city=Fremont")) Partiton.

SELECT city FROM cust_part2 WHERE city = 'Fremont' AND state = 'CA';

In [0]:
# Lab 05g: Another Partitioning Lab using a DataFrame instead of a Table

autoDF = spark.read.option("header", True).option("inferSchema", True).csv("dbfs:/FileStore/tables/autos.csv")

autoDF.write.format("delta").mode("overwrite").partitionBy("yearOfRegistration").save("/tmp/autoDelta")
PartAutoDF = spark.read.format("delta").load("/tmp/autoDelta")

# Check out the Partitioned Years
display(dbutils.fs.ls("dbfs:/tmp/autoDelta/"))

In [0]:
from pyspark.sql.functions import *
# Lab 05h: Query Partitioned DataFrame
# Partitioning colunn = 'yearOfRegistration'
display(PartAutoDF.select(PartAutoDF.brand, PartAutoDF.price, PartAutoDF.yearOfRegistration)
        .where(PartAutoDF.yearOfRegistration.isin([2015,2016])))

### Lab 06: Tungsten - Improved Memory Usage

In [0]:
# Do this first to prevent side effects
spark.conf.set("spark.databricks.io.cache.enabled", False)

In [0]:
%scala
// Lab 06a:
import org.apache.spark.storage.StorageLevel

// Lab 14:  Improved Memory Usage in Cache 
// View 'Storage' tab in Spark UI to view RAM size consumed between RDD and DataFrame

val rdd1 = sc.textFile("dbfs:/FileStore/tables/autos.csv").map(_.split(","))
val rdd2 = rdd1.persist(StorageLevel.MEMORY_ONLY_SER)
rdd2.count()

val df1 = spark.read.option("header" , "true").option("inferSchema", "true").csv("dbfs:/FileStore/tables/autos.csv")
val df2 = df1.persist(StorageLevel.MEMORY_ONLY_SER)
df2.count()

In [0]:
# Lab 06c: After execute, go to 'SQL' tab to view
spark.range(1000).filter("id > 100").selectExpr("sum(id)").show()

# End of Module 08: Catalog, Catalyst, Tungsten