In [0]:
# Lab 01a: Before we begin, confirm all files are loaded
# Should have 56 rows if you loaded everything correctly
display(dbutils.fs.ls("dbfs:/FileStore/tables/"))

# Mod 10: Performance Tuning

In [0]:
# Lab 00: First, disable side effects
spark.conf.set("spark.databricks.io.cache.enabled", False)
spark.conf.set("spark.sql.adaptive.enabled", False)

## Lab 01: Spark Cache

In [0]:
# Lab 01a: Viewing 'Storage' tab and 'SQL' tab after Caching
# cache() is lazy for DataFrames, so issue count() to put DataFrame in cache()
# On subsequent queries using this DataFrame, should
df1 = spark.read.format("delta").load("/tmp/delta_colPrune/")
display(df1)

# Best to name DataFrame so Users know it's Cached
c_DF = df1.cache()
c_DF.count()

In [0]:
# Lab 01b:  Subsequent query faster since reading from Cache, not Files
c_DF.count()

In [0]:
# Lab 01c: Getting Cache wrong. First, let's cache the following
copsCache_DF = df1.select("Category", "Description").filter("IncidentNum > 150146449").cache()
copsCache_DF.count()

In [0]:
# Lab 01d: Getting Cache wrong (Cache not being used)
# Moved 'filter' in front of the 'select'
display(copsCache_DF.filter("IncidentNum > 150146449").select("Category", "Description"))

In [0]:
# Lab 01e: Getting Cache wrong again (Cache not being used) 
# Filter is slightly greater than original Cache
display(copsCache_DF.select("Category", "Description").filter("IncidentNum > 150146500"))

In [0]:
# Lab 01f: Getting Cache wrong again (Cache not being used) 
# Only have 'Category' in 'select'
display(copsCache_DF.select("Category").filter("IncidentNum > 150146449"))

In [0]:
# Lab 01g: Getting Cache right (Cache being used) 
# Using the exact Cache object I started with
display(copsCache_DF)

In [0]:
# Lab 01h: Perist Disk_only
# Go to Spark UI and view DAG and 'Storage' tab
from pyspark import StorageLevel

spark.read.parquet("dbfs:/FileStore/tables/dept_snappy.parquet/").createOrReplaceTempView("dept_view")
deptDF = spark.table("dept_view")
cachedDeptDF = deptDF.persist(StorageLevel.DISK_ONLY)
cachedDeptDF.count()
display(cachedDeptDF)

In [0]:
# Lab 01i: Caching Tables
# Go to Spark UI and view DAG and 'Storage' tab

spark.sql("cache table dept_view")

In [0]:
# Lab 01j: UnPerist a DataFrame/Table
# Go to Spark UI and view DAG 'Storage' tab.  Object should be removed

c_DF.unpersist()
copsCache_DF.unpersist()
spark.sql("uncache table dept_view")

In [0]:
# Lab 01k: Use catalog to remove all data from cache
spark.catalog.clearCache()

## Delta Cache

In [0]:
# Lab 02a:  Go to Spark UI > Storage tab and confirm Delta Cache is empty

In [0]:
# Lab 02b: Now, enable Delta Cache
spark.conf.set("spark.databricks.io.cache.enabled", True)

In [0]:
# Lab 02c: Read Data in to be Delta Cached
#          Then go to Spark UI > Storage tab and confirm it has been activated
df1 = spark.read.format("delta").load("/tmp/delta_colPrune/")
display(df1)

# 5 Most common Performance issues

## Lab 01: Spill

In [0]:
# 01a: Here's the Data set we'll be using.  It's rather large
display(dbutils.fs.ls("dbfs:/databricks-datasets/asa/airlines/"))

In [0]:
# 01b: Hard-code Schema
DDL_Schema = ("Year integer,Month integer,DayofMonth integer,DayOfWeek integer,DepTime string,CRSDepTime integer,ArrTime string,CRSArrTime integer,UniqueCarrier string,FlightNum integer,TailNum string,ActualElapsedTime string,CRSElapsedTime integer,AirTime string,ArrDelay string,DepDelay integer,Origin string,Dest string,Distance integer,TaxiIn integer,TaxiOut integer,Cancelled integer,CancellationCode string,Diverted integer,CarrierDelay string,WeatherDelay string,NASDelay string,SecurityDelay string,LateAircraftDelay string")

In [0]:
# 01c: Force Partition size = 2GB in attempt to force Spill
spark.conf.set("spark.sql.files.maxPartitionBytes", 2005000000)

In [0]:
# 01d: Force # of Partition = 8 via repartition() method
# Let run for minute, then from Spark UI, go to 'Stages' tab and look for Spill
# Then 'CANCEL' query and move on to next Cell
flightsDF = spark.read.option("header", True).schema(DDL_Schema).csv("dbfs:/databricks-datasets/asa/airlines/").repartition(8)
flightsDF.createOrReplaceTempView("flights_view")
display(display(spark.sql("SELECT * FROM flights_view v1 UNION SELECT * FROM flights_view")))

In [0]:
# 01e: Configure settings back to Default and run again.  Then remove 'repartition(8)'
#      Query now runs WITHOUT Spill

spark.conf.set("spark.sql.files.maxPartitionBytes", 134217728)
flightsDF = spark.read.option("header", True).schema(DDL_Schema).csv("dbfs:/databricks-datasets/asa/airlines/")
flightsDF.createOrReplaceTempView("flights_view")
display(display(spark.sql("SELECT * FROM flights_view v1 UNION SELECT * FROM flights_view")))

## Lab 02: Skew

In [0]:
%scala
// Lab 02a: Setup for Skew Partitions:  Create Data objects

import scala.util.Random
import scala.math.BigDecimal

case class MakeModel(make: String, model: String)

case class T1(registration: String, make: String, model: String, engine_size: BigDecimal)

case class T2(make: String, model: String, engine_size: BigDecimal, sale_price: Double)

    val makeModelSet: Seq[MakeModel] = Seq(
      MakeModel("FORD", "FIESTA")
      , MakeModel("NISSAN", "QASHQAI")
      , MakeModel("HYUNDAI", "I20")
      , MakeModel("SUZUKI", "SWIFT")
      , MakeModel("MERCEDED_BENZ", "E CLASS")
      , MakeModel("VAUXHALL", "CORSA")
      , MakeModel("FIAT", "500")
      , MakeModel("SKODA", "OCTAVIA")
      , MakeModel("KIA", "RIO")
    )

    def randomMakeModel(): MakeModel = {
      val makeModelIndex = if (Random.nextBoolean()) 0 else Random.nextInt(makeModelSet.size)
      makeModelSet(makeModelIndex)
    }

    def randomEngineSize() = BigDecimal(s"1.${Random.nextInt(9)}")

    def randomRegistration(): String = s"${Random.alphanumeric.take(7).mkString("")}"

    def randomPrice() = 500 + Random.nextInt(5000)

    def randomT1(): T1 = {
      val makeModel = randomMakeModel()
      T1(randomRegistration(), makeModel.make, makeModel.model, randomEngineSize())
    }

    def randomT2(): T2 = {
      val makeModel = randomMakeModel()
      T2(makeModel.make, makeModel.model, randomEngineSize(), randomPrice())
    }

    val t1 = Seq.fill(10000)(randomT1()).toDS()

    val t2 = Seq.fill(100000)(randomT2()).toDS()



In [0]:
%scala
// Lab 02b: Setup for Skew Partitions:  Write to File

t1.write.format("parquet").mode("overwrite").save("dbfs:/FileStore/tables/t1")
t2.write.format("parquet").mode("overwrite").save("dbfs:/FileStore/tables/t2")

In [0]:
%scala
// Lab 02c: Load the Data

val t1DF = spark.read.parquet("dbfs:/FileStore/tables/t1")
val t2DF = spark.read.parquet("dbfs:/FileStore/tables/t2")

spark.read.parquet("dbfs:/FileStore/tables/t1").createOrReplaceTempView("t1_view")
spark.read.parquet("dbfs:/FileStore/tables/t2").createOrReplaceTempView("t2_view")

In [0]:
%sql
-- Lab 02d: Notice Skew for 'Ford Fiesta'. It has 10x rows compared to others
SELECT make, model, count(*) AS cnt FROM t2_view GROUP BY make, model ORDER BY cnt DESC

In [0]:
%scala
// Lab 02e: View Spark UI to find Bottleneck.  It's a Skew Partition issue (2 minute query)

import org.apache.spark.sql.functions._

// We disable Broadcast join and AQE, then JOIN on 'make' and 'model'
// In order to see our Skew happening, we need to suppress this behaviour
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
spark.conf.set("spark.sql.adaptive.enabled", false)

// Skew eats up 2 Minutes in one of the Stages.  Ouch!!
display(t1DF.join(t2DF, Seq("make", "model"))
.filter(abs(t2DF("engine_size") - t1DF("engine_size")) <= BigDecimal("0.1"))
  .groupBy("registration")
  .agg(avg("sale_price").as("average_price")).collect())

In [0]:
%scala
// Lab 02f: Using 'hint' to minimize Skew

// Ensure that AQE is disabled
spark.conf.set("spark.sql.adaptive.enabled", false)
spark.conf.set("spark.sql.adaptive.skewedJoin.enabled", false)

// See 'hint' near end of code
val t2DF = spark.read.parquet("dbfs:/FileStore/tables/t2").hint("skew", "model", ("fiesta"))

display(t1DF.join(t2DF, Seq("make", "model"))
.filter(abs(t2DF("engine_size") - t1DF("engine_size")) <= BigDecimal("0.1"))
  .groupBy("registration")
  .agg(avg("sale_price").as("average_price")))

In [0]:
%scala
// Lab 02g: Let AQE and let it figure out the Skew problem and fix it automatically
// First configure the Settings

import org.apache.spark.sql.functions._

// We disable Broadcast join and enable AQE
// In order to see our skew happening, we need to suppress this behaviour
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
spark.conf.set("spark.sql.adaptive.enabled", true)

// I added this to see if it would work
// Disable coalesce Partitions so Skew occurs
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", false)
spark.conf.set("spark.sql.shuffle.partitions", 200)

// A Partition is considered as skewed if its size is larger than this factor multiplying 
// The median partition size and also larger than //spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes
spark.conf.set("spark.sql.adaptive.skewedPartitionFactor", 2)

// A Partition is considered  skewed if its size in bytes is larger than this threshold and larger than spark.sql.adaptive.skewJoin
// skewedPartitionFactor multiplying the median partition size. 
//Ideally this config should be set larger than spark.sql.adaptive.advisoryPartitionSizeInBytes.
// Was 1KB, then 124 (1.95)
spark.conf.set("spark.sql.adaptive.advisoryPartitionSizeInBytes", "1MB")
// Was 4KB then 512
spark.conf.set("spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes","1MB")

In [0]:
%scala
// Lab 02h: Solution: Let AQE figure out the Skew problem and fix it automatically
// Keep tweaking above settings to get better Performance here

display(t1DF.join(t2DF, Seq("make", "model"))
.filter(abs(t2DF("engine_size") - t1DF("engine_size")) <= BigDecimal("0.1"))
  .groupBy("registration")
  .agg(avg("sale_price").as("average_price")))

In [0]:
%scala
// Lab 02i: Fix Skew by adding 'Engine_Size' to JOIN key to get more evenly Partitions
//          This is similar to 'salt' to get more evenly sized Partitions
//
 display(t1DF.withColumn("engine_size", explode(array($"engine_size" - BigDecimal("0.1"), 
                                           $"engine_size", $"engine_size" + BigDecimal("0.1")))) 
  .join(t2DF, Seq("make", "model", "engine_size")) 
  .groupBy("registration")
  .agg(avg("sale_price").as("average_price")))

## Lab 03: Shuffle

In [0]:
# Lab 03a: Configure settings first
# Disable IO cache so as to minimize side effects
spark.conf.set("spark.databricks.io.cache.enabled", False)

# Disable Broadcast Hash Join
spark.sql("SET spark.sql.autoBroadcastJoinThreshold = -1")    

# Enable Bucketing
spark.sql("SET spark.sql.sources.bucketing.enabled=true") 

# Disable AQE
spark.conf.set("spark.sql.adaptive.enabled",False)

# Encourage SortMergeJoin
spark.conf.set("spark.sql.join.preferSortMergeJoin", True)

In [0]:
# Lab 03b: Read in Data
fDF = (spark.read
  .option("header", True)
  .option("inferSchema", True)
  .csv("dbfs:/databricks-datasets/asa/small/small.csv"))

fDF.createOrReplaceTempView("fly_view")

In [0]:
%sql
-- Lab 03c: Create 1st Bucket Table
DROP TABLE IF EXISTS fly_bucket;

CREATE TABLE fly_bucket(tailnum string, carrier string) 
USING CSV
CLUSTERED BY(tailnum) INTO 42 BUCKETS;

In [0]:
%sql
-- Lab 03d: Populate 1st Bucket Table
INSERT INTO fly_bucket (tailnum, carrier) SELECT TailNum, UniqueCarrier from fly_view

In [0]:
%sql
-- Lab 03e: Confirm contents of 1st Bucket Table
SELECT * FROM fly_bucket;

In [0]:
# Lab 03f: Load 2nd View
pDF = (spark.read
  .option("header", True)
  .option("inferSchema", True)
  .csv("dbfs:/databricks-datasets/asa/planes/plane-data.csv"))

pDF.createOrReplaceTempView("plane_view")

In [0]:
%sql
-- Lab 03g: Create 2nd Bucket Table (so we can JOIN later on)
DROP TABLE IF EXISTS plane_bucket;

CREATE TABLE plane_bucket( tailnum string, manufacturer string) 
USING CSV
CLUSTERED BY(tailnum) INTO 42 BUCKETS;

In [0]:
%sql
-- Lab 03h: Populate 2nd Bucket Table
INSERT INTO plane_bucket SELECT tailnum, manufacturer FROM plane_view;

In [0]:
%sql
-- Lab 03i: Confirm contents of 2nd Bucket Table
SELECT * FROM plane_bucket;

In [0]:
%sql
-- Lab 03j: Without Buckets have Shuffle
SELECT f.tailnum, p.manufacturer FROM fly_view f JOIN plane_view p ON f.tailnum = p.tailnum

In [0]:
%sql
-- Lab 03k: With Buckets, don't have Shuffle
SELECT f.tailnum, p.manufacturer FROM fly_bucket f JOIN plane_bucket p ON f.tailnum = p.tailnum

## Lab 04: Storage

### Data Skipping (SELECT)

In [0]:
# Lab 04a: Create 2 DataFrames (one as CSV, one as Delta)
from pyspark.sql.types import StructType, StructField, StringType

policeSchema = StructType([StructField('IncidentNum', StringType(), True), StructField('Category', StringType(), True), StructField('Description', StringType(), True), StructField('DayOfWeek', StringType(), True), StructField('Date', StringType(), True), StructField('Time', StringType(), True), StructField('PdDistrict', StringType(), True),  StructField('Resolution', StringType(), True), StructField('Address', StringType(), True), StructField('X', StringType(), True), StructField('Y', StringType(), True), StructField('Loc', StringType(), True), StructField('PdId', StringType(), True)])

CSVdf = spark.read.schema(policeSchema).csv("dbfs:/FileStore/tables/sfpd1/")
display(CSVdf)

CSVdf.write.format("delta").mode("overwrite").save("/tmp/delta_cops")
DeltaDF = spark.read.format("delta").load("/tmp/delta_cops")

In [0]:
# Lab 04b: Attempt Column Pruning on CSV File formats 
# Go to Spark UI, SQL tab. How  many MB of data was read?  
display(CSVdf.select("Category", "DayOfWeek"))

In [0]:
# Lab 04c: Column Pruning via Columnar format on Delta File formats (2 of 2)
# Go to Spark UI, SQL tab. How  many MB of data where read?  
display(DeltaDF.select("Category", "DayOfWeek"))

### Data Skipping (WHERE)

In [0]:
#  Lab 04d: If wish to repeat, run these first so don't get any conflicts
dbutils.fs.rm("dbfs:/tmp/delta_cops",True)

In [0]:
# Lab 04e: How many files in the Directory? (20 files)
display(dbutils.fs.ls("dbfs:/FileStore/tables/sfpd1/"))

In [0]:
# Lab 04f: Create 2 DataFrames (one as CSV, one as Delta)
from pyspark.sql.types import StructType, StructField, StringType

policeSchema = StructType([StructField('IncidentNum', StringType(), True), StructField('Category', StringType(), True), StructField('Description', StringType(), True), StructField('DayOfWeek', StringType(), True), StructField('Date', StringType(), True), StructField('Time', StringType(), True), StructField('PdDistrict', StringType(), True),  StructField('Resolution', StringType(), True), StructField('Address', StringType(), True), StructField('X', StringType(), True), StructField('Y', StringType(), True), StructField('Loc', StringType(), True), StructField('PdId', StringType(), True)])

CSVdf = spark.read.schema(policeSchema).csv("dbfs:/FileStore/tables/sfpd1/")
display(CSVdf)

CSVdf.write.format("delta").mode("overwrite").save("/tmp/delta_cops")
DeltaDF = spark.read.format("delta").load("/tmp/delta_cops")

In [0]:
# Lab 04g: Attempt Data Skipping on CSV File formats (1 of 2)
# Go to Spark UI, SQL tab.  How  many of 20 files where read?  
display(CSVdf.where("IncidentNum < 015046293"))

In [0]:
# Lab 04h: Note Delta keeps Metadata files located under Directory /_delta_log
#          It uses these Statistics to skip reading these Files if not in query
display(spark.read.json("dbfs:/tmp/delta_cops/_delta_log/00000000000000000000.json"))

In [0]:
# Lab 04i: Data Skipping on Delta File formats (2 of 2)
# Go to Spark UI, SQL tab.  How  many of 20 files where read?  
display(DeltaDF.where("IncidentNum < 015046293"))

### Best File Size on Disk (via 'optimize')

In [0]:
# Lab 04j: Best File size on Disk (Total: 12GB in '/airlines' Directory)
#          Want 500MB files on Disk
# WARNING: This query takes 40 minutes. Do NOT Run

# df = (spark.read.csv("dbfs:/databricks-datasets/asa/airlines/")
#          .repartition(48)
#          .write.mode("overwrite")
#          .format("delta")
#          .save("/tmp/deltaPart/"))

In [0]:
# Lab 04k: Optimize 
flights = (spark.read.format("csv") 
  .option("header", "true") 
  .option("inferSchema", "true") 
  .load("/databricks-datasets/asa/airlines/2008.csv"))

In [0]:
# Lab 04l: Optimize: Write into Delta (2 minutes)
(flights.write.format("delta")
              .mode("overwrite")
              .save("dbfs:/tmp/flights_delta/"))

In [0]:
# Lab 04m: Optimize: Here's the DELTA directories before OPTIMIZE (9 directories - Avg = 16MB)
display(dbutils.fs.ls("tmp/flights_delta"))

In [0]:
# Lab 04n: Optimize 
# Before Compact: Find Top 10 cities with highest monthly flights on 1st day of the week

from pyspark.sql.functions import count
flights_delta = spark.read.format("delta").load("dbfs:/tmp/flights_delta/")
display(flights_delta.filter("DayOfWeek = 1").groupBy("Month","Origin").agg(count("*").alias("TotalFlights")).orderBy("TotalFlights", ascending=False).limit(10))

In [0]:
# Lab 04o: Optimize: 
spark.sql("DROP TABLE IF EXISTS flights_delta")

spark.sql("""
CREATE TABLE flights_delta
USING DELTA 
LOCATION 'dbfs:/tmp/flights_delta'
""")

In [0]:
%sql
-- # Lab 04p: Optimize: This will do a Compaction (bin-packing) only (2-1/2 min)
OPTIMIZE flights_delta

In [0]:
# # Lab 04q: Optimize: Here's the DELTA directories after OPTIMIZE (10 Files - Got 1 giant File from Compaction)
# So I can now VACUUM to get rid of the old directories I no longer need
display(dbutils.fs.ls("tmp/flights_delta"))

In [0]:
# Lab 04r: Optimize: Enables me to remove files less than week old but keep last one
spark.sql("SET spark.databricks.delta.retentionDurationCheck.enabled=false")

In [0]:
%sql
-- Lab 04s: Optimize: Remove older files and retain just one File we created via Optimize
VACUUM flights_delta RETAIN 0 hours

In [0]:
# Lab 04t: Optimize: Here's the DELTA directories after OPTIMIZE (10 Files down to 1 File)
display(dbutils.fs.ls("tmp/flights_delta"))

In [0]:
# Lab 04u: Optimize: After Compaction Optimization. Compare Clock time to Cell 58
display(flights_delta.filter("DayOfWeek = 1").groupBy("Month","Origin").agg(count("*").alias("TotalFlights")).orderBy("TotalFlights", ascending=False).limit(10))

###  'optimize' using Z-Order

In [0]:
%sql
-- Lab 04v: Now 'ZORDER'
OPTIMIZE flights_delta ZORDER BY (DayofWeek)

In [0]:
# Lab 04w: After ZORDER Optimization. Compare Clock time to Cell 65
display(flights_delta.filter("DayOfWeek = 1").groupBy("Month","Origin").agg(count("*").alias("TotalFlights")).orderBy("TotalFlights", ascending=False).limit(10))

### Partitoned Tables and DataFrames

In [0]:
# Lab 04x: Read in a DataFrame
flight_nonPart = (spark.read
  .option("header", True)
  .option("inferSchema", True)
  .csv("dbfs:/databricks-datasets/asa/small/small.csv"))

flight_nonPart.createOrReplaceTempView("flight_view_nonPart")

In [0]:
# Lab 04y: Create a Partitioned DataFrame
flight_nonPart.write.mode("overwrite").format("delta").partitionBy("UniqueCarrier").save("/tmp/flight_part/")

In [0]:
# Lab 043z: Confirm each 'UniqueCarrier' has a Disk Directory
display(dbutils.fs.ls("dbfs:/tmp/flight_part/"))

In [0]:
# Lab 04aa: Load Partitioned DataFrame and Table

flight_Part = (spark.read.format("delta").load("dbfs:/tmp/flight_part/"))
fDF.createOrReplaceTempView("flight_view_part")

In [0]:
# Lab 04bb: Compare Clock times of Non-Part to Part
# NonPart is a full file scan
display(spark.sql("SELECT * FROM flight_view_nonPart WHERE UniqueCarrier = 'DL'"))

In [0]:
# Lab 04cc: Compare Clock times of Non-Part to Part
# Only scan 1 File Partition
display(spark.sql("SELECT * FROM flight_view_part WHERE UniqueCarrier = 'DL'"))

### Bloom Filters (WARNING: Long-running queries)

In [0]:
# Lab 04dd: Disable IO cache so as to minimize side effects
spark.conf.set("spark.databricks.io.cache.enabled", False)

In [0]:
%sql
-- Lab 04ee: Enable Bloom filter capability
SET spark.databricks.io.skipping.bloomFilter.enabled = true;

In [0]:
%sql
-- Lab 04ff: Create Table
CREATE OR REPLACE TABLE bloom_test (
  id   BIGINT NOT NULL,
  str1 STRING NOT NULL,
  sha  STRING NOT NULL,
  sha1 STRING NOT NULL,
  sha2_256 STRING NOT NULL,
  row_hash_too_big STRING NOT NULL,
  row_hash STRING NOT NULL
)
USING DELTA
LOCATION 'dbfs:/tmp/bloom_test'

In [0]:
%sql
-- Lab 04gg: Create Index before adding Data
CREATE BLOOMFILTER INDEX
ON TABLE bloom_test
FOR COLUMNS(sha OPTIONS (fpp=0.1, numItems=50000000))

In [0]:
%sql
-- Lab 04hh: Generate Data
TRUNCATE TABLE bloom_test;

WITH sample (
  SELECT
    id,
    'windows.exe' as str1,
    monotonically_increasing_id() mono_id,
    hash(id) hash,
    sha (cast(id % 50000000 as string)) sha,
    sha1(cast(id % 50000000 as string)) sha1,
    sha2(cast(id as string), 256)    sha2_256
  from
    RANGE(0, 10000000, 1, 448)  -- start, end, step, numPartitions
)
INSERT INTO bloom_test 
SELECT id, 
  str1, 
  sha,
  sha1,
  sha2_256,
  sha2(concat_ws('||',id, str1, mono_id, hash, sha, sha1, sha2_256),512) row_hash_too_big,
  sha2(concat_ws('||',id, str1, mono_id, hash, sha, sha1, sha2_256),256) row_hash
FROM sample
LIMIT 20000

In [0]:
%sql
-- Lab 04ii: If add Index and there's already Data there, must Optimize to build out the Bloom Filter
-- The default value is 1073741824, which sets the size to 1 GB. 
SET spark.databricks.delta.optimize.maxFileSize = 1610612736;
OPTIMIZE bloom_test
ZORDER BY id

In [0]:
# Lab 04jj: Examine the Bloom index via '_delta_index' directory
display(dbutils.fs.ls("dbfs:/tmp/bloom_test"))

In [0]:
# Lab 04kk: Examine the Bloom index via '_delta_index' directory
display(dbutils.fs.ls("dbfs:/tmp/bloom_test/_delta_index/"))

In [0]:
%sql
-- Lab 04ll: Find some 'sha' values
SELECT * FROM bloom_test WHERE id in ( 0, 1, 99999998, 99999999)

In [0]:
%sql
-- Lab 04mm: Query a Non-Bloom column
-- Might be lucky and do some Data Skipping (SELECT) since we are using Delta
SELECT sha1 FROM bloom_test WHERE sha1 = 'b6589fc6ab0dc82cf12099d1c2d40ab994e8410c'

In [0]:
%sql
-- Lab 04nn: Query a Bloom column
SELECT sha FROM bloom_test WHERE sha = '356a192b7913b04c54574d18c28d46e6395428ab'

In [0]:
%sql
-- Lab 04oo: Search Bloom column for something that is not there
SELECT count(*) FROM bloom_test WHERE sha = '356a192b7913b04c54574d18c28d46e6395428ab'

## Lab 05: Serialization

In [0]:
%scala
// Lab 05a: Using lower level 'flatMap' which forces Serialization/Deserialization
import org.apache.spark.sql._
import org.apache.spark.sql.types._

val arrayStructureData = Seq(
    Row("James,Smith",List("Java","Scala","C++"),"CA"),
    Row("Michael,Rose,",List("Spark","Java","C++"),"NJ"),
    Row("Robert,Williams",List("CSharp","VB","R"),"NV")
)

val arrayStructureSchema = new StructType()
    .add("name",StringType)
    .add("languages", ArrayType(StringType))
    .add("state", StringType)

val df = spark.createDataFrame(
spark.sparkContext.parallelize(arrayStructureData),arrayStructureSchema)
import spark.implicits._

In [0]:
%scala
// Lab 05b:  Here's the Data
df.show()

In [0]:
%scala
// Lab 05c: Using Lower level function flatMap().  Go to Spark UI > SQL tab and look for Deserialization
val df2=df.flatMap(f=> f.getSeq[String](1).map((f.getString(0),_,f.getString(2))))
    .toDF("name","language","state")

df2.show(false)

In [0]:
%scala
// Lab 05d: Using Higher-leve function 'explode' (Java Optimized)
// Compare Clock times
import org.apache.spark.sql.functions.explode
df.select($"name", explode($"languages"), $"state").show()

# End of Module 10: Performance Tuning
# Ignore past here