In [0]:
%sh
# Lab 00a: Before we begin, here's files that are counting against our QUOTA

du --human-readable --max-depth=1 --exclude='/dbfs' /

In [0]:
 %sh
 # Lab 00c:
    exec <&- 2> /dev/null
    echo "=Look for big s3 files:"
    du --human-readable --max-depth=2 --apparent-size --exclude='/dbfs/mnt' \
        --exclude='/dbfs/databricks-*' /dbfs
    echo
    echo "=Look for big local files:"
    du --human-readable --max-depth=1 --exclude='/dbfs' /

In [0]:
%sh
# Lab 00d:

# rm -rf /dbfs/FileStore/*.png
# rm -rf /dbfs/tmp/*

In [0]:
%sh
# Lab 00e:

# rm -rf /dbfs/tmp/*
# rm -rf /dbfs/local_disk0/tmp/*

# Mod 07: Spark Architecture and User Interface

In [0]:
# Lab 00: First, disable side effects
spark.conf.set("spark.databricks.io.cache.enabled", False)
spark.conf.set("spark.sql.adaptive.enabled", False)

## Lab 01: How Driver decides on Memory Partitions

In [0]:
# Lab 01a: Here's the Data.  192 compressed files roughly 24.6 MB in size.  Total size on Disk = 4734 MB
display(dbutils.fs.ls("dbfs:/databricks-datasets/wiki/"))

In [0]:
# Lab 01b: How many cores in Cluster?
sc.defaultParallelism

In [0]:
# Lab 01b: Max # of bytes to pack into a Partition (Default = 128 MB)
spark.conf.get("spark.sql.files.maxPartitionBytes")

In [0]:
# Lab 01b: Overhead cost when packing files into a Partition (Default = 4 MB)
spark.conf.get("spark.sql.files.openCostInBytes")

In [0]:
# Lab 01c:
# Notice no Job spawn. That's because the Driver is solely responsible for building Partitions
from pyspark.sql.types import LongType, TimestampType, StringType, StructType, StructField

DDL_Schema = StructType([
  StructField("id", LongType(), True),
  StructField("location", StringType(), True),
  StructField("ts", TimestampType(), True),
  StructField("comment", StringType(), True),
  StructField("author", StringType(), True)
])

wikiCSVDF = spark.read.option("sep", "\t").schema(DDL_Schema).csv("dbfs:/databricks-datasets/wiki/")
wikiCSVDF.rdd.getNumPartitions()

In [0]:
# Lab 01d: repartition() increases # of Partitions
# This is used to get Partition size between 200MB - 1GB range
wikiDF16 = wikiCSVDF.repartition(16)
wikiDF16.rdd.getNumPartitions()

In [0]:
# Lab 01e: coalesce() decreases # of Partitions
# This is used to get Partition size between 200MB - 1GB range
wikiDF04 = wikiDF16.coalesce(4)
wikiDF04.rdd.getNumPartitions()

## Lab 02:  Spark UI walkthrough

In [0]:
# Lab 02a: First create some DataFrames and TempViews
empDF = spark.read.parquet("dbfs:/FileStore/tables/emp_snappy.parquet/")
deptDF = spark.read.parquet("dbfs:/FileStore/tables/dept_snappy.parquet/")

empDF.createOrReplaceTempView("emp_view")
deptDF.createOrReplaceTempView("dept_view")

display(empDF)
display(deptDF)

In [0]:
%sql
-- Lab 02b: Am dropping Hint to Catalyst Optimizer to force Sort Merge Join
-- Follow along with Instructor to see what Spark UI has to say about this query
SELECT /*+ SHUFFLE_MERGE(dept_view) */ * FROM emp_view JOIN dept_view ON emp_view.dept = dept_view.dept

## Lab 03: 'inferSchema' spawns 2 Jobs

In [0]:
# Lab 03a: Note 'inferSchema' causes 2 Jobs to run (1 to read first row of file to see if Column names defined, and 1 to infer Data Types)
#          So best practice is to always hard-code Schema to avoid this overhead (as seen in Cell 18)
df = (spark.read.option("header", True).option("inferSchema", True)
          .csv("dbfs:/databricks-datasets/asa/airlines/2007.csv"))
df.createOrReplaceTempView("air_view")

In [0]:
# Lab 03b: Here's Schema we'll use in next Cell
DDL_Schema = ("Year integer,Month integer,DayofMonth integer,DayOfWeek integer,DepTime string,CRSDepTime integer,ArrTime string,CRSArrTime integer,UniqueCarrier string,FlightNum integer,TailNum string,ActualElapsedTime string,CRSElapsedTime integer,AirTime string,ArrDelay string,DepDelay integer,Origin string,Dest string,Distance integer,TaxiIn integer,TaxiOut integer,Cancelled integer,CancellationCode string,Diverted integer,CarrierDelay string,WeatherDelay string,NASDelay string,SecurityDelay string,LateAircraftDelay string")

In [0]:
# Lab 03c: No Job(s) spawned here since Schema defined
df = spark.read.option("header", True).schema(DDL_Schema).csv("dbfs:/databricks-datasets/asa/airlines/2007.csv")
df.createOrReplaceTempView("flights_view")

## Lab 04: Narrow versus Wide Tasks

In [0]:
# Lab 04a: 'select' and 'filter' are Narrow Task since they stay in same Stage
display(df.select("UniqueCarrier", "Distance").filter("Distance > 2000"))

In [0]:
# Lab 04b: Is an 'orderBy' a Wide or Narrow Task? 
display(df.select("UniqueCarrier", "Distance").filter("Distance > 4600").orderBy("Distance"))

In [0]:
# Lab 04c: Is 'union' a Wide Task? 
display(df.union(df).limit(10))

## Lab 05: Wide Tasks spawn new Stage (along with mandatory Shuffle/Exchange)

In [0]:
# Lab 05a: Let's create 2 new objects so we can Join on them in below Cells
empDF = spark.read.parquet("dbfs:/FileStore/tables/emp_snappy.parquet/")
deptDF = spark.read.parquet("dbfs:/FileStore/tables/dept_snappy.parquet/")

empDF.createOrReplaceTempView("emp_view")
deptDF.createOrReplaceTempView("dept_view")

display(empDF)
display(deptDF)

In [0]:
%sql
-- Lab 05b: Am dropping Hint to Catalyst Optimizer to force Sort Merge Join
SELECT /*+ SHUFFLE_MERGE(dept_view) */ * FROM emp_view JOIN dept_view ON emp_view.dept = dept_view.dept

In [0]:
%sql
-- Lab 05c: Aggregations are Wide Tasks too
SELECT dept, sum(salary) FROM emp_view
GROUP BY dept;



In [0]:
%sql
-- Lab 05d: Distinct is Wide Task too
SELECT distinct dept FROM emp_view;

## Lab 06: In Review
## Remove 'Comment' before running each Cell

In [0]:
%sql
--  Lab 06a: Shuffle?
-- SELECT * FROM emp_view INTERSECT SELECT * FROM emp_view

In [0]:
%sql
-- Lab 06b: Shuffle?
-- CREATE TABLE emp_table AS SELECT * FROM emp_view

In [0]:
# Lab 06c: Shuffle?
#flyDF = spark.read.format("csv").load("dbfs:/FileStore/tables/header_flights_abbr.csv", inferSchema="true", header="true").count()

In [0]:
%sql
-- Lab 06d: Shuffle?
-- SELECT * FROM emp_view WHERE NOT dept IS NOT NULL OR emp > 1000 LIMIT 20

In [0]:
# Lab 06e: Shuffle?
# display(wikiDF04.repartition(24))


# End of Mod 07 - Architecture - Spark UI
## Ignore past here

In [0]:
%scala
// Lab 03b: Notice how easy it is to read DAG (from top to bottom) with Scala
// Create EMP and DEPT rdd
// Since these RDDs are not dependent on each other, they may run in parallel
//val emp = sc.parallelize(Seq((1, "mark", 10), (2, "juli", 20), (3, "carol", 30), (4, "jarrod", 35), (5, "karen", 30)), 80)
//val dept = sc.parallelize(Seq(("hadoop", 10), ("spark", 20), ("hive", 30), ("sqoop", 40) ), 40)

// Establish 3rd field as Key/Join column for EMP rdd, 2nd columm for DEPT rdd
//val emp1 = emp.keyBy(col => col._3)
//val dept1 = dept.keyBy(col => col._2)

// Inner Join
//val join1 = emp1.join(dept1)
//join1.toDebugString

In [0]:
%scala
// Lab 03c: Notice how easy it is to read DAG (from top to bottom) with Scala
// Create EMP and DEPT rdd
// Since these RDDs are not dependent on each other, they may run in parallel
//val emp = sc.parallelize(Seq((1, "mark", 10), (2, "juli", 20), (3, "carol", 30), (4, "jarrod", 35), (5, "karen", 30)), 80)
//val dept = sc.parallelize(Seq(("hadoop", 10), ("spark", 20), ("hive", 30), ("sqoop", 40) ), 40)

// Establish 3rd field as Key/Join column for EMP rdd, 2nd columm for DEPT rdd
//val emp1 = emp.keyBy(col => col._3)
//val dept1 = dept.keyBy(col => col._2)

// Inner Join
//val join1 = emp1.join(dept1)
//join1.collect()

In [0]:
%py
# Lab 03d:  Notice diffrence DAG with Python
# Create EMP and DEPT rdd
# Since these RDDs are not dependent on each other, they may run in parallel
#emp = sc.parallelize([ (1,"mark",10), (2,"juli",20), (3,"matt",30), (4,"jay",35), (5,"sue",30) ])
#dept = sc.parallelize([ ("hadoop", 10), ("spark", 20), ("hive", 30), ("sqoop",40) ])

# Establish 3rd field as Key/Join column for EMP rdd, 2nd columm for DEPT rdd
#emp1 = emp.keyBy(lambda j: j[2] )
#dept1 = dept.keyBy(lambda j: j[1] )

# Inner Join
#join1 = emp1.join(dept1)
#join1.collect()

### Lab 04: Storage tab

In [0]:
%py
# Lab 04a: Cache a DataFrame

#df1 = spark.read.format("json").load("dbfs:/FileStore/tables/names1.json")

#df1.cache()
#df1.show()

In [0]:
%sql
-- Lab 04b: Cache a Table

#DROP TABLE IF EXISTS dept;

#CREATE TABLE dept (dept_num INT, dept_name STRING, budget INT, mgr INT)
#USING csv
#OPTIONS (path "dbfs:/FileStore/tables/dept.csv");

#CACHE TABLE dept;
#SELECT * FROM dept;

In [0]:
%scala
// Lab 04c: Cache an RDD

//val emp = sc.parallelize(Seq((1, "mark", 10), (2, "juli", 20), (3, "carol", 30), (4, "jarrod", 35), (5, "karen", 30)), 80)
//val dept = sc.parallelize(Seq(("hadoop", 10), ("spark", 20), ("hive", 30), ("sqoop", 40) ), 40)

// Establish 3rd field as Key/Join column for EMP rdd, 2nd columm for DEPT rdd
//val emp1 = emp.keyBy(col => col._3)
//val dept1 = dept.keyBy(col => col._2)

// Assign RDD name and then Cache
//emp1.setName("empRDD").cache()
//dept1.setName("deptRDD").cache()

// Inner Join
//val join1 = emp1.join(dept1)
//join1.collect()

In [0]:
%scala
// Lab 04d:  Run JOIN again.  Since it is now cached, can skip some Stages
//join1.collect()

### Lab 05: Environment tab

In [0]:
%scala
// Lab 05:  Run JOIN again. 'View' for 'Environment' tab
//join1.collect()

### Lab 06: Executors (and Drivers) tab

In [0]:
%py
#sc.defaultParallelism

In [0]:
%scala
// Lab 06:  Run JOIN again. 'View' for 'Executor' tab
//join1.collect()

### Lab 07: SQL tab

In [0]:
%scala
// Lab 07a: View 'sql' tab. Then click 'Details' hotlink
//val df = Seq((1, "andy"), (2, "bob"), (2, "andy")).toDF("count", "name")


//df.count                                                           
//df.createOrReplaceTempView("temp_view1")

//spark.sql("select name,sum(count) from temp_view1 group by name").show()

In [0]:
%py
# Mod 07b: Fist Load 2 Parquet files and JOIN the DataFrames

#empDF = spark.read.format("parquet").load("dbfs:/FileStore/tables/emp_snappy.parquet/")
#deptDF = spark.read.format("parquet").load("dbfs:/FileStore/tables/dept_snappy.parquet/")
#joinDF = empDF.join(deptDF, "dept")

In [0]:
%py
# Mod 07c: Run query and then view 'sql' tab.  Then click 'Details' hotlink

#display(joinDF)

### Lab 08: JDBC/ODBC Server tab

In [0]:
%scala
// Spark SQL can also act as a distributed query engine using its JDBC/ODBC or command-line interface. In this mode, end-users or applications can interact with Spark // SQL directly to run SQL queries, without the need to write any code.
// For example, could query Hive table usin the Spark Engine.
// This functionality is not configurated for Databrick Community

### Lab 09: Structured Streaming tab

In [0]:
%python

# Lab 09a: Similar to definition of staticInputDF above, just using `readStream` instead of `read`
# Since the sample data is just a static set of files, you can emulate a stream from them by reading one #file at a time, in the chronological order in which they were created

from pyspark.sql.types import *
from pyspark.sql.functions import *

inputPath = "/databricks-datasets/structured-streaming/events/"

jsonSchema = StructType([ StructField("time", TimestampType(), True), StructField("action", StringType(), True) ])

# Static DataFrame representing data in the JSON files
streamingInputDF = (
  spark
    .readStream
    .schema(jsonSchema)               # Set the schema of the JSON data
    .option("maxFilesPerTrigger", 1)  # Treat a sequence of files as a stream by picking one file at a time
    .json(inputPath))

In [0]:
%python

# Lab 09b: Now we can compute the number of "open" and "close" actions with one minute windows. 
#To do this, we will group by the action column and 1 minute #windows #over the time column. Same query as staticInputDF earlier

streamingCountsDF = (
  streamingInputDF
    .groupBy(
      streamingInputDF.action,
      window(streamingInputDF.time, "1 minute"))
    .count()
)

# Is this DF actually a streaming DF?
streamingCountsDF.isStreaming

In [0]:
%py 

# Lab 9c: Once started, click on 'Spark Jobs' and then click on 'View' hotlink

query = (
  streamingCountsDF
    .writeStream
    .format("memory")        # memory = store in-memory table (for testing only)
    .queryName("counts")     # counts = name of the in-memory table
    .outputMode("complete")  # complete = all the counts should be in the table
    .start())

In [0]:
%py
# Lab 9d: Stop Streaming

query.stop()

# End of Module 08: Architecture and Spark UI