# Mod 01: Spark Basics

## Lab 01:  Using 'dbutils' command to view Files

In [0]:
# Lab 01a: Before we begin, confirm all files are loaded
# Should have XX rows if you loaded everything correctly
display(dbutils.fs.ls("dbfs:/FileStore/tables/"))

In [0]:
# Lab 01b: View files under '/databricks-datasets/...'
display(dbutils.fs.ls("/databricks-datasets/"))

In [0]:
# Lab 01c: To delete an individual file, type: 
# 'True' means command completed successfully (in other words, file was deleted)
# dbutils.fs.rm("dbfs:/FileStore/tables/flights_abbr_1.txt")

In [0]:
# Lab 01d: How to drop a Folder and all it's contents
# 'False' means command failed (Typically if occurs when Folder does not exist)
# dbutils.fs.rm('FileStore/tables/', True)

In [0]:
# Lab 01e: View contents of a file 
display(dbutils.fs.head("dbfs:/FileStore/tables/LifeExp.csv"))

## Lab 02: Language Interpreters (via '%')
### Next 4 cells will show 4 different Intrepreters (Markdown, Python, Scala, SQL, SH)
### Note if have Intrepreter defined, must be Row 1
### Here is an example of a **Markdown** cell (1 of 5)
### Double-click inside Cell to see the underlying code
### Click anywhere outside Cell to view as Markdown

In [0]:
%python
#  Here's example of a Python cell (2 of 5)

# Lab 01a: Create RDD from text file using 'sc.parallelize'
data = ([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15])
baseRDD = sc.parallelize(data)
baseRDD.collect()

In [0]:
%scala
//  Here's example of a Scala cell (3 of 5)
val baseRDD = sc.textFile("/FileStore/tables/lincoln.txt")
baseRDD.collect()

In [0]:
%sql
-- Here's an example of a SQL cell (4 of 5)
DROP TABLE IF EXISTS diamonds;

CREATE TABLE diamonds
USING csv
OPTIONS (path "/databricks-datasets/Rdatasets/data-001/csv/ggplot2/diamonds.csv", header "true")

In [0]:
%sql
-- View contents of previous Cell
SELECT * FROM diamonds

In [0]:
%sh
# Here's the Shell intrepreter (5 of 5)
ls

## Lab 03: Spark SQL

In [0]:
%sql
-- Lab 03a: Drop Hive Table, then create Hive Table and populate it
DROP TABLE IF EXISTS flights_abbr;

CREATE TABLE flights_abbr
USING CSV
OPTIONS (path "/FileStore/tables/header_flights_abbr.csv", header "true")

In [0]:
%sql
-- View contents of Hive Table

SELECT * FROM flights_abbr

In [0]:
%sql
-- Run Aggregate on Hive Table

SELECT uniquecarrier, avg(depdelay) as AVGdelay FROM flights_abbr
GROUP BY uniquecarrier
ORDER BY AVGdelay DESC

## Lab 04: DataFrame API

In [0]:
df = spark.read.csv("/FileStore/tables/header_flights_abbr.csv", header=True)
display(df)

## Lab 05: Resilient Distributed Datasets

In [0]:
%scala
// Lab 05b: Query: Do WordCount using 3 separate statements using following Operations (flatMap, map, reduceByKey)

val rdd1 = sc.textFile("/FileStore/tables/mary.txt")
val rdd2 = rdd1.flatMap(line => line.split(" "))
val rdd3 = rdd2.map(word => (word, 1))
val rdd4 = rdd3.reduceByKey((x,y) => x+y)
rdd4.collect()

In [0]:
%scala
// Lab of5b Query: Pipeline 3 Operations (flatMap, map, reduceByKey) into a single statement

val rdd1 = sc.textFile("/FileStore/tables/mary.txt")
val rdd2 = rdd1.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey((x,y) => x+y)
rdd2.collect()

In [0]:
%python
# Lab 05c: Distributed refers to breaking file into smaller Partitions so they can be run in Parallel
#          To see # of Partitions, can use 'getNumPartitions' method.  Note this method only works on RDDs
rdd1 = sc.textFile("/FileStore/tables/mary.txt")
rdd1.getNumPartitions()

In [0]:
%python
# Lab 05d:  To get # of Partitions for a DataFrame, use the 'rdd' method to convert DataFrame to rdd,
#           then run the 'getNumPartitions'
df = spark.read.csv("/FileStore/tables/header_flights_abbr.csv", header=True)
df.rdd.getNumPartitions()

# End of Module 01 - Basics