## Mod 02: Spark SQL (Read/Write DataFrames/Tables)

### Lab 01: What is a DataFrame? Object with a Schema

In [0]:
# Place cursor after last '.' and click TAB key to see which file types are supported
spark.read.

In [0]:
#  Lab 01a: Using 'spark.read' to create a DataFrame using CSV file format
df1 = spark.read.load("dbfs:/FileStore/tables/LifeExp_headers.csv", format="csv", header= True, inferSchema= True )
display(df1)

In [0]:
# Lab 01b: Alternative code
df1 = spark.read.option("header", True).option("inferSchema", True).format("csv").load("dbfs:/FileStore/tables/LifeExp_headers.csv")
display(df1)

In [0]:
#  Lab 01c: Columns Implicit via hard-coded default column name = '_cX'
#           (where X is Integer starting at 0 and incrementing by 1)
df2 = spark.read.load("dbfs:/FileStore/tables/LifeExp.csv", format="csv", header="False", inferSchema="True")
display(df2)

### Lab 02: Create Dataframe from JSON file

In [0]:
# Lab 02a: Using 'spark.read' function, load JSON file, then display

df1 = spark.read.format("json").option("inferSchema", True).load("dbfs:/FileStore/tables/names1.json")
display(df1)

In [0]:
# Lab 02b: View schema

df1.printSchema()

In [0]:
# Lab 02c: Query Dataframe using 'select' with 'display'

# Below 2 equivalent answer set
display(df1.select("name"))
display(df1.select(df1.name))

display(df1.select(df1["name"], df1["age"]))
display(df1[df1.age<50])

### Lab 03: Read Parquet files

In [0]:
# Load Parquet files and display
# Parquet file format has Schema (Column names and Data types built-in the Metadata)

empDF = spark.read.format("parquet").load("dbfs:/FileStore/tables/emp_snappy.parquet/")
deptDF = spark.read.format("parquet").load("dbfs:/FileStore/tables/dept_snappy.parquet/")

display(empDF)
display(deptDF)

### Lab 04a: spark.read.csv with StructType Libraray and format and schema arguments

In [0]:
## Lab 04a: How to manually define Schema for your CSV file
## Import library so can create Schema
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

## Create schema to be used in creation of DataFrame
lifeSchema = StructType([StructField('Country',StringType(), True), \
                         StructField('LifeExp', FloatType(), True), StructField('Region', StringType(), True) ])

df1 = spark.read.load("dbfs:/FileStore/tables/LifeExp.csv", format="csv", schema=lifeSchema)

display(df1)
df1.printSchema()

### Lab 04b: Even easier than StructType, just code Schema like  below

In [0]:
# With Spark version 3, don't even have to mess around with 'StructType' anymore!!  Just do it like below
DDLSchema = "Country string, LifeExp float, Region string"

df1 = spark.read.format("csv").load("dbfs:/FileStore/tables/LifeExp.csv", schema=DDLSchema)

display(df1)
df1.printSchema()

### Lab 05: Using PostgreSQL database

In [0]:
%scala
// Here's JDBC driver and credentials to log into PostgreSQL
import org.apache.spark.sql.functions._

// Configure this app to connect to a PostgreSQL database
Class.forName("org.postgresql.Driver")

val connectionProperties = new java.util.Properties()
connectionProperties.put("user", "readonly")
connectionProperties.put("password", "readonly")

val tableName = "training.people_1m"
val jdbcUrl = "jdbc:postgresql://server1.databricks.training:5432/training"

In [0]:
%scala
// Here we query PostgreSQL table 
sc.setJobDescription("Step C: With Predicate")

val df = spark.read
  .jdbc(url=jdbcUrl, table=tableName, properties=connectionProperties) // Open a JDBC connect
  .filter($"id" > 343517)                                             
display(df) 

### Lab 06: Write DataFrame to Files

In [0]:
# Lab 06a: Write as Delta file
empDF = spark.read.format("parquet").load("dbfs:/FileStore/tables/emp_snappy.parquet/")
empDF.write.format("delta").mode("append").save("/tmp/delta/")

In [0]:
# View the files
display(dbutils.fs.ls("tmp/delta/"))


In [0]:
# Lab 06b: Save as an SQL Table
# Can view table via the 'Data' menu
# empDF.write.format("delta").mode("append").saveAsTable("emp_table")

# Note this lab will fail if you run again.  To fix, uncomment below and then run.  Then put comment back
#spark.sql("DROP TABLE IF EXISTS emp_table")

### Lab 07: Write as JSON, ORC and Parquet

In [0]:
# Write empDF in other File formats
empDF.select("emp", "mgr", "dept").write.format("json").mode("overwrite").save("tmp/JSON/")
empDF.select("emp", "mgr", "dept").write.format("orc").mode("overwrite").save("tmp/orc/")
empDF.select("emp", "mgr", "dept").write.format("parquet").mode("overwrite").save("tmp/parquet/")

In [0]:
Which one has least disk space?
display(dbutils.fs.ls("tmp/JSON/"))
display(dbutils.fs.ls("tmp/orc/"))
display(dbutils.fs.ls("tmp/parquet/"))

### Lab 08: SHOW TABLES displays SQL Tables and TempViews

In [0]:
%sql
-- Here's the table we created in Cmd Cell 23
-- 'isTemporary' = True earmarks any TempViews you may have created via 'CreateOrReplaceTempView' statement
show tables;

In [0]:
# Alternative code: If wish to run in SQL-like code in Python, just preface with 'spark.sql'
display(spark.sql("show tables"))

### Lab 09: Create Table: USING and OPTIONS

In [0]:
%sql
-- Lab 12a: DROP, then CREATE TABLE
-- Note if PATH points to files, Table is instantly populated
DROP TABLE IF EXISTS mpg;

CREATE TABLE mpg
USING csv
OPTIONS (path "/databricks-datasets/Rdatasets/data-001/csv/ggplot2/mpg.csv", header "true");

In [0]:
%sql
-- Query Table

SELECT * FROM mpg;

### Lab 10: Follow along with Instructor as View Table details via 'Data' tab in UI

### Lab 11: Create Table without Header info by creating Column names/data types manually

In [0]:
%sql
-- Lab 11a: Create Table (Here I'm using 'LOCATION' keyword instead of 'PATH' keyword)
DROP TABLE IF EXISTS dept;

CREATE TABLE dept (dept_num INT, dept_name STRING, budget INT, mgr INT)
USING csv
LOCATION "dbfs:/FileStore/tables/dept.csv"

In [0]:
%sql
-- Lab 11b: View Table contents

SELECT * FROM dept;

In [0]:
# Lab 11c: Create DataFrame from Hive Table using 'spark.table'
df = spark.table("dept")
display(df)

### Lab 12: Create Table: PARTITION BY

In [0]:
# Uncomment this and run if wish to run next Cell again
# dbutils.fs.rm('/user/hive/warehouse/cust_part', True)

In [0]:
%sql
-- Lab 12a: 
DROP TABLE IF EXISTS cust_part;

CREATE TABLE cust_part (id INT, name STRING) 
USING DELTA
PARTITIONED BY (state STRING, city STRING); 

In [0]:
%sql
-- Lab 12b: INSERT data into Table
INSERT INTO cust_part PARTITION (state = 'CA', city = 'Fremont') VALUES (100, 'Al'); 
INSERT INTO cust_part PARTITION (state = 'CA', city = 'San Jose') VALUES (200, 'Bo'); 
INSERT INTO cust_part PARTITION (state = 'AZ', city = 'Peoria') VALUES (300, 'Cy'); 

In [0]:
%sql
-- Lab 12c: View Table contents
SELECT * FROM cust_part;

In [0]:
%sql
-- Lab 12d: Details about the Table
DESCRIBE EXTENDED cust_part;

In [0]:
%sql
-- Lab 12e: Confirm only reading 1 Partition (CA). Look for 'PartitionFilters'

EXPLAIN FORMATTED SELECT * FROM cust_part WHERE state = 'CA';

In [0]:
# Lab 12f: Confirm have 2 Directories under dbfs:/user/hive/warehouse/cust_part

display(dbutils.fs.ls("dbfs:/user/hive/warehouse/cust_part/"))

In [0]:
# Lab 12g: Drill down to state=CA partition and notice 2 CITY partitions under it

display(dbutils.fs.ls("dbfs:/user/hive/warehouse/cust_part/state=CA"))

In [0]:
# Drill down to 'city=Fremont' Partition to view actual File(s)
# Lab 12h:

display(dbutils.fs.ls("dbfs:/user/hive/warehouse/cust_part/state=CA/city=Fremont/"))

### Lab 13: CREATE BUCKET TABLE: Clustered By Sorted By INTO Num_Buckets

In [0]:
# Uncomment this and run if wish to run next Cell again
# dbutils.fs.rm('/user/hive/warehouse/bucket_table', True)

In [0]:
%sql
-- Lab 13a: Create 25 buckets (files) and has 'State' values to one of these Buckets
--          Note it is possible to get Skewed files this way

DROP TABLE IF EXISTS bucket_table;

CREATE TABLE bucket_table (state STRING, population INT, yr INT)
        USING CSV
        COMMENT 'A bucketed sorted user table'
        CLUSTERED BY (state) SORTED BY (state) INTO 25 BUCKETS;

In [0]:
%sql
-- Lab 13b: INSERT

INSERT OVERWRITE bucket_table VALUES
("Alabama",4875120,2017),
("Alaska",739786,2017),
("Arizona",7048876,2017),
("Arkansas",3002997,2017),
("California",39399349,2017),
("Colorado",5615902,2017),
("Connecticut",3573880,2017),
("Delaware",957078,2017),
("DistrictofColumbia",695691,2017),
("Florida",20976812,2017),
("Georgia",10413055,2017),
("Hawaii",1424203,2017),
("Idaho",1718904,2017),
("Illinois",12786196,2017),
("Indiana",6660082,2017),
("Iowa",3143637,2017),
("Kansas",2910689,2017),
("Kentucky",4453874,2017),
("Louisiana",4670818,2017),
("Maine",1335063,2017),
("Maryland",6024891,2017),
("Massachusetts",6863246,2017),
("Michigan",9976447,2017),
("Minnesota",5568155,2017),
("Mississippi",2989663,2017),
("Missouri",6108612,2017),
("Montana",1053090,2017),
("Nebraska",1917575,2017),
("Nevada",2972405,2017),
("New Hampshire",1349767,2017),
("New Jersey",8888543,2017),
("New Mexico",2093395,2017),
("New York",19590719,2017),
("North Carolina",10270800,2017),
("North Dakota",755176,2017),
("Ohio",11664129,2017),
("Oklahoma",3932640,2017),
("Oregon",4146592,2017),
("Pennsylvania",12790447,2017),
("Rhode Island",1056486,2017),
("South Carolina",5021219,2017),
("South Dakota",873286,2017),
("Tennessee",6708794,2017),
("Texas",28322717,2017),
("Utah",3103118,2017),
("Vermont",624525,2017),
("Virginia",8465207,2017),
("Washington",7425432,2017),
("West Virginia",1817048,2017),
("Wisconsin",5792051,2017),
("Wyoming",578934,2017),
("Alabama",4887871,2018),
("Alaska",737438,2018),
("Arizona",7171646,2018),
("Arkansas",3013825,2018),
("California",39557045,2018),
("Colorado",5695564,2018),
("Connecticut",3572665,2018),
("Delaware",967171,2018),
("DistrictofColumbia",702455,2018),
("Florida",21299325,2018),
("Georgia",10519475,2018),
("Hawaii",1420491,2018),
("Idaho",1754208,2018),
("Illinois",12741080,2018),
("Indiana",6691878,2018),
("Iowa",3156145,2018),
("Kansas",2911505,2018),
("Kentucky",4468402,2018),
("Louisiana",4659978,2018),
("Maine",1338404,2018),
("Maryland",6042718,2018),
("Massachusetts",6902149,2018),
("Michigan",9995915,2018),
("Minnesota",5611179,2018),
("Mississippi",2986530,2018),
("Missouri",6126452,2018),
("Montana",1062305,2018),
("Nebraska",1929268,2018),
("Nevada",3034392,2018),
("New Hampshire",1356458,2018),
("New Jersey",8908520,2018),
("New Mexico",2095428,2018),
("New York",19542209,2018),
("North Carolina",10383620,2018),
("North Dakota",760077,2018),
("Ohio",11689442,2018),
("Oklahoma",3943079,2018),
("Oregon",4190713,2018),
("Pennsylvania",12807060,2018),
("Rhode Island",1057315,2018),
("South Carolina",5084127,2018),
("South Dakota",882235,2018),
("Tennessee",6770010,2018),
("Texas",28701845,2018),
("Utah",3161105,2018),
("Vermont",626299,2018),
("Virginia",8517685,2018),
("Washington",7535591,2018),
("West Virginia",1805832,2018),
("Wisconsin",5813568,2018),
("Wyoming",577737,2018)

In [0]:
%sql
-- Lab 13c: Confirm only reading 1 Bucket. Look for Number of Buckets Selected

EXPLAIN FORMATTED SELECT * FROM bucket_table WHERE state = 'Ohio';

In [0]:
%sql
-- Lab 13d: View DDL of Table
SHOW CREATE TABLE bucket_table

### Lab 14: CREATE TABLE: TBLPROPERTIES 
### Creates Key-Values pairs that can be viewed via 'SHOW TBLPROPERTIES' query

In [0]:
%sql
-- Lab 14a:

DROP TABLE IF EXISTS customer1;

CREATE TABLE customer1(cust_code INT, name VARCHAR(100), cust_addr STRING)
  TBLPROPERTIES ('created.by.user' = 'Mark', 'created.date' = '01-01-2021');

In [0]:
%sql
-- Lab 14b: Comments show up in this command

SHOW TBLPROPERTIES customer1;

In [0]:
%sql
-- Lab 14c:  Comments show up in this command too
DESCRIBE EXTENDED customer1

### Lab 15: Creating TempViews via 'CreateOrReplaceTempView'

In [0]:
# Lab 15a: Convert DataFrame into a TempView

deptDF = spark.read.format("parquet").load("dbfs:/FileStore/tables/dept_snappy.parquet/")
deptDF.createOrReplaceTempView("dept_view")

In [0]:
%sql
-- Lab 15b: show tables. Note have both PERM and TEMP objects

show tables;

In [0]:
%sql
-- Lab 15c: Query Temp View

SELECT * FROM dept_view;

In [0]:
%sql
-- Lab 15d: Saving a TempView as a Perm Table
CREATE TABLE perm_dept_table2 AS SELECT * FROM dept_view;

SHOW TABLES;

### Lab 15e: Delete Cluster, Add Cluster back and repeat Cmd 60.
### Confirm 'dept_view' is no longer there

### Lab 16: Global Temporary Tables

In [0]:
# Uncomment below and run if wish to run next Cell again
# spark.catalog.dropGlobalTempView("people")

In [0]:
# Lab 16a:  Run Global Temp View across 2 sessions: They both work
# Note if need to drop a Global Temp View, use the code in above Cell (commented out since don't need it for lab)
# spark.catalog.dropGlobalTempView("people")

df = spark.read.json("dbfs:/FileStore/tables/names1.json") 
df.createGlobalTempView("people") 


# Global temporary view is tied to a system preserved database `global_temp`
spark.sql("SELECT * FROM global_temp.people").show() 

# Global temporary view successful in another Session
spark.newSession().sql("SELECT * FROM global_temp.people").show() 

In [0]:
# Lab 16b:  Run Temp View across 2 sessions: 2nd Session fails on Purpose since it's a Temp View.
#           It ran in previous lab because it was a Global View
df1 = spark.read.json("dbfs:/FileStore/tables/names1.json") 
df1.createOrReplaceTempView("people_temp_view")

# Now Convert TempView to a DataFrame using following syntax
spark.sql("SELECT * from people_temp_view").show()


# Temporary views ARE NOT cross-session. Query Fails intentionally
spark.newSession().sql("SELECT * FROM people_temp_view").show() 

# End of Module 02 - SparkSQL (Read/Write DataFrames/Tables)
## Ignore past here