In [0]:
# Lab 00: If get QUOTA EXCEEDED, run below commands to remove ALL HIVE tables
# If need to remove files, can get rid of these
# dbutils.fs.rm("dbfs:/user/hive/warehouse/", True)

# Mod 03a: SparkSQL (Transformations 01)

## Columns and Expressions

In [0]:
empDF = spark.read.format("parquet").load("dbfs:/FileStore/tables/emp_snappy.parquet/")
display(empDF)
empDF.schema

In [0]:
deptDF = spark.read.format("parquet").load("dbfs:/FileStore/tables/dept_snappy.parquet/")
display(deptDF)
deptDF.schema

### Using select, filter, withColumn (to add new Column) and 'col' for cast

In [0]:
from pyspark.sql.functions import *
# Using 'col' to form Expressions (in 'cast')

empDF2 = (empDF.select("emp", "mgr", "dept", "salary")
               .filter("dept > 100")
               .withColumn("NewSalary", (col("salary") * 1.10).cast("float"))
               .sort(col("mgr").desc()))

display(empDF2)

In [0]:
# Create DataFrame with Complex Data types so we can pluck out nested Columns
complexDF = spark.read.parquet("/FileStore/tables/emp1.parquet/")

complexDF.printSchema()
display(complexDF)
complexDF.createOrReplaceTempView("temp_complex")

### Pluck out Nested Columns in 'address' Struct via 'col'

In [0]:
from pyspark.sql.functions import col

locDF = complexDF.select("name",
  col("address.city").alias("city"),
  col("address.state").alias("state"))

display(locDF)

### Using 'selectExpr' to alias a Column

In [0]:
# Return Boolean 'true' or 'false'
display(empDF.selectExpr("last_name", "dept", "dept in (401, 402) as HR_depts"))

### 'drop' removes Column from Output

In [0]:
display(empDF.drop("hire", "birth", "salary"))

### 'withColumnRenamed' to change a Column (Alternative to 'alias' and 'as')

In [0]:
from pyspark.sql.functions import col

display(complexDF.select ("name", "subordinates")
                 .withColumnRenamed("name", "FullName")
                 .withColumnRenamed("subordinates", "Associates")
       )

### 'isNotNull' requires 'col'

In [0]:
data = [
    ("James",None,"M"),
    ("Anna","NY","F"),
    ("Julia",None,None)
  ]

columns = ["name","state","gender"]
df = spark.createDataFrame(data,columns)
display(df)

In [0]:
display(df.filter(col("state").isNotNull()))

### Compound 'filter'

In [0]:
display(empDF.filter((empDF.emp > 1010) & (empDF.dept == 501)))

### 'dropDuplicates' and 'distinct'

In [0]:
data = [
    ("Mark","OH","M"),
    ("Juli","NY","F"),
    ("Juli","NY","F")
  ]

columns = ["name","state","gender"]
df = spark.createDataFrame(data,columns)
display(df)

In [0]:
# 'distinct' drops all other columns
display(df.select("name").distinct())

In [0]:
# 'dropDuplicates' keeps other columns
display(df.dropDuplicates(["name"]))

###'limit' and 'sort' and 'orderBy'

In [0]:
# Note however that sort() method will sort the records in each partition and then return the final output 
# which means that the order of the output data is not guaranteed because the data is ordered on partition-level 
# but your DataFrame may have thousands of partitions distributed across the cluster. 
# Since the data is not collected into a single executor the sort() method is efficient 
# thus more suitable when sorting is not critical for your use-case.

# Unlike sort(), the orderBy() function guarantees a total order in the output. 
# This happens because the data will be collected into a single executor in order to be sorted. 
# This means that orderBy() is more inefficient compared to sort()

display(empDF.sort("emp").limit(10))

In [0]:
display(empDF.orderBy(col("emp").desc()))

### Lab 01: Date and TimeStamps

In [0]:
eventsDF = spark.read.option("header", True).option("inferSchema", True).csv("dbfs:/FileStore/tables/webevent.csv")
display(eventsDF)

### 'select' with Column operators
## cast, alias, between, substring, orderBy

In [0]:
empDF.select(empDF.salary.cast("string")).limit(2).show()

empDF.select(empDF.salary.alias("wage")).limit(2).show()

empDF.select(empDF.salary,empDF.salary.between(50000, 100000)).limit(2).show()

empDF.select(empDF.salary.substr(1,4)).limit(2).show()

empDF.orderBy(empDF.salary.desc()).limit(3).show()

### Statistics using 'describe' and 'count'

In [0]:
display(empDF.describe())

empDF.count()

### 'na.fill' to replace NULL values. 'drop' to remove Columns in Output

In [0]:
flyDF = spark.read.parquet("dbfs:/FileStore/tables/fly1.parquet")
flyDF.show()

flyDF2 = flyDF.drop("k10").drop("v10").na.fill("0")
flyDF2.show()

### 'groupBy', 'count' and 'orderBy'

In [0]:
display(empDF.select("last_name", "dept", "salary").filter("dept > 400").groupBy("dept").count().orderBy("dept", ascending=False))

### 'groupBy' with 'aggr'

In [0]:
display(empDF.select("dept", "salary").groupBy("dept").agg({"*": "count", "salary": "max"}))

### 'join'

In [0]:
# Lab 12a: Join column = 'dept'
display(empDF.join(deptDF, "dept").limit(3))

display(empDF.join(deptDF, "dept").select("last_name", "dept", "dept_name").limit(4))

In [0]:
# Lab 12b: Change dept 100 to 999 for next Lab (Left JOIN,)
# Dept 999 exists in empDF2, but not deptDF now
from pyspark.sql.functions import when

empDF2 = empDF.withColumn("dept", when(empDF["dept"] == 100, 999).otherwise(empDF["dept"]))
empDF3 = empDF2.distinct().orderBy(["dept"], ascending=False)

In [0]:
# Lab 12c: Left-outer Join
display(empDF3.join(deptDF, "dept", "left_outer").orderBy(["dept"], ascending=False))

# Lab 12d: Join on 2 columns:  
display(empDF3.join(deptDF, (empDF2.dept == deptDF.dept) & (empDF2.mgr == deptDF.mgr)).limit(3))

### 'explain' on DataFrame

In [0]:
empDF.join(deptDF, "dept").select("last_name", "dept", "dept_name").explain("formatted")

### Spark SQL Tables/Views syntax

In [0]:
empDF.createOrReplaceTempView("emp_view")
deptDF.createOrReplaceTempView("dept_view")

In [0]:
%sql

SELECT * FROM dept_view;

In [0]:
%sql
-- Various functions including WHERE, BETWEEN, AS, ORDER BY

SELECT emp, last_name, salary as wage 
FROM emp_view
WHERE salary BETWEEN 50000 AND 100000
ORDER BY salary DESC;

In [0]:
%sql
-- Aggregations (Count and Max)

SELECT dept, 
       count(*) as ct_dept, 
       max(salary) as max_sal 
FROM emp_view
GROUP BY dept;

In [0]:
# Lab 14d: Notice all the NULL values in the Output

spark.read.parquet("/FileStore/tables/fly1.parquet").createOrReplaceTempView("fly_view")
spark.sql("SELECT * FROM fly_view").show()

In [0]:
%sql
-- Notice NULL values in k6 Column.  Next cell will convert these NULLs to 0

SELECT k1, v1, k6 FROM fly_view;

In [0]:
%sql
-- Use 'coalesce' to convert NULL to different value

SELECT k1, v1, COALESCE(k6, CAST(0 as INTEGER)) as k6 FROM fly_view;

In [0]:
%sql
-- JOIN

SELECT * FROM emp_view as e JOIN dept_view as d
ON e.dept = d.dept;

In [0]:
%sql
-- EXPLAIN  (Extended and Formatted)

EXPLAIN FORMATTED SELECT * FROM emp_view as e LEFT OUTER JOIN dept_view as d
ON e.dept = d.dept;

# End of Mod-03a-SparkSQL (Transformations 01)
## Continue on to Notebook Mod03b-SparkSQL (Transformations 02)