In [0]:
# Before we begin, confirm all files are loaded
display(dbutils.fs.ls("dbfs:/FileStore/tables/"))

# Mod 04 - Complex Data Types (Array, Map, Struct)

## Lab 01: Query an ARRAY Data type

In [0]:
# Lab 01a: First Load Data and create TempView

df1 = spark.read.format("parquet").load("dbfs:/FileStore/tables/temp1.parquet/")
display(df1)
df1.printSchema()
df1.createOrReplaceTempView("temp_temperature")

In [0]:
# Uncomment this and run if wish to run next Cell again
# dbutils.fs.rm('/user/hive/warehouse/temperature2', True)

In [0]:
# Lab 01b: Create Permanent Table from TempView

spark.sql('DROP TABLE IF EXISTS temperature2')

spark.sql('CREATE TABLE temperature2 AS SELECT * FROM temp_temperature')
display(spark.sql('SELECT * FROM temperature2'))

In [0]:
%sql
-- Lab 01c: Query an ARRAY from TempView
-- Pluck out 2 Index Number values

SELECT city, mytemp[1], mytemp[2] from temp_temperature

In [0]:
# Lab 01d: Query ARRAY from DataFrame using 'array_contains'

# First convert Hive table ot Dataframe
df = spark.table("temperature2")

# Import Library
from pyspark.sql.functions import array_contains

# Find row(s) where 'mytemp' Array column has value = 68
display(df.select("city").where(array_contains(df.mytemp, 68.0)))

## Lab 02: Query an ARRAY from DataFrame using 'explode', 'explode_outer' and 'size'

In [0]:
%python
# Lab 02a: Create DataFrame and TempView first

df1 = spark.read.format("parquet").load("dbfs:/FileStore/tables/temp_array_null.parquet")
# df1 = spark.read.format("parquet").load("dbfs:/FileStore/tables/aa_temp_array_null/")
display(df1)
df1.createOrReplaceTempView("temp_array_null")

In [0]:
# Uncomment this and run if wish to run next Cell again
# dbutils.fs.rm('/user/hive/warehouse/perm_array_null2', True)

In [0]:
%sql
-- Lab 02b: Create Permanent Hive table from TempView

DROP TABLE IF EXISTS perm_array_null2;

CREATE TABLE perm_array_null2 as SELECT * FROM temp_array_null;

SELECT * FROM perm_array_null2;

In [0]:
%py
# Lab 02c: Query ARRAY data type using 'explode', 'explode_outer' and 'size'


from pyspark.sql.functions import explode, explode_outer, size

df = spark.table("perm_array_null2")
display(df)

# 'explode' will not display the NULL Array rows
display(df.select("city", explode("mytemp")))

# 'explode_outer' will display the NULL Array rows
display(df.select("city", explode_outer("mytemp")))

# 'size' will count values in Array (-1 = Null Array)
display(df.select("city", size("mytemp")))

## Lab 03: Reverse on an ARRAY

In [0]:
%py
# Lab 03a: Create TempView from DataFrame
 
df1 = spark.read.format("json").load("dbfs:/FileStore/tables/state_city.json/")
display(df1)
df1.printSchema()
df1.createOrReplaceTempView("temp_state_city")

In [0]:
%python
# Lab 03b: Reverse 'city' Array data type
from pyspark.sql.functions import reverse

display(df1.select(reverse("city")))

## Lab 04: Create ARRAY on-the-fly (Both Scala and Python)

In [0]:
%scala
// Lab 04a: Using Scala, create ARRAY data type

import org.apache.spark.sql.functions._

val df = Seq(
  ("beatles", "help|penny lane"),
  ("supertramp", "breakfast in america")
).toDF("name", "songs")

val bandDF = df.withColumn(
        "songs",
        split(col("songs"), "\\|"))


bandDF.show(2, false)

bandDF.printSchema()

In [0]:
# Lab 04b: Using Python, create ARRAY data type

df = spark.createDataFrame([ ([1,2,2,3],), ([4,4,5,5],)], ['data'])
df.printSchema()

df.show(10, False)

## Lab 05: MAP Data types

In [0]:
# Lab 05a: Load Data and create TempView

df1 = spark.read.format("parquet").load("dbfs:/FileStore/tables/school1.parquet/")
display(df1)          
df1.printSchema()
df1.createOrReplaceTempView("temp_school")

In [0]:
%sql
-- Lab 05b: Pluck out Values for 'Math' Key

SELECT name, grades['Math'] FROM temp_school;

In [0]:
%sql
-- Lab 05c: 'explode' out all Values from all Keys in 'grade' MAP column

SELECT name, explode(grades) FROM temp_school;

In [0]:
# Lab 05d: map_keys

from pyspark.sql.functions import map_keys

display(df1.select(map_keys("grades")))

In [0]:
# Lab 05e: map_values

from pyspark.sql.functions import map_values

display(df1.select(map_values("grades")))

## Lab 06: Create Map DataFrame using 'create_map'

In [0]:
# Lab 06a: Create MAP data type from File
from pyspark.sql.functions import create_map

df1 = spark.read.csv("dbfs:/FileStore/tables/names1.csv", inferSchema=True, header=True)

df1.printSchema()

df1.show(5, False)

df2 = df1.select(create_map("k1", "v1", "k2", "v2").alias("kv"))
df2.printSchema()

df2.show(5, False)

df2.select ("kv.name").show(5, False)

In [0]:
%sql
-- Lab 06b: First CREATE TABLE of all your KV pairs
-- Must use STRING data type since can have variable KV pairs 

DROP TABLE IF EXISTS demo1;

CREATE TABLE demo1 (k1 string, v1 string, k2 string, v2 string) 
USING CSV 
OPTIONS (path "dbfs:/FileStore/tables/names.txt", header = False);

SELECT * FROM demo1;

In [0]:
# Lab 06c: (Con't) Convert Table to DataFrame, then convert KV pair to MAP data type
demoDF = spark.sql("SELECT * FROM demo1")

demoDF = demoDF.select(create_map("k1", "v1", "k2", "v2").alias("kv"))
demoDF.printSchema()

display(demoDF)
display(demoDF.select("kv.name"))

## Lab 07: "STRUCTS" Data type

In [0]:
# Lab 07a: Query STRUCT on a DataFrame

df1 = spark.read.parquet("dbfs:/FileStore/tables/auto1.parquet/")
df1.createOrReplaceTempView("temp_auto")

display(df1)
display(df1.select("name", "attrib.mpg", "attrib.trans"))
df1.printSchema()

In [0]:
%sql
-- Lab 07b: Query 'STRUCT' on TempView
SELECT name, attrib.wt, attrib.trans FROM temp_auto;

## Lab 08: Putting it All Together

In [0]:
%py
# Lab 08a:
# Fix:  Ensure students have Line 5 correctly codes

complex_df = spark.read.parquet("/FileStore/tables/emp1.parquet/")

complex_df.printSchema()
display(complex_df)
complex_df.createOrReplaceTempView("temp_complex")

In [0]:
%py
# Lab 08b

display(spark.sql("SELECT * FROM temp_complex"))

In [0]:
%py
# Lab 08c: From ARRAY, Pluck out Index 1 from the ARRAY 'subordinates' column

display(spark.sql("SELECT name, subordinates[1] FROM temp_complex"))

In [0]:
%sql

-- Lab 08d: From MAP, Pluck out 'Value' for a 'Key' from the MAP 'deductions' column
SELECT name, deductions.Insurance FROM temp_complex;

In [0]:
%sql

-- Lab 08e:From STRUCT, Pluck out 'city' and 'state from the STRUCT 'address' column
SELECT name, address.city, address.state FROM temp_complex;

In [0]:
%sql
-- Lab 08f: 'explode' on an ARRAY 

SELECT explode(subordinates) FROM temp_complex;

In [0]:
%sql
-- Lab 08g: 'explode' on a MAP

SELECT explode(deductions) FROM temp_complex;

# End of Module 04: Complex Data Types
### Ignore past here

In [0]:
%sql

--DROP TABLE IF EXISTS employees2;

-- Optional way of doing it, but no need for Schema to be defined since Parquet carries Schema automatically
--CREATE TABLE employees2 (
--      name         STRING, salary  FLOAT,
--      subordinates ARRAY<STRING>,
--      deductions   MAP<STRING, FLOAT>,
--      address      STRUCT<street:STRING,city:STRING,state:STRING, zip:INT>)
--USING PARQUET
--OPTIONS (path "/FileStore/tables/aa_emp_parquet2/")

In [0]:
%sql
--DESCRIBE employees2;

In [0]:
%sql
--SELECT * from employees2

In [0]:
%py 

# How to query in SQL from Python via 'spark.sql'

#fly1_df = spark.sql("SELECT * FROM flights_abbr")
#display(fly1_df)

In [0]:
%py

# 'fly2' folder created automatically
#fly1_df.write.parquet("/FileStore/tables/fly2/", mode = "overwrite")

In [0]:
%python
#fly2_df = spark.read.parquet("/FileStore/tables/fly2/")

In [0]:
%scala
// Note must have TSV file loaded first
//val rdd7 = sc.textFile("dbfs:/FileStore/tables/site02_201410.tsv")		

//Split by tab-delimited, then replace TSV with CSV, then Save to Directory
//val rdd8 = rdd7.map(x => x.split("\t"))
//val rdd9 = rdd8.map(x=>x.mkString(","))		
// rdd9.saveAsTextFile("dbfs:/FileStore/tables/fly3")
// Create DataFrame via Schema, then remove all NULLs

//import org.apache.spark.sql.DataFrame
//import org.apache.spark.sql.functions._
//import org.apache.spark.sql.types._
//import org.apache.spark.sql.types.{StructType, StructField, StringType}
//val schema1 = StructType(Array(StructField("k1", StringType, true),StructField("v1", StringType, true),StructField("k2", StringType, true),StructField("v2", //StringType, true),StructField("k3", StringType, true),StructField("v3", StringType, true),StructField("k4", StringType, true),StructField("v4", StringType, //true),StructField("k5", StringType, true),StructField("v5", StringType, true),StructField("k6", StringType, true),StructField("v6", StringType, //true),StructField("k7", StringType, true),StructField("v7", StringType, true),StructField("k8", StringType, true),StructField("v8", StringType, //true),StructField("k9", StringType, true),StructField("v9", StringType, true),StructField("k10", StringType, true),StructField("v10",StringType, true)))

//val df2 = spark.read.format("csv").option("header", "false").option("inferSchema", "false").schema(schema1).load("dbfs:/FileStore/tables/site02_201410.tsv")
//val df3 = df2.na.fill("0")

// Create Map Data Type by pointing to K-V pairs via 'map' function

In [0]:
%scala
//df3.show()

In [0]:
%sql
-- CREATE TABLE employees (
--      name         STRING, 
--      salary FLOAT,
--      subordinates ARRAY<STRING>,
--      deductions   MAP<STRING, STRING>,
--      address      STRUCT<street:STRING,city:STRING,state:STRING, zip:INT>)
--USING CSV
--LOCATION "dbfs:/shared_uploads/ottmk@ucmail.uc.edu/site02_201410.csv"

-- LOAD DATA local inpath '/opt/employees.txt' into table employees

-- DESCRIBE employees;

-- SELECT * from employees; 

In [0]:
# How to write a DataFrame and/or Table to to a file

#complex_df = spark.read.parquet("/FileStore/tables/emp_snappy.parquet")

#complex_df.printSchema()
#complex_df.show(100, False)
#complex_df.createOrReplaceTempView("temp_complex")

In [0]:
# To save a DataFrame as a file, code this:
#complex_df = spark.read.parquet("/FileStore/tables/emp_snappy.parquet")
#complex_df.write.format("orc").save("/FileStore/tables/complex_dir/", mode = "overwrite")

# To save a TempView or Hive table as a file, code this.  Files saved to : /user/warehouse/hive/<tablename>/
#spark.sql("DROP TABLE IF EXISTS complex_table2")
#spark.sql("CREATE TABLE complex_table2 AS SELECT * FROM temp_complex")

In [0]:
# Confirm files are written from DataFrame:
#display(dbutils.fs.ls("dbfs:/FileStore/tables/complex_dir/"))

In [0]:
# Confirm files are written to Hive table
#display(dbutils.fs.ls("dbfs:/user/hive/warehouse/complex_table2/"))