In [None]:
// Create cubed function
val cubed = (s: Long) => {
 s * s * s
}

// Register UDF
spark.udf.register("cubed", cubed)

// Create temporary view
spark.range(1, 9).createOrReplaceTempView("udf_test")

In [None]:
// Query the cubed UDF
spark.sql("SELECT id, cubed(id) AS id_cubed FROM udf_test").show()

In [None]:
// Create DataFrame with two rows of two arrays (tempc1, tempc2)
val t1 = Array(35, 36, 32, 30, 40, 42, 38)
val t2 = Array(31, 32, 34, 55, 56)
val tC = Seq(t1, t2).toDF("celsius")
tC.createOrReplaceTempView("tC")

// Show the DataFrame
tC.show()

In [None]:
// transform()
// Calculate Fahrenheit from Celsius for an array of temperatures
spark.sql("""
SELECT celsius, transform(celsius, t -> ((t * 9) div 5) + 32) as fahrenheit FROM tC
""").show()

In [None]:
// filter()
// Filter temperatures > 38C for array of temperatures
spark.sql("""
SELECT celsius,
 filter(celsius, t -> t > 38) as high FROM tC
""").show()

In [None]:
// exists()
// Is there a temperature of 38C in the array of temperatures
spark.sql("""
SELECT celsius,
 exists(celsius, t -> t = 38) as threshold FROM tC
""").show()

In [None]:
// reduce()
// Calculate average temperature and convert to F
spark.sql("""
SELECT celsius,
 reduce(celsius, 0, (t, acc) -> t + acc, acc -> (acc div size(celsius) * 9 div 5) + 32) as avgFahrenheit FROM tC
""").show()

In [None]:
import org.apache.spark.sql.functions._

// Set file paths
val delaysPath =
 "/databricks-datasets/learning-spark-v2/flights/departuredelays.csv"
val airportsPath =
 "/databricks-datasets/learning-spark-v2/flights/airport-codes-na.txt"

// Obtain airports data set
val airports = spark.read
 .option("header", "true")
 .option("inferschema", "true")
 .option("delimiter", "\t")
 .csv(airportsPath)
airports.createOrReplaceTempView("airports_na")

// Obtain departure Delays data set
val delays = spark.read
 .option("header","true")
 .csv(delaysPath)
 .withColumn("delay", expr("CAST(delay as INT) as delay"))
 .withColumn("distance", expr("CAST(distance as INT) as distance"))
delays.createOrReplaceTempView("departureDelays")

// Create temporary small table
val foo = delays.filter(
 expr("""origin == 'SEA' AND destination == 'SFO' AND
 date like '01010%' AND delay > 0"""))
foo.createOrReplaceTempView("foo")

In [None]:
spark.sql("SELECT * FROM airports_na LIMIT 10").show()

In [None]:
// Union two tables
val bar = delays.union(foo)
bar.createOrReplaceTempView("bar")
bar.filter(expr("""origin == 'SEA' AND destination == 'SFO'
AND date LIKE '01010%' AND delay > 0""")).show()

In [None]:
foo.join(
 airports.as('air),
 $"air.IATA" === $"origin"
).select("City", "State", "date", "delay", "distance", "destination").show()

In [None]:
DROP TABLE IF EXISTS departureDelaysWindow;

CREATE TABLE departureDelaysWindow AS
SELECT origin, destination, SUM(delay) AS TotalDelays
 FROM departureDelays
WHERE origin IN ('SEA', 'SFO', 'JFK')
 AND destination IN ('SEA', 'SFO', 'JFK', 'DEN', 'ORD', 'LAX', 'ATL')
GROUP BY origin, destination;

SELECT * FROM departureDelaysWindow

origin,destination,TotalDelays
JFK,ORD,5608
JFK,SFO,35619
JFK,DEN,4315
JFK,ATL,12141
JFK,SEA,7856
JFK,LAX,35755
SEA,LAX,9359
SFO,ORD,27412
SFO,DEN,18688
SFO,SEA,17080


In [None]:
spark.sql("""
SELECT origin, destination, TotalDelays, rank
  FROM (
  SELECT origin, destination, TotalDelays, dense_rank()
  OVER (PARTITION BY origin ORDER BY TotalDelays DESC) as rank
  FROM departureDelaysWindow) t
  WHERE rank <= 3
  """).show()

In [None]:
// Modifications
foo.show()

In [None]:
import org.apache.spark.sql.functions.expr

val foo2 = foo.withColumn("status", expr("CASE WHEN delay <= 10 THEN 'On-time' ELSE 'Delayed' END")
 )


In [None]:
foo2.show()

In [None]:
// Dropping columns
val foo3 = foo2.drop("delay")
foo3.show()

In [None]:
// Renaming columns
val foo4 = foo3.withColumnRenamed("status", "flight_status")
foo4.show()

In [None]:
-- Pivoting
SELECT destination, CAST(SUBSTRING(date, 0, 2) AS int) AS month, delay
FROM departureDelays
WHERE origin = 'SEA'

destination,month,delay
ORD,1,92
JFK,1,-7
DFW,1,-5
MIA,1,-3
DFW,1,-3
DFW,1,1
ORD,1,-10
DFW,1,-6
DFW,1,-2
ORD,1,-3


In [None]:
SELECT * FROM (
SELECT destination, CAST(SUBSTRING(date, 0, 2) AS int) AS month, delay
 FROM departureDelays WHERE origin = 'SEA'
)

PIVOT (
 CAST(AVG(delay) AS DECIMAL(4, 2)) AS AvgDelay, MAX(delay) AS MaxDelay
 FOR month IN (1 JAN, 2 FEB)
)

ORDER BY destination

destination,JAN_AvgDelay,JAN_MaxDelay,FEB_AvgDelay,FEB_MaxDelay
ABQ,19.86,316,11.42,69.0
ANC,4.44,149,7.9,141.0
ATL,11.98,397,7.73,145.0
AUS,3.48,50,-0.21,18.0
BOS,7.84,110,14.58,152.0
BUR,-2.03,56,-1.89,78.0
CLE,16.0,27,,
CLT,2.53,41,12.96,228.0
COS,5.32,82,12.18,203.0
CVG,-0.5,4,,


Pros y Cons utilizar UDFs

In [None]:
// Pros: Su flexibilidad de uso.

// Cons: No persiste por debajo del MetaStore y, por tanto, solo sirve para una sesión. Es más laborioso que utilizar las funciones ya definidas por Spark.