In [12]:
case class Bloggers(id:Long, first:String, last:String, url:String, published:String,
hits: Long, campaigns:Array[String])

defined class Bloggers


In [13]:
val bloggers = "../databricks-datasets/learning-spark-v2/blogs.json"
val bloggersDS = spark
 .read
 .format("json")
 .option("path", bloggers)
 .load()
 .as[Bloggers]

bloggers: String = ../databricks-datasets/learning-spark-v2/blogs.json
bloggersDS: org.apache.spark.sql.Dataset[Bloggers] = [Campaigns: array<string>, First: string ... 5 more fields]


In [15]:
bloggersDS.collect()

res1: Array[Bloggers] = Array(Bloggers(1,Jules,Damji,https://tinyurl.1,1/4/2016,4535,[Ljava.lang.String;@54f6955c), Bloggers(2,Brooke,Wenig,https://tinyurl.2,5/5/2018,8908,[Ljava.lang.String;@6b899e09), Bloggers(3,Denny,Lee,https://tinyurl.3,6/7/2019,7659,[Ljava.lang.String;@5d03a0ce), Bloggers(4,Tathagata,Das,https://tinyurl.4,5/12/2018,10568,[Ljava.lang.String;@224b6709), Bloggers(5,Matei,Zaharia,https://tinyurl.5,5/14/2014,40578,[Ljava.lang.String;@42775a69), Bloggers(6,Reynold,Xin,https://tinyurl.6,3/2/2015,25568,[Ljava.lang.String;@65a9edf8))


#### Working with Datasets

In [24]:
import scala.util.Random._
import spark.implicits._  

import scala.util.Random._
import spark.implicits._


In [25]:
// Our case class for the Dataset
case class Usage(uid:Int, uname:String, usage: Int)

defined class Usage


In [26]:
val r = new scala.util.Random(42)
// Create 1000 instances of scala Usage class
// This generates data on the fly
val data = for (i <- 0 to 1000)
 yield (Usage(i, "user-" + r.alphanumeric.take(5).mkString(""), r.nextInt(1000)))

r: scala.util.Random = scala.util.Random@49fb0c76
data: scala.collection.immutable.IndexedSeq[Usage] = Vector(Usage(0,user-Gpi2C,525), Usage(1,user-DgXDi,502), Usage(2,user-M66yO,170), Usage(3,user-xTOn6,913), Usage(4,user-3xGSz,246), Usage(5,user-2aWRN,727), Usage(6,user-EzZY1,65), Usage(7,user-ZlZMZ,935), Usage(8,user-VjxeG,756), Usage(9,user-iqf1P,3), Usage(10,user-91S1q,794), Usage(11,user-qHNj0,501), Usage(12,user-7hb94,460), Usage(13,user-bz0WF,142), Usage(14,user-71nwy,479), Usage(15,user-7GZz1,823), Usage(16,user-1CSk6,140), Usage(17,user-WPzlL,246), Usage(18,user-VaEit,451), Usage(19,user-PSaRq,679), Usage(20,user-0Kkzu,332), Usage(21,user-UN3MG,172), Usage(22,user-KwwER,442), Usage(23,user-ZnltJ,923), Usage(24,user-IRA17,741), Usage(25,user-yNHRT,299), Usage(26,user-CJY3C,996)...


In [27]:
// Create a Dataset of Usage typed data
val dsUsage = spark.createDataset(data)
dsUsage.show(10)

+---+----------+-----+
|uid|     uname|usage|
+---+----------+-----+
|  0|user-Gpi2C|  525|
|  1|user-DgXDi|  502|
|  2|user-M66yO|  170|
|  3|user-xTOn6|  913|
|  4|user-3xGSz|  246|
|  5|user-2aWRN|  727|
|  6|user-EzZY1|   65|
|  7|user-ZlZMZ|  935|
|  8|user-VjxeG|  756|
|  9|user-iqf1P|    3|
+---+----------+-----+
only showing top 10 rows



dsUsage: org.apache.spark.sql.Dataset[Usage] = [uid: int, uname: string ... 1 more field]


#### Higher-order functions and functional programming

In [44]:
import org.apache.spark.sql.functions._ 
dsUsage
 //.filter(d => d.usage > 900)
 .filter($"usage" > 900)
 .orderBy(desc("usage"))
 .show(5, false)

+---+----------+-----+
|uid|uname     |usage|
+---+----------+-----+
|605|user-NL6c4|999  |
|113|user-nnAXr|999  |
|634|user-L0wci|999  |
|561|user-5n2xY|999  |
|26 |user-CJY3C|996  |
+---+----------+-----+
only showing top 5 rows



import org.apache.spark.sql.functions._


In [67]:
def filterWithUsage(u: Usage) = u.usage > 900
dsUsage.filter(filterWithUsage(_)).orderBy(desc("usage")).show(5)

<console>: 2: error: not a legal formal parameter.

In [57]:
// Use an if-then-else lambda expression and compute a value
dsUsage.map(u => {if (u.usage > 750) u.usage * .15 else u.usage * .50 })
 .show(5, false)
// Define a function to compute the usage
def computeCostUsage(usage: Int): Double = {
 if (usage > 750) usage * 0.15 else usage * 0.50
}
// Use the function as an argument to map()
dsUsage.map(u => {computeCostUsage(u.usage)}).show(5, false)

org.apache.spark.SparkException:  Job aborted due to stage failure: Task 0 in stage 16.0 failed 1 times, most recent failure: Lost task 0.0 in stage 16.0 (TID 115) (192.168.0.11 executor driver): java.lang.ClassCastException: $iw cannot be cast to $iw

In [69]:
// Create a new case class with an additional field, cost
case class UsageCost(uid: Int, uname:String, usage: Int, cost: Double)
// Compute the usage cost with Usage as a parameter
// Return a new object, UsageCost


defined class UsageCost


In [70]:
def computeUserCostUsage(u: Usage): UsageCost = {
 val v = if (u.usage > 750) u.usage * 0.15 else u.usage * 0.50
 UsageCost(u.uid, u.uname, u.usage, v)
}
// Use map() on our original Dataset
dsUsage.map(u => {computeUserCostUsage(u)}).show(5)

org.apache.spark.SparkException:  Job aborted due to stage failure: Task 0 in stage 18.0 failed 1 times, most recent failure: Lost task 0.0 in stage 18.0 (TID 117) (192.168.0.11 executor driver): java.lang.ClassCastException: $iw cannot be cast to $iw

#### Conversión de DataFrames a Datasets 

In [71]:
val bloggers = "../databricks-datasets/learning-spark-v2/blogs.json"
val bloggersDS = spark
 .read
 .format("json")
 .option("path", bloggers)
 .load()
 .as[Bloggers]

bloggers: String = ../databricks-datasets/learning-spark-v2/blogs.json
bloggersDS: org.apache.spark.sql.Dataset[Bloggers] = [Campaigns: array<string>, First: string ... 5 more fields]


In [73]:
case class Person(id: Integer, firstName: String, middleName: String, lastName: String,
gender: String, birthDate: String, ssn: String, salary: String)

defined class Person


In [75]:
import java.util.Calendar
val earliestYear = Calendar.getInstance.get(Calendar.YEAR) - 40

val personDS
 // Everyone above 40: lambda-1
 .filter(x => x.birthDate.split("-")(0).toInt > earliestYear)
 // Everyone earning more than 80K
 .filter($"salary" > 80000)
// Last name starts with J: lambda-2
 .filter(x => x.lastName.startsWith("J"))
 // First name starts with D
 .filter($"firstName".startsWith("D"))
 .count()

<console>: 7: error: ')' expected but '=>' found.