In [1]:
import org.apache.spark.sql.SparkSession 

val sparkSession = SparkSession.builder.appName("Juptyer").getOrCreate()

Intitializing Scala interpreter ...

Spark Web UI available at http://d772c787a954:4041
SparkContext available as 'sc' (version = 3.5.1, master = local[*], app id = local-1734655333256)
SparkSession available as 'spark'


import org.apache.spark.sql.SparkSession
sparkSession: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@5f152f7f


## Events - DATASET

In [2]:
// If something is nullabe, you need to wrap the value type in Option[] - this helps enforce assumptions about the pipeline
case class Event (
    user_id: Option[Integer],
    device_id: Option[Integer],
    referrer: String,
    host: String,
    url: String,
    event_time: String
)

val dummyData = List(
        Event(user_id=None, device_id=None, referrer="linkedin", host="eczachly.com", url="/signup", event_time="2023-01-01"),
        Event(user_id=None, device_id=None, referrer="twitter", host="eczachly.com", url="/signup", event_time="2023-01-01")
   )

defined class Event
dummyData: List[Event] = List(Event(None,None,linkedin,eczachly.com,/signup,2023-01-01), Event(None,None,twitter,eczachly.com,/signup,2023-01-01))


In [3]:
// Applying this case class before hand is very powerful, enforces Nullability/non-nullability at runtime!
val events = sparkSession.read.option("header", "true")
                        .option("inferSchema", "true")
                        .csv("/home/iceberg/data/events.csv")
                        .as[Event]

events: org.apache.spark.sql.Dataset[Event] = [user_id: int, device_id: int ... 4 more fields]


In [4]:
events.describe()

res0: org.apache.spark.sql.DataFrame = [summary: string, user_id: string ... 4 more fields]


In [5]:
events.show(5)

+-----------+---------+--------+--------------------+---+--------------------+
|    user_id|device_id|referrer|                host|url|          event_time|
+-----------+---------+--------+--------------------+---+--------------------+
| 1037710827|532630305|    NULL| www.zachwilson.tech|  /|2021-03-08 17:27:...|
|  925588856|532630305|    NULL|    www.eczachly.com|  /|2021-05-10 11:26:...|
|-1180485268|532630305|    NULL|admin.zachwilson....|  /|2021-02-17 16:19:...|
|-1044833855|532630305|    NULL| www.zachwilson.tech|  /|2021-09-24 15:53:...|
|  747494706|532630305|    NULL| www.zachwilson.tech|  /|2021-09-26 16:03:...|
+-----------+---------+--------+--------------------+---+--------------------+
only showing top 5 rows



In [6]:
events.count()

res2: Long = 404814


In [7]:
val filteredViaDataset = events.filter(event => event.user_id.isDefined && event.device_id.isDefined)

filteredViaDataset: org.apache.spark.sql.Dataset[Event] = [user_id: int, device_id: int ... 4 more fields]


In [8]:
filteredViaDataset.count()

res3: Long = 404747


## Events - DataFrame

In [9]:
val filteredViaDataFrame = events.toDF().where($"user_id".isNotNull && $"device_id".isNotNull)

filteredViaDataFrame: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [user_id: int, device_id: int ... 4 more fields]


In [10]:
filteredViaDataFrame.count()

res4: Long = 404747


## Evnets - SQL

In [14]:
/*
Ran this code in another notebook to create the events table
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col
spark = SparkSession.builder.appName("Jupyter").getOrCreate()
events = spark.read.option("header", "true").csv("/home/iceberg/data/events.csv").withColumn("event_date", expr("DATE_TRUNC('day', event_time)"))
df.write.mode("overwrite").saveAsTable("bootcamp.events_dump")
*/

In [15]:
val filteredViaSparkSql = sparkSession.sql("SELECT * FROM demo.bootcamp.events_dump WHERE user_id IS NOT NULL AND device_id IS NOT NULL")

filteredViaSparkSql: org.apache.spark.sql.DataFrame = [user_id: string, device_id: string ... 5 more fields]


In [16]:
filteredViaSparkSql.count()

res7: Long = 404747


## Devices

In [51]:
import spark.implicits._

case class Device (
    device_id: Integer,
    browser_type: String,
    os_type: String,
    device_type: String
)

val devices = sparkSession.read.option("header", "true")
                        .option("inferSchema", "true")
                        .csv("/home/iceberg/data/devices.csv")
                        .as[Device]

import spark.implicits._
defined class Device
devices: org.apache.spark.sql.Dataset[Device] = [device_id: int, browser_type: string ... 2 more fields]


In [52]:
devices.describe()

res33: org.apache.spark.sql.DataFrame = [summary: string, device_id: string ... 3 more fields]


In [53]:
devices.show(5)

+-----------+--------------------+-------+------------------+
|  device_id|        browser_type|os_type|       device_type|
+-----------+--------------------+-------+------------------+
|-2147042689|             Firefox| Ubuntu|             Other|
|-2146219609|            WhatsApp|  Other|            Spider|
|-2145574618|       Chrome Mobile|Android|Generic Smartphone|
|-2144707350|Chrome Mobile Web...|Android|  Samsung SM-G988B|
|-2143813999|Mobile Safari UI/...|    iOS|            iPhone|
+-----------+--------------------+-------+------------------+
only showing top 5 rows



In [54]:
devices.groupBy("device_id").count().show(5)

+-----------+-----+
|  device_id|count|
+-----------+-----+
|-2001648078|    1|
|-1649034749|    1|
|-1529640407|    1|
|-1307533700|    1|
|-1288794983|    1|
+-----------+-----+
only showing top 5 rows



In [55]:
devices.createOrReplaceTempView("devices")
events.createOrReplaceTempView("events")

In [75]:
case class EventWithDeviceInfo (
    user_id: Integer,
    device_id: Integer,
    browser_type: String,
    os_type: String,
    device_type: String,
    referrer: String,
    host: String,
    url: String,
    event_time: String
)

def toUpperCase(s: String): String = {
    return s.toUpperCase()
}

// This will fail if user_id is None
val combinedViaDatasets = filteredViaDataset
    .joinWith(devices, events("device_id") === devices("device_id"), "inner")
    .map{case (event: Event, device: Device) => EventWithDeviceInfo(
                  user_id=event.user_id.get,
                  device_id=event.device_id.get,
                  browser_type=device.browser_type,
                  os_type=device.os_type,
                  device_type=device.device_type,
                  referrer=event.referrer,
                  host=event.host,
                  url=event.url,
                  event_time=event.event_time
              ) }
    .map{ case (row: EventWithDeviceInfo) => 
        row.copy(browser_type = row.browser_type.toUpperCase)
    }              
    
combinedViaDatasets.show(5)  

+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------------+
|    user_id|device_id|browser_type|os_type|device_type|referrer|                host|url|          event_time|
+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------------+
| 1037710827|532630305|       OTHER|  Other|      Other|    NULL| www.zachwilson.tech|  /|2021-03-08 17:27:...|
|  925588856|532630305|       OTHER|  Other|      Other|    NULL|    www.eczachly.com|  /|2021-05-10 11:26:...|
|-1180485268|532630305|       OTHER|  Other|      Other|    NULL|admin.zachwilson....|  /|2021-02-17 16:19:...|
|-1044833855|532630305|       OTHER|  Other|      Other|    NULL| www.zachwilson.tech|  /|2021-09-24 15:53:...|
|  747494706|532630305|       OTHER|  Other|      Other|    NULL| www.zachwilson.tech|  /|2021-09-26 16:03:...|
+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------

defined class EventWithDeviceInfo
toUpperCase: (s: String)String
combinedViaDatasets: org.apache.spark.sql.Dataset[EventWithDeviceInfo] = [user_id: int, device_id: int ... 7 more fields]


In [76]:
val toUpperCaseUdf = udf(toUpperCase _)

// DataFrames give up some of the intellisense because you no longer have static typing
val combinedViaDataFrames = filteredViaDataFrame.as("e")
            //Make sure to use triple equals when using data frames
            .join(devices.as("d"), $"e.device_id" === $"d.device_id", "inner")
            .select(
              col("e.user_id"),
              $"d.device_id",
              toUpperCaseUdf($"d.browser_type").as("browser_type"),
              $"d.os_type",
              $"d.device_type",
              $"e.referrer",
              $"e.host",
              $"e.url",
              $"e.event_time"
            )
            
combinedViaDataFrames.show(5)

+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------------+
|    user_id|device_id|browser_type|os_type|device_type|referrer|                host|url|          event_time|
+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------------+
| 1037710827|532630305|       OTHER|  Other|      Other|    NULL| www.zachwilson.tech|  /|2021-03-08 17:27:...|
|  925588856|532630305|       OTHER|  Other|      Other|    NULL|    www.eczachly.com|  /|2021-05-10 11:26:...|
|-1180485268|532630305|       OTHER|  Other|      Other|    NULL|admin.zachwilson....|  /|2021-02-17 16:19:...|
|-1044833855|532630305|       OTHER|  Other|      Other|    NULL| www.zachwilson.tech|  /|2021-09-24 15:53:...|
|  747494706|532630305|       OTHER|  Other|      Other|    NULL| www.zachwilson.tech|  /|2021-09-26 16:03:...|
+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------

toUpperCaseUdf: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$7025/0x0000000842146840@34176149,StringType,List(Some(class[value[0]: string])),Some(class[value[0]: string]),None,true,true)
combinedViaDataFrames: org.apache.spark.sql.DataFrame = [user_id: int, device_id: int ... 7 more fields]


In [25]:
//Creating temp views is a good strategy if you're leveraging SparkSQL
filteredViaSparkSql.createOrReplaceTempView("filtered_events")
val combinedViaSparkSQL = spark.sql(f"""
    SELECT 
        fe.user_id,
        d.device_id,
        d.browser_type,
        d.os_type,
        d.device_type,
        fe. referrer,
        fe.host,
        fe.url,
        fe.event_time
    FROM filtered_events fe 
    JOIN devices d ON fe.device_id = d.device_id
""")

combinedViaSparkSQL.show(5)

+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------------+
|    user_id|device_id|browser_type|os_type|device_type|referrer|                host|url|          event_time|
+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------------+
| 1037710827|532630305|       Other|  Other|      Other|    NULL| www.zachwilson.tech|  /|2021-03-08 17:27:...|
|  925588856|532630305|       Other|  Other|      Other|    NULL|    www.eczachly.com|  /|2021-05-10 11:26:...|
|-1180485268|532630305|       Other|  Other|      Other|    NULL|admin.zachwilson....|  /|2021-02-17 16:19:...|
|-1044833855|532630305|       Other|  Other|      Other|    NULL| www.zachwilson.tech|  /|2021-09-24 15:53:...|
|  747494706|532630305|       Other|  Other|      Other|    NULL| www.zachwilson.tech|  /|2021-09-26 16:03:...|
+-----------+---------+------------+-------+-----------+--------+--------------------+---+--------------

combinedViaSparkSQL: org.apache.spark.sql.DataFrame = [user_id: string, device_id: int ... 7 more fields]


---
# Original Code from Databricks [ Does not work]

In [None]:
import org.apache.spark.sql.SparkSession 

val sparkSession = SparkSession.builder.appName("Juptyer").getOrCreate()

//TODO Illustrate how this fails if you change from Option[String] to String for referrer
case class Event (
   //Option is a way to handle NULL more gracefully
    user_id: Option[Integer],
    device_id: Option[Integer],
    referrer: Option[String],
    host: String,
    url: String,
    event_time: String
)


dummyData = List(
        Event(user_id=1, device_id=2, referrer="linkedin", host="eczachly.com", url="/signup", event_time="2023-01-01"),
        Event(user_id=3, device_id=7, referrer="twitter", host="eczachly.com", url="/signup", event_time="2023-01-01")
    )

//TODO Illustrate how this fails if you change from Option[Long] to Long
case class Device (
    device_id: Integer,
    browser_type: String,
    os_type: String,
    device_type: String
)

case class EventWithDeviceInfo (
   user_id: Integer,
    device_id: Integer,
    browser_type: String,
    os_type: String,
    device_type: String,
    referrer: String,
    host: String,
    url: String,
    event_time: String
)

// When should you use each type?
import sparkSession.implicits._

// Applying this case class before hand is very powerful, enforces Nullability/non-nullability at runtime!
val events: Dataset[Event] = sparkSession.read.option("header", "true")
                        .option("inferSchema", "true")
                        .csv("/home/iceberg/data/events.csv")
                        .as[Event]

val devices: Dataset[Device] = sparkSession.read.option("header", "true")
                        .option("inferSchema", "true")
                        .csv("/home/iceberg/data/devices.csv")
                        .as[Device]

devices.createOrReplaceTempView("devices")
events.createOrReplaceTempView("events")

// For simple transformations, you can see that these approaches are very similar. Dataset is winning slightly because of the quality enforcement
val filteredViaDataset = events.filter(event => event.user_id.isDefined && event.device_id.isDefined)
val filteredViaDataFrame = events.toDF().where($"user_id".isNotNull && $"device_id".isNotNull)
val filteredViaSparkSql = sparkSession.sql("SELECT * FROM events WHERE user_id IS NOT NULL AND device_id IS NOT NULL")


// This will fail if user_id is None
val combinedViaDatasets = filteredViaDataset
    .joinWith(devices, events("device_id") === devices("device_id"), "inner")
    .map{ case (event: Event, device: Device) => EventWithDeviceInfo(
                  user_id=event.user_id.get,
                  device_id=device.device_id,
                  browser_type=device.browser_type,
                  os_type=device.os_type,
                  device_type=device.device_type,
                  referrer=event.referrer,
                  host=event.host,
                  url=event.url,
                  event_time=event.event_time
              ) }
    .map( case (row: EventWithDeviceInfo) => {
        row.browser_type = toUpperCase(row.browser_type)
        return row
    })




// DataFrames give up some of the intellisense because you no longer have static typing
val combinedViaDataFrames = filteredViaDataFrame.as("e")
            //Make sure to use triple equals when using data frames
            .join(devices.as("d"), $"e.device_id" === $"d.device_id", "inner")
            .select(
              $"e.user_id",
              $"d.device_id",
              $"d.browser_type",
              $"d.os_type",
              $"d.device_type",
              $"e.referrer",
              $"e.host",
              $"e.url",
              $"e.event_time"
            )

//Creating temp views is a good strategy if you're leveraging SparkSQL
filteredViaSparkSql.createOrReplaceTempView("filtered_events")
val combinedViaSparkSQL = spark.sql(f"""
    SELECT 
        fe.user_id,
        d.device_id,
        d.browser_type,
        d.os_type,
        d.device_type,
        fe. referrer,
        fe.host,
        fe.url,
        fe.event_time
    FROM filtered_events fe 
    JOIN devices d ON fe.device_id = d.device_id
""")

combinedViaDatasets.take(5)    