In [29]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import types

In [3]:
# Create Spark Session

spark = SparkSession.builder\
    .master("local[*]")\
    .appName("test")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/23 02:32:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Spark DataFrame

In [6]:
# read in data which have been saved

df = spark.read.parquet("fhvhv/2021/01/")

                                                                                

In [9]:
df.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)



### Example of Transformation in Spark DataFrame

Transformation is lazy and not directly executed. Example includes `.select`, `.filter`. If you're checking the Spark UI, no jobs will run for transformation in Spark DataFrame

In [14]:
df.select("pickup_datetime","dropoff_datetime").filter(df.hvfhs_license_num == "HV0003")

DataFrame[pickup_datetime: timestamp, dropoff_datetime: timestamp]

### Example of Action in Spark DataFrame

Action is eager and directly executed. Example included `.show()`, `.head()`. If you're checking the Spark UI, jobs will run for transformation in Spark DataFrame

In [17]:
df.select("pickup_datetime","dropoff_datetime").filter(df.hvfhs_license_num == "HV0003").show()

+-------------------+-------------------+
|    pickup_datetime|   dropoff_datetime|
+-------------------+-------------------+
|2021-01-05 22:14:07|2021-01-05 22:32:28|
|2021-01-02 17:59:55|2021-01-02 18:10:39|
|2021-01-02 23:57:54|2021-01-03 00:15:48|
|2021-01-06 15:53:13|2021-01-06 16:07:07|
|2021-01-07 07:35:24|2021-01-07 07:55:49|
|2021-01-07 08:45:12|2021-01-07 08:51:17|
|2021-01-02 15:44:26|2021-01-02 16:10:50|
|2021-01-04 16:50:28|2021-01-04 16:57:43|
|2021-01-03 10:30:34|2021-01-03 10:44:53|
|2021-01-03 22:05:20|2021-01-03 22:27:55|
|2021-01-04 08:01:02|2021-01-04 08:33:27|
|2021-01-02 13:01:10|2021-01-02 13:08:11|
|2021-01-06 17:12:27|2021-01-06 17:46:56|
|2021-01-04 09:05:18|2021-01-04 09:27:50|
|2021-01-06 16:46:47|2021-01-06 17:50:24|
|2021-01-06 08:03:47|2021-01-06 08:17:43|
|2021-01-04 06:45:42|2021-01-04 06:55:01|
|2021-01-03 13:20:41|2021-01-03 13:31:11|
|2021-01-03 17:30:33|2021-01-03 17:45:19|
|2021-01-06 20:55:57|2021-01-06 21:02:01|
+-------------------+-------------

### Spark Built-in Functions
In spark, the have pre-built in function that can be used for transformation. We use the built-in function called `to_date()` for demonstration

In [18]:
# import the functions
from pyspark.sql import functions as F

In [23]:
# Adding the date column to the spark dataframe
df \
    .withColumn('pickup_date',F.to_date(df.pickup_datetime))\
    .withColumn('dropoff_date',F.to_date(df.dropoff_datetime))\
    .select('pickup_date','dropoff_date','PULocationID','DOLocationID')\
    .show()

+-----------+------------+------------+------------+
|pickup_date|dropoff_date|PULocationID|DOLocationID|
+-----------+------------+------------+------------+
| 2021-01-03|  2021-01-03|         255|          34|
| 2021-01-05|  2021-01-05|         189|         107|
| 2021-01-02|  2021-01-02|          88|         137|
| 2021-01-02|  2021-01-03|         238|         224|
| 2021-01-06|  2021-01-06|         169|         208|
| 2021-01-07|  2021-01-07|          75|          88|
| 2021-01-07|  2021-01-07|         210|         210|
| 2021-01-02|  2021-01-02|         243|          69|
| 2021-01-04|  2021-01-04|         250|         213|
| 2021-01-03|  2021-01-03|          87|          79|
| 2021-01-03|  2021-01-03|          68|         181|
| 2021-01-04|  2021-01-04|          95|         236|
| 2021-01-02|  2021-01-02|         262|         236|
| 2021-01-04|  2021-01-04|         225|         233|
| 2021-01-06|  2021-01-06|         237|          83|
| 2021-01-05|  2021-01-05|         231|       

### User-Defined Function

In addition to the built-in function, PySpark provide the flexibility for developer to have their own function to have the data transformations


In [25]:
def business_logic(base_num):
    num = int(base_num[1:])
    if num % 7 == 0:
        return f's/{num:03x}'
    elif num % 6 == 0 :
        return f'a/{num:03x}'
    else:
        return f'e/{num:03x}'

In [30]:
# create a user define function

business_logic_udf = F.udf(business_logic, returnType=types.StringType())

In [31]:
# Applied the udf

df \
    .withColumn('pickup_date',F.to_date(df.pickup_datetime))\
    .withColumn('dropoff_date',F.to_date(df.dropoff_datetime))\
    .withColumn('base_id',business_logic_udf(df.dispatching_base_num))\
    .select('base_id','pickup_date','dropoff_date','PULocationID','DOLocationID')\
    .show()

[Stage 5:>                                                          (0 + 1) / 1]

+-------+-----------+------------+------------+------------+
|base_id|pickup_date|dropoff_date|PULocationID|DOLocationID|
+-------+-----------+------------+------------+------------+
|  e/9ce| 2021-01-03|  2021-01-03|         255|          34|
|  e/b42| 2021-01-05|  2021-01-05|         189|         107|
|  e/b33| 2021-01-02|  2021-01-02|          88|         137|
|  e/b38| 2021-01-02|  2021-01-03|         238|         224|
|  e/b3b| 2021-01-06|  2021-01-06|         169|         208|
|  e/b33| 2021-01-07|  2021-01-07|          75|          88|
|  e/acc| 2021-01-07|  2021-01-07|         210|         210|
|  e/acc| 2021-01-02|  2021-01-02|         243|          69|
|  e/b35| 2021-01-04|  2021-01-04|         250|         213|
|  s/b3d| 2021-01-03|  2021-01-03|          87|          79|
|  e/a39| 2021-01-03|  2021-01-03|          68|         181|
|  s/acd| 2021-01-04|  2021-01-04|          95|         236|
|  s/b13| 2021-01-02|  2021-01-02|         262|         236|
|  e/9ce| 2021-01-04|  2

                                                                                