## Spark Session

In [1]:
from pyspark.sql import SparkSession

appName = "BDP DF Functions"

# Create Spark session
spark = SparkSession.builder \
    .appName(appName) \
    .getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# Spark Parquet Dataset

In [2]:
!hdfs dfs -mkdir -p /user/bigdatapedia/input/customer/parquet

In [3]:
!hdfs dfs -put /home/bigdatapedia/data/customer_parq.parquet /user/bigdatapedia/input/customer/parquet/

In [4]:
!hdfs dfs -ls -h /user/bigdatapedia/input/customer/parquet/

Found 1 items
-rw-r--r--   3 bigdatapedia supergroup    248.7 K 2025-03-23 03:49 /user/bigdatapedia/input/customer/parquet/customer_parq.parquet


In [5]:
df_cust = spark.read.parquet("/user/bigdatapedia/input/customer/parquet")

                                                                                

In [6]:
df_cust.show(5,0)

[Stage 1:>                                                          (0 + 1) / 1]

+-----------+--------------+--------------+--------------+-----------------+-----------------------+-------------+--------------+----------------+
|customer_id|customer_fname|customer_lname|customer_email|customer_password|customer_street        |customer_city|customer_state|customer_zipcode|
+-----------+--------------+--------------+--------------+-----------------+-----------------------+-------------+--------------+----------------+
|1          |Richard       |Hernandez     |XXXXXXXXX     |XXXXXXXXX        |6303 Heather Plaza     |Brownsville  |TX            |78521           |
|2          |Mary          |Barrett       |XXXXXXXXX     |XXXXXXXXX        |9526 Noble Embers Ridge|Littleton    |CO            |80126           |
|3          |Ann           |Smith         |XXXXXXXXX     |XXXXXXXXX        |3422 Blue Pioneer Bend |Caguas       |PR            |00725           |
|4          |Mary          |Jones         |XXXXXXXXX     |XXXXXXXXX        |8324 Little Common     |San Marcos   |CA  

                                                                                

### Spark BuiltIn Functions

#### String Functions

In [48]:
from pyspark.sql.functions import concat_ws, substring, regexp_replace, regexp_extract, instr, 

In [35]:
df_select = df_cust.select("customer_id", "customer_fname", "customer_lname", "customer_street", "customer_city", "customer_state")

In [13]:
df_concat = df_select.withColumn("FullName", concat_ws(" ", df_cust["customer_fname"], df_cust["customer_lname"]))

In [14]:
df_concat.show(5,0)

[Stage 3:>                                                          (0 + 1) / 1]

+-----------+--------------+--------------+-------------+--------------+-----------------+
|customer_id|customer_fname|customer_lname|customer_city|customer_state|FullName         |
+-----------+--------------+--------------+-------------+--------------+-----------------+
|1          |Richard       |Hernandez     |Brownsville  |TX            |Richard Hernandez|
|2          |Mary          |Barrett       |Littleton    |CO            |Mary Barrett     |
|3          |Ann           |Smith         |Caguas       |PR            |Ann Smith        |
|4          |Mary          |Jones         |San Marcos   |CA            |Mary Jones       |
|5          |Robert        |Hudson        |Caguas       |PR            |Robert Hudson    |
+-----------+--------------+--------------+-------------+--------------+-----------------+
only showing top 5 rows



                                                                                

In [40]:
df_select.select("*", substring("customer_street", 1, 4).alias("DoorNo")).show(10,0)

+-----------+--------------+--------------+---------------------------+-------------+--------------+------+
|customer_id|customer_fname|customer_lname|customer_street            |customer_city|customer_state|DoorNo|
+-----------+--------------+--------------+---------------------------+-------------+--------------+------+
|1          |Richard       |Hernandez     |6303 Heather Plaza         |Brownsville  |TX            |6303  |
|2          |Mary          |Barrett       |9526 Noble Embers Ridge    |Littleton    |CO            |9526  |
|3          |Ann           |Smith         |3422 Blue Pioneer Bend     |Caguas       |PR            |3422  |
|4          |Mary          |Jones         |8324 Little Common         |San Marcos   |CA            |8324  |
|5          |Robert        |Hudson        |10 Crystal River Mall      |Caguas       |PR            |10 C  |
|6          |Mary          |Smith         |3151 Sleepy Quail Promenade|Passaic      |NJ            |3151  |
|7          |Melissa       |

In [51]:
df_select.select("*", instr(substring("customer_street", 1, 4), " ").alias("DoorNo")).show(10,0)

+-----------+--------------+--------------+---------------------------+-------------+--------------+------+
|customer_id|customer_fname|customer_lname|customer_street            |customer_city|customer_state|DoorNo|
+-----------+--------------+--------------+---------------------------+-------------+--------------+------+
|1          |Richard       |Hernandez     |6303 Heather Plaza         |Brownsville  |TX            |0     |
|2          |Mary          |Barrett       |9526 Noble Embers Ridge    |Littleton    |CO            |0     |
|3          |Ann           |Smith         |3422 Blue Pioneer Bend     |Caguas       |PR            |0     |
|4          |Mary          |Jones         |8324 Little Common         |San Marcos   |CA            |0     |
|5          |Robert        |Hudson        |10 Crystal River Mall      |Caguas       |PR            |3     |
|6          |Mary          |Smith         |3151 Sleepy Quail Promenade|Passaic      |NJ            |0     |
|7          |Melissa       |

In [47]:
df_select.select("*", 
                 regexp_extract(substring("customer_street", 1, 4), "[0-9]+", 0).alias("DoorNo")
                ).show(10,0)

+-----------+--------------+--------------+---------------------------+-------------+--------------+------+
|customer_id|customer_fname|customer_lname|customer_street            |customer_city|customer_state|DoorNo|
+-----------+--------------+--------------+---------------------------+-------------+--------------+------+
|1          |Richard       |Hernandez     |6303 Heather Plaza         |Brownsville  |TX            |6303  |
|2          |Mary          |Barrett       |9526 Noble Embers Ridge    |Littleton    |CO            |9526  |
|3          |Ann           |Smith         |3422 Blue Pioneer Bend     |Caguas       |PR            |3422  |
|4          |Mary          |Jones         |8324 Little Common         |San Marcos   |CA            |8324  |
|5          |Robert        |Hudson        |10 Crystal River Mall      |Caguas       |PR            |10    |
|6          |Mary          |Smith         |3151 Sleepy Quail Promenade|Passaic      |NJ            |3151  |
|7          |Melissa       |

                                                                                

In [57]:
df_select.withColumn("customer_street", regexp_replace("customer_street", "[0-9]+", "xxxx")
                ).show(10,0)

+-----------+--------------+--------------+---------------------------+-------------+--------------+
|customer_id|customer_fname|customer_lname|customer_street            |customer_city|customer_state|
+-----------+--------------+--------------+---------------------------+-------------+--------------+
|1          |Richard       |Hernandez     |xxxx Heather Plaza         |Brownsville  |TX            |
|2          |Mary          |Barrett       |xxxx Noble Embers Ridge    |Littleton    |CO            |
|3          |Ann           |Smith         |xxxx Blue Pioneer Bend     |Caguas       |PR            |
|4          |Mary          |Jones         |xxxx Little Common         |San Marcos   |CA            |
|5          |Robert        |Hudson        |xxxx Crystal River Mall    |Caguas       |PR            |
|6          |Mary          |Smith         |xxxx Sleepy Quail Promenade|Passaic      |NJ            |
|7          |Melissa       |Wilcox        |xxxx High Concession       |Caguas       |PR    

#### Date Functions

In [20]:
from pyspark.sql.functions import current_date, current_timestamp, date_format, date_sub, month, year

In [18]:
df_curr_date = df_select.withColumn("Current_Date", current_date()).withColumn("Current_Time", current_timestamp())

In [19]:
df_curr_date.show(5,0)

+-----------+--------------+--------------+-------------+--------------+------------+-----------------------+
|customer_id|customer_fname|customer_lname|customer_city|customer_state|Current_Date|Current_Time           |
+-----------+--------------+--------------+-------------+--------------+------------+-----------------------+
|1          |Richard       |Hernandez     |Brownsville  |TX            |2025-03-23  |2025-03-23 03:56:47.325|
|2          |Mary          |Barrett       |Littleton    |CO            |2025-03-23  |2025-03-23 03:56:47.325|
|3          |Ann           |Smith         |Caguas       |PR            |2025-03-23  |2025-03-23 03:56:47.325|
|4          |Mary          |Jones         |San Marcos   |CA            |2025-03-23  |2025-03-23 03:56:47.325|
|5          |Robert        |Hudson        |Caguas       |PR            |2025-03-23  |2025-03-23 03:56:47.325|
+-----------+--------------+--------------+-------------+--------------+------------+-----------------------+
only showi

In [26]:
df_format = df_curr_date.select(date_format("Current_Time", "dd-MMM-yy"))

In [27]:
df_format.show(5,0)

+------------------------------------+
|date_format(Current_Time, dd-MMM-yy)|
+------------------------------------+
|23-Mar-25                           |
|23-Mar-25                           |
|23-Mar-25                           |
|23-Mar-25                           |
|23-Mar-25                           |
+------------------------------------+
only showing top 5 rows



In [28]:
df_minus_one = df_curr_date.withColumn("day_before", date_sub("Current_Date", 1))

In [29]:
df_minus_one.show(5,0)

+-----------+--------------+--------------+-------------+--------------+------------+-----------------------+----------+
|customer_id|customer_fname|customer_lname|customer_city|customer_state|Current_Date|Current_Time           |day_before|
+-----------+--------------+--------------+-------------+--------------+------------+-----------------------+----------+
|1          |Richard       |Hernandez     |Brownsville  |TX            |2025-03-23  |2025-03-23 04:01:44.837|2025-03-22|
|2          |Mary          |Barrett       |Littleton    |CO            |2025-03-23  |2025-03-23 04:01:44.837|2025-03-22|
|3          |Ann           |Smith         |Caguas       |PR            |2025-03-23  |2025-03-23 04:01:44.837|2025-03-22|
|4          |Mary          |Jones         |San Marcos   |CA            |2025-03-23  |2025-03-23 04:01:44.837|2025-03-22|
|5          |Robert        |Hudson        |Caguas       |PR            |2025-03-23  |2025-03-23 04:01:44.837|2025-03-22|
+-----------+--------------+----

In [31]:
df_minus_one_nf = df_curr_date.withColumn("day_before", df_curr_date["Current_Date"] - 1)

In [32]:
df_minus_one_nf.show(5,0)

+-----------+--------------+--------------+-------------+--------------+------------+-----------------------+----------+
|customer_id|customer_fname|customer_lname|customer_city|customer_state|Current_Date|Current_Time           |day_before|
+-----------+--------------+--------------+-------------+--------------+------------+-----------------------+----------+
|1          |Richard       |Hernandez     |Brownsville  |TX            |2025-03-23  |2025-03-23 04:04:47.838|2025-03-22|
|2          |Mary          |Barrett       |Littleton    |CO            |2025-03-23  |2025-03-23 04:04:47.838|2025-03-22|
|3          |Ann           |Smith         |Caguas       |PR            |2025-03-23  |2025-03-23 04:04:47.838|2025-03-22|
|4          |Mary          |Jones         |San Marcos   |CA            |2025-03-23  |2025-03-23 04:04:47.838|2025-03-22|
|5          |Robert        |Hudson        |Caguas       |PR            |2025-03-23  |2025-03-23 04:04:47.838|2025-03-22|
+-----------+--------------+----

#### Case When

In [58]:
df_select.show(5,0)

+-----------+--------------+--------------+-----------------------+-------------+--------------+
|customer_id|customer_fname|customer_lname|customer_street        |customer_city|customer_state|
+-----------+--------------+--------------+-----------------------+-------------+--------------+
|1          |Richard       |Hernandez     |6303 Heather Plaza     |Brownsville  |TX            |
|2          |Mary          |Barrett       |9526 Noble Embers Ridge|Littleton    |CO            |
|3          |Ann           |Smith         |3422 Blue Pioneer Bend |Caguas       |PR            |
|4          |Mary          |Jones         |8324 Little Common     |San Marcos   |CA            |
|5          |Robert        |Hudson        |10 Crystal River Mall  |Caguas       |PR            |
+-----------+--------------+--------------+-----------------------+-------------+--------------+
only showing top 5 rows



In [60]:
from pyspark.sql.functions import when

In [66]:
df_case = df_select.withColumn("is_it_tx", 
                               when(df_select["customer_state"]=="TX", "TX").
                               when(df_select["customer_state"]=="CO", "CO").otherwise("Non-CO/TX"))

In [67]:
df_case.show(5,0)

+-----------+--------------+--------------+-----------------------+-------------+--------------+---------+
|customer_id|customer_fname|customer_lname|customer_street        |customer_city|customer_state|is_it_tx |
+-----------+--------------+--------------+-----------------------+-------------+--------------+---------+
|1          |Richard       |Hernandez     |6303 Heather Plaza     |Brownsville  |TX            |TX       |
|2          |Mary          |Barrett       |9526 Noble Embers Ridge|Littleton    |CO            |CO       |
|3          |Ann           |Smith         |3422 Blue Pioneer Bend |Caguas       |PR            |Non-CO/TX|
|4          |Mary          |Jones         |8324 Little Common     |San Marcos   |CA            |Non-CO/TX|
|5          |Robert        |Hudson        |10 Crystal River Mall  |Caguas       |PR            |Non-CO/TX|
+-----------+--------------+--------------+-----------------------+-------------+--------------+---------+
only showing top 5 rows



#### Ranking

In [82]:
from pyspark.sql import Window
from pyspark.sql.functions import dense_rank, rank, row_number

In [70]:
df_regex = df_select.select("*", 
                 regexp_extract(substring("customer_street", 1, 4), "[0-9]+", 0).alias("DoorNo")
                )

df_regex.show(10,0)

+-----------+--------------+--------------+---------------------------+-------------+--------------+------+
|customer_id|customer_fname|customer_lname|customer_street            |customer_city|customer_state|DoorNo|
+-----------+--------------+--------------+---------------------------+-------------+--------------+------+
|1          |Richard       |Hernandez     |6303 Heather Plaza         |Brownsville  |TX            |6303  |
|2          |Mary          |Barrett       |9526 Noble Embers Ridge    |Littleton    |CO            |9526  |
|3          |Ann           |Smith         |3422 Blue Pioneer Bend     |Caguas       |PR            |3422  |
|4          |Mary          |Jones         |8324 Little Common         |San Marcos   |CA            |8324  |
|5          |Robert        |Hudson        |10 Crystal River Mall      |Caguas       |PR            |10    |
|6          |Mary          |Smith         |3151 Sleepy Quail Promenade|Passaic      |NJ            |3151  |
|7          |Melissa       |

In [76]:
windowing = Window.orderBy("DoorNo")

In [78]:
df_regex.withColumn("denseRank", dense_rank().over(windowing)).show(10, 0)

25/03/23 04:39:01 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
[Stage 32:>                                                         (0 + 1) / 1]

+-----------+--------------+--------------+------------------------+-------------+--------------+------+---------+
|customer_id|customer_fname|customer_lname|customer_street         |customer_city|customer_state|DoorNo|denseRank|
+-----------+--------------+--------------+------------------------+-------------+--------------+------+---------+
|2883       |Christine     |Smith         |1 Crystal Gate          |Sheboygan    |WI            |1     |1        |
|4647       |Mary          |Smith         |1 Crystal Gate          |Caguas       |PR            |1     |1        |
|7494       |Rebecca       |Moore         |1 Crystal Gate          |Caguas       |PR            |1     |1        |
|10276      |Mary          |Gutierrez     |1 Foggy Freeway         |Rego Park    |NY            |1     |1        |
|10296      |Mary          |Ramos         |1 Bright Manor          |Caguas       |PR            |1     |1        |
|5          |Robert        |Hudson        |10 Crystal River Mall   |Caguas      

                                                                                

In [80]:
df_regex.withColumn("Ranking", rank().over(windowing)).show(10, 0)

25/03/23 04:39:49 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
[Stage 34:>                                                         (0 + 1) / 1]

+-----------+--------------+--------------+------------------------+-------------+--------------+------+-------+
|customer_id|customer_fname|customer_lname|customer_street         |customer_city|customer_state|DoorNo|Ranking|
+-----------+--------------+--------------+------------------------+-------------+--------------+------+-------+
|2883       |Christine     |Smith         |1 Crystal Gate          |Sheboygan    |WI            |1     |1      |
|4647       |Mary          |Smith         |1 Crystal Gate          |Caguas       |PR            |1     |1      |
|7494       |Rebecca       |Moore         |1 Crystal Gate          |Caguas       |PR            |1     |1      |
|10276      |Mary          |Gutierrez     |1 Foggy Freeway         |Rego Park    |NY            |1     |1      |
|10296      |Mary          |Ramos         |1 Bright Manor          |Caguas       |PR            |1     |1      |
|5          |Robert        |Hudson        |10 Crystal River Mall   |Caguas       |PR            

                                                                                

In [81]:
df_regex.withColumn("Ranking", rank().over(Window.orderBy("DoorNo"))).show(10, 0)

25/03/23 04:41:01 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-----------+--------------+--------------+------------------------+-------------+--------------+------+-------+
|customer_id|customer_fname|customer_lname|customer_street         |customer_city|customer_state|DoorNo|Ranking|
+-----------+--------------+--------------+------------------------+-------------+--------------+------+-------+
|2883       |Christine     |Smith         |1 Crystal Gate          |Sheboygan    |WI            |1     |1      |
|4647       |Mary          |Smith         |1 Crystal Gate          |Caguas       |PR            |1     |1      |
|7494       |Rebecca       |Moore         |1 Crystal Gate          |Caguas       |PR            |1     |1      |
|10276      |Mary          |Gutierrez     |1 Foggy Freeway         |Rego Park    |NY            |1     |1      |
|10296      |Mary          |Ramos         |1 Bright Manor          |Caguas       |PR            |1     |1      |
|5          |Robert        |Hudson        |10 Crystal River Mall   |Caguas       |PR            

In [83]:
df_regex.withColumn("RowNumber", row_number().over(windowing)).show(10,0)

25/03/23 04:42:21 WARN window.WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-----------+--------------+--------------+------------------------+-------------+--------------+------+---------+
|customer_id|customer_fname|customer_lname|customer_street         |customer_city|customer_state|DoorNo|RowNumber|
+-----------+--------------+--------------+------------------------+-------------+--------------+------+---------+
|2883       |Christine     |Smith         |1 Crystal Gate          |Sheboygan    |WI            |1     |1        |
|4647       |Mary          |Smith         |1 Crystal Gate          |Caguas       |PR            |1     |2        |
|7494       |Rebecca       |Moore         |1 Crystal Gate          |Caguas       |PR            |1     |3        |
|10276      |Mary          |Gutierrez     |1 Foggy Freeway         |Rego Park    |NY            |1     |4        |
|10296      |Mary          |Ramos         |1 Bright Manor          |Caguas       |PR            |1     |5        |
|5          |Robert        |Hudson        |10 Crystal River Mall   |Caguas      

In [85]:
df_regex.withColumn("Ranking", rank().over(Window.partitionBy("customer_city").orderBy("DoorNo"))).show(15, 0)

                                                                                

+-----------+--------------+--------------+---------------------------+-------------+--------------+------+-------+
|customer_id|customer_fname|customer_lname|customer_street            |customer_city|customer_state|DoorNo|Ranking|
+-----------+--------------+--------------+---------------------------+-------------+--------------+------+-------+
|2544       |Mary          |Smith         |2905 Quiet River Trail     |Hanover      |PA            |2905  |1      |
|6108       |Emma          |Smith         |3722 Red Hickory Path      |Hanover      |PA            |3722  |2      |
|10693      |Andrea        |Sparks        |3993 Pleasant Beach        |Hanover      |PA            |3993  |3      |
|147        |Mary          |Smith         |443 Jagged Lane            |Hanover      |PA            |443   |4      |
|9008       |Ann           |Smith         |5795 Lazy View Concession  |Hanover      |PA            |5795  |5      |
|8612       |Emma          |Smith         |7041 Rustic Pathway        |H

#### UDF

In [86]:
from pyspark.sql.functions import udf

In [87]:
df_select.show(10,0)

+-----------+--------------+--------------+---------------------------+-------------+--------------+
|customer_id|customer_fname|customer_lname|customer_street            |customer_city|customer_state|
+-----------+--------------+--------------+---------------------------+-------------+--------------+
|1          |Richard       |Hernandez     |6303 Heather Plaza         |Brownsville  |TX            |
|2          |Mary          |Barrett       |9526 Noble Embers Ridge    |Littleton    |CO            |
|3          |Ann           |Smith         |3422 Blue Pioneer Bend     |Caguas       |PR            |
|4          |Mary          |Jones         |8324 Little Common         |San Marcos   |CA            |
|5          |Robert        |Hudson        |10 Crystal River Mall      |Caguas       |PR            |
|6          |Mary          |Smith         |3151 Sleepy Quail Promenade|Passaic      |NJ            |
|7          |Melissa       |Wilcox        |9453 High Concession       |Caguas       |PR    

In [88]:
def upperConvert(x):
    return x.upper()

In [89]:
upperConvert("hello")

'HELLO'

In [90]:
upper_UDF = udf(lambda y: upperConvert(y))

In [91]:
df_select.withColumn("Upper_Fname", upper_UDF("customer_fname")).show(10,0)

[Stage 48:>                                                         (0 + 1) / 1]

+-----------+--------------+--------------+---------------------------+-------------+--------------+-----------+
|customer_id|customer_fname|customer_lname|customer_street            |customer_city|customer_state|Upper_Fname|
+-----------+--------------+--------------+---------------------------+-------------+--------------+-----------+
|1          |Richard       |Hernandez     |6303 Heather Plaza         |Brownsville  |TX            |RICHARD    |
|2          |Mary          |Barrett       |9526 Noble Embers Ridge    |Littleton    |CO            |MARY       |
|3          |Ann           |Smith         |3422 Blue Pioneer Bend     |Caguas       |PR            |ANN        |
|4          |Mary          |Jones         |8324 Little Common         |San Marcos   |CA            |MARY       |
|5          |Robert        |Hudson        |10 Crystal River Mall      |Caguas       |PR            |ROBERT     |
|6          |Mary          |Smith         |3151 Sleepy Quail Promenade|Passaic      |NJ         

                                                                                