In [1]:
import findspark

In [2]:
# /opt/manual/spark: this is SPARK_HOME path
findspark.init("/opt/manual/spark")

In [3]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import *

In [4]:
spark = SparkSession.builder \
.appName("User Defined Functions") \
.master("local[2]") \
.getOrCreate()

2022-08-27 19:33:51,715 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
df = spark.read \
.option("header",True) \
.option("inferSchema", True) \
.option("compression","gzip") \
.csv("file:///home/train/datasets/Hotel_Reviews.csv.gz")

                                                                                

In [6]:
df2 = df.withColumn("Tags", 
                     F.split(F.col("Tags"), ",")
                     .cast(ArrayType(StringType()))) \
.withColumn("Review_Date", F.to_date(F.col("Review_Date"),"M/d/yyyy"))

In [7]:
df2.limit(2).toPandas()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.3605759,4.9159683
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,2017-08-03,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[[' Leisure trip ', ' Couple ', ' Duplex Dou...",0 days,52.3605759,4.9159683


# Define Function

In [8]:
def upper_case(x):
    return x.upper()

In [9]:
upper_case("ali gel")

'ALI GEL'

# Register UDF

In [10]:
from pyspark.sql.types import StringType

In [11]:
upper_case_udf = F.udf(upper_case, StringType())

In [12]:
df.withColumn("Nationality_Upper", upper_case_udf("Reviewer_Nationality")).select("Reviewer_Nationality","Nationality_Upper").show(4)

[Stage 3:>                                                          (0 + 1) / 1]

+--------------------+-----------------+
|Reviewer_Nationality|Nationality_Upper|
+--------------------+-----------------+
|             Russia |          RUSSIA |
|            Ireland |         IRELAND |
|          Australia |       AUSTRALIA |
|     United Kingdom |  UNITED KINGDOM |
+--------------------+-----------------+
only showing top 4 rows



Traceback (most recent call last):
  File "/opt/manual/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/opt/manual/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/opt/manual/spark/python/lib/pyspark.zip/pyspark/worker.py", line 643, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/opt/manual/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError
                                                                                

# UDF on multiple columns

In [13]:
def hotel_and_country(hotel, country):
    return "{} - {}".format(hotel, country)

In [14]:
hotel_and_country_udf = spark.udf.register("hotel_and_country_udf", 
                                           hotel_and_country, StringType())

In [15]:
df.select(hotel_and_country_udf("Hotel_Name","Reviewer_Nationality") \
          .alias("Hotel_Country")).show(truncate=False)

+------------------------------+
|Hotel_Country                 |
+------------------------------+
|Hotel Arena -  Russia         |
|Hotel Arena -  Ireland        |
|Hotel Arena -  Australia      |
|Hotel Arena -  United Kingdom |
|Hotel Arena -  New Zealand    |
|Hotel Arena -  Poland         |
|Hotel Arena -  United Kingdom |
|Hotel Arena -  United Kingdom |
|Hotel Arena -  Belgium        |
|Hotel Arena -  Norway         |
|Hotel Arena -  United Kingdom |
|Hotel Arena -  France         |
|Hotel Arena -  United Kingdom |
|Hotel Arena -  Italy          |
|Hotel Arena -  Canada         |
|Hotel Arena -  Italy          |
|Hotel Arena -  United Kingdom |
|Hotel Arena -  Ireland        |
|Hotel Arena -  Netherlands    |
|Hotel Arena -  Australia      |
+------------------------------+
only showing top 20 rows



Traceback (most recent call last):
  File "/opt/manual/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/opt/manual/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/opt/manual/spark/python/lib/pyspark.zip/pyspark/worker.py", line 643, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/opt/manual/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError


In [16]:
df1 = df.withColumn("hotel_country", hotel_and_country_udf("Hotel_Name","Reviewer_Nationality")) \
.drop("Hotel_Name", "Reviewer_Nationality")

In [17]:
df1.printSchema()

root
 |-- Hotel_Address: string (nullable = true)
 |-- Additional_Number_of_Scoring: integer (nullable = true)
 |-- Review_Date: string (nullable = true)
 |-- Average_Score: double (nullable = true)
 |-- Negative_Review: string (nullable = true)
 |-- Review_Total_Negative_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews: integer (nullable = true)
 |-- Positive_Review: string (nullable = true)
 |-- Review_Total_Positive_Word_Counts: integer (nullable = true)
 |-- Total_Number_of_Reviews_Reviewer_Has_Given: integer (nullable = true)
 |-- Reviewer_Score: double (nullable = true)
 |-- Tags: string (nullable = true)
 |-- days_since_review: string (nullable = true)
 |-- lat: string (nullable = true)
 |-- lng: string (nullable = true)
 |-- hotel_country: string (nullable = true)



# Pandas UDFs

    One of the previous prevailing issues with using PySpark UDFs was that they had
    slower performance than Scala UDFs. This was because the PySpark UDFs required
    data movement between the JVM and Python, which was quite expensive. To resolve
    this problem, Pandas UDFs (also known as vectorized UDFs) were introduced as part
    of Apache Spark 2.3. A Pandas UDF uses Apache Arrow to transfer data and Pandas
    to work with the data. You define a Pandas UDF using the keyword pandas_udf as
    the decorator, or to wrap the function itself. Once the data is in Apache Arrow format,
    there is no longer the need to serialize/pickle the data as it is already in a format
    consumable by the Python process. Instead of operating on individual inputs row by
    row, you are operating on a Pandas Series or DataFrame (i.e., vectorized execution).

In [18]:
import pandas as pd

In [19]:
def upper_case(col: pd.Series) -> pd.Series:
    return col.transform(lambda x: x.upper())

In [20]:
# Format
# F.pandas_udf(function, returnType)

In [23]:
upper_case_pdudf = F.pandas_udf(upper_case, returnType=StringType())

In [22]:
# ! pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-9.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.3 MB)
     |████████████████████████████████| 35.3 MB 460 kB/s             |███████████████████████████████▉| 35.1 MB 2.8 MB/s eta 0:00:01
Installing collected packages: pyarrow
Successfully installed pyarrow-9.0.0
You should consider upgrading via the '/home/train/venvspark/bin/python -m pip install --upgrade pip' command.[0m


In [24]:
df.select("Hotel_Name",upper_case_pdudf("Hotel_Name").alias("Hotel_Name_Upper")).show(5)

[Stage 5:>                                                          (0 + 1) / 1]

+-----------+----------------+
| Hotel_Name|Hotel_Name_Upper|
+-----------+----------------+
|Hotel Arena|     HOTEL ARENA|
|Hotel Arena|     HOTEL ARENA|
|Hotel Arena|     HOTEL ARENA|
|Hotel Arena|     HOTEL ARENA|
|Hotel Arena|     HOTEL ARENA|
+-----------+----------------+
only showing top 5 rows



                                                                                

# pandas udf on multiple columns

In [25]:
def pd_hotel_and_country(hotel:pd.Series, country:pd.Series) -> pd.Series:
    return hotel + ' - ' + country

In [26]:
pd_hotel_and_country_udf = F.pandas_udf(pd_hotel_and_country, StringType())

In [27]:
df2.select(pd_hotel_and_country_udf("Hotel_Name", "Reviewer_Nationality") \
           .alias("HC")).show(n=4, truncate=False)

[Stage 6:>                                                          (0 + 1) / 1]

+------------------------------+
|HC                            |
+------------------------------+
|Hotel Arena -  Russia         |
|Hotel Arena -  Ireland        |
|Hotel Arena -  Australia      |
|Hotel Arena -  United Kingdom |
+------------------------------+
only showing top 4 rows



                                                                                

In [28]:
spark.stop()