In [37]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, DateType, ArrayType, FloatType

In [38]:
spark = SparkSession.builder \
.master("local[2]") \
.appName("UDF") \
.getOrCreate()

In [39]:
# ! wget -O /home/train/datasets/Hotel_Reviews.csv.gz \
# https://github.com/erkansirin78/datasets/raw/master/Hotel_Reviews.csv.gz

# Read Data

In [40]:
df = spark.read \
.format("csv") \
.option("header", True) \
.option("sep", ",") \
.option("inferSchema", True) \
.load("file:///home/train/datasets/Hotel_Reviews.csv.gz") \
.withColumn("Review_Date", F.to_date("Review_Date", "M/d/y")) \
.withColumn("Tags", F.split(F.col("Tags"), ",").cast(ArrayType(StringType())))

                                                                                

In [41]:
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [42]:
df.limit(2)

Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
s Gravesandestra...,194,2017-08-03,7.7,Hotel Arena,Russia,I am so angry th...,397,1403,Only the park ou...,11,7,2.9,[[' Leisure trip ...,0 days,52.3605759,4.9159683
s Gravesandestra...,194,2017-08-03,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complain...,105,7,7.5,[[' Leisure trip ...,0 days,52.3605759,4.9159683


In [43]:
type(df2)

pyspark.sql.dataframe.DataFrame

In [44]:
def upper_case(x):
    return x.upper()

In [45]:
upper_case('Ali Gel')

'ALI GEL'

In [46]:
upper_case_udf = F.udf(upper_case, StringType())

In [47]:
df.withColumn("Hotel_Name", upper_case_udf(F.col("Hotel_Name"))).limit(3)

Traceback (most recent call last):                                  (0 + 1) / 1]
  File "/opt/manual/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/opt/manual/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/opt/manual/spark/python/lib/pyspark.zip/pyspark/worker.py", line 643, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/opt/manual/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError
Traceback (most recent call last):                                              
  File "/opt/manual/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/opt/manual/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/opt/manual/spark/python/lib/pyspark.zip/pyspark/worker.py", line 643, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/opt/manual/spark/python/lib/pyspark.zip/pyspark/serializers.p

Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
s Gravesandestra...,194,2017-08-03,7.7,HOTEL ARENA,Russia,I am so angry th...,397,1403,Only the park ou...,11,7,2.9,[[' Leisure trip ...,0 days,52.3605759,4.9159683
s Gravesandestra...,194,2017-08-03,7.7,HOTEL ARENA,Ireland,No Negative,0,1403,No real complain...,105,7,7.5,[[' Leisure trip ...,0 days,52.3605759,4.9159683
s Gravesandestra...,194,2017-07-31,7.7,HOTEL ARENA,Australia,Rooms are nice b...,42,1403,Location was goo...,21,9,7.1,[[' Leisure trip ...,3 days,52.3605759,4.9159683


In [48]:
def hotel_and_country(hotel, country):
    return "{} - {}".format(hotel, country.strip())

In [49]:
hotel_and_country_udf = F.udf(hotel_and_country, StringType())

In [50]:
df.select(hotel_and_country_udf("Hotel_Name","Reviewer_Nationality") \
          .alias("Hotel_Country")).show(truncate=False)

+----------------------------+
|Hotel_Country               |
+----------------------------+
|Hotel Arena - Russia        |
|Hotel Arena - Ireland       |
|Hotel Arena - Australia     |
|Hotel Arena - United Kingdom|
|Hotel Arena - New Zealand   |
|Hotel Arena - Poland        |
|Hotel Arena - United Kingdom|
|Hotel Arena - United Kingdom|
|Hotel Arena - Belgium       |
|Hotel Arena - Norway        |
|Hotel Arena - United Kingdom|
|Hotel Arena - France        |
|Hotel Arena - United Kingdom|
|Hotel Arena - Italy         |
|Hotel Arena - Canada        |
|Hotel Arena - Italy         |
|Hotel Arena - United Kingdom|
|Hotel Arena - Ireland       |
|Hotel Arena - Netherlands   |
|Hotel Arena - Australia     |
+----------------------------+
only showing top 20 rows



Traceback (most recent call last):
  File "/opt/manual/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/opt/manual/spark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/opt/manual/spark/python/lib/pyspark.zip/pyspark/worker.py", line 643, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/opt/manual/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError


In [51]:
import pandas as pd

In [52]:
def upper_case(col: pd.Series) -> pd.Series:
    return col.transform(lambda x: x.upper())

In [53]:
upper_case_pdudf = F.pandas_udf(upper_case, returnType=StringType())

In [54]:
# ! pip install pyarrow

In [55]:
df.select("Hotel_Name",upper_case_pdudf("Hotel_Name").alias("Hotel_Name_Upper")).show(5)

[Stage 13:>                                                         (0 + 1) / 1]

+-----------+----------------+
| Hotel_Name|Hotel_Name_Upper|
+-----------+----------------+
|Hotel Arena|     HOTEL ARENA|
|Hotel Arena|     HOTEL ARENA|
|Hotel Arena|     HOTEL ARENA|
|Hotel Arena|     HOTEL ARENA|
|Hotel Arena|     HOTEL ARENA|
+-----------+----------------+
only showing top 5 rows



                                                                                

In [56]:
df2 = df.withColumnRenamed("Review_Total_Negative_Word_Counts","TN_Counts")

In [60]:
df2.limit(3)

                                                                                

Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,TN_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
s Gravesandestra...,194,2017-08-03,7.7,Hotel Arena,Russia,I am so angry th...,397,1403,Only the park ou...,11,7,2.9,[[' Leisure trip ...,0 days,52.3605759,4.9159683
s Gravesandestra...,194,2017-08-03,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complain...,105,7,7.5,[[' Leisure trip ...,0 days,52.3605759,4.9159683
s Gravesandestra...,194,2017-07-31,7.7,Hotel Arena,Australia,Rooms are nice b...,42,1403,Location was goo...,21,9,7.1,[[' Leisure trip ...,3 days,52.3605759,4.9159683


In [58]:
df3 = df2.drop("TN_Counts", "Tags")

In [59]:
df3.limit(3)

Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,days_since_review,lat,lng
s Gravesandestra...,194,2017-08-03,7.7,Hotel Arena,Russia,I am so angry th...,1403,Only the park ou...,11,7,2.9,0 days,52.3605759,4.9159683
s Gravesandestra...,194,2017-08-03,7.7,Hotel Arena,Ireland,No Negative,1403,No real complain...,105,7,7.5,0 days,52.3605759,4.9159683
s Gravesandestra...,194,2017-07-31,7.7,Hotel Arena,Australia,Rooms are nice b...,1403,Location was goo...,21,9,7.1,3 days,52.3605759,4.9159683
