# PySpark UDF (User Defined Function) - 
        by Aishwarya Raut

# 1. Create PySpark UDF

## 1.1 Create a DF

In [1]:
from pyspark.sql import SparkSession
spark= SparkSession.builder.appName("SP").getOrCreate()

In [5]:
columns=["seq_no","name"]
data=[("1","kim namjoon"),
     ("2","kim seokjin"),
     ("3","min yoongi")]

df=spark.createDataFrame(data,columns)
df.printSchema()
#df.show()

root
 |-- seq_no: string (nullable = true)
 |-- name: string (nullable = true)



## 1.2 Create a Python Function


In [10]:
# Create a function which take a string parameter
# and converts the first letter of every word to 
# capital letter. 

def convertCase(str):
    res_Str=""
    arr=str.split(" ")
    for x in arr:
        res_Str=res_Str+x[0:1].upper()+x[1:len(x)]+" "
    return res_Str


# 1.3 Convert a Python function to PySpark UDF


In [18]:
from pyspark.sql.functions import col,udf 
from pyspark.sql.types import StringType

# Converting function to UDF
convertUDF= udf(lambda z: convertCase(z),StringType())

In [19]:
# 2. Using UDF with DataFrame
## 2.1 Using UDF with PySpark DF select()

df.select(col("seq_no"),
         convertUDF(col("name")).alias("Name")).printSchema()

root
 |-- seq_no: string (nullable = true)
 |-- Name: string (nullable = true)



# 2.2 Using UDF with PySpark DataFrame withColumn()

In [20]:
def uppercase(str):
    return str.upper()

In [21]:
uppercase_udf= udf(lambda x: uppercase(x),StringType())

df.withColumn("new_name",uppercase_udf(col("Name"))).printSchema()

root
 |-- seq_no: string (nullable = true)
 |-- name: string (nullable = true)
 |-- new_name: string (nullable = true)



# 2.3 Registering PySpark UDF and use it on SQL

In [None]:
""" Using UDF on SQL """
spark.udf.register("convertUDF", convertCase,StringType())
df.createOrReplaceTempView("NAME_TABLE")
spark.sql("select Seqno, convertUDF(Name) as Name from NAME_TABLE") \
     .show(truncate=False)

# 3. Creating UDF using annotation


In [None]:
@udf(returnType=StringType()) 
def upperCase(str):
    return str.upper()

df.withColumn("Cureated Name", upperCase(col("Name"))) \
.show(truncate=False)