In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder.appName("UDF").getOrCreate()

In [3]:
data = [
    ("U1", 85),
    ("U2", 72),
    ("U3", 40)
]
df = spark.createDataFrame(data, ["user_id", "score"])
df.show()

+-------+-----+
|user_id|score|
+-------+-----+
|     U1|   85|
|     U2|   72|
|     U3|   40|
+-------+-----+



##UDF Way

In [6]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def classify_score(score):
    if score >= 80:
        return "High"
    elif score >=50:
        return "Medium"
    else:
        return "Low"

performance_udf = udf(classify_score, StringType())
#df.withColumn("performance", performance_udf("score")).show()  #original df is not getting changed and also cannot assign while asking to show
df = df.withColumn("performance", performance_udf("score"))
df.show()

+-------+-----+-----------+
|user_id|score|performance|
+-------+-----+-----------+
|     U1|   85|       High|
|     U2|   72|     Medium|
|     U3|   40|        Low|
+-------+-----+-----------+



In [9]:
df.orderBy("performance",df.score.desc()).show()

+-------+-----+-----------+
|user_id|score|performance|
+-------+-----+-----------+
|     U1|   85|       High|
|     U3|   40|        Low|
|     U2|   72|     Medium|
+-------+-----+-----------+



##Native Way

In [5]:
from pyspark.sql.functions import when,col

df.withColumn("performance", when(col("score") >= 80, "High")
              .when(col("score") >= 50, "Medium")
              .otherwise("Low")).show()

+-------+-----+-----------+
|user_id|score|performance|
+-------+-----+-----------+
|     U1|   85|       High|
|     U2|   72|     Medium|
|     U3|   40|        Low|
+-------+-----+-----------+



#SET Functions

In [10]:
data_a = [
    ("U1","Python"),
    ("U2","Java"),
    ("U3","Spark")
]
df_a = spark.createDataFrame(data_a, ["user_id", "course"])

In [12]:
data_b = [
    ("U2","Java"),
    ("U3","Spark"),
    ("U4","Python")
]
df_b = spark.createDataFrame(data_b, ["user_id", "course"])


In [13]:
df_a.union(df_b).show()

+-------+------+
|user_id|course|
+-------+------+
|     U1|Python|
|     U2|  Java|
|     U3| Spark|
|     U2|  Java|
|     U3| Spark|
|     U4|Python|
+-------+------+



In [14]:
df_a.intersect(df_b).show()

+-------+------+
|user_id|course|
+-------+------+
|     U3| Spark|
|     U2|  Java|
+-------+------+

