## User Defined Functions

In [52]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

In [53]:
spark = SparkSession.builder.appName("Python UDF Example").getOrCreate()

In [54]:
spark

In [55]:
df = spark.read.csv("world_happiness_data/2021.csv", header=True, inferSchema=True)

In [56]:
df.printSchema()

root
 |-- Country name: string (nullable = true)
 |-- Happiness Rank: integer (nullable = true)
 |-- Happiness score: double (nullable = true)
 |-- Upperwhisker: double (nullable = true)
 |-- Lowerwhisker: double (nullable = true)
 |-- Economy (GDP per Capita)\t: double (nullable = true)
 |-- Social support: double (nullable = true)
 |-- Healthy life expectancy: double (nullable = true)
 |-- Freedom to make life choices: double (nullable = true)
 |-- Generosity: double (nullable = true)
 |-- Perceptions of corruption: double (nullable = true)



In [57]:
def get_happiness_score(happiness_score):
    if happiness_score is None:
        return "Unknown"
    if happiness_score >= 7:
        return "Very Happy"
    elif happiness_score >= 5:
        return "Happy"
    elif happiness_score >= 3:
        return "Neutral"
    else:
        return "Unhappy"

In [58]:
get_happiness_score_udf = udf(get_happiness_score, StringType())

In [59]:
spark.udf.register("get_happiness_score_udf", get_happiness_score_udf)

<pyspark.sql.udf.UserDefinedFunction at 0x1b0306d3750>

In [60]:
query = """
SELECT `Country name`, `Happiness Score`, get_happiness_score_udf(`Happiness Score`) AS Happiness_Description
FROM df"""

In [61]:
df = df.createOrReplaceTempView("df")

In [62]:
results_df = spark.sql(query)