In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType, StringType

spark = SparkSession.builder \
    .appName("UDF Example") \
    .getOrCreate()

data = [(1, "Alice"), (2, "Bob"), (3, "Cathy"), (4, "David")]
df = spark.createDataFrame(data, ["id", "name"])

# Определяем пользовательскую функцию
def add_prefix(name):
    return "Name_" + name

# Регистрируем функцию как UDF
add_prefix_udf = udf(add_prefix, StringType())

# Применяем UDF к DataFrame
df_with_prefix = df.withColumn("prefixed_name", add_prefix_udf(col("name")))

df_with_prefix.show()

spark.stop()

+---+-----+-------------+
| id| name|prefixed_name|
+---+-----+-------------+
|  1|Alice|   Name_Alice|
|  2|  Bob|     Name_Bob|
|  3|Cathy|   Name_Cathy|
|  4|David|   Name_David|
+---+-----+-------------+



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType
spark = SparkSession.builder \
    .appName("Register UDF Example") \
    .getOrCreate()


data = [(1, "Alice"), (2, "Bob"), (3, "Cathy"), (4, "David")]
df = spark.createDataFrame(data, ["id", "name"])


def name_length(name):
    return len(name)

# Регистрируем функцию как UDF с использованием spark.udf.register
spark.udf.register("name_length_udf", name_length, IntegerType())

# Создаем временную таблицу для выполнения SQL-запросов
df.createOrReplaceTempView("people")

# Используем зарегистрированную UDF в SQL-запросе
result_df = spark.sql("SELECT id, name, name_length_udf(name) as name_length FROM people")


result_df.show()


spark.stop()

+---+-----+-----------+
| id| name|name_length|
+---+-----+-----------+
|  1|Alice|          5|
|  2|  Bob|          3|
|  3|Cathy|          5|
|  4|David|          5|
+---+-----+-----------+

