In [1]:
import findspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

findspark.init()
findspark.find()

'/usr/local/spark'

In [2]:
spark = (
    SparkSession.builder.appName("udfApp")
    .master("local[4]")
    .config("spark.dynamicAllocation.enabled", "false")
    .config("spark.sql.adaptive.enabled", "false")
    .getOrCreate()
)

spark

In [3]:
sc = SparkSession.sparkContext
sc

<property at 0x7f555119ad90>

In [4]:
cabsDF = (
    spark.read.option("header", "true")
    .option("inferSchema", "true")
    .csv("/home/jovyan/spark-workspace/data/csvs/Cabs.csv")
)

cabsDF.show(truncate=False)

+---------+--------------------+------------------------+----------------+------+-------------------+-----------------+--------------------+-----------+-----------+---------------+--------------------------+--------------------------------------------+---------------+
|CabNumber|VehicleLicenseNumber|Name                    |LicenseType     |Active|PermitLicenseNumber|VehicleVinNumber |WheelchairAccessible|VehicleYear|VehicleType|TelephoneNumber|Website                   |Address                                     |LastDateUpdated|
+---------+--------------------+------------------------+----------------+------+-------------------+-----------------+--------------------+-----------+-----------+---------------+--------------------------+--------------------------------------------+---------------+
|T802127C |C19641              |ABCON INC.              |OWNER MUST DRIVE|YES   |NULL               |5TDBK3EH0DS268018|NULL                |2016       |NULL       |(718)438-1100  |NULL         

In [5]:
# Get the columns from 0 to 3 (Python indexing: exclusive of 3)
# selected_columns = cabsDF.columns[0:4]
# print("Selected Columns:", selected_columns)

# # Select these columns from the DataFrame
# sliced_df = cabsDF.select(*selected_columns)
# sliced_df.show()

In [6]:
# Create a temp view
cabsDF.createOrReplaceTempView("Cabs")

### Create a function to convert case:

In [23]:
def convertCase(str):
    if not str:  # Handle empty or None values
        return ""
    result = ""
    nameWordArray = str.split(",")
    for word in nameWordArray:
        word = word.strip()
        result += word[0:1].upper() + word[1:].lower() + ", "
    return result[:-2]  # Remove the trailing ", "

### Register function as a User Defined Function (UDF)

This registration option is for using UDF in Python/Scala

In [24]:
# convertCaseUdf = udf(lambda str: convertCase(str), StringType())

spark.udf.register("convertCaseUdf", convertCase, StringType())

<function __main__.convertCase(str)>

#### Use UDF in DataFrame code

In [25]:
from pyspark.sql.functions import col

# Apply the UDF
cabsDF.select("Name", convertCaseUdf(col("Name")).alias("Name_ConvertedCase")).show(
    truncate=False
)

+------------------------+------------------------+
|Name                    |Name_ConvertedCase      |
+------------------------+------------------------+
|ABCON INC.              |Abcon inc.              |
|ACCEPTABLE TAXI LLC     |Acceptable taxi llc     |
|ALLIS CAB CORP          |Allis cab corp          |
|BENE CAB CORP           |Bene cab corp           |
|BOULOS TAXI CORP.       |Boulos taxi corp.       |
|CACERES,JAIME,P         |Caceres, Jaime, P       |
|CALCIUM ONE SERVICE INC.|Calcium one service inc.|
|CHARLES,WILBERT         |Charles, Wilbert        |
|CHAWKI,MICHAEL          |Chawki, Michael         |
|CHRYSOVALANTOU CORP,    |Chrysovalantou corp,    |
|COFI BOAT CORP.         |Cofi boat corp.         |
|DEKEL TAXI CAB CORP     |Dekel taxi cab corp     |
|FLORIAN & ROBERT INC    |Florian & robert inc    |
|GART CAB CORP           |Gart cab corp           |
|GAUTHIER,JACQUES        |Gauthier, Jacques       |
|GEORGAKOPOULOS, GEORGIA |Georgakopoulos,  georgia|
|GUJAR CAB C

In [26]:
# Use the registered UDF in the SQL query
spark.sql(
    """
    SELECT 
        Name,
        convertCaseUdf(Name) AS Name_ConvertedCase
    FROM Cabs
"""
).show(truncate=False)

+------------------------+------------------------+
|Name                    |Name_ConvertedCase      |
+------------------------+------------------------+
|ABCON INC.              |Abcon inc.              |
|ACCEPTABLE TAXI LLC     |Acceptable taxi llc     |
|ALLIS CAB CORP          |Allis cab corp          |
|BENE CAB CORP           |Bene cab corp           |
|BOULOS TAXI CORP.       |Boulos taxi corp.       |
|CACERES,JAIME,P         |Caceres, Jaime, P       |
|CALCIUM ONE SERVICE INC.|Calcium one service inc.|
|CHARLES,WILBERT         |Charles, Wilbert        |
|CHAWKI,MICHAEL          |Chawki, Michael         |
|CHRYSOVALANTOU CORP,    |Chrysovalantou corp,    |
|COFI BOAT CORP.         |Cofi boat corp.         |
|DEKEL TAXI CAB CORP     |Dekel taxi cab corp     |
|FLORIAN & ROBERT INC    |Florian & robert inc    |
|GART CAB CORP           |Gart cab corp           |
|GAUTHIER,JACQUES        |Gauthier, Jacques       |
|GEORGAKOPOULOS, GEORGIA |Georgakopoulos, Georgia |
|GUJAR CAB C

In [27]:
# Example DataFrame
data = [("john,doe",), ("alice,bob",), (None,)]
columns = ["Name"]
data_DF = spark.createDataFrame(data, columns)


# Define the Python function
def convert_case(name: str) -> str:
    if not name:
        return ""
    result = ""
    reversed_word = name.split(",")
    for word in reversed_word:
        result += word[::-1] + ", "
    return result[:-2]


# Register UDF
convert_case_udf = udf(convert_case, StringType())

# Apply UDF to DataFrame
data_DF.select("Name", convert_case_udf(col("Name")).alias("Name_ConvertedCase")).show(
    truncate=False
)

+---------+------------------+
|Name     |Name_ConvertedCase|
+---------+------------------+
|john,doe |nhoj, eod         |
|alice,bob|ecila, bob        |
|NULL     |                  |
+---------+------------------+

