In [14]:
def split_full_name(df, full_name_column):
    """
    Splits the full_name column in a DataFrame into title, first_name, and last_name columns.
    """
    # List of possible German titles
    raw_titles = [
        "Dr.",
        "Prof.",
        "Dipl.-Ing.",
        "Dipl.-Kfm.",
        "Dipl.",
        "Mag.",
        "Ing.",
        "B.Sc.",
        "M.Sc.",
        "Ph.D.",
        "Univ.-Prof.",
        "Priv.-Doz.",
        "Univ.-Doz.",
        "Dr.-Ing.",
    ]
    # Normalize titles for case-insensitive matching and handle variations with or without periods
    normalized_titles = set()
    for t in raw_titles:
        t_norm = t.replace(".", "").lower()
        normalized_titles.add(t_norm)

    # List of German and common European prepositions or particles in surnames
    prepositions_list = [
        ["von", "der"],
        ["van", "der"],
        ["von", "dem"],
        ["van", "den"],
        ["de", "la"],
        ["de", "le"],
        ["de", "los"],
        ["de", "las"],
        ["von"],
        ["van"],
        ["zu"],
        ["zum"],
        ["zur"],
        ["vom"],
        ["de"],
        ["del"],
        ["da"],
        ["di"],
        ["der"],
        ["den"],
        ["du"],
        ["la"],
        ["le"],
    ]
    # Sort prepositions_list by length in decreasing order
    prepositions_list.sort(key=lambda x: -len(x))

    # Normalize prepositions to lowercase for comparison
    prepositions_list = [
        [word.lower() for word in preposition] for preposition in prepositions_list
    ]

    # Define the UDF to split the full name
    def split_name(full_name):
        if not full_name:
            return ("", "", "")
        words = full_name.strip().split()
        title_words = []
        i = 0
        # Extract titles from the beginning of the name
        while i < len(words):
            word = words[i]
            word_norm = word.replace(".", "").lower()
            if word_norm in normalized_titles:
                title_words.append(word)
                i += 1
            else:
                break
        title = " ".join(title_words)
        # Remaining words after extracting the title
        name_words = words[i:]
        if not name_words:
            return (title, "", "")
        # Build last_name_words starting from the last word
        last_name_words = [name_words[-1]]
        j = len(name_words) - 2
        while j >= 0:
            match = False
            for preposition in prepositions_list:
                preposition_length = len(preposition)
                if j - preposition_length + 1 >= 0:
                    candidate_words = name_words[j - preposition_length + 1 : j + 1]
                    candidate_words_lower = [w.lower() for w in candidate_words]
                    if candidate_words_lower == preposition:
                        last_name_words = candidate_words + last_name_words
                        j = j - preposition_length
                        match = True
                        break
            if not match:
                break
        first_name_words = name_words[: j + 1]
        first_name = " ".join(first_name_words)
        last_name = " ".join(last_name_words)
        return (title, first_name, last_name)

    # Define the UDF with the appropriate return type
    split_name_udf = udf(
        split_name,
        StructType(
            [
                StructField("title", StringType(), True),
                StructField("first_name", StringType(), True),
                StructField("last_name", StringType(), True),
            ]
        ),
    )

    # Apply the UDF to the DataFrame
    df = df.withColumn("name_struct", split_name_udf(col(full_name_column)))
    df = df.withColumn("title", col("name_struct.title"))
    df = df.withColumn("first_name", col("name_struct.first_name"))
    df = df.withColumn("last_name", col("name_struct.last_name"))
    df = df.drop("name_struct")

    return df

In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_extract, regexp_replace, trim, udf
from pyspark.sql.types import StringType, StructField, StructType
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Sample data
data = [
    ("Dr. Max Mustermann",),
    ("Prof. Dr. Anna Müller",),
    ("Hans von Schmidt",),
    ("Dr.-Ing. Karl Zumbrink",),
    ("Maria de la Cruz",),
    ("Dipl.-Ing. Peter van den Berg",),
    ("Laura van der Vaart",),
    ("Franz von der Lippe",),
    ("Luisa del Toro",),
    ("Juan de los Santos",),
    ("Sophie de las Flores",),
]

# Create SparkSession
spark = SparkSession.builder.appName("SplitFullName").getOrCreate()

# Create DataFrame
df = spark.createDataFrame(data, ["full_name"])

# Apply the split_full_name function
df_split = split_full_name(df, "full_name")

# Show the result
df_split.select("title", "first_name", "last_name").show(truncate=False)

+----------+----------+-------------+
|title     |first_name|last_name    |
+----------+----------+-------------+
|Dr.       |Max       |Mustermann   |
|Prof. Dr. |Anna      |Müller       |
|          |Hans      |von Schmidt  |
|Dr.-Ing.  |Karl      |Zumbrink     |
|          |Maria     |de la Cruz   |
|Dipl.-Ing.|Peter     |van den Berg |
|          |Laura     |van der Vaart|
|          |Franz     |von der Lippe|
|          |Luisa     |del Toro     |
|          |Juan      |de los Santos|
|          |Sophie    |de las Flores|
+----------+----------+-------------+



In [16]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, lpad, rpad, when


def enforce_string_length(
    df: DataFrame, column_name: str, string_length: int = 20
) -> DataFrame:
    """
    Adjusts the string length of a specified column in the PySpark DataFrame to match the specified length.

    If the original string is longer, it trims it. If it is shorter, it pads with spaces.

    Parameters
    ----------
    df : DataFrame
        The input PySpark DataFrame.
    column_name : str
        The name of the string column to be adjusted.
    string_length : int, optional
        The length to which the strings should be adjusted (default is 20).

    Returns
    -------
    DataFrame
        A DataFrame with the specified column adjusted to the exact string length.
    """
    return df.withColumn(
        column_name,
        when(
            col(column_name).isNotNull(),
            rpad(col(column_name).substr(1, string_length), string_length, " "),
        ).otherwise(col(column_name)),
    )


data = [("short",), ("this is a long string that will be trimmed",), (None,)]
df = spark.createDataFrame(data, ["example_col"])
df_transformed = enforce_string_length(df, "example_col", 15)
df_transformed.show()

+---------------+
|    example_col|
+---------------+
|short          |
|this is a long |
|           null|
+---------------+



24/10/20 00:55:51 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 569882 ms exceeds timeout 120000 ms
24/10/20 00:55:51 WARN SparkContext: Killing executors is not supported by current scheduler.
24/10/20 00:55:51 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:322)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:117)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$driverEndpoint(BlockManagerMasterEndpoint.scala:116)
	at org.apache.spark.storage.B