## handleInvalid 
### Options are:
- <b>'skip'</b> (filter out rows with invalid data), 
- <b>error </b> (throw an error), 
- <b>'keep'</b> (put invalid data in a special additional bucket, at index numLabels).")

https://spark.apache.org/docs/latest/ml-features.html#stringindexer

In [1]:
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()

In [2]:
columns = ["Seqno","Name"]
data = [("1", "john jones"),
    ("2", "tracey smith"),
    ("3", "amy sanders")]

df = spark.createDataFrame(data=data,schema=columns)

df.show(truncate=False)

                                                                                

+-----+------------+
|Seqno|Name        |
+-----+------------+
|1    |john jones  |
|2    |tracey smith|
|3    |amy sanders |
+-----+------------+



In [3]:
def convertCase(str):
    resStr=""
    arr = str.split(" ")
    #print(arr)
    for x in arr:
        resStr= resStr + x[0:1].upper() + x[1:len(x)] + " "
        #print(resStr)
    return resStr 

In [4]:
convertCase('ahmed mohamed hamed')

'Ahmed Mohamed Hamed '

In [5]:
from pyspark.sql.functions import udf,col
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType

In [6]:
convertUDF = udf(convertCase,StringType())

In [7]:
df.select(col("Seqno"), \
    convertUDF(col("Name")).alias("Name") ) \
   .show(truncate=False)

                                                                                

+-----+-------------+
|Seqno|Name         |
+-----+-------------+
|1    |John Jones   |
|2    |Tracey Smith |
|3    |Amy Sanders  |
+-----+-------------+



In [8]:
""" Using UDF on SQL """
spark.udf.register("convertUDFSQL", convertCase,StringType())

<function __main__.convertCase(str)>

In [10]:
df.createOrReplaceTempView("NAME_TABLE")

In [11]:
spark.sql("select Seqno, convertUDFSQL(Name) as Name from NAME_TABLE") \
     .show(truncate=False)

+-----+-------------+
|Seqno|Name         |
+-----+-------------+
|1    |John Jones   |
|2    |Tracey Smith |
|3    |Amy Sanders  |
+-----+-------------+



In [5]:
@udf(returnType=StringType()) 
def convertCaseAnnot(str):
    resStr=""
    arr = str.split(" ")
    #print(arr)
    for x in arr:
        resStr= resStr + x[0:1].upper() + x[1:len(x)] + " "
        #print(resStr)
    return resStr 

In [18]:
df.select(col("Seqno"), \
    convertCaseAnnot(col("Name")).alias("Name") ) \
   .show(truncate=False)

+-----+-------------+
|Seqno|Name         |
+-----+-------------+
|1    |John Jones   |
|2    |Tracey Smith |
|3    |Amy Sanders  |
+-----+-------------+

