In [1]:
from pyspark.sql import SparkSession

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/evivancovid/warehouse"). \
    enableHiveSupport(). \
    appName(f'evivancovid | Python - Data Processing - Overview'). \
    master('yarn'). \
    getOrCreate()

In [13]:
employees = [(1, "Scott", "Tiger", 1000.0, 
                      "united states", "+1 123 456 7890", "123 45 6789"
                     ),
                     (2, "Henry", "Ford", 1250.0, 
                      "India", "+91 234 567 8901", "456 78 9123"
                     ),
                     (3, "Nick", "Junior", 750.0, 
                      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
                     ),
                     (4, "Bill", "Gomes", 1500.0, 
                      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
                     )
                ]

In [14]:
employeesDF = spark.createDataFrame(employees,
                                    schema = """employee_id INT,first_name STRING,
                                    last_name STRING, salary FLOAT, nationality STRING,
                                    phone_number STRING, ssn STRING""")

In [4]:
employeesDF.show()

+-----------+----------+---------+------+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+--------------+----------------+-----------+



Extracting Substrings using substring() function

In [5]:
from pyspark.sql.functions import substring, lit, col

In [6]:
employeesDF.select("first_name", "phone_number", "ssn"). \
withColumn("phone_last_4", substring(col("phone_number"),-4,4)). \
withColumn("ssn_last_4", substring(col("ssn"),-4,4)). \
show()

+----------+----------------+-----------+------------+----------+
|first_name|    phone_number|        ssn|phone_last_4|ssn_last_4|
+----------+----------------+-----------+------------+----------+
|     Scott| +1 123 456 7890|123 45 6789|        7890|      6789|
|     Henry|+91 234 567 8901|456 78 9123|        8901|      9123|
|      Nick|+44 111 111 1111|222 33 4444|        1111|      4444|
|      Bill|+61 987 654 3210|789 12 6118|        3210|      6118|
+----------+----------------+-----------+------------+----------+



Extracting substrings using split() function

In [7]:
from pyspark.sql.functions import split, explode, lit

In [8]:
l = [('X', )]

In [9]:
df = spark.createDataFrame(l, "dummy STRING")

In [10]:
df.select(split(lit("Hello World, how are you"), " ")). \
    show(truncate=False)

+--------------------------------------+
|split(Hello World, how are you,  , -1)|
+--------------------------------------+
|[Hello, World,, how, are, you]        |
+--------------------------------------+



In [11]:
df.select(split(lit("Hello World, how are you"), " ")[2]). \
    show(truncate=False)

+-----------------------------------------+
|split(Hello World, how are you,  , -1)[2]|
+-----------------------------------------+
|how                                      |
+-----------------------------------------+



In [12]:
df.select(explode(split(lit("Hello World, how are you"), " ")).alias("word")). \
    show(truncate=False)

+------+
|word  |
+------+
|Hello |
|World,|
|how   |
|are   |
|you   |
+------+



In [18]:
employeesDF.select("first_name", "phone_number", "ssn"). \
withColumn("area_code", split(col("phone_number")," ")[0]). \
withColumn("phone_last_4", split(col("phone_number")," ")[3]). \
withColumn("ssn_las_4", split(col("ssn")," ")[2]). \
show()

+----------+----------------+-----------+---------+------------+---------+
|first_name|    phone_number|        ssn|area_code|phone_last_4|ssn_las_4|
+----------+----------------+-----------+---------+------------+---------+
|     Scott| +1 123 456 7890|123 45 6789|       +1|        7890|     6789|
|     Henry|+91 234 567 8901|456 78 9123|      +91|        8901|     9123|
|      Nick|+44 111 111 1111|222 33 4444|      +44|        1111|     4444|
|      Bill|+61 987 654 3210|789 12 6118|      +61|        3210|     6118|
+----------+----------------+-----------+---------+------------+---------+



In [1]:
help(substring)

NameError: name 'substring' is not defined