In [21]:
from pyspark.sql import SparkSession

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/evivancovid/warehouse"). \
    enableHiveSupport(). \
    appName(f'evivancovid | Python - Data Processing - Overview'). \
    master('yarn'). \
    getOrCreate()

In [2]:
employees = [(1, "Scott", "Tiger", 1000.0, 
                      "united states", "+1 123 456 7890", "123 45 6789"
                     ),
                     (2, "Henry", "Ford", 1250.0, 
                      "India", "+91 234 567 8901", "456 78 9123"
                     ),
                     (3, "Nick", "Junior", 750.0, 
                      "united KINGDOM", "+44 111 111 1111", "222 33 4444"
                     ),
                     (4, "Bill", "Gomes", 1500.0, 
                      "AUSTRALIA", "+61 987 654 3210", "789 12 6118"
                     )
                ]

In [15]:
employeesDF = spark.createDataFrame(employees,
                                    schema = """employee_id INT,first_name STRING,
                                    last_name STRING, salary FLOAT, nationality STRING,
                                    phone_number STRING, ssn STRING""")

In [16]:
employeesDF.show()

+-----------+----------+---------+------+--------------+----------------+-----------+
|employee_id|first_name|last_name|salary|   nationality|    phone_number|        ssn|
+-----------+----------+---------+------+--------------+----------------+-----------+
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118|
+-----------+----------+---------+------+--------------+----------------+-----------+



In [17]:
from pyspark.sql.functions import concat

employeesDF.withColumn("full_name", concat("first_name", "last_name")).\
show()

+-----------+----------+---------+------+--------------+----------------+-----------+----------+
|employee_id|first_name|last_name|salary|   nationality|    phone_number|        ssn| full_name|
+-----------+----------+---------+------+--------------+----------------+-----------+----------+
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|ScottTiger|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123| HenryFord|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|NickJunior|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118| BillGomes|
+-----------+----------+---------+------+--------------+----------------+-----------+----------+



In [18]:
from pyspark.sql.functions import concat, lit

employeesDF.withColumn("full_name", concat("first_name", lit(", "), "last_name")).\
show()

+-----------+----------+---------+------+--------------+----------------+-----------+------------+
|employee_id|first_name|last_name|salary|   nationality|    phone_number|        ssn|   full_name|
+-----------+----------+---------+------+--------------+----------------+-----------+------------+
|          1|     Scott|    Tiger|1000.0| united states| +1 123 456 7890|123 45 6789|Scott, Tiger|
|          2|     Henry|     Ford|1250.0|         India|+91 234 567 8901|456 78 9123| Henry, Ford|
|          3|      Nick|   Junior| 750.0|united KINGDOM|+44 111 111 1111|222 33 4444|Nick, Junior|
|          4|      Bill|    Gomes|1500.0|     AUSTRALIA|+61 987 654 3210|789 12 6118| Bill, Gomes|
+-----------+----------+---------+------+--------------+----------------+-----------+------------+



In [19]:
from pyspark.sql.functions import col, lower, upper, initcap, length
employeesDF. \
select("nationality"). \
withColumn("nationality_lower", lower(col("nationality"))). \
withColumn("nationality_upper", upper(col("nationality"))). \
withColumn("nationality_initcap", initcap(col("nationality"))). \
withColumn("nationality_length", length(col("nationality"))). \
show()

+--------------+-----------------+-----------------+-------------------+------------------+
|   nationality|nationality_lower|nationality_upper|nationality_initcap|nationality_length|
+--------------+-----------------+-----------------+-------------------+------------------+
| united states|    united states|    UNITED STATES|      United States|                13|
|         India|            india|            INDIA|              India|                 5|
|united KINGDOM|   united kingdom|   UNITED KINGDOM|     United Kingdom|                14|
|     AUSTRALIA|        australia|        AUSTRALIA|          Australia|                 9|
+--------------+-----------------+-----------------+-------------------+------------------+

