In [5]:
#String dates NULL

In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Strings & Dates")
    .master("local[*]")
    .getOrCreate()
)

spark

In [2]:
from pyspark.sql.types import StringType,StructField,StructType

schema = StructType([
    StructField("employee_id", StringType(), True),
    StructField("department_id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("age", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("hire_date", StringType(), True),
    StructField("salary",StringType(),True)
])

In [3]:
data = [
    ("E001", "D001", "John Doe", "30", "Male", "2019-05-20","200000"),
    ("E002", "D002", "Jane Smith", "25", "Female", "2020-07-15","400000"),
    ("E003", "D001", "Alice Johnson", "35", "Female", "2018-11-03","500000"),
    ("E004", "D003", "Bob Williams", "40", "Male", "2017-09-12","4500000")
]

In [4]:
emp = spark.createDataFrame(data,schema)

In [5]:
emp.show()

+-----------+-------------+-------------+---+------+----------+-------+
|employee_id|department_id|         name|age|gender| hire_date| salary|
+-----------+-------------+-------------+---+------+----------+-------+
|       E001|         D001|     John Doe| 30|  Male|2019-05-20| 200000|
|       E002|         D002|   Jane Smith| 25|Female|2020-07-15| 400000|
|       E003|         D001|Alice Johnson| 35|Female|2018-11-03| 500000|
|       E004|         D003| Bob Williams| 40|  Male|2017-09-12|4500000|
+-----------+-------------+-------------+---+------+----------+-------+



In [6]:
# male -> M female -> F else NULL
from pyspark.sql.functions import when, col, expr 
emp_gender = emp.withColumn("gen",when(col("gender") == 'Male','M')
                           .when(col("gender") == 'Female', 'F')
                            .otherwise(None)
                           )
emp_gender.show()

+-----------+-------------+-------------+---+------+----------+-------+---+
|employee_id|department_id|         name|age|gender| hire_date| salary|gen|
+-----------+-------------+-------------+---+------+----------+-------+---+
|       E001|         D001|     John Doe| 30|  Male|2019-05-20| 200000|  M|
|       E002|         D002|   Jane Smith| 25|Female|2020-07-15| 400000|  F|
|       E003|         D001|Alice Johnson| 35|Female|2018-11-03| 500000|  F|
|       E004|         D003| Bob Williams| 40|  Male|2017-09-12|4500000|  M|
+-----------+-------------+-------------+---+------+----------+-------+---+



In [7]:
# another way using expr.

emp_gender_1 = emp.withColumn("gen",expr("case when gender = 'Male' then 'M' when gender = 'Female' then 'F' else null end as gen"))
emp_gender_1.show()

+-----------+-------------+-------------+---+------+----------+-------+---+
|employee_id|department_id|         name|age|gender| hire_date| salary|gen|
+-----------+-------------+-------------+---+------+----------+-------+---+
|       E001|         D001|     John Doe| 30|  Male|2019-05-20| 200000|  M|
|       E002|         D002|   Jane Smith| 25|Female|2020-07-15| 400000|  F|
|       E003|         D001|Alice Johnson| 35|Female|2018-11-03| 500000|  F|
|       E004|         D003| Bob Williams| 40|  Male|2017-09-12|4500000|  M|
+-----------+-------------+-------------+---+------+----------+-------+---+



In [8]:
#replace 

from pyspark.sql.functions import regexp_replace

emp_name_replace = emp.withColumn("new_name",regexp_replace(col("name"),"J","G"))
emp_name_replace.show()

+-----------+-------------+-------------+---+------+----------+-------+-------------+
|employee_id|department_id|         name|age|gender| hire_date| salary|     new_name|
+-----------+-------------+-------------+---+------+----------+-------+-------------+
|       E001|         D001|     John Doe| 30|  Male|2019-05-20| 200000|     Gohn Doe|
|       E002|         D002|   Jane Smith| 25|Female|2020-07-15| 400000|   Gane Smith|
|       E003|         D001|Alice Johnson| 35|Female|2018-11-03| 500000|Alice Gohnson|
|       E004|         D003| Bob Williams| 40|  Male|2017-09-12|4500000| Bob Williams|
+-----------+-------------+-------------+---+------+----------+-------+-------------+



In [9]:
#convert data -> string to datetype
from pyspark.sql.functions import to_date

emp_date = emp.withColumn("hire_date",to_date(col("hire_date"),'yyyy-MM-dd'))
emp_date.show()

+-----------+-------------+-------------+---+------+----------+-------+
|employee_id|department_id|         name|age|gender| hire_date| salary|
+-----------+-------------+-------------+---+------+----------+-------+
|       E001|         D001|     John Doe| 30|  Male|2019-05-20| 200000|
|       E002|         D002|   Jane Smith| 25|Female|2020-07-15| 400000|
|       E003|         D001|Alice Johnson| 35|Female|2018-11-03| 500000|
|       E004|         D003| Bob Williams| 40|  Male|2017-09-12|4500000|
+-----------+-------------+-------------+---+------+----------+-------+



In [10]:
emp_date.printSchema()

root
 |-- employee_id: string (nullable = true)
 |-- department_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- hire_date: date (nullable = true)
 |-- salary: string (nullable = true)



In [11]:
# add columns
# add date,timestamp,extract year from hire_date
from pyspark.sql.functions import current_timestamp,current_date

emp_date = emp.withColumn("date_now",current_date()).withColumn("timestamp_now",current_timestamp())
emp_date.show()

+-----------+-------------+-------------+---+------+----------+-------+----------+--------------------+
|employee_id|department_id|         name|age|gender| hire_date| salary|  date_now|       timestamp_now|
+-----------+-------------+-------------+---+------+----------+-------+----------+--------------------+
|       E001|         D001|     John Doe| 30|  Male|2019-05-20| 200000|2024-10-17|2024-10-17 09:07:...|
|       E002|         D002|   Jane Smith| 25|Female|2020-07-15| 400000|2024-10-17|2024-10-17 09:07:...|
|       E003|         D001|Alice Johnson| 35|Female|2018-11-03| 500000|2024-10-17|2024-10-17 09:07:...|
|       E004|         D003| Bob Williams| 40|  Male|2017-09-12|4500000|2024-10-17|2024-10-17 09:07:...|
+-----------+-------------+-------------+---+------+----------+-------+----------+--------------------+



In [12]:
# date into string and extract date information
from pyspark.sql.functions import date_format

emp_new = emp.withColumn("date_string",date_format(col("hire_date"),"dd/MM/yyyy"))
emp_new.show()

+-----------+-------------+-------------+---+------+----------+-------+-----------+
|employee_id|department_id|         name|age|gender| hire_date| salary|date_string|
+-----------+-------------+-------------+---+------+----------+-------+-----------+
|       E001|         D001|     John Doe| 30|  Male|2019-05-20| 200000| 20/05/2019|
|       E002|         D002|   Jane Smith| 25|Female|2020-07-15| 400000| 15/07/2020|
|       E003|         D001|Alice Johnson| 35|Female|2018-11-03| 500000| 03/11/2018|
|       E004|         D003| Bob Williams| 40|  Male|2017-09-12|4500000| 12/09/2017|
+-----------+-------------+-------------+---+------+----------+-------+-----------+

