In [1]:
# working with string data
  # case when regex_replace etc 
# working with dates 
  # to_date,current_date,current_timestamp
# working with null values 
  # nvl, na.drop, na.fill

In [31]:
from pyspark.sql import SparkSession


spark = (
    SparkSession
    .builder
    .appName("spark introduction")
    .master("local[*]")
    .getOrCreate()
)


In [60]:
# Updated schema with gender and hire_date
emp_schema = """
emp_id int,
emp_name string,
gender string,
department string,
role string,
salary int,
hire_date string
"""

# Expanded employee data with hire_date
emp_data = [
    [101, "Alice", "Female", "HR", "Manager", 60000, "2018-03-15"],
    [102, "Bob", "Male", "IT", "Developer", 75000, "2019-07-22"],
    [103, "Charlie", "Male", "Finance", "Analyst", 65000, "2020-01-10"],
    [104, "Diana", "Female", "IT", "Tester", 55000, "2021-05-18"],
    [105, "Evan", "Male", "Sales", "Executive", 50000, "2022-02-01"],
    [106, "Fiona", "Female", "Marketing", "Coordinator", 48000, "2020-09-12"],
    [107, "George", "", "IT", "DevOps Engineer", 82000, "2017-11-30"],
    [108, "Hannah", "Female", "Finance", "Accountant", 62000, "2019-04-25"],
    [109, "Ian", "Male", "HR", "Recruiter", 52000, "2021-08-09"],
    [110, "Julia", "Female", "Sales", "Manager", 70000, "2016-06-17"]
]


In [63]:
emp = spark.createDataFrame(data=emp_data,schema=emp_schema)

In [64]:
from pyspark.sql.functions import when,col,expr

emp_gender_fixed = emp.withColumn("new_gender",when(col("gender")=='Male','M').when(col("gender")=="Female","F").otherwise(None))

In [65]:
emp_gender_fixed.show()

+------+--------+------+----------+---------------+------+----------+----------+
|emp_id|emp_name|gender|department|           role|salary| hire_date|new_gender|
+------+--------+------+----------+---------------+------+----------+----------+
|   101|   Alice|Female|        HR|        Manager| 60000|2018-03-15|         F|
|   102|     Bob|  Male|        IT|      Developer| 75000|2019-07-22|         M|
|   103| Charlie|  Male|   Finance|        Analyst| 65000|2020-01-10|         M|
|   104|   Diana|Female|        IT|         Tester| 55000|2021-05-18|         F|
|   105|    Evan|  Male|     Sales|      Executive| 50000|2022-02-01|         M|
|   106|   Fiona|Female| Marketing|    Coordinator| 48000|2020-09-12|         F|
|   107|  George|      |        IT|DevOps Engineer| 82000|2017-11-30|      null|
|   108|  Hannah|Female|   Finance|     Accountant| 62000|2019-04-25|         F|
|   109|     Ian|  Male|        HR|      Recruiter| 52000|2021-08-09|         M|
|   110|   Julia|Female|    

In [66]:
emp_fixed3 = emp.withColumn("new_gender",expr("case when gender = 'Male' then 'M' when gender = 'Female' then 'F' else null end"))

In [67]:
emp_fixed3.show() # use of case statement 

+------+--------+------+----------+---------------+------+----------+----------+
|emp_id|emp_name|gender|department|           role|salary| hire_date|new_gender|
+------+--------+------+----------+---------------+------+----------+----------+
|   101|   Alice|Female|        HR|        Manager| 60000|2018-03-15|         F|
|   102|     Bob|  Male|        IT|      Developer| 75000|2019-07-22|         M|
|   103| Charlie|  Male|   Finance|        Analyst| 65000|2020-01-10|         M|
|   104|   Diana|Female|        IT|         Tester| 55000|2021-05-18|         F|
|   105|    Evan|  Male|     Sales|      Executive| 50000|2022-02-01|         M|
|   106|   Fiona|Female| Marketing|    Coordinator| 48000|2020-09-12|         F|
|   107|  George|      |        IT|DevOps Engineer| 82000|2017-11-30|      null|
|   108|  Hannah|Female|   Finance|     Accountant| 62000|2019-04-25|         F|
|   109|     Ian|  Male|        HR|      Recruiter| 52000|2021-08-09|         M|
|   110|   Julia|Female|    

In [68]:
# replace in strings

from pyspark.sql.functions import regexp_replace

emp_name_fixed = emp_gender_fixed.withColumn("new_name",regexp_replace(col("emp_name"),"H","P"))


In [69]:
emp_name_fixed.show()  # go documention for more

+------+--------+------+----------+---------------+------+----------+----------+--------+
|emp_id|emp_name|gender|department|           role|salary| hire_date|new_gender|new_name|
+------+--------+------+----------+---------------+------+----------+----------+--------+
|   101|   Alice|Female|        HR|        Manager| 60000|2018-03-15|         F|   Alice|
|   102|     Bob|  Male|        IT|      Developer| 75000|2019-07-22|         M|     Bob|
|   103| Charlie|  Male|   Finance|        Analyst| 65000|2020-01-10|         M| Charlie|
|   104|   Diana|Female|        IT|         Tester| 55000|2021-05-18|         F|   Diana|
|   105|    Evan|  Male|     Sales|      Executive| 50000|2022-02-01|         M|    Evan|
|   106|   Fiona|Female| Marketing|    Coordinator| 48000|2020-09-12|         F|   Fiona|
|   107|  George|      |        IT|DevOps Engineer| 82000|2017-11-30|      null|  George|
|   108|  Hannah|Female|   Finance|     Accountant| 62000|2019-04-25|         F|  Pannah|
|   109|  

In [70]:
# convert date
from pyspark.sql.functions import to_date

emp_date_fix = emp_name_fixed.withColumn("hire_date",to_date(col("hire_date"),"yyyy-MM-dd"))

In [71]:
emp_date_fix.show(truncate=False)

+------+--------+------+----------+---------------+------+----------+----------+--------+
|emp_id|emp_name|gender|department|role           |salary|hire_date |new_gender|new_name|
+------+--------+------+----------+---------------+------+----------+----------+--------+
|101   |Alice   |Female|HR        |Manager        |60000 |2018-03-15|F         |Alice   |
|102   |Bob     |Male  |IT        |Developer      |75000 |2019-07-22|M         |Bob     |
|103   |Charlie |Male  |Finance   |Analyst        |65000 |2020-01-10|M         |Charlie |
|104   |Diana   |Female|IT        |Tester         |55000 |2021-05-18|F         |Diana   |
|105   |Evan    |Male  |Sales     |Executive      |50000 |2022-02-01|M         |Evan    |
|106   |Fiona   |Female|Marketing |Coordinator    |48000 |2020-09-12|F         |Fiona   |
|107   |George  |      |IT        |DevOps Engineer|82000 |2017-11-30|null      |George  |
|108   |Hannah  |Female|Finance   |Accountant     |62000 |2019-04-25|F         |Pannah  |
|109   |Ia

In [46]:
emp_date_fix.printSchema()  # conversion done with column

root
 |-- emp_id: integer (nullable = true)
 |-- emp_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- department: string (nullable = true)
 |-- role: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- hire_date: date (nullable = true)
 |-- new_gender: string (nullable = true)
 |-- new_name: string (nullable = true)



In [72]:
from pyspark.sql.functions import current_date,current_timestamp



In [73]:
dtnow = emp_date_fix.withColumn("current_date",current_date()).withColumn("timestamp_now",current_timestamp())

In [74]:
dtnow.show()

+------+--------+------+----------+---------------+------+----------+----------+--------+------------+--------------------+
|emp_id|emp_name|gender|department|           role|salary| hire_date|new_gender|new_name|current_date|       timestamp_now|
+------+--------+------+----------+---------------+------+----------+----------+--------+------------+--------------------+
|   101|   Alice|Female|        HR|        Manager| 60000|2018-03-15|         F|   Alice|  2025-12-29|2025-12-29 19:26:...|
|   102|     Bob|  Male|        IT|      Developer| 75000|2019-07-22|         M|     Bob|  2025-12-29|2025-12-29 19:26:...|
|   103| Charlie|  Male|   Finance|        Analyst| 65000|2020-01-10|         M| Charlie|  2025-12-29|2025-12-29 19:26:...|
|   104|   Diana|Female|        IT|         Tester| 55000|2021-05-18|         F|   Diana|  2025-12-29|2025-12-29 19:26:...|
|   105|    Evan|  Male|     Sales|      Executive| 50000|2022-02-01|         M|    Evan|  2025-12-29|2025-12-29 19:26:...|
|   106|

In [75]:
# drop null gender values 
emp_one = dtnow.na.drop()

In [76]:
emp_one.show() #not good optionn in production

+------+--------+------+----------+-----------+------+----------+----------+--------+------------+--------------------+
|emp_id|emp_name|gender|department|       role|salary| hire_date|new_gender|new_name|current_date|       timestamp_now|
+------+--------+------+----------+-----------+------+----------+----------+--------+------------+--------------------+
|   101|   Alice|Female|        HR|    Manager| 60000|2018-03-15|         F|   Alice|  2025-12-29|2025-12-29 19:26:...|
|   102|     Bob|  Male|        IT|  Developer| 75000|2019-07-22|         M|     Bob|  2025-12-29|2025-12-29 19:26:...|
|   103| Charlie|  Male|   Finance|    Analyst| 65000|2020-01-10|         M| Charlie|  2025-12-29|2025-12-29 19:26:...|
|   104|   Diana|Female|        IT|     Tester| 55000|2021-05-18|         F|   Diana|  2025-12-29|2025-12-29 19:26:...|
|   105|    Evan|  Male|     Sales|  Executive| 50000|2022-02-01|         M|    Evan|  2025-12-29|2025-12-29 19:26:...|
|   106|   Fiona|Female| Marketing|Coord

In [55]:
# fix null values 
from pyspark.sql.functions import coalesce,lit


In [77]:
emp_null_df = dtnow.withColumn(
    "new_gender",
    coalesce(col("new_gender"), lit("o"))
)

In [78]:
emp_null_df.show()

+------+--------+------+----------+---------------+------+----------+----------+--------+------------+--------------------+
|emp_id|emp_name|gender|department|           role|salary| hire_date|new_gender|new_name|current_date|       timestamp_now|
+------+--------+------+----------+---------------+------+----------+----------+--------+------------+--------------------+
|   101|   Alice|Female|        HR|        Manager| 60000|2018-03-15|         F|   Alice|  2025-12-29|2025-12-29 19:27:...|
|   102|     Bob|  Male|        IT|      Developer| 75000|2019-07-22|         M|     Bob|  2025-12-29|2025-12-29 19:27:...|
|   103| Charlie|  Male|   Finance|        Analyst| 65000|2020-01-10|         M| Charlie|  2025-12-29|2025-12-29 19:27:...|
|   104|   Diana|Female|        IT|         Tester| 55000|2021-05-18|         F|   Diana|  2025-12-29|2025-12-29 19:27:...|
|   105|    Evan|  Male|     Sales|      Executive| 50000|2022-02-01|         M|    Evan|  2025-12-29|2025-12-29 19:27:...|
|   106|

In [82]:
emp_final = emp_null_df.drop("emp_name","gender").withColumnRenamed("new_name","name").withColumnRenamed("new_gender","gender")
# drop old column and fix new column names

In [83]:
emp_final.show()

+------+----------+---------------+------+----------+------+-------+------------+--------------------+
|emp_id|department|           role|salary| hire_date|gender|   name|current_date|       timestamp_now|
+------+----------+---------------+------+----------+------+-------+------------+--------------------+
|   101|        HR|        Manager| 60000|2018-03-15|     F|  Alice|  2025-12-29|2025-12-29 19:30:...|
|   102|        IT|      Developer| 75000|2019-07-22|     M|    Bob|  2025-12-29|2025-12-29 19:30:...|
|   103|   Finance|        Analyst| 65000|2020-01-10|     M|Charlie|  2025-12-29|2025-12-29 19:30:...|
|   104|        IT|         Tester| 55000|2021-05-18|     F|  Diana|  2025-12-29|2025-12-29 19:30:...|
|   105|     Sales|      Executive| 50000|2022-02-01|     M|   Evan|  2025-12-29|2025-12-29 19:30:...|
|   106| Marketing|    Coordinator| 48000|2020-09-12|     F|  Fiona|  2025-12-29|2025-12-29 19:30:...|
|   107|        IT|DevOps Engineer| 82000|2017-11-30|     o| George|  202

In [84]:
emp_final.write.format("csv").save("data/output/4/emp.csv")

In [94]:
from pyspark.sql.functions import date_format

fixed = emp_final.withColumn("date_string",date_format(col("hire_date"),"MM/dd/yyyy"))

In [95]:
fixed.show()

+------+----------+---------------+------+----------+------+-------+------------+--------------------+-----------+
|emp_id|department|           role|salary| hire_date|gender|   name|current_date|       timestamp_now|date_string|
+------+----------+---------------+------+----------+------+-------+------------+--------------------+-----------+
|   101|        HR|        Manager| 60000|2018-03-15|     F|  Alice|  2025-12-29|2025-12-29 19:36:...| 03/15/2018|
|   102|        IT|      Developer| 75000|2019-07-22|     M|    Bob|  2025-12-29|2025-12-29 19:36:...| 07/22/2019|
|   103|   Finance|        Analyst| 65000|2020-01-10|     M|Charlie|  2025-12-29|2025-12-29 19:36:...| 01/10/2020|
|   104|        IT|         Tester| 55000|2021-05-18|     F|  Diana|  2025-12-29|2025-12-29 19:36:...| 05/18/2021|
|   105|     Sales|      Executive| 50000|2022-02-01|     M|   Evan|  2025-12-29|2025-12-29 19:36:...| 02/01/2022|
|   106| Marketing|    Coordinator| 48000|2020-09-12|     F|  Fiona|  2025-12-29