In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder\
.appName('Handle Date Type')\
.getOrCreate()

25/12/23 20:31:54 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
# Sending the Data to HDFS Via Notebook

# Create a CSV file
csv_data = """id,date_iso,date_dmy,date_mdy,timestamp
1,2023-01-15,15/01/2023,01/15/2023,2023-01-15 10:30:00
2,2023-05-20,20/05/2023,05/20/2023,2023-05-20 15:45:00
3,InvalidDate,31/02/2023,02/31/2023,InvalidTimestamp
4,,,, 
"""

# Save the CSV file
with open("dates_data.csv", "w") as f:
    f.write(csv_data)

In [3]:
!ls *dates_data*

dates_data.csv


In [5]:
!hadoop fs -put dates_data.csv /data/dates_data.csv

put: `/data/dates_data.csv': File exists


In [6]:
!hadoop fs -ls /data/

Found 3 items
-rw-r--r--   2 root hadoop       5488 2025-12-20 19:02 /data/customers_100.csv
-rw-r--r--   2 root hadoop        210 2025-12-22 13:34 /data/dates_data.csv
drwxr-xr-x   - root hadoop          0 2025-12-20 20:26 /data/write_output.csv


In [None]:
# from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# # StructType for the schema
# struct_schema = StructType([
#     StructField("id", IntegerType(), nullable=True),
#     StructField("date_iso", StringType(), nullable=True),
#     StructField("date_dmy", StringType(), nullable=True),
#     StructField("date_mdy", StringType(), nullable=True),
#     StructField("timestamp", StringType(), nullable=True)
# ])


In [7]:
# Define the schema
ddl_schema = """
    id INT,
    date_iso STRING,
    date_dmy STRING,
    date_mdy STRING,
    timestamp STRING
"""

# Read the CSV file into a DataFrame
df_file = spark.read.option("header", True).schema(ddl_schema).csv("/data/dates_data.csv")

# Show the DataFrame
df_file.show(truncate=False)


                                                                                

+---+-----------+----------+----------+-------------------+
|id |date_iso   |date_dmy  |date_mdy  |timestamp          |
+---+-----------+----------+----------+-------------------+
|1  |2023-01-15 |15/01/2023|01/15/2023|2023-01-15 10:30:00|
|2  |2023-05-20 |20/05/2023|05/20/2023|2023-05-20 15:45:00|
|3  |InvalidDate|31/02/2023|02/31/2023|InvalidTimestamp   |
|4  |NULL       |NULL      |NULL      |                   |
+---+-----------+----------+----------+-------------------+



In [8]:
# Sample data with multiple date formats
data = [
    (1, "2023-01-15", "15/01/2023", "01/15/2023", "2023-01-15 10:30:00"),
    (2, "2023-05-20", "20/05/2023", "05/20/2023", "2023-05-20 15:45:00"),
    (3, "InvalidDate", "31/02/2023", "02/31/2023", "InvalidTimestamp"),  # Invalid dates
    (4, None, None, None, None)  # Null values
]

# Define column names
columns = ["id", "date_iso", "date_dmy", "date_mdy", "timestamp"]


df = spark.createDataFrame(data, schema = columns)
df.show(truncate=False)

[Stage 2:>                                                          (0 + 1) / 1]

+---+-----------+----------+----------+-------------------+
|id |date_iso   |date_dmy  |date_mdy  |timestamp          |
+---+-----------+----------+----------+-------------------+
|1  |2023-01-15 |15/01/2023|01/15/2023|2023-01-15 10:30:00|
|2  |2023-05-20 |20/05/2023|05/20/2023|2023-05-20 15:45:00|
|3  |InvalidDate|31/02/2023|02/31/2023|InvalidTimestamp   |
|4  |NULL       |NULL      |NULL      |NULL               |
+---+-----------+----------+----------+-------------------+



                                                                                

In [9]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- date_iso: string (nullable = true)
 |-- date_dmy: string (nullable = true)
 |-- date_mdy: string (nullable = true)
 |-- timestamp: string (nullable = true)



In [12]:
from pyspark.sql.functions import to_date
df = df\
        .withColumn('parsed_date_iso',to_date(df.date_iso,'yyyy-MM-dd'))\
        .withColumn('parsed_date_dmy',to_date(df.date_dmy,'dd/MM/yyyy'))\
        .withColumn('parsed_date_mdy',to_date(df.date_mdy,'MM/dd/yyyy'))

In [13]:
df.show(truncate=False)

+---+-----------+----------+----------+-------------------+---------------+---------------+---------------+
|id |date_iso   |date_dmy  |date_mdy  |timestamp          |parsed_date_iso|parsed_date_dmy|parsed_date_mdy|
+---+-----------+----------+----------+-------------------+---------------+---------------+---------------+
|1  |2023-01-15 |15/01/2023|01/15/2023|2023-01-15 10:30:00|2023-01-15     |2023-01-15     |2023-01-15     |
|2  |2023-05-20 |20/05/2023|05/20/2023|2023-05-20 15:45:00|2023-05-20     |2023-05-20     |2023-05-20     |
|3  |InvalidDate|31/02/2023|02/31/2023|InvalidTimestamp   |NULL           |NULL           |NULL           |
|4  |NULL       |NULL      |NULL      |NULL               |NULL           |NULL           |NULL           |
+---+-----------+----------+----------+-------------------+---------------+---------------+---------------+



# TimeStamp

In [15]:
from pyspark.sql.functions import to_timestamp,year,month,dayofmonth,hour,minute

df = df.withColumn('parsed_timestamp',to_timestamp(df.timestamp))
df.show()
df.printSchema()

+---+-----------+----------+----------+-------------------+---------------+---------------+---------------+-------------------+
| id|   date_iso|  date_dmy|  date_mdy|          timestamp|parsed_date_iso|parsed_date_dmy|parsed_date_mdy|   parsed_timestamp|
+---+-----------+----------+----------+-------------------+---------------+---------------+---------------+-------------------+
|  1| 2023-01-15|15/01/2023|01/15/2023|2023-01-15 10:30:00|     2023-01-15|     2023-01-15|     2023-01-15|2023-01-15 10:30:00|
|  2| 2023-05-20|20/05/2023|05/20/2023|2023-05-20 15:45:00|     2023-05-20|     2023-05-20|     2023-05-20|2023-05-20 15:45:00|
|  3|InvalidDate|31/02/2023|02/31/2023|   InvalidTimestamp|           NULL|           NULL|           NULL|               NULL|
|  4|       NULL|      NULL|      NULL|               NULL|           NULL|           NULL|           NULL|               NULL|
+---+-----------+----------+----------+-------------------+---------------+---------------+-------------

In [17]:
df = df\
        .withColumn('year',year(df.parsed_timestamp))\
        .withColumn('month',month(df.parsed_timestamp))\
        .withColumn('day',dayofmonth(df.parsed_timestamp))\
        .withColumn('hour',hour(df.parsed_timestamp))\
        .withColumn('minute',minute(df.parsed_timestamp))
df.show(truncate=False)

+---+-----------+----------+----------+-------------------+---------------+---------------+---------------+-------------------+----+-----+----+----+------+
|id |date_iso   |date_dmy  |date_mdy  |timestamp          |parsed_date_iso|parsed_date_dmy|parsed_date_mdy|parsed_timestamp   |year|month|day |hour|minute|
+---+-----------+----------+----------+-------------------+---------------+---------------+---------------+-------------------+----+-----+----+----+------+
|1  |2023-01-15 |15/01/2023|01/15/2023|2023-01-15 10:30:00|2023-01-15     |2023-01-15     |2023-01-15     |2023-01-15 10:30:00|2023|1    |15  |10  |30    |
|2  |2023-05-20 |20/05/2023|05/20/2023|2023-05-20 15:45:00|2023-05-20     |2023-05-20     |2023-05-20     |2023-05-20 15:45:00|2023|5    |20  |15  |45    |
|3  |InvalidDate|31/02/2023|02/31/2023|InvalidTimestamp   |NULL           |NULL           |NULL           |NULL               |NULL|NULL |NULL|NULL|NULL  |
|4  |NULL       |NULL      |NULL      |NULL               |NULL 

In [19]:
from pyspark.sql.functions import datediff
df = df.withColumn('days_differ',datediff(df.parsed_date_mdy,df.parsed_date_iso))
df.select('parsed_date_mdy','parsed_date_iso','days_differ').show(truncate=False)

+---------------+---------------+-----------+
|parsed_date_mdy|parsed_date_iso|days_differ|
+---------------+---------------+-----------+
|2023-01-15     |2023-01-15     |0          |
|2023-05-20     |2023-05-20     |0          |
|NULL           |NULL           |NULL       |
|NULL           |NULL           |NULL       |
+---------------+---------------+-----------+



In [20]:
spark.stop()