In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("date_timestamp").getOrCreate()

In [3]:
from pyspark.sql.types import StructType, StructField, StringType, DateType, TimestampType
from pyspark.sql.functions import to_date, to_timestamp

# Define the schema with StringType for date and timestamp columns
schema = StructType([
    StructField("name", StringType(), True),
    StructField("birthdate", StringType(), True),
    StructField("joindate", StringType(), True),
    StructField("last_login", StringType(), True)
])

# Sample data
data = [
    ("Alice", "1985-05-15", "2010-06-20", "2024-08-12 14:30:00"),
    ("Bob", "1990-08-25", "2015-07-30", "2024-08-13 10:15:00"),
    ("Carol", "1978-12-01", "2005-03-18", "2024-08-10 18:45:00")
]

# Create DataFrame with StringType schema
df = spark.createDataFrame(data, schema)

# Show the DataFrame
df.show(truncate=False)

# Print the schema
df.printSchema()


+-----+----------+----------+-------------------+
|name |birthdate |joindate  |last_login         |
+-----+----------+----------+-------------------+
|Alice|1985-05-15|2010-06-20|2024-08-12 14:30:00|
|Bob  |1990-08-25|2015-07-30|2024-08-13 10:15:00|
|Carol|1978-12-01|2005-03-18|2024-08-10 18:45:00|
+-----+----------+----------+-------------------+

root
 |-- name: string (nullable = true)
 |-- birthdate: string (nullable = true)
 |-- joindate: string (nullable = true)
 |-- last_login: string (nullable = true)



In [4]:
# Convert the columns to the correct types
df = df.withColumn("birthdate", to_date("birthdate", "yyyy-MM-dd")) \
       .withColumn("joindate", to_date("joindate", "yyyy-MM-dd")) \
       .withColumn("last_login", to_timestamp("last_login", "yyyy-MM-dd HH:mm:ss"))

# Show the DataFrame
df.show(truncate=False)

# Print the schema
df.printSchema()

+-----+----------+----------+-------------------+
|name |birthdate |joindate  |last_login         |
+-----+----------+----------+-------------------+
|Alice|1985-05-15|2010-06-20|2024-08-12 14:30:00|
|Bob  |1990-08-25|2015-07-30|2024-08-13 10:15:00|
|Carol|1978-12-01|2005-03-18|2024-08-10 18:45:00|
+-----+----------+----------+-------------------+

root
 |-- name: string (nullable = true)
 |-- birthdate: date (nullable = true)
 |-- joindate: date (nullable = true)
 |-- last_login: timestamp (nullable = true)



In [6]:
from pyspark.sql.functions import current_date,current_timestamp

# Current date and time
df = df.withColumn("current_date", current_date()) \
       .withColumn("current_timestamp", current_timestamp())

# Show the DataFrame
df.show(truncate=False)

# Print the schema
df.printSchema()

+-----+----------+----------+-------------------+------------+-----------------------+
|name |birthdate |joindate  |last_login         |current_date|current_timestamp      |
+-----+----------+----------+-------------------+------------+-----------------------+
|Alice|1985-05-15|2010-06-20|2024-08-12 14:30:00|2024-08-13  |2024-08-13 17:52:00.988|
|Bob  |1990-08-25|2015-07-30|2024-08-13 10:15:00|2024-08-13  |2024-08-13 17:52:00.988|
|Carol|1978-12-01|2005-03-18|2024-08-10 18:45:00|2024-08-13  |2024-08-13 17:52:00.988|
+-----+----------+----------+-------------------+------------+-----------------------+

root
 |-- name: string (nullable = true)
 |-- birthdate: date (nullable = true)
 |-- joindate: date (nullable = true)
 |-- last_login: timestamp (nullable = true)
 |-- current_date: date (nullable = false)
 |-- current_timestamp: timestamp (nullable = false)



In [12]:
from pyspark.sql.functions import date_add,date_sub,datediff,months_between,add_months,current_date,current_timestamp

df.show(truncate=False)

# Date arithmetic
df = df.withColumn("date_plus_5_days", date_add("joindate", 5)) \
       .withColumn("date_minus_5_days", date_sub("joindate", 5)) \
       .withColumn("days_since_joining", datediff(current_date(), "joindate")) \
       .withColumn("months_since_birth", months_between(current_date(), "birthdate")) \
       .withColumn("joining_plus_2_months", add_months("joindate", 2))

# Show the DataFrame
df.show(truncate=False)

# Print the schema
df.printSchema()


+-----+----------+----------+-------------------+------------+-----------------------+
|name |birthdate |joindate  |last_login         |current_date|current_timestamp      |
+-----+----------+----------+-------------------+------------+-----------------------+
|Alice|1985-05-15|2010-06-20|2024-08-12 14:30:00|2024-08-13  |2024-08-13 17:55:58.594|
|Bob  |1990-08-25|2015-07-30|2024-08-13 10:15:00|2024-08-13  |2024-08-13 17:55:58.594|
|Carol|1978-12-01|2005-03-18|2024-08-10 18:45:00|2024-08-13  |2024-08-13 17:55:58.594|
+-----+----------+----------+-------------------+------------+-----------------------+

+-----+----------+----------+-------------------+------------+---------------------+----------------+-----------------+------------------+------------------+---------------------+
|name |birthdate |joindate  |last_login         |current_date|current_timestamp    |date_plus_5_days|date_minus_5_days|days_since_joining|months_since_birth|joining_plus_2_months|
+-----+----------+----------+-

In [14]:
from pyspark.sql.functions import unix_timestamp,from_unixtime

# Convert UNIX timestamp to date
df = df.withColumn("unix_timestamp", unix_timestamp("last_login")) \
       .withColumn("converted_timestamp", from_unixtime("unix_timestamp", "yyyy-MM-dd HH:mm:ss"))

# Show the DataFrame
df.show(truncate=False)

# Print the schema
df.printSchema()


+-----+----------+----------+-------------------+------------+-----------------------+----------------+-----------------+------------------+------------------+---------------------+--------------+-------------------+
|name |birthdate |joindate  |last_login         |current_date|current_timestamp      |date_plus_5_days|date_minus_5_days|days_since_joining|months_since_birth|joining_plus_2_months|unix_timestamp|converted_timestamp|
+-----+----------+----------+-------------------+------------+-----------------------+----------------+-----------------+------------------+------------------+---------------------+--------------+-------------------+
|Alice|1985-05-15|2010-06-20|2024-08-12 14:30:00|2024-08-13  |2024-08-13 17:58:01.651|2010-06-25      |2010-06-15       |5168              |470.93548387      |2010-08-20           |1723453200    |2024-08-12 14:30:00|
|Bob  |1990-08-25|2015-07-30|2024-08-13 10:15:00|2024-08-13  |2024-08-13 17:58:01.651|2015-08-04      |2015-07-25       |3302       

In [15]:
from pyspark.sql.functions import *

# Date extraction
df = df.withColumn("year_of_birth", year("birthdate")) \
       .withColumn("month_of_birth", month("birthdate")) \
       .withColumn("day_of_birth", dayofmonth("birthdate")) \
       .withColumn("day_of_week_login", dayofweek("last_login")) \
       .withColumn("hour_of_login", hour("last_login")) \
       .withColumn("minute_of_login", minute("last_login")) \
       .withColumn("second_of_login", second("last_login"))

# Show the DataFrame
df.show(truncate=False)

# Print the schema
df.printSchema()

+-----+----------+----------+-------------------+------------+-----------------------+----------------+-----------------+------------------+------------------+---------------------+--------------+-------------------+-------------+--------------+------------+-----------------+-------------+---------------+---------------+
|name |birthdate |joindate  |last_login         |current_date|current_timestamp      |date_plus_5_days|date_minus_5_days|days_since_joining|months_since_birth|joining_plus_2_months|unix_timestamp|converted_timestamp|year_of_birth|month_of_birth|day_of_birth|day_of_week_login|hour_of_login|minute_of_login|second_of_login|
+-----+----------+----------+-------------------+------------+-----------------------+----------------+-----------------+------------------+------------------+---------------------+--------------+-------------------+-------------+--------------+------------+-----------------+-------------+---------------+---------------+
|Alice|1985-05-15|2010-06-20|20

In [16]:
# Formatting date and time
df = df.withColumn("formatted_date_of_birth", date_format("birthdate", "dd-MM-yyyy")) \
       .withColumn("formatted_login", date_format("last_login", "MM/dd/yyyy HH:mm:ss"))

# Truncating dates
df = df.withColumn("truncated_to_month", trunc("joindate", "month")) \
       .withColumn("truncated_to_hour", date_trunc("hour", "last_login"))

# Other date functions
df = df.withColumn("next_monday_after_joining", next_day("joindate", "Mon"))

# Show the DataFrame
df.show(truncate=False)

# Print the schema
df.printSchema()


+-----+----------+----------+-------------------+------------+-----------------------+----------------+-----------------+------------------+------------------+---------------------+--------------+-------------------+-------------+--------------+------------+-----------------+-------------+---------------+---------------+-----------------------+-------------------+------------------+-------------------+-------------------------+
|name |birthdate |joindate  |last_login         |current_date|current_timestamp      |date_plus_5_days|date_minus_5_days|days_since_joining|months_since_birth|joining_plus_2_months|unix_timestamp|converted_timestamp|year_of_birth|month_of_birth|day_of_birth|day_of_week_login|hour_of_login|minute_of_login|second_of_login|formatted_date_of_birth|formatted_login    |truncated_to_month|truncated_to_hour  |next_monday_after_joining|
+-----+----------+----------+-------------------+------------+-----------------------+----------------+-----------------+---------------