In [1]:
# https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html

In [2]:
import findspark

In [3]:
# /opt/manual/spark: this is SPARK_HOME path
findspark.init("/opt/manual/spark")

In [4]:
from pyspark.sql import SparkSession, functions as F

In [5]:
spark = SparkSession.builder \
.appName("Datetime Ops") \
.master("local[2]") \
.getOrCreate()

2022-08-27 19:45:03,953 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [6]:
# Data source: https://data.sfgov.org/Public-Safety/Fire-Incidents/wr8u-xric/data

In [7]:
! wget -O ~/datasets/Fire_Incidents.csv.gz \
https://github.com/erkansirin78/datasets/raw/master/Fire_Incidents.csv.gz

--2022-08-27 19:45:11--  https://github.com/erkansirin78/datasets/raw/master/Fire_Incidents.csv.gz
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/erkansirin78/datasets/master/Fire_Incidents.csv.gz [following]
--2022-08-27 19:45:11--  https://raw.githubusercontent.com/erkansirin78/datasets/master/Fire_Incidents.csv.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 41002480 (39M) [application/octet-stream]
Saving to: ‘/home/train/datasets/Fire_Incidents.csv.gz’


2022-08-27 19:45:28 (2.78 MB/s) - ‘/home/train/datasets/Fire_Incidents.csv.gz’ saved [41002480/41002480]



In [8]:
! ls -l ~/datasets | grep Fire

-rw-rw-r--. 1 train train 41002480 Aug 27 19:45 Fire_Incidents.csv.gz


In [9]:
df = spark.read \
.format("csv") \
.option("header", True) \
.option("inferSchema", True) \
.option("compression","gzip") \
.load("file:///home/train/datasets/Fire_Incidents.csv.gz")

                                                                                

In [10]:
df.count()

                                                                                

533598

In [11]:
len(df.columns)

80

In [12]:
df.columns

['Incident Number',
 'Exposure Number',
 'ID',
 'Address',
 'Incident Date',
 'Call Number',
 'Alarm DtTm',
 'Arrival DtTm',
 'Close DtTm',
 'City',
 'ZIP Code',
 'Battalion',
 'Station Area',
 'Box',
 'Suppression Units',
 'Suppression Personnel',
 'EMS Units',
 'EMS Personnel',
 'Other Units',
 'Other Personnel',
 'First Unit On Scene',
 'Estimated Property Loss',
 'Estimated Contents Loss',
 'Fire Fatalities',
 'Fire Injuries',
 'Civilian Fatalities',
 'Civilian Injuries',
 'Number of Alarms',
 'Primary Situation',
 'Mutual Aid',
 'Action Taken Primary',
 'Action Taken Secondary',
 'Action Taken Other',
 'Detector Alerted Occupants',
 'Property Use',
 'Area of Fire Origin',
 'Ignition Cause',
 'Ignition Factor Primary',
 'Ignition Factor Secondary',
 'Heat Source',
 'Item First Ignited',
 'Human Factors Associated with Ignition',
 'Structure Type',
 'Structure Status',
 'Floor of Fire Origin',
 'Fire Spread',
 'No Flame Spead',
 'Number of floors with minimum damage',
 'Number of fl

In [13]:
ts_cols = ['Incident Date','Alarm DtTm']

In [14]:
df.select(ts_cols).show(n=4, truncate=False)

+-------------+----------------------+
|Incident Date|Alarm DtTm            |
+-------------+----------------------+
|06/05/2018   |06/05/2018 06:38:01 PM|
|08/29/2019   |08/29/2019 08:09:25 PM|
|06/14/2018   |06/14/2018 08:37:56 PM|
|12/30/2005   |12/30/2005 10:40:27 PM|
+-------------+----------------------+
only showing top 4 rows



In [15]:
# Create new dataframe which includes only date time columns

df2 = df.select(ts_cols)

In [16]:
df2.dtypes

[('Incident Date', 'string'), ('Alarm DtTm', 'string')]

<h1 style="color:blue;">From String to Date Time Conversion</h1>

# Date and Time Operations - to_timestamp

In [17]:
# When spark reads text or csv schema it accepts every column as string type.
# When we tell spark to inferSchema it takes sample and infers datatype.
# Most of time it infers as we expected but in case date and time things get complicated due to datetime formats
# If datetime format is out of standard spark considers it as string
# Therefore most of the strugle is just convert string into datetime types
# To convert properly you have to define existing format according to this:
# https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html
# Otherwise your conversion results in None or null


# In our practice dataset;
# ('Incident Date', 'string'),
# ('Alarm DtTm', 'string'),
# ('Arrival DtTm', 'string'),
# ('Close DtTm', 'string'),
# These columns ts but spark inferred them as string due out of standard ts format
# We have to handle manually and modify schema by casting/converting them into ts type

In [18]:
df2.limit(5).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm
0,06/05/2018,06/05/2018 06:38:01 PM
1,08/29/2019,08/29/2019 08:09:25 PM
2,06/14/2018,06/14/2018 08:37:56 PM
3,12/30/2005,12/30/2005 10:40:27 PM
4,09/13/2018,09/13/2018 08:30:38 PM


## Wrong definition of format

In [20]:
# MM/dd/yyyy hh:mm:ss a

In [21]:
df3 = df2.withColumn("Alarm_DtTm_New", 
                     F.to_timestamp(F.col("Alarm DtTm"), 'MM/dd/yyyy HH:mm:ss a'))
df3.limit(2).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Alarm_DtTm_New
0,06/05/2018,06/05/2018 06:38:01 PM,NaT
1,08/29/2019,08/29/2019 08:09:25 PM,NaT


In [22]:
# Time is AM-PM not 24 hour so the hour should be hh not HH
# As you see even slight mistake results in None

## Correct definition of format

In [23]:

df3 = df2.withColumn("Alarm_DtTm", 
                     F.to_timestamp(F.col("Alarm DtTm"), 'MM/dd/yyyy hh:mm:ss a'))

In [24]:
df3.limit(2).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Alarm_DtTm
0,06/05/2018,06/05/2018 06:38:01 PM,2018-06-05 18:38:01
1,08/29/2019,08/29/2019 08:09:25 PM,2019-08-29 20:09:25


In [25]:
df3.printSchema()

root
 |-- Incident Date: string (nullable = true)
 |-- Alarm DtTm: string (nullable = true)
 |-- Alarm_DtTm: timestamp (nullable = true)



# Date and Time Operations - unix_timestamp and from_unixtime 

In [26]:
df3 = df2.withColumn("Alarm_DtTm", 
                     F.unix_timestamp(F.col("Alarm DtTm"), 'MM/dd/yyyy hh:mm:ss a'))
df3.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Alarm_DtTm
0,06/05/2018,06/05/2018 06:38:01 PM,1528213081
1,08/29/2019,08/29/2019 08:09:25 PM,1567098565
2,06/14/2018,06/14/2018 08:37:56 PM,1528997876


In [27]:
df3.printSchema()

root
 |-- Incident Date: string (nullable = true)
 |-- Alarm DtTm: string (nullable = true)
 |-- Alarm_DtTm: long (nullable = true)



In [28]:
df3 = df2.withColumn("Alarm_UnixTS", 
                     F.unix_timestamp(F.col("Alarm DtTm"), 'MM/dd/yyyy hh:mm:ss a')) \
.withColumn("Alarm_DtTm", F.to_timestamp(F.col("Alarm_UnixTS")))

df3.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Alarm_UnixTS,Alarm_DtTm
0,06/05/2018,06/05/2018 06:38:01 PM,1528213081,2018-06-05 18:38:01
1,08/29/2019,08/29/2019 08:09:25 PM,1567098565,2019-08-29 20:09:25
2,06/14/2018,06/14/2018 08:37:56 PM,1528997876,2018-06-14 20:37:56


In [29]:
df3.printSchema()

root
 |-- Incident Date: string (nullable = true)
 |-- Alarm DtTm: string (nullable = true)
 |-- Alarm_UnixTS: long (nullable = true)
 |-- Alarm_DtTm: timestamp (nullable = true)



# Date and Time Operations - to_date

In [30]:
df4 = df3.withColumn("Incident_Date", 
                     F.to_date(F.col("Incident Date"), 'MM/dd/yyyy'))
df4.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Alarm_UnixTS,Alarm_DtTm,Incident_Date
0,06/05/2018,06/05/2018 06:38:01 PM,1528213081,2018-06-05 18:38:01,2018-06-05
1,08/29/2019,08/29/2019 08:09:25 PM,1567098565,2019-08-29 20:09:25,2019-08-29
2,06/14/2018,06/14/2018 08:37:56 PM,1528997876,2018-06-14 20:37:56,2018-06-14


In [31]:
df4.printSchema()

root
 |-- Incident Date: string (nullable = true)
 |-- Alarm DtTm: string (nullable = true)
 |-- Alarm_UnixTS: long (nullable = true)
 |-- Alarm_DtTm: timestamp (nullable = true)
 |-- Incident_Date: date (nullable = true)



<h1 style="color:blue;">From Date Time to String  Conversion</h1>

## Convert date and timestamp columns to string in a desired format

In [32]:
df5 = df4.withColumn("Incident_Date_Str", 
                     F.date_format(F.col("Incident_Date"), 'MM--dd--yyyy'))


df5.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Alarm_UnixTS,Alarm_DtTm,Incident_Date,Incident_Date_Str
0,06/05/2018,06/05/2018 06:38:01 PM,1528213081,2018-06-05 18:38:01,2018-06-05,06--05--2018
1,08/29/2019,08/29/2019 08:09:25 PM,1567098565,2019-08-29 20:09:25,2019-08-29,08--29--2019
2,06/14/2018,06/14/2018 08:37:56 PM,1528997876,2018-06-14 20:37:56,2018-06-14,06--14--2018


In [33]:
df5 = df4.withColumn("Incident_Date_Str", 
                     F.date_format(F.col("Incident_Date"), 'yyyy:MM:dd'))
df5.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Alarm_UnixTS,Alarm_DtTm,Incident_Date,Incident_Date_Str
0,06/05/2018,06/05/2018 06:38:01 PM,1528213081,2018-06-05 18:38:01,2018-06-05,2018:06:05
1,08/29/2019,08/29/2019 08:09:25 PM,1567098565,2019-08-29 20:09:25,2019-08-29,2019:08:29
2,06/14/2018,06/14/2018 08:37:56 PM,1528997876,2018-06-14 20:37:56,2018-06-14,2018:06:14


In [34]:
df5 = df4.withColumn("Alarm_DtTm_Str", 
                     F.date_format(F.col("Alarm_DtTm"), 'yyyy-MM-dd hh:ss a'))
df5.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Alarm_UnixTS,Alarm_DtTm,Incident_Date,Alarm_DtTm_Str
0,06/05/2018,06/05/2018 06:38:01 PM,1528213081,2018-06-05 18:38:01,2018-06-05,2018-06-05 06:01 PM
1,08/29/2019,08/29/2019 08:09:25 PM,1567098565,2019-08-29 20:09:25,2019-08-29,2019-08-29 08:25 PM
2,06/14/2018,06/14/2018 08:37:56 PM,1528997876,2018-06-14 20:37:56,2018-06-14,2018-06-14 08:56 PM


In [35]:
df5 = df4.withColumn("Alarm_DtTm_Str", 
                     F.date_format(F.col("Alarm_DtTm"), 'yyyy MMMM dd E'))
df5.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Alarm_UnixTS,Alarm_DtTm,Incident_Date,Alarm_DtTm_Str
0,06/05/2018,06/05/2018 06:38:01 PM,1528213081,2018-06-05 18:38:01,2018-06-05,2018 June 05 Tue
1,08/29/2019,08/29/2019 08:09:25 PM,1567098565,2019-08-29 20:09:25,2019-08-29,2019 August 29 Thu
2,06/14/2018,06/14/2018 08:37:56 PM,1528997876,2018-06-14 20:37:56,2018-06-14,2018 June 14 Thu


In [36]:
df5 = df4.withColumn("Alarm_DtTm_Str", 
                     F.date_format(F.col("Alarm_DtTm"), 'yyyy MMMM dd VV'))
df5.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Alarm_UnixTS,Alarm_DtTm,Incident_Date,Alarm_DtTm_Str
0,06/05/2018,06/05/2018 06:38:01 PM,1528213081,2018-06-05 18:38:01,2018-06-05,2018 June 05 Europe/Istanbul
1,08/29/2019,08/29/2019 08:09:25 PM,1567098565,2019-08-29 20:09:25,2019-08-29,2019 August 29 Europe/Istanbul
2,06/14/2018,06/14/2018 08:37:56 PM,1528997876,2018-06-14 20:37:56,2018-06-14,2018 June 14 Europe/Istanbul


In [37]:
df5 = df4.withColumn("Alarm_DtTm_Str", 
                     F.date_format(F.col("Alarm_DtTm"), 'yyyy MMMM dd OOOO'))
df5.limit(3).toPandas()

Unnamed: 0,Incident Date,Alarm DtTm,Alarm_UnixTS,Alarm_DtTm,Incident_Date,Alarm_DtTm_Str
0,06/05/2018,06/05/2018 06:38:01 PM,1528213081,2018-06-05 18:38:01,2018-06-05,2018 June 05 GMT+03:00
1,08/29/2019,08/29/2019 08:09:25 PM,1567098565,2019-08-29 20:09:25,2019-08-29,2019 August 29 GMT+03:00
2,06/14/2018,06/14/2018 08:37:56 PM,1528997876,2018-06-14 20:37:56,2018-06-14,2018 June 14 GMT+03:00


In [38]:
spark.stop()