In [1]:
import findspark

In [2]:
findspark.init("/opt/manual/spark")

In [3]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import *

In [4]:
spark = SparkSession.builder \
.appName("Datetime Ops") \
.master("local[2]") \
.getOrCreate()

In [5]:
! wget -P ~/datasets https://github.com/erkansirin78/datasets/raw/master/Fire_Incidents.csv.gz  

--2025-06-12 16:00:01--  https://github.com/erkansirin78/datasets/raw/master/Fire_Incidents.csv.gz
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/erkansirin78/datasets/master/Fire_Incidents.csv.gz [following]
--2025-06-12 16:00:01--  https://raw.githubusercontent.com/erkansirin78/datasets/master/Fire_Incidents.csv.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 41002480 (39M) [application/octet-stream]
Saving to: ‘/home/train/datasets/Fire_Incidents.csv.gz’


2025-06-12 16:00:18 (2.53 MB/s) - ‘/home/train/datasets/Fire_Incidents.csv.gz’ saved [41002480/41002480]



In [6]:
df = spark.read \
.option("header",True) \
.option("inferSchema", True) \
.option("compression", "gzip") \
.csv("file:///home/train/datasets/Fire_Incidents.csv.gz")

In [7]:
df.count()

533598

In [8]:
len(df.columns)

80

In [9]:
ts_cols = ['Incident Date', 'Alarm DtTm', 'Arrival DtTm', 'Close DtTm']

In [10]:
df.select(ts_cols).show(4)

+-------------+--------------------+--------------------+--------------------+
|Incident Date|          Alarm DtTm|        Arrival DtTm|          Close DtTm|
+-------------+--------------------+--------------------+--------------------+
|   06/05/2018|06/05/2018 06:38:...|06/05/2018 06:41:...|06/05/2018 06:42:...|
|   08/29/2019|08/29/2019 08:09:...|08/29/2019 08:11:...|08/29/2019 08:12:...|
|   06/14/2018|06/14/2018 08:37:...|06/14/2018 08:40:...|06/14/2018 08:40:...|
|   12/30/2005|12/30/2005 10:40:...|12/30/2005 10:46:...|12/30/2005 11:37:...|
+-------------+--------------------+--------------------+--------------------+
only showing top 4 rows



In [11]:
df.select(ts_cols).printSchema()

root
 |-- Incident Date: string (nullable = true)
 |-- Alarm DtTm: string (nullable = true)
 |-- Arrival DtTm: string (nullable = true)
 |-- Close DtTm: string (nullable = true)



In [13]:
df.select(ts_cols).withColumn("Alarm DtTm2", F.to_timestamp(F.col("Alarm DtTm",), "MM/dd/yyyy hh:mm:ss a")).show(n=4, truncate=False)

+-------------+----------------------+----------------------+----------------------+-------------------+
|Incident Date|Alarm DtTm            |Arrival DtTm          |Close DtTm            |Alarm DtTm2        |
+-------------+----------------------+----------------------+----------------------+-------------------+
|06/05/2018   |06/05/2018 06:38:01 PM|06/05/2018 06:41:59 PM|06/05/2018 06:42:12 PM|2018-06-05 18:38:01|
|08/29/2019   |08/29/2019 08:09:25 PM|08/29/2019 08:11:54 PM|08/29/2019 08:12:24 PM|2019-08-29 20:09:25|
|06/14/2018   |06/14/2018 08:37:56 PM|06/14/2018 08:40:37 PM|06/14/2018 08:40:52 PM|2018-06-14 20:37:56|
|12/30/2005   |12/30/2005 10:40:27 PM|12/30/2005 10:46:33 PM|12/30/2005 11:37:23 PM|2005-12-30 22:40:27|
+-------------+----------------------+----------------------+----------------------+-------------------+
only showing top 4 rows



In [14]:
df2 = df.select(ts_cols).withColumn("Alarm DtTm2", F.to_timestamp(F.col("Alarm DtTm"), "MM/dd/yyyy hh:mm:ss a"))

In [15]:
df2.printSchema()

root
 |-- Incident Date: string (nullable = true)
 |-- Alarm DtTm: string (nullable = true)
 |-- Arrival DtTm: string (nullable = true)
 |-- Close DtTm: string (nullable = true)
 |-- Alarm DtTm2: timestamp (nullable = true)



# Tarih/Zaman Operasyonları-2: Unix Timestamp

In [17]:
df.select(ts_cols).withColumn("Alarm DtTm_UT", F.unix_timestamp(F.col("Alarm DtTm"), "MM/dd/yyyy hh:mm:ss a")).show(n=4, truncate=False)

+-------------+----------------------+----------------------+----------------------+-------------+
|Incident Date|Alarm DtTm            |Arrival DtTm          |Close DtTm            |Alarm DtTm_UT|
+-------------+----------------------+----------------------+----------------------+-------------+
|06/05/2018   |06/05/2018 06:38:01 PM|06/05/2018 06:41:59 PM|06/05/2018 06:42:12 PM|1528213081   |
|08/29/2019   |08/29/2019 08:09:25 PM|08/29/2019 08:11:54 PM|08/29/2019 08:12:24 PM|1567098565   |
|06/14/2018   |06/14/2018 08:37:56 PM|06/14/2018 08:40:37 PM|06/14/2018 08:40:52 PM|1528997876   |
|12/30/2005   |12/30/2005 10:40:27 PM|12/30/2005 10:46:33 PM|12/30/2005 11:37:23 PM|1135975227   |
+-------------+----------------------+----------------------+----------------------+-------------+
only showing top 4 rows



In [19]:
df3 = df.select(ts_cols).withColumn("Alarm DtTm_UT", F.unix_timestamp(F.col("Alarm DtTm"), "MM/dd/yyyy hh:mm:ss a"))

In [21]:
df3.withColumn("From_DtTm_UT", F.to_timestamp(F.col("Alarm DtTm_UT"))).show(n=4, truncate=False)

+-------------+----------------------+----------------------+----------------------+-------------+-------------------+
|Incident Date|Alarm DtTm            |Arrival DtTm          |Close DtTm            |Alarm DtTm_UT|From_DtTm_UT       |
+-------------+----------------------+----------------------+----------------------+-------------+-------------------+
|06/05/2018   |06/05/2018 06:38:01 PM|06/05/2018 06:41:59 PM|06/05/2018 06:42:12 PM|1528213081   |2018-06-05 18:38:01|
|08/29/2019   |08/29/2019 08:09:25 PM|08/29/2019 08:11:54 PM|08/29/2019 08:12:24 PM|1567098565   |2019-08-29 20:09:25|
|06/14/2018   |06/14/2018 08:37:56 PM|06/14/2018 08:40:37 PM|06/14/2018 08:40:52 PM|1528997876   |2018-06-14 20:37:56|
|12/30/2005   |12/30/2005 10:40:27 PM|12/30/2005 10:46:33 PM|12/30/2005 11:37:23 PM|1135975227   |2005-12-30 22:40:27|
+-------------+----------------------+----------------------+----------------------+-------------+-------------------+
only showing top 4 rows



In [22]:
df4  = df3.withColumn("From_DtTm_UT", F.to_timestamp(F.col("Alarm DtTm_UT")))

In [23]:
df4.printSchema()

root
 |-- Incident Date: string (nullable = true)
 |-- Alarm DtTm: string (nullable = true)
 |-- Arrival DtTm: string (nullable = true)
 |-- Close DtTm: string (nullable = true)
 |-- Alarm DtTm_UT: long (nullable = true)
 |-- From_DtTm_UT: timestamp (nullable = true)



## From string to date and Date Format

In [25]:
df.select(ts_cols).withColumn("Incident_Date_Date_Format", F.to_date(F.col("Incident Date"), "MM/dd/yyyy"))\
.show(n=4, truncate=False)

+-------------+----------------------+----------------------+----------------------+-------------------------+
|Incident Date|Alarm DtTm            |Arrival DtTm          |Close DtTm            |Incident_Date_Date_Format|
+-------------+----------------------+----------------------+----------------------+-------------------------+
|06/05/2018   |06/05/2018 06:38:01 PM|06/05/2018 06:41:59 PM|06/05/2018 06:42:12 PM|2018-06-05               |
|08/29/2019   |08/29/2019 08:09:25 PM|08/29/2019 08:11:54 PM|08/29/2019 08:12:24 PM|2019-08-29               |
|06/14/2018   |06/14/2018 08:37:56 PM|06/14/2018 08:40:37 PM|06/14/2018 08:40:52 PM|2018-06-14               |
|12/30/2005   |12/30/2005 10:40:27 PM|12/30/2005 10:46:33 PM|12/30/2005 11:37:23 PM|2005-12-30               |
+-------------+----------------------+----------------------+----------------------+-------------------------+
only showing top 4 rows



In [26]:
df5 = df.select(ts_cols).withColumn("Incident_Date_Date_Format", F.to_date(F.col("Incident Date"), "MM/dd/yyyy"))\
.show(n=4, truncate=False)

+-------------+----------------------+----------------------+----------------------+-------------------------+
|Incident Date|Alarm DtTm            |Arrival DtTm          |Close DtTm            |Incident_Date_Date_Format|
+-------------+----------------------+----------------------+----------------------+-------------------------+
|06/05/2018   |06/05/2018 06:38:01 PM|06/05/2018 06:41:59 PM|06/05/2018 06:42:12 PM|2018-06-05               |
|08/29/2019   |08/29/2019 08:09:25 PM|08/29/2019 08:11:54 PM|08/29/2019 08:12:24 PM|2019-08-29               |
|06/14/2018   |06/14/2018 08:37:56 PM|06/14/2018 08:40:37 PM|06/14/2018 08:40:52 PM|2018-06-14               |
|12/30/2005   |12/30/2005 10:40:27 PM|12/30/2005 10:46:33 PM|12/30/2005 11:37:23 PM|2005-12-30               |
+-------------+----------------------+----------------------+----------------------+-------------------------+
only showing top 4 rows



In [30]:
df5 = df.select(ts_cols).withColumn("Incident_Date_D", F.to_date(F.col("Incident Date"), "MM/dd/yyyy"))

In [31]:
df5.printSchema()

root
 |-- Incident Date: string (nullable = true)
 |-- Alarm DtTm: string (nullable = true)
 |-- Arrival DtTm: string (nullable = true)
 |-- Close DtTm: string (nullable = true)
 |-- Incident_Date_D: date (nullable = true)



In [32]:
df5.withColumn("Incident_Date_D", F.date_format(F.col("Incident_Date_D"), "yyyy:MM:dd HH:ss"))\
.show(n=4, truncate=False)

+-------------+----------------------+----------------------+----------------------+----------------+
|Incident Date|Alarm DtTm            |Arrival DtTm          |Close DtTm            |Incident_Date_D |
+-------------+----------------------+----------------------+----------------------+----------------+
|06/05/2018   |06/05/2018 06:38:01 PM|06/05/2018 06:41:59 PM|06/05/2018 06:42:12 PM|2018:06:05 00:00|
|08/29/2019   |08/29/2019 08:09:25 PM|08/29/2019 08:11:54 PM|08/29/2019 08:12:24 PM|2019:08:29 00:00|
|06/14/2018   |06/14/2018 08:37:56 PM|06/14/2018 08:40:37 PM|06/14/2018 08:40:52 PM|2018:06:14 00:00|
|12/30/2005   |12/30/2005 10:40:27 PM|12/30/2005 10:46:33 PM|12/30/2005 11:37:23 PM|2005:12:30 00:00|
+-------------+----------------------+----------------------+----------------------+----------------+
only showing top 4 rows



In [33]:
df5.withColumn("Incident_Date_D", F.date_format(F.col("Incident_Date_D"), "yyyy-MM-dd HH:ss"))\
.show(n=4, truncate=False)

+-------------+----------------------+----------------------+----------------------+----------------+
|Incident Date|Alarm DtTm            |Arrival DtTm          |Close DtTm            |Incident_Date_D |
+-------------+----------------------+----------------------+----------------------+----------------+
|06/05/2018   |06/05/2018 06:38:01 PM|06/05/2018 06:41:59 PM|06/05/2018 06:42:12 PM|2018-06-05 00:00|
|08/29/2019   |08/29/2019 08:09:25 PM|08/29/2019 08:11:54 PM|08/29/2019 08:12:24 PM|2019-08-29 00:00|
|06/14/2018   |06/14/2018 08:37:56 PM|06/14/2018 08:40:37 PM|06/14/2018 08:40:52 PM|2018-06-14 00:00|
|12/30/2005   |12/30/2005 10:40:27 PM|12/30/2005 10:46:33 PM|12/30/2005 11:37:23 PM|2005-12-30 00:00|
+-------------+----------------------+----------------------+----------------------+----------------+
only showing top 4 rows



## Get year from ts

In [34]:
df2 = df.select(ts_cols).withColumn("Alarm DtTm", F.to_timestamp(F.col("Alarm DtTm"), "MM/dd/yyyy hh:mm:ss a")).withColumn("Incident Date", F.to_date(F.col("Incident Date"), "MM/dd/yyyy"))

In [35]:
df3 = df2.withColumn("Alarm_Year", F.year(F.col("Alarm DtTm")))
df3.show(n=4, truncate=False)#Get year from ts

+-------------+-------------------+----------------------+----------------------+----------+
|Incident Date|Alarm DtTm         |Arrival DtTm          |Close DtTm            |Alarm_Year|
+-------------+-------------------+----------------------+----------------------+----------+
|2018-06-05   |2018-06-05 18:38:01|06/05/2018 06:41:59 PM|06/05/2018 06:42:12 PM|2018      |
|2019-08-29   |2019-08-29 20:09:25|08/29/2019 08:11:54 PM|08/29/2019 08:12:24 PM|2019      |
|2018-06-14   |2018-06-14 20:37:56|06/14/2018 08:40:37 PM|06/14/2018 08:40:52 PM|2018      |
|2005-12-30   |2005-12-30 22:40:27|12/30/2005 10:46:33 PM|12/30/2005 11:37:23 PM|2005      |
+-------------+-------------------+----------------------+----------------------+----------+
only showing top 4 rows



## Get month from ts

In [36]:
df4 = df2.withColumn("Alarm_Month", F.month(F.col("Alarm DtTm")))
df4.show(n=4, truncate=False)

+-------------+-------------------+----------------------+----------------------+-----------+
|Incident Date|Alarm DtTm         |Arrival DtTm          |Close DtTm            |Alarm_Month|
+-------------+-------------------+----------------------+----------------------+-----------+
|2018-06-05   |2018-06-05 18:38:01|06/05/2018 06:41:59 PM|06/05/2018 06:42:12 PM|6          |
|2019-08-29   |2019-08-29 20:09:25|08/29/2019 08:11:54 PM|08/29/2019 08:12:24 PM|8          |
|2018-06-14   |2018-06-14 20:37:56|06/14/2018 08:40:37 PM|06/14/2018 08:40:52 PM|6          |
|2005-12-30   |2005-12-30 22:40:27|12/30/2005 10:46:33 PM|12/30/2005 11:37:23 PM|12         |
+-------------+-------------------+----------------------+----------------------+-----------+
only showing top 4 rows



## Get day of month

In [37]:
df5 = df2.withColumn("Alarm_Day", F.dayofmonth(F.col("Alarm DtTm")))
df5.show(n=4, truncate=False)#Get year from ts

+-------------+-------------------+----------------------+----------------------+---------+
|Incident Date|Alarm DtTm         |Arrival DtTm          |Close DtTm            |Alarm_Day|
+-------------+-------------------+----------------------+----------------------+---------+
|2018-06-05   |2018-06-05 18:38:01|06/05/2018 06:41:59 PM|06/05/2018 06:42:12 PM|5        |
|2019-08-29   |2019-08-29 20:09:25|08/29/2019 08:11:54 PM|08/29/2019 08:12:24 PM|29       |
|2018-06-14   |2018-06-14 20:37:56|06/14/2018 08:40:37 PM|06/14/2018 08:40:52 PM|14       |
|2005-12-30   |2005-12-30 22:40:27|12/30/2005 10:46:33 PM|12/30/2005 11:37:23 PM|30       |
+-------------+-------------------+----------------------+----------------------+---------+
only showing top 4 rows



## Get day name

In [38]:
df5 = df2.withColumn("Alarm_Day_Name", F.date_format(F.col("Alarm DtTm"), 'E'))
df5.show(n=4, truncate=False)#Get year from ts

+-------------+-------------------+----------------------+----------------------+--------------+
|Incident Date|Alarm DtTm         |Arrival DtTm          |Close DtTm            |Alarm_Day_Name|
+-------------+-------------------+----------------------+----------------------+--------------+
|2018-06-05   |2018-06-05 18:38:01|06/05/2018 06:41:59 PM|06/05/2018 06:42:12 PM|Tue           |
|2019-08-29   |2019-08-29 20:09:25|08/29/2019 08:11:54 PM|08/29/2019 08:12:24 PM|Thu           |
|2018-06-14   |2018-06-14 20:37:56|06/14/2018 08:40:37 PM|06/14/2018 08:40:52 PM|Thu           |
|2005-12-30   |2005-12-30 22:40:27|12/30/2005 10:46:33 PM|12/30/2005 11:37:23 PM|Fri           |
+-------------+-------------------+----------------------+----------------------+--------------+
only showing top 4 rows



In [39]:
spark.stop()