In [1]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, DateType, ArrayType, FloatType

In [2]:
spark = SparkSession.builder \
.master("local[2]") \
.appName("Datetime Ops") \
.getOrCreate()

2023-02-19 10:03:51,445 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
! wget -O ~/datasets/Fire_Incidents.csv.gz \
https://github.com/erkansirin78/datasets/raw/master/Fire_Incidents.csv.gz

--2023-02-19 10:04:07--  https://github.com/erkansirin78/datasets/raw/master/Fire_Incidents.csv.gz
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/erkansirin78/datasets/master/Fire_Incidents.csv.gz [following]
--2023-02-19 10:04:07--  https://raw.githubusercontent.com/erkansirin78/datasets/master/Fire_Incidents.csv.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 41002480 (39M) [application/octet-stream]
Saving to: ‘/home/train/datasets/Fire_Incidents.csv.gz’


2023-02-19 10:04:16 (6.02 MB/s) - ‘/home/train/datasets/Fire_Incidents.csv.gz’ saved [41002480/41002480]



# Read Data

In [4]:
df = spark.read \
.format("csv") \
.option("header", True) \
.option("sep", ",") \
.option("inferSchema", True) \
.load("file:///home/train/datasets/Fire_Incidents.csv.gz")

                                                                                

In [5]:
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [6]:
df.limit(2)

2023-02-19 10:05:22,808 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Incident Number,Exposure Number,ID,Address,Incident Date,Call Number,Alarm DtTm,Arrival DtTm,Close DtTm,City,ZIP Code,Battalion,Station Area,Box,Suppression Units,Suppression Personnel,EMS Units,EMS Personnel,Other Units,Other Personnel,First Unit On Scene,Estimated Property Loss,Estimated Contents Loss,Fire Fatalities,Fire Injuries,Civilian Fatalities,Civilian Injuries,Number of Alarms,Primary Situation,Mutual Aid,Action Taken Primary,Action Taken Secondary,Action Taken Other,Detector Alerted Occupants,Property Use,Area of Fire Origin,Ignition Cause,Ignition Factor Primary,Ignition Factor Secondary,Heat Source,Item First Ignited,Human Factors Associated with Ignition,Structure Type,Structure Status,Floor of Fire Origin,Fire Spread,No Flame Spead,Number of floors with minimum damage,Number of floors with significant damage,Number of floors with heavy damage,Number of floors with extreme damage,Detectors Present,Detector Type,Detector Operation,Detector Effectiveness,Detector Failure Reason,Automatic Extinguishing System Present,Automatic Extinguishing Sytem Type,Automatic Extinguishing Sytem Perfomance,Automatic Extinguishing Sytem Failure Reason,Number of Sprinkler Heads Operating,Supervisor District,Analysis Neighborhood,point,Neighborhoods (old),Zip Codes,Fire Prevention Districts,Police Districts,Supervisor Districts,Civic Center Harm Reduction Project Boundary,2017 Fix It Zones,HSOC Zones,Central Market/Tenderloin Boundary,Central Market/Tenderloin Boundary Polygon - Updated,HSOC Zones as of 2018-06-05,Neighborhoods,SF Find Neighborhoods,Current Police Districts,Current Supervisor Districts,Analysis Neighborhoods
18066225,0,180662250,Gateview/bayside,06/05/2018,181563203,06/05/2018 06:38:...,06/05/2018 06:41:...,06/05/2018 06:42:...,,,B03,48,2931,2,9,0,0,0,0,,,,0,0,0,0,,"500 service call,...",n none,86 investigate,,,,"000 property use,...",,,,,,,,,,,,na,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
19102786,0,191027860,Bayside Drive,08/29/2019,192413977,08/29/2019 08:09:...,08/29/2019 08:11:...,08/29/2019 08:12:...,,,B03,48,2931,2,9,0,0,0,0,,,,0,0,0,0,,700 false alarm o...,n none,"00 action taken, ...",,,,nnn none,,,,,,,,,,,,na,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [7]:
ts_cols = ['Incident Date','Alarm DtTm']

In [8]:
df.select(ts_cols).show(n=4, truncate=False)

+-------------+----------------------+
|Incident Date|Alarm DtTm            |
+-------------+----------------------+
|06/05/2018   |06/05/2018 06:38:01 PM|
|08/29/2019   |08/29/2019 08:09:25 PM|
|06/14/2018   |06/14/2018 08:37:56 PM|
|12/30/2005   |12/30/2005 10:40:27 PM|
+-------------+----------------------+
only showing top 4 rows



In [9]:
df2 = df.select(ts_cols)

In [10]:
df2.printSchema()

root
 |-- Incident Date: string (nullable = true)
 |-- Alarm DtTm: string (nullable = true)



# From String to Date Time Conversion

In [31]:
Incident_Date_Format = "M/d/y"
Alarm_DtTm_Format = "M/d/y h:m:s a"

In [32]:
df3 = df2.withColumn("Incident Date", F.to_date(F.col("Incident Date"), Incident_Date_Format)) \
.withColumn("Alarm DtTm", F.to_timestamp(F.col("Alarm DtTm"), Alarm_DtTm_Format))

In [33]:
df3.show(n=14, truncate=False)

+-------------+-------------------+
|Incident Date|Alarm DtTm         |
+-------------+-------------------+
|2018-06-05   |2018-06-05 18:38:01|
|2019-08-29   |2019-08-29 20:09:25|
|2018-06-14   |2018-06-14 20:37:56|
|2005-12-30   |2005-12-30 22:40:27|
|2018-09-13   |2018-09-13 20:30:38|
|2018-12-02   |2018-12-02 10:52:18|
|2018-12-24   |2018-12-24 14:03:57|
|2019-02-12   |2019-02-12 17:56:22|
|2019-05-14   |2019-05-14 12:52:45|
|2006-01-10   |2006-01-10 19:46:06|
|2005-10-15   |2005-10-15 22:06:25|
|2017-12-12   |2017-12-12 08:14:47|
|2007-03-12   |2007-03-12 13:47:40|
|2005-12-04   |2005-12-04 19:23:44|
+-------------+-------------------+
only showing top 14 rows



In [27]:
df3.printSchema()

root
 |-- Incident Date: date (nullable = true)
 |-- Alarm DtTm: timestamp (nullable = true)



# Date and Time Operations - unix_timestamp and from_unixtime

In [35]:
df4 = df3.withColumn("Alarm DtTm_Linux", F.unix_timestamp(F.col("Alarm DtTm")))

In [36]:
df4.show(n=14, truncate=False)

+-------------+-------------------+----------------+
|Incident Date|Alarm DtTm         |Alarm DtTm_Linux|
+-------------+-------------------+----------------+
|2018-06-05   |2018-06-05 18:38:01|1528213081      |
|2019-08-29   |2019-08-29 20:09:25|1567098565      |
|2018-06-14   |2018-06-14 20:37:56|1528997876      |
|2005-12-30   |2005-12-30 22:40:27|1135975227      |
|2018-09-13   |2018-09-13 20:30:38|1536859838      |
|2018-12-02   |2018-12-02 10:52:18|1543737138      |
|2018-12-24   |2018-12-24 14:03:57|1545649437      |
|2019-02-12   |2019-02-12 17:56:22|1549983382      |
|2019-05-14   |2019-05-14 12:52:45|1557827565      |
|2006-01-10   |2006-01-10 19:46:06|1136915166      |
|2005-10-15   |2005-10-15 22:06:25|1129403185      |
|2017-12-12   |2017-12-12 08:14:47|1513055687      |
|2007-03-12   |2007-03-12 13:47:40|1173700060      |
|2005-12-04   |2005-12-04 19:23:44|1133717024      |
+-------------+-------------------+----------------+
only showing top 14 rows



In [39]:
df5 = df4.withColumn("From_Linux_DtTm", F.to_timestamp(F.col("Alarm DtTm_Linux")))

In [40]:
df5.show(n=14, truncate=False)

+-------------+-------------------+----------------+-------------------+
|Incident Date|Alarm DtTm         |Alarm DtTm_Linux|From_Linux_DtTm    |
+-------------+-------------------+----------------+-------------------+
|2018-06-05   |2018-06-05 18:38:01|1528213081      |2018-06-05 18:38:01|
|2019-08-29   |2019-08-29 20:09:25|1567098565      |2019-08-29 20:09:25|
|2018-06-14   |2018-06-14 20:37:56|1528997876      |2018-06-14 20:37:56|
|2005-12-30   |2005-12-30 22:40:27|1135975227      |2005-12-30 22:40:27|
|2018-09-13   |2018-09-13 20:30:38|1536859838      |2018-09-13 20:30:38|
|2018-12-02   |2018-12-02 10:52:18|1543737138      |2018-12-02 10:52:18|
|2018-12-24   |2018-12-24 14:03:57|1545649437      |2018-12-24 14:03:57|
|2019-02-12   |2019-02-12 17:56:22|1549983382      |2019-02-12 17:56:22|
|2019-05-14   |2019-05-14 12:52:45|1557827565      |2019-05-14 12:52:45|
|2006-01-10   |2006-01-10 19:46:06|1136915166      |2006-01-10 19:46:06|
|2005-10-15   |2005-10-15 22:06:25|1129403185      

# From Date Time to String Conversion

In [50]:
df6 = df5.withColumn("BI_Format_of_Alarm", F.date_format(F.col("Alarm DtTm"), "y--M--d E"))

In [51]:
df6.show(n=14, truncate=False)

+-------------+-------------------+----------------+-------------------+------------------+
|Incident Date|Alarm DtTm         |Alarm DtTm_Linux|From_Linux_DtTm    |BI_Format_of_Alarm|
+-------------+-------------------+----------------+-------------------+------------------+
|2018-06-05   |2018-06-05 18:38:01|1528213081      |2018-06-05 18:38:01|2018--6--5 Tue    |
|2019-08-29   |2019-08-29 20:09:25|1567098565      |2019-08-29 20:09:25|2019--8--29 Thu   |
|2018-06-14   |2018-06-14 20:37:56|1528997876      |2018-06-14 20:37:56|2018--6--14 Thu   |
|2005-12-30   |2005-12-30 22:40:27|1135975227      |2005-12-30 22:40:27|2005--12--30 Fri  |
|2018-09-13   |2018-09-13 20:30:38|1536859838      |2018-09-13 20:30:38|2018--9--13 Thu   |
|2018-12-02   |2018-12-02 10:52:18|1543737138      |2018-12-02 10:52:18|2018--12--2 Sun   |
|2018-12-24   |2018-12-24 14:03:57|1545649437      |2018-12-24 14:03:57|2018--12--24 Mon  |
|2019-02-12   |2019-02-12 17:56:22|1549983382      |2019-02-12 17:56:22|2019--2-

# Year

In [61]:
df7 = df6.withColumn("Incident_Year", F.weekofyear(F.to_timestamp(F.col("Alarm DtTm_Linux"))))

In [62]:
df7.show(n=14, truncate=False)

+-------------+-------------------+----------------+-------------------+------------------+-------------+
|Incident Date|Alarm DtTm         |Alarm DtTm_Linux|From_Linux_DtTm    |BI_Format_of_Alarm|Incident_Year|
+-------------+-------------------+----------------+-------------------+------------------+-------------+
|2018-06-05   |2018-06-05 18:38:01|1528213081      |2018-06-05 18:38:01|2018--6--5 Tue    |23           |
|2019-08-29   |2019-08-29 20:09:25|1567098565      |2019-08-29 20:09:25|2019--8--29 Thu   |35           |
|2018-06-14   |2018-06-14 20:37:56|1528997876      |2018-06-14 20:37:56|2018--6--14 Thu   |24           |
|2005-12-30   |2005-12-30 22:40:27|1135975227      |2005-12-30 22:40:27|2005--12--30 Fri  |52           |
|2018-09-13   |2018-09-13 20:30:38|1536859838      |2018-09-13 20:30:38|2018--9--13 Thu   |37           |
|2018-12-02   |2018-12-02 10:52:18|1543737138      |2018-12-02 10:52:18|2018--12--2 Sun   |48           |
|2018-12-24   |2018-12-24 14:03:57|1545649437 