In [1]:
from pyspark.sql.functions import *
from pyspark.sql import Window

In [4]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, DateType, ArrayType, FloatType

In [5]:
spark = SparkSession.builder \
.master("local[1]") \
.appName("OrderStatusCheck") \
.getOrCreate()

2023-02-28 23:26:35,489 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [25]:
# Read the input file
df = spark.read \
.format("csv") \
.option("header", True) \
.option("sep", ",") \
.option("inferSchema", True) \
.load("file:///home/train/dataops7/spark/hw4_03_order_status/orderStatusData.csv")

In [26]:
df.show()

+--------+-------------+-----------+-----------+---------+
|ORDER_ID|SUBSCRIBER_ID|STATUS_DATE|STATUS_TIME|   STATUS|
+--------+-------------+-----------+-----------+---------+
|  100159|       200427|   20230223|      83209| ASSIGNED|
|  100159|       200427|   20230223|      74232| RETURNED|
|  100159|       200427|   20230222|      95056|  CREATED|
|  100410|       200366|   20230223|      91017| ASSIGNED|
|  100410|       200366|   20230223|      30301| RETURNED|
|  100410|       200366|   20230222|      93638|  CREATED|
|  100497|       200024|   20230222|     105418|     POOL|
|  100497|       200024|   20230222|     105418|  CREATED|
|  100539|       200012|   20230222|     112855|COMPLETED|
|  100539|       200012|   20230222|      95408|  CREATED|
|  100575|       200573|   20230223|      85951| ASSIGNED|
|  100575|       200573|   20230223|      41932| RETURNED|
|  100575|       200573|   20230222|     105441|  CREATED|
|  100259|       200192|   20230223|      83115| ASSIGNE

In [27]:
# Convert the date and time columns to timestamps
df = df.withColumn("STATUS_DATETIME", to_timestamp(concat_ws(" ", df.STATUS_DATE, df.STATUS_TIME), "yyyyMMdd HHmmss"))

In [28]:
df.show()

+--------+-------------+-----------+-----------+---------+-------------------+
|ORDER_ID|SUBSCRIBER_ID|STATUS_DATE|STATUS_TIME|   STATUS|    STATUS_DATETIME|
+--------+-------------+-----------+-----------+---------+-------------------+
|  100159|       200427|   20230223|      83209| ASSIGNED|               null|
|  100159|       200427|   20230223|      74232| RETURNED|               null|
|  100159|       200427|   20230222|      95056|  CREATED|               null|
|  100410|       200366|   20230223|      91017| ASSIGNED|               null|
|  100410|       200366|   20230223|      30301| RETURNED|               null|
|  100410|       200366|   20230222|      93638|  CREATED|               null|
|  100497|       200024|   20230222|     105418|     POOL|2023-02-22 10:54:18|
|  100497|       200024|   20230222|     105418|  CREATED|2023-02-22 10:54:18|
|  100539|       200012|   20230222|     112855|COMPLETED|2023-02-22 11:28:55|
|  100539|       200012|   20230222|      95408|  CR

In [29]:
# Create a window partitioned by ORDER_ID and sorted by STATUS_DATETIME
window_spec = Window.partitionBy("ORDER_ID").orderBy("STATUS_DATETIME")

In [30]:
# Create a column for the previous status
df = df.withColumn("PREV_STATUS", lag("STATUS", 1).over(window_spec))

In [32]:
spark.sql("set spark.sql.legacy.timeParserPolicy=CORRECTED")

DataFrame[key: string, value: string]

In [33]:
df.show()

+--------+-------------+-----------+-----------+---------+-------------------+-----------+
|ORDER_ID|SUBSCRIBER_ID|STATUS_DATE|STATUS_TIME|   STATUS|    STATUS_DATETIME|PREV_STATUS|
+--------+-------------+-----------+-----------+---------+-------------------+-----------+
|  100170|       200497|   20230222|      94140|  CREATED|               null|       null|
|  100170|       200497|   20230222|     162938| RETURNED|2023-02-22 16:29:38|    CREATED|
|  100170|       200497|   20230222|     164206|COMPLETED|2023-02-22 16:42:06|   RETURNED|
|  100274|       200242|   20230222|      95309|  CREATED|               null|       null|
|  100274|       200242|   20230222|     181241| RETURNED|2023-02-22 18:12:41|    CREATED|
|  100274|       200242|   20230222|     181513|CANCELLED|2023-02-22 18:15:13|   RETURNED|
|  100446|       200239|   20230222|      95004|  CREATED|               null|       null|
|  100446|       200239|   20230222|     142807|COMPLETED|2023-02-22 14:28:07|    CREATED|

In [34]:
# Create columns for the start date and end date
df = df.withColumn("START_DATE", when((df.STATUS == "CREATED") | (df.STATUS == "POOL"), df.STATUS_DATETIME))
df = df.withColumn("END_DATE", when((df.STATUS == "COMPLETED") | (df.STATUS == "CANCELLED"), df.STATUS_DATETIME))

In [35]:
df.show()

+--------+-------------+-----------+-----------+---------+-------------------+-----------+-------------------+-------------------+
|ORDER_ID|SUBSCRIBER_ID|STATUS_DATE|STATUS_TIME|   STATUS|    STATUS_DATETIME|PREV_STATUS|         START_DATE|           END_DATE|
+--------+-------------+-----------+-----------+---------+-------------------+-----------+-------------------+-------------------+
|  100170|       200497|   20230222|      94140|  CREATED|               null|       null|               null|               null|
|  100170|       200497|   20230222|     162938| RETURNED|2023-02-22 16:29:38|    CREATED|               null|               null|
|  100170|       200497|   20230222|     164206|COMPLETED|2023-02-22 16:42:06|   RETURNED|               null|2023-02-22 16:42:06|
|  100274|       200242|   20230222|      95309|  CREATED|               null|       null|               null|               null|
|  100274|       200242|   20230222|     181241| RETURNED|2023-02-22 18:12:41|    C

In [36]:
# Create a column for the current status
df = df.withColumn("CURRENT_STATUS", when(df.END_DATE.isNotNull(), df.STATUS).otherwise(df.PREV_STATUS))

In [37]:
df.show()

+--------+-------------+-----------+-----------+---------+-------------------+-----------+-------------------+-------------------+--------------+
|ORDER_ID|SUBSCRIBER_ID|STATUS_DATE|STATUS_TIME|   STATUS|    STATUS_DATETIME|PREV_STATUS|         START_DATE|           END_DATE|CURRENT_STATUS|
+--------+-------------+-----------+-----------+---------+-------------------+-----------+-------------------+-------------------+--------------+
|  100170|       200497|   20230222|      94140|  CREATED|               null|       null|               null|               null|          null|
|  100170|       200497|   20230222|     162938| RETURNED|2023-02-22 16:29:38|    CREATED|               null|               null|       CREATED|
|  100170|       200497|   20230222|     164206|COMPLETED|2023-02-22 16:42:06|   RETURNED|               null|2023-02-22 16:42:06|     COMPLETED|
|  100274|       200242|   20230222|      95309|  CREATED|               null|       null|               null|              

In [38]:
# Create a column for the duration
df = df.withColumn("DURATION", when(df.END_DATE.isNotNull(), unix_timestamp(df.END_DATE) - unix_timestamp(df.START_DATE)).otherwise(0))

In [39]:
df.show()

+--------+-------------+-----------+-----------+---------+-------------------+-----------+-------------------+-------------------+--------------+--------+
|ORDER_ID|SUBSCRIBER_ID|STATUS_DATE|STATUS_TIME|   STATUS|    STATUS_DATETIME|PREV_STATUS|         START_DATE|           END_DATE|CURRENT_STATUS|DURATION|
+--------+-------------+-----------+-----------+---------+-------------------+-----------+-------------------+-------------------+--------------+--------+
|  100170|       200497|   20230222|      94140|  CREATED|               null|       null|               null|               null|          null|       0|
|  100170|       200497|   20230222|     162938| RETURNED|2023-02-22 16:29:38|    CREATED|               null|               null|       CREATED|       0|
|  100170|       200497|   20230222|     164206|COMPLETED|2023-02-22 16:42:06|   RETURNED|               null|2023-02-22 16:42:06|     COMPLETED|    null|
|  100274|       200242|   20230222|      95309|  CREATED|            

In [42]:
df.printSchema()

root
 |-- ORDER_ID: integer (nullable = true)
 |-- SUBSCRIBER_ID: integer (nullable = true)
 |-- STATUS_DATE: integer (nullable = true)
 |-- STATUS_TIME: integer (nullable = true)
 |-- STATUS: string (nullable = true)
 |-- STATUS_DATETIME: timestamp (nullable = true)
 |-- PREV_STATUS: string (nullable = true)
 |-- START_DATE: timestamp (nullable = true)
 |-- END_DATE: timestamp (nullable = true)
 |-- CURRENT_STATUS: string (nullable = true)
 |-- DURATION: long (nullable = true)



In [43]:
df2 = df.select("ORDER_ID", "SUBSCRIBER_ID", "STATUS", "START_DATE", "END_DATE", "DURATION")

In [44]:
df2.show()

+--------+-------------+---------+-------------------+-------------------+--------+
|ORDER_ID|SUBSCRIBER_ID|   STATUS|         START_DATE|           END_DATE|DURATION|
+--------+-------------+---------+-------------------+-------------------+--------+
|  100159|       200427| ASSIGNED|               null|               null|       0|
|  100159|       200427| RETURNED|               null|               null|       0|
|  100159|       200427|  CREATED|               null|               null|       0|
|  100410|       200366| ASSIGNED|               null|               null|       0|
|  100410|       200366| RETURNED|               null|               null|       0|
|  100410|       200366|  CREATED|               null|               null|       0|
|  100497|       200024|     POOL|2023-02-22 10:54:18|               null|       0|
|  100497|       200024|  CREATED|2023-02-22 10:54:18|               null|       0|
|  100539|       200012|COMPLETED|               null|2023-02-22 11:28:55|  

In [53]:
df2.show()

+--------+-------------+-------+-------------------+--------+--------+
|ORDER_ID|SUBSCRIBER_ID| STATUS|         START_DATE|END_DATE|DURATION|
+--------+-------------+-------+-------------------+--------+--------+
|  100497|       200024|   POOL|2023-02-22 10:54:18|    null|       0|
|  100497|       200024|CREATED|2023-02-22 10:54:18|    null|       0|
|  100575|       200573|CREATED|2023-02-22 10:54:41|    null|       0|
|  100230|       200049|CREATED|2023-02-22 10:57:08|    null|       0|
|  100511|       200495|   POOL|2023-02-22 10:55:14|    null|       0|
|  100511|       200495|CREATED|2023-02-22 10:55:14|    null|       0|
|  100399|       200373|CREATED|2023-02-21 16:38:00|    null|       0|
|  100188|       200156|CREATED|2023-02-21 16:38:07|    null|       0|
|  100483|       200338|CREATED|2023-02-22 10:53:57|    null|       0|
|  100288|       200330|CREATED|2023-02-21 16:12:05|    null|       0|
|  100165|       200031|CREATED|2023-02-22 10:57:40|    null|       0|
|  100

In [52]:
df2 = df2.filter(df2.START_DATE.isNotNull())

In [54]:
# Select the desired columns and write the output to a file
# output_df = df.select("ORDER_ID", "CURRENT_STATUS", "START_DATE", "END_DATE", "DURATION")
df2.write.mode("overwrite").option("header", "true").csv("file:///home/train/dataops7/spark/hw4_03_order_status/report.csv")