In [1]:
from pyspark.sql.functions import *
from pyspark.sql import Window

In [2]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, DateType, ArrayType, FloatType

In [3]:
spark = SparkSession.builder \
.master("local[1]") \
.appName("OrderStatusCheck") \
.getOrCreate()

2023-03-01 10:35:03,902 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [24]:
# Read the input file
df = spark.read \
.format("csv") \
.option("header", True) \
.option("sep", ",") \
.option("inferSchema", True) \
.load("file:///home/train/dataops7/spark/hw4_03_order_status/orderStatusData.csv")

In [25]:
df.show()

+--------+-------------+-----------+-----------+---------+
|ORDER_ID|SUBSCRIBER_ID|STATUS_DATE|STATUS_TIME|   STATUS|
+--------+-------------+-----------+-----------+---------+
|  100159|       200427|   20230223|      83209| ASSIGNED|
|  100159|       200427|   20230223|      74232| RETURNED|
|  100159|       200427|   20230222|      95056|  CREATED|
|  100410|       200366|   20230223|      91017| ASSIGNED|
|  100410|       200366|   20230223|      30301| RETURNED|
|  100410|       200366|   20230222|      93638|  CREATED|
|  100497|       200024|   20230222|     105418|     POOL|
|  100497|       200024|   20230222|     105418|  CREATED|
|  100539|       200012|   20230222|     112855|COMPLETED|
|  100539|       200012|   20230222|      95408|  CREATED|
|  100575|       200573|   20230223|      85951| ASSIGNED|
|  100575|       200573|   20230223|      41932| RETURNED|
|  100575|       200573|   20230222|     105441|  CREATED|
|  100259|       200192|   20230223|      83115| ASSIGNE

In [26]:
# Convert the date and time columns to timestamps
df = df.withColumn("STATUS_DATETIME", to_timestamp(concat_ws(" ", df.STATUS_DATE, df.STATUS_TIME), "yyyyMMdd Hmmss"))

In [27]:
df.show()

+--------+-------------+-----------+-----------+---------+-------------------+
|ORDER_ID|SUBSCRIBER_ID|STATUS_DATE|STATUS_TIME|   STATUS|    STATUS_DATETIME|
+--------+-------------+-----------+-----------+---------+-------------------+
|  100159|       200427|   20230223|      83209| ASSIGNED|2023-02-23 08:32:09|
|  100159|       200427|   20230223|      74232| RETURNED|2023-02-23 07:42:32|
|  100159|       200427|   20230222|      95056|  CREATED|2023-02-22 09:50:56|
|  100410|       200366|   20230223|      91017| ASSIGNED|2023-02-23 09:10:17|
|  100410|       200366|   20230223|      30301| RETURNED|2023-02-23 03:03:01|
|  100410|       200366|   20230222|      93638|  CREATED|2023-02-22 09:36:38|
|  100497|       200024|   20230222|     105418|     POOL|2023-02-22 10:54:18|
|  100497|       200024|   20230222|     105418|  CREATED|2023-02-22 10:54:18|
|  100539|       200012|   20230222|     112855|COMPLETED|2023-02-22 11:28:55|
|  100539|       200012|   20230222|      95408|  CR

In [28]:
# Create a window partitioned by ORDER_ID and sorted by STATUS_DATETIME
window_spec = Window.partitionBy("ORDER_ID").orderBy("STATUS_DATETIME")

In [29]:
window_spec.show()

AttributeError: 'WindowSpec' object has no attribute 'show'

In [30]:
# Create a column for the previous status
df = df.withColumn("PREV_STATUS", lag("STATUS", 1).over(window_spec))

In [31]:
spark.sql("set spark.sql.legacy.timeParserPolicy=CORRECTED")

DataFrame[key: string, value: string]

In [32]:
df.show()

+--------+-------------+-----------+-----------+---------+-------------------+-----------+
|ORDER_ID|SUBSCRIBER_ID|STATUS_DATE|STATUS_TIME|   STATUS|    STATUS_DATETIME|PREV_STATUS|
+--------+-------------+-----------+-----------+---------+-------------------+-----------+
|  100170|       200497|   20230222|      94140|  CREATED|2023-02-22 09:41:40|       null|
|  100170|       200497|   20230222|     162938| RETURNED|2023-02-22 16:29:38|    CREATED|
|  100170|       200497|   20230222|     164206|COMPLETED|2023-02-22 16:42:06|   RETURNED|
|  100274|       200242|   20230222|      95309|  CREATED|2023-02-22 09:53:09|       null|
|  100274|       200242|   20230222|     181241| RETURNED|2023-02-22 18:12:41|    CREATED|
|  100274|       200242|   20230222|     181513|CANCELLED|2023-02-22 18:15:13|   RETURNED|
|  100446|       200239|   20230222|      95004|  CREATED|2023-02-22 09:50:04|       null|
|  100446|       200239|   20230222|     142807|COMPLETED|2023-02-22 14:28:07|    CREATED|

In [33]:
# Create columns for the start date and end date
df = df.withColumn("START_DATE", when((df.STATUS == "CREATED") | (df.STATUS == "POOL"), df.STATUS_DATETIME))
df = df.withColumn("END_DATE", when((df.STATUS == "COMPLETED") | (df.STATUS == "CANCELLED"), df.STATUS_DATETIME))

In [34]:
df.show()

+--------+-------------+-----------+-----------+---------+-------------------+-----------+-------------------+-------------------+
|ORDER_ID|SUBSCRIBER_ID|STATUS_DATE|STATUS_TIME|   STATUS|    STATUS_DATETIME|PREV_STATUS|         START_DATE|           END_DATE|
+--------+-------------+-----------+-----------+---------+-------------------+-----------+-------------------+-------------------+
|  100170|       200497|   20230222|      94140|  CREATED|2023-02-22 09:41:40|       null|2023-02-22 09:41:40|               null|
|  100170|       200497|   20230222|     162938| RETURNED|2023-02-22 16:29:38|    CREATED|               null|               null|
|  100170|       200497|   20230222|     164206|COMPLETED|2023-02-22 16:42:06|   RETURNED|               null|2023-02-22 16:42:06|
|  100274|       200242|   20230222|      95309|  CREATED|2023-02-22 09:53:09|       null|2023-02-22 09:53:09|               null|
|  100274|       200242|   20230222|     181241| RETURNED|2023-02-22 18:12:41|    C

In [35]:
# Create a column for the current status
df = df.withColumn("CURRENT_STATUS", when(df.END_DATE.isNotNull(), df.STATUS).otherwise(df.PREV_STATUS))

In [51]:
df1 = df.filter(df.CURRENT_STATUS.isNull())

In [52]:
df1.show()

+--------+-------------+-----------+-----------+-------+-------------------+-----------+-------------------+--------+--------------+--------+
|ORDER_ID|SUBSCRIBER_ID|STATUS_DATE|STATUS_TIME| STATUS|    STATUS_DATETIME|PREV_STATUS|         START_DATE|END_DATE|CURRENT_STATUS|DURATION|
+--------+-------------+-----------+-----------+-------+-------------------+-----------+-------------------+--------+--------------+--------+
|  100170|       200497|   20230222|      94140|CREATED|2023-02-22 09:41:40|       null|2023-02-22 09:41:40|    null|          null|       0|
|  100274|       200242|   20230222|      95309|CREATED|2023-02-22 09:53:09|       null|2023-02-22 09:53:09|    null|          null|       0|
|  100446|       200239|   20230222|      95004|CREATED|2023-02-22 09:50:04|       null|2023-02-22 09:50:04|    null|          null|       0|
|  100068|       200431|   20230222|      93835|CREATED|2023-02-22 09:38:35|       null|2023-02-22 09:38:35|    null|          null|       0|
|  100

In [53]:
df1.count()

                                                                                

584

In [66]:
df111 = df1.select('ORDER_ID', 'START_DATE')

In [72]:
df111 = df111.withColumnRenamed('ORDER_ID', 'df111_orderid') 

In [74]:
df111 = df111.withColumnRenamed('START_DATE', 'df111_startdate') 

In [75]:
df111.show()

+-------------+-------------------+
|df111_orderid|    df111_startdate|
+-------------+-------------------+
|       100170|2023-02-22 09:41:40|
|       100274|2023-02-22 09:53:09|
|       100446|2023-02-22 09:50:04|
|       100068|2023-02-22 09:38:35|
|       100088|2023-02-22 09:54:57|
|       100220|2023-02-22 10:54:16|
|       100453|2023-02-22 09:53:12|
|       100570|2023-02-22 09:35:53|
|       100102|2023-02-22 09:36:52|
|       100110|2023-02-22 09:39:35|
|       100349|2023-02-21 16:38:34|
|       100417|2023-02-22 09:45:13|
|       100447|2023-02-22 09:49:32|
|       100003|2023-02-22 09:40:01|
|       100074|2023-02-22 10:55:11|
|       100206|2023-02-22 09:49:21|
|       100240|2023-02-22 09:39:23|
|       100279|2023-02-22 09:42:01|
|       100007|2023-02-22 10:54:48|
|       100042|2023-02-22 10:55:57|
+-------------+-------------------+
only showing top 20 rows



In [76]:
df11 = df.join(df111, df.ORDER_ID == df111.df111_orderid)

In [77]:
df11.printSchema()

root
 |-- ORDER_ID: integer (nullable = true)
 |-- SUBSCRIBER_ID: integer (nullable = true)
 |-- STATUS_DATE: integer (nullable = true)
 |-- STATUS_TIME: integer (nullable = true)
 |-- STATUS: string (nullable = true)
 |-- STATUS_DATETIME: timestamp (nullable = true)
 |-- PREV_STATUS: string (nullable = true)
 |-- START_DATE: timestamp (nullable = true)
 |-- END_DATE: timestamp (nullable = true)
 |-- CURRENT_STATUS: string (nullable = true)
 |-- DURATION: long (nullable = true)
 |-- df111_orderid: integer (nullable = true)
 |-- df111_startdate: timestamp (nullable = true)



In [82]:
# Create a column for the duration
df11 = df11.withColumn("DURATION", when(df11.END_DATE.isNotNull(), unix_timestamp(df11.END_DATE) - unix_timestamp(df11.df111_startdate)).otherwise(0))

In [85]:
df11.select("ORDER_ID", "SUBSCRIBER_ID", "STATUS", "df111_startdate", "END_DATE", "DURATION").show()



+--------+-------------+---------+-------------------+-------------------+--------+
|ORDER_ID|SUBSCRIBER_ID|   STATUS|    df111_startdate|           END_DATE|DURATION|
+--------+-------------+---------+-------------------+-------------------+--------+
|  100159|       200427| ASSIGNED|2023-02-22 09:50:56|               null|       0|
|  100159|       200427| RETURNED|2023-02-22 09:50:56|               null|       0|
|  100159|       200427|  CREATED|2023-02-22 09:50:56|               null|       0|
|  100410|       200366| ASSIGNED|2023-02-22 09:36:38|               null|       0|
|  100410|       200366| RETURNED|2023-02-22 09:36:38|               null|       0|
|  100410|       200366|  CREATED|2023-02-22 09:36:38|               null|       0|
|  100497|       200024|     POOL|2023-02-22 10:54:18|               null|       0|
|  100497|       200024|  CREATED|2023-02-22 10:54:18|               null|       0|
|  100539|       200012|COMPLETED|2023-02-22 09:54:08|2023-02-22 11:28:55|  

                                                                                

In [36]:
df.show()

+--------+-------------+-----------+-----------+---------+-------------------+-----------+-------------------+-------------------+--------------+
|ORDER_ID|SUBSCRIBER_ID|STATUS_DATE|STATUS_TIME|   STATUS|    STATUS_DATETIME|PREV_STATUS|         START_DATE|           END_DATE|CURRENT_STATUS|
+--------+-------------+-----------+-----------+---------+-------------------+-----------+-------------------+-------------------+--------------+
|  100170|       200497|   20230222|      94140|  CREATED|2023-02-22 09:41:40|       null|2023-02-22 09:41:40|               null|          null|
|  100170|       200497|   20230222|     162938| RETURNED|2023-02-22 16:29:38|    CREATED|               null|               null|       CREATED|
|  100170|       200497|   20230222|     164206|COMPLETED|2023-02-22 16:42:06|   RETURNED|               null|2023-02-22 16:42:06|     COMPLETED|
|  100274|       200242|   20230222|      95309|  CREATED|2023-02-22 09:53:09|       null|2023-02-22 09:53:09|              

In [37]:
# Create a column for the duration
df = df.withColumn("DURATION", when(df.END_DATE.isNotNull(), unix_timestamp(df.END_DATE) - unix_timestamp(df.START_DATE)).otherwise(0))

In [38]:
df.show()

+--------+-------------+-----------+-----------+---------+-------------------+-----------+-------------------+-------------------+--------------+--------+
|ORDER_ID|SUBSCRIBER_ID|STATUS_DATE|STATUS_TIME|   STATUS|    STATUS_DATETIME|PREV_STATUS|         START_DATE|           END_DATE|CURRENT_STATUS|DURATION|
+--------+-------------+-----------+-----------+---------+-------------------+-----------+-------------------+-------------------+--------------+--------+
|  100170|       200497|   20230222|      94140|  CREATED|2023-02-22 09:41:40|       null|2023-02-22 09:41:40|               null|          null|       0|
|  100170|       200497|   20230222|     162938| RETURNED|2023-02-22 16:29:38|    CREATED|               null|               null|       CREATED|       0|
|  100170|       200497|   20230222|     164206|COMPLETED|2023-02-22 16:42:06|   RETURNED|               null|2023-02-22 16:42:06|     COMPLETED|    null|
|  100274|       200242|   20230222|      95309|  CREATED|2023-02-22 0

In [18]:
df.printSchema()

root
 |-- ORDER_ID: integer (nullable = true)
 |-- SUBSCRIBER_ID: integer (nullable = true)
 |-- STATUS_DATE: integer (nullable = true)
 |-- STATUS_TIME: integer (nullable = true)
 |-- STATUS: string (nullable = true)
 |-- STATUS_DATETIME: timestamp (nullable = true)
 |-- PREV_STATUS: string (nullable = true)
 |-- START_DATE: timestamp (nullable = true)
 |-- END_DATE: timestamp (nullable = true)
 |-- CURRENT_STATUS: string (nullable = true)
 |-- DURATION: long (nullable = true)



In [19]:
df2 = df.select("ORDER_ID", "SUBSCRIBER_ID", "STATUS", "START_DATE", "END_DATE", "DURATION")

In [20]:
df2.show()

+--------+-------------+---------+-------------------+-------------------+--------+
|ORDER_ID|SUBSCRIBER_ID|   STATUS|         START_DATE|           END_DATE|DURATION|
+--------+-------------+---------+-------------------+-------------------+--------+
|  100159|       200427| ASSIGNED|               null|               null|       0|
|  100159|       200427| RETURNED|               null|               null|       0|
|  100159|       200427|  CREATED|               null|               null|       0|
|  100410|       200366| ASSIGNED|               null|               null|       0|
|  100410|       200366| RETURNED|               null|               null|       0|
|  100410|       200366|  CREATED|               null|               null|       0|
|  100497|       200024|     POOL|2023-02-22 10:54:18|               null|       0|
|  100497|       200024|  CREATED|2023-02-22 10:54:18|               null|       0|
|  100539|       200012|COMPLETED|               null|2023-02-22 11:28:55|  

In [21]:
df2.show()

+--------+-------------+---------+-------------------+-------------------+--------+
|ORDER_ID|SUBSCRIBER_ID|   STATUS|         START_DATE|           END_DATE|DURATION|
+--------+-------------+---------+-------------------+-------------------+--------+
|  100159|       200427| ASSIGNED|               null|               null|       0|
|  100159|       200427| RETURNED|               null|               null|       0|
|  100159|       200427|  CREATED|               null|               null|       0|
|  100410|       200366| ASSIGNED|               null|               null|       0|
|  100410|       200366| RETURNED|               null|               null|       0|
|  100410|       200366|  CREATED|               null|               null|       0|
|  100497|       200024|     POOL|2023-02-22 10:54:18|               null|       0|
|  100497|       200024|  CREATED|2023-02-22 10:54:18|               null|       0|
|  100539|       200012|COMPLETED|               null|2023-02-22 11:28:55|  

In [22]:
df2 = df2.filter(df2.START_DATE.isNotNull())

In [23]:
# Select the desired columns and write the output to a file
# output_df = df.select("ORDER_ID", "CURRENT_STATUS", "START_DATE", "END_DATE", "DURATION")
df2.write.mode("overwrite").option("header", "true").csv("file:///home/train/dataops7/spark/hw4_03_order_status/report.csv")