In [2]:
from pyspark.sql.functions import *
from pyspark.sql import Window

In [3]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, DateType, ArrayType, FloatType

In [4]:
spark = SparkSession.builder \
.master("local[2]") \
.appName("OrderStatusCheck") \
.getOrCreate()

2023-03-01 21:13:14,173 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
# Read the input file
df = spark.read \
.format("csv") \
.option("header", True) \
.option("sep", ",") \
.option("inferSchema", True) \
.load("file:///home/train/dataops7/spark/hw4_OrderStatus/OrderStatusData.csv")

In [6]:
df.show()

+--------+-------------+-----------+-----------+---------+
|ORDER_ID|SUBSCRIBER_ID|STATUS_DATE|STATUS_TIME|   STATUS|
+--------+-------------+-----------+-----------+---------+
|  100159|       200427|   20230223|      83209| ASSIGNED|
|  100159|       200427|   20230223|      74232| RETURNED|
|  100159|       200427|   20230222|      95056|  CREATED|
|  100410|       200366|   20230223|      91017| ASSIGNED|
|  100410|       200366|   20230223|      30301| RETURNED|
|  100410|       200366|   20230222|      93638|  CREATED|
|  100497|       200024|   20230222|     105418|     POOL|
|  100497|       200024|   20230222|     105418|  CREATED|
|  100539|       200012|   20230222|     112855|COMPLETED|
|  100539|       200012|   20230222|      95408|  CREATED|
|  100575|       200573|   20230223|      85951| ASSIGNED|
|  100575|       200573|   20230223|      41932| RETURNED|
|  100575|       200573|   20230222|     105441|  CREATED|
|  100259|       200192|   20230223|      83115| ASSIGNE

In [7]:
# Convert the date and time columns to timestamps
df = df.withColumn("STATUS_DATETIME", to_timestamp(concat_ws(" ", df.STATUS_DATE, df.STATUS_TIME), "yyyyMMdd Hmmss"))

In [8]:
df.show()

+--------+-------------+-----------+-----------+---------+-------------------+
|ORDER_ID|SUBSCRIBER_ID|STATUS_DATE|STATUS_TIME|   STATUS|    STATUS_DATETIME|
+--------+-------------+-----------+-----------+---------+-------------------+
|  100159|       200427|   20230223|      83209| ASSIGNED|2023-02-23 08:32:09|
|  100159|       200427|   20230223|      74232| RETURNED|2023-02-23 07:42:32|
|  100159|       200427|   20230222|      95056|  CREATED|2023-02-22 09:50:56|
|  100410|       200366|   20230223|      91017| ASSIGNED|2023-02-23 09:10:17|
|  100410|       200366|   20230223|      30301| RETURNED|2023-02-23 03:03:01|
|  100410|       200366|   20230222|      93638|  CREATED|2023-02-22 09:36:38|
|  100497|       200024|   20230222|     105418|     POOL|2023-02-22 10:54:18|
|  100497|       200024|   20230222|     105418|  CREATED|2023-02-22 10:54:18|
|  100539|       200012|   20230222|     112855|COMPLETED|2023-02-22 11:28:55|
|  100539|       200012|   20230222|      95408|  CR

In [9]:
# Create a window partitioned by ORDER_ID and sorted by STATUS_DATETIME
window_spec = Window.partitionBy("ORDER_ID").orderBy(F.col("STATUS_DATETIME").desc())

In [10]:
# Create a column for the previous status
df = df.withColumn("PREV_STATUS", lag("STATUS", 1).over(window_spec))

In [11]:
df = df.withColumn('LATEST_STATUS', F.first('STATUS').over(Window.partitionBy('ORDER_ID').orderBy(F.col("STATUS_DATETIME").desc())))

In [12]:
spark.sql("set spark.sql.legacy.timeParserPolicy=CORRECTED")

DataFrame[key: string, value: string]

In [13]:
df.show()

+--------+-------------+-----------+-----------+---------+-------------------+-----------+-------------+
|ORDER_ID|SUBSCRIBER_ID|STATUS_DATE|STATUS_TIME|   STATUS|    STATUS_DATETIME|PREV_STATUS|LATEST_STATUS|
+--------+-------------+-----------+-----------+---------+-------------------+-----------+-------------+
|  100170|       200497|   20230222|     164206|COMPLETED|2023-02-22 16:42:06|       null|    COMPLETED|
|  100170|       200497|   20230222|     162938| RETURNED|2023-02-22 16:29:38|  COMPLETED|    COMPLETED|
|  100170|       200497|   20230222|      94140|  CREATED|2023-02-22 09:41:40|   RETURNED|    COMPLETED|
|  100274|       200242|   20230222|     181513|CANCELLED|2023-02-22 18:15:13|       null|    CANCELLED|
|  100274|       200242|   20230222|     181241| RETURNED|2023-02-22 18:12:41|  CANCELLED|    CANCELLED|
|  100274|       200242|   20230222|      95309|  CREATED|2023-02-22 09:53:09|   RETURNED|    CANCELLED|
|  100446|       200239|   20230222|     142807|COMPLET

In [21]:
# Create columns for the start date and end date
df = df.withColumn("START_DATE", when((df.STATUS == "CREATED") | (df.STATUS == "POOL"), df.STATUS_DATETIME))
df = df.withColumn("END_DATE", when((df.STATUS == "COMPLETED") | (df.STATUS == "CANCELLED"), df.STATUS_DATETIME))

In [22]:
df.limit(5).toPandas()

Unnamed: 0,ORDER_ID,SUBSCRIBER_ID,STATUS_DATE,STATUS_TIME,STATUS,STATUS_DATETIME,PREV_STATUS,LATEST_STATUS,START_DATE,END_DATE
0,100170,200497,20230222,164206,COMPLETED,2023-02-22 16:42:06,,COMPLETED,NaT,2023-02-22 16:42:06
1,100170,200497,20230222,162938,RETURNED,2023-02-22 16:29:38,COMPLETED,COMPLETED,NaT,NaT
2,100170,200497,20230222,94140,CREATED,2023-02-22 09:41:40,RETURNED,COMPLETED,2023-02-22 09:41:40,NaT
3,100274,200242,20230222,181513,CANCELLED,2023-02-22 18:15:13,,CANCELLED,NaT,2023-02-22 18:15:13
4,100274,200242,20230222,181241,RETURNED,2023-02-22 18:12:41,CANCELLED,CANCELLED,NaT,NaT


In [36]:
# Create a column for the current status
#df = df.withColumn("CURRENT_STATUS", when(df.END_DATE.isNotNull(), df.STATUS).otherwise(df.PREV_STATUS))

In [21]:
#df.limit(9).toPandas()

                                                                                

Unnamed: 0,ORDER_ID,SUBSCRIBER_ID,STATUS_DATE,STATUS_TIME,STATUS,STATUS_DATETIME,PREV_STATUS,START_DATE,END_DATE,CURRENT_STATUS
0,100170,200497,20230222,94140,CREATED,2023-02-22 09:41:40,,2023-02-22 09:41:40,NaT,
1,100170,200497,20230222,162938,RETURNED,2023-02-22 16:29:38,CREATED,NaT,NaT,CREATED
2,100170,200497,20230222,164206,COMPLETED,2023-02-22 16:42:06,RETURNED,NaT,2023-02-22 16:42:06,COMPLETED
3,100274,200242,20230222,95309,CREATED,2023-02-22 09:53:09,,2023-02-22 09:53:09,NaT,
4,100274,200242,20230222,181241,RETURNED,2023-02-22 18:12:41,CREATED,NaT,NaT,CREATED
5,100274,200242,20230222,181513,CANCELLED,2023-02-22 18:15:13,RETURNED,NaT,2023-02-22 18:15:13,CANCELLED
6,100446,200239,20230222,95004,CREATED,2023-02-22 09:50:04,,2023-02-22 09:50:04,NaT,
7,100446,200239,20230222,142807,COMPLETED,2023-02-22 14:28:07,CREATED,NaT,2023-02-22 14:28:07,COMPLETED
8,100068,200431,20230222,93835,CREATED,2023-02-22 09:38:35,,2023-02-22 09:38:35,NaT,


In [29]:
df1 = df.select('ORDER_ID','SUBSCRIBER_ID', 'LATEST_STATUS', 'START_DATE', 'END_DATE') \
.groupBy('ORDER_ID','SUBSCRIBER_ID','LATEST_STATUS' ) \
.agg(F.min('START_DATE').alias('START_DATE'), F.min('END_DATE').alias('END_DATE'))

In [51]:
df2= df1.withColumn("DURATION", when(df1.END_DATE.isNotNull(), (unix_timestamp(df1.END_DATE) - unix_timestamp(df1.START_DATE))/3600).otherwise(0))

In [53]:
df2.orderBy("ORDER_ID").show()



+--------+-------------+-------------+-------------------+-------------------+------------------+
|ORDER_ID|SUBSCRIBER_ID|LATEST_STATUS|         START_DATE|           END_DATE|          DURATION|
+--------+-------------+-------------+-------------------+-------------------+------------------+
|  100001|       200574|     ASSIGNED|2023-02-22 09:53:06|               null|               0.0|
|  100002|       200121|         POOL|2023-02-22 09:37:12|               null|               0.0|
|  100003|       200432|     ASSIGNED|2023-02-22 09:40:01|               null|               0.0|
|  100004|       200234|    COMPLETED|2023-02-22 09:37:58|2023-02-22 10:44:26|1.1077777777777778|
|  100005|       200546|         POOL|2023-02-22 09:47:27|               null|               0.0|
|  100006|       200369|     ASSIGNED|2023-02-22 10:54:13|               null|               0.0|
|  100007|       200486|     ASSIGNED|2023-02-22 10:54:48|               null|               0.0|
|  100008|       200

                                                                                

In [54]:
df3 = df2.repartition(1)

In [55]:
# Select the desired columns and write the output to a file
# output_df = df.select("ORDER_ID", "CURRENT_STATUS", "START_DATE", "END_DATE", "DURATION")
df3.write.mode("overwrite").option("header", "true").csv("file:///home/train/dataops7/spark/hw4_OrderStatus/report")

                                                                                

In [None]:
spark.stop()