In [1]:
from pyspark.sql.functions import *
from pyspark.sql import Window

In [3]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType, DateType, ArrayType, FloatType

In [4]:
spark = SparkSession.builder \
.master("local[2]") \
.appName("OrderStatusCheck") \
.getOrCreate()

2023-03-02 10:21:51,572 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2023-03-02 10:21:52,723 WARN util.Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [5]:
# Read the input file
df = spark.read \
.format("csv") \
.option("header", True) \
.option("sep", ",") \
.option("inferSchema", True) \
.load("file:///home/train/dataops7/spark/hw4_03_order_status/orderStatusData.csv")

In [6]:
df.show()

+--------+-------------+-----------+-----------+---------+
|ORDER_ID|SUBSCRIBER_ID|STATUS_DATE|STATUS_TIME|   STATUS|
+--------+-------------+-----------+-----------+---------+
|  100159|       200427|   20230223|      83209| ASSIGNED|
|  100159|       200427|   20230223|      74232| RETURNED|
|  100159|       200427|   20230222|      95056|  CREATED|
|  100410|       200366|   20230223|      91017| ASSIGNED|
|  100410|       200366|   20230223|      30301| RETURNED|
|  100410|       200366|   20230222|      93638|  CREATED|
|  100497|       200024|   20230222|     105418|     POOL|
|  100497|       200024|   20230222|     105418|  CREATED|
|  100539|       200012|   20230222|     112855|COMPLETED|
|  100539|       200012|   20230222|      95408|  CREATED|
|  100575|       200573|   20230223|      85951| ASSIGNED|
|  100575|       200573|   20230223|      41932| RETURNED|
|  100575|       200573|   20230222|     105441|  CREATED|
|  100259|       200192|   20230223|      83115| ASSIGNE

In [7]:
# Convert the date and time columns to timestamps
df = df.withColumn("STATUS_DATETIME", to_timestamp(concat_ws(" ", df.STATUS_DATE, df.STATUS_TIME), "yyyyMMdd Hmmss"))

In [8]:
df.show()

+--------+-------------+-----------+-----------+---------+-------------------+
|ORDER_ID|SUBSCRIBER_ID|STATUS_DATE|STATUS_TIME|   STATUS|    STATUS_DATETIME|
+--------+-------------+-----------+-----------+---------+-------------------+
|  100159|       200427|   20230223|      83209| ASSIGNED|2023-02-23 08:32:09|
|  100159|       200427|   20230223|      74232| RETURNED|2023-02-23 07:42:32|
|  100159|       200427|   20230222|      95056|  CREATED|2023-02-22 09:50:56|
|  100410|       200366|   20230223|      91017| ASSIGNED|2023-02-23 09:10:17|
|  100410|       200366|   20230223|      30301| RETURNED|2023-02-23 03:03:01|
|  100410|       200366|   20230222|      93638|  CREATED|2023-02-22 09:36:38|
|  100497|       200024|   20230222|     105418|     POOL|2023-02-22 10:54:18|
|  100497|       200024|   20230222|     105418|  CREATED|2023-02-22 10:54:18|
|  100539|       200012|   20230222|     112855|COMPLETED|2023-02-22 11:28:55|
|  100539|       200012|   20230222|      95408|  CR

In [10]:
df = df.withColumn('LATEST_STATUS', F.first('STATUS').over(Window.partitionBy('ORDER_ID').orderBy(F.col("STATUS_DATETIME").desc())))

In [11]:
df.show()

+--------+-------------+-----------+-----------+---------+-------------------+-------------+
|ORDER_ID|SUBSCRIBER_ID|STATUS_DATE|STATUS_TIME|   STATUS|    STATUS_DATETIME|LATEST_STATUS|
+--------+-------------+-----------+-----------+---------+-------------------+-------------+
|  100170|       200497|   20230222|     164206|COMPLETED|2023-02-22 16:42:06|    COMPLETED|
|  100170|       200497|   20230222|     162938| RETURNED|2023-02-22 16:29:38|    COMPLETED|
|  100170|       200497|   20230222|      94140|  CREATED|2023-02-22 09:41:40|    COMPLETED|
|  100274|       200242|   20230222|     181513|CANCELLED|2023-02-22 18:15:13|    CANCELLED|
|  100274|       200242|   20230222|     181241| RETURNED|2023-02-22 18:12:41|    CANCELLED|
|  100274|       200242|   20230222|      95309|  CREATED|2023-02-22 09:53:09|    CANCELLED|
|  100446|       200239|   20230222|     142807|COMPLETED|2023-02-22 14:28:07|    COMPLETED|
|  100446|       200239|   20230222|      95004|  CREATED|2023-02-22 0

In [12]:
#spark.sql("set spark.sql.legacy.timeParserPolicy=CORRECTED")

DataFrame[key: string, value: string]

In [12]:
# Create columns for the start date and end date
df = df.withColumn("START_DATE", when((df.STATUS == "CREATED") | (df.STATUS == "POOL"), df.STATUS_DATETIME))
df = df.withColumn("END_DATE", when((df.STATUS == "COMPLETED") | (df.STATUS == "CANCELLED"), df.STATUS_DATETIME))

In [13]:
df.limit(5).toPandas()

Unnamed: 0,ORDER_ID,SUBSCRIBER_ID,STATUS_DATE,STATUS_TIME,STATUS,STATUS_DATETIME,LATEST_STATUS,START_DATE,END_DATE
0,100170,200497,20230222,164206,COMPLETED,2023-02-22 16:42:06,COMPLETED,NaT,2023-02-22 16:42:06
1,100170,200497,20230222,162938,RETURNED,2023-02-22 16:29:38,COMPLETED,NaT,NaT
2,100170,200497,20230222,94140,CREATED,2023-02-22 09:41:40,COMPLETED,2023-02-22 09:41:40,NaT
3,100274,200242,20230222,181513,CANCELLED,2023-02-22 18:15:13,CANCELLED,NaT,2023-02-22 18:15:13
4,100274,200242,20230222,181241,RETURNED,2023-02-22 18:12:41,CANCELLED,NaT,NaT


In [14]:
df1 = df.select('ORDER_ID','SUBSCRIBER_ID', 'LATEST_STATUS', 'START_DATE', 'END_DATE') \
.groupBy('ORDER_ID','SUBSCRIBER_ID','LATEST_STATUS' ) \
.agg(F.min('START_DATE').alias('START_DATE'), F.min('END_DATE').alias('END_DATE'))

In [15]:
df1.show()

+--------+-------------+-------------+-------------------+-------------------+
|ORDER_ID|SUBSCRIBER_ID|LATEST_STATUS|         START_DATE|           END_DATE|
+--------+-------------+-------------+-------------------+-------------------+
|  100170|       200497|    COMPLETED|2023-02-22 09:41:40|2023-02-22 16:42:06|
|  100274|       200242|    CANCELLED|2023-02-22 09:53:09|2023-02-22 18:15:13|
|  100446|       200239|    COMPLETED|2023-02-22 09:50:04|2023-02-22 14:28:07|
|  100068|       200431|    COMPLETED|2023-02-22 09:38:35|2023-02-22 15:45:43|
|  100088|       200449|    COMPLETED|2023-02-22 09:54:57|2023-02-23 02:11:27|
|  100220|       200100|     ASSIGNED|2023-02-22 10:54:16|               null|
|  100453|       200419|    COMPLETED|2023-02-22 09:53:12|2023-02-22 18:20:56|
|  100570|       200080|    COMPLETED|2023-02-22 09:35:53|2023-02-22 11:04:21|
|  100102|       200041|         POOL|2023-02-22 09:36:52|               null|
|  100110|       200424|     ASSIGNED|2023-02-22 09:

In [16]:
df2 = df1.withColumn("DURATION", when(df1.END_DATE.isNotNull(), (unix_timestamp(df1.END_DATE) - unix_timestamp(df1.START_DATE))/3600).otherwise(0))

In [17]:
df2.orderBy("ORDER_ID").show()



+--------+-------------+-------------+-------------------+-------------------+------------------+
|ORDER_ID|SUBSCRIBER_ID|LATEST_STATUS|         START_DATE|           END_DATE|          DURATION|
+--------+-------------+-------------+-------------------+-------------------+------------------+
|  100001|       200574|     ASSIGNED|2023-02-22 09:53:06|               null|               0.0|
|  100002|       200121|         POOL|2023-02-22 09:37:12|               null|               0.0|
|  100003|       200432|     ASSIGNED|2023-02-22 09:40:01|               null|               0.0|
|  100004|       200234|    COMPLETED|2023-02-22 09:37:58|2023-02-22 10:44:26|1.1077777777777778|
|  100005|       200546|         POOL|2023-02-22 09:47:27|               null|               0.0|
|  100006|       200369|     ASSIGNED|2023-02-22 10:54:13|               null|               0.0|
|  100007|       200486|     ASSIGNED|2023-02-22 10:54:48|               null|               0.0|
|  100008|       200

                                                                                

In [18]:
df3 = df2.repartition(1)

In [20]:
# Select the desired columns and write the output to a file
# output_df = df.select("ORDER_ID", "CURRENT_STATUS", "START_DATE", "END_DATE", "DURATION")
df3.write.mode("overwrite").option("header", "true").csv("file:///home/train/dataops7/spark/hw4_03_order_status/rep")

2023-03-02 10:29:03,749 ERROR executor.Executor: Exception in task 0.0 in stage 21.0 (TID 225)
java.io.FileNotFoundException: File file:/home/train/dataops7/spark/hw4_03_order_status/orderStatusData.csv does not exist
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:124)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:169)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spar

Py4JJavaError: An error occurred while calling o138.csv.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:231)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:188)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:108)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:106)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:131)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:180)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:218)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:215)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:176)
	at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:132)
	at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:131)
	at org.apache.spark.sql.DataFrameWriter.$anonfun$runCommand$1(DataFrameWriter.scala:989)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:772)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:989)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:438)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:415)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:293)
	at org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:979)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 21.0 failed 1 times, most recent failure: Lost task 0.0 in stage 21.0 (TID 225) (trainvm.vbo.local executor driver): java.io.FileNotFoundException: File file:/home/train/dataops7/spark/hw4_03_order_status/orderStatusData.csv does not exist
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:124)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:169)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:132)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2253)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2202)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2201)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2201)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1078)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1078)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2440)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2382)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2371)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2202)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:200)
	... 33 more
Caused by: java.io.FileNotFoundException: File file:/home/train/dataops7/spark/hw4_03_order_status/orderStatusData.csv does not exist
It is possible the underlying files have been updated. You can explicitly invalidate the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by recreating the Dataset/DataFrame involved.
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:124)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:169)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:93)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:132)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more


In [None]:
spark.stop()