[BUG] Transient Parquet Error #34817
Labels
Client
This issue points to a problem in the data-plane of the library.
Cosmos
needs-team-triage
This issue needs the team to triage.
Describe the bug
This bug is created as part of CRI reported by client where they are getting intermittent exceptions when trying to read parquet. The error message indicates CosmosItemsDataSource could not be instantiated.
Exception or Stack Trace
Traceback (most recent call last):
File "python_script_runner.py", line 210, in main
retry_call(execute_main_logic, fargs=[parameters, metric_client],
File "/home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages/retry/api.py", line 101, in retry_call
return __retry_internal(partial(f, *args, **kwargs), exceptions, tries, delay, max_delay, backoff, jitter, logger)
File "/home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages/retry/api.py", line 33, in __retry_internal
return f()
File "/mnt/var/hadoop/tmp/nm-local-dir/usercache/trusted-service-user/appcache/application_1683115666988_0055/container_1683115666988_0055_01_000001/azurespenddataprocessing/jobs/job_submitter.py", line 70, in execute_main_logic
submit_job(spark, parameters)
File "/mnt/var/hadoop/tmp/nm-local-dir/usercache/trusted-service-user/appcache/application_1683115666988_0055/container_1683115666988_0055_01_000001/azurespenddataprocessing/jobs/job_submitter.py", line 57, in submit_job
job.process()
File "/mnt/var/hadoop/tmp/nm-local-dir/usercache/trusted-service-user/appcache/application_1683115666988_0055/container_1683115666988_0055_01_000001/azurespenddataprocessing/jobs/base_job.py", line 111, in process
self.load_inputs()
File "/mnt/var/hadoop/tmp/nm-local-dir/usercache/trusted-service-user/appcache/application_1683115666988_0055/container_1683115666988_0055_01_000001/azurespenddataprocessing/jobs/base_job.py", line 186, in load_inputs
self.load_range_input(key, value_dict)
File "/mnt/var/hadoop/tmp/nm-local-dir/usercache/trusted-service-user/appcache/application_1683115666988_0055/container_1683115666988_0055_01_000001/azurespenddataprocessing/jobs/base_job.py", line 250, in load_range_input
dataframe = data_storage.create_data_frame(self.spark)
File "/mnt/var/hadoop/tmp/nm-local-dir/usercache/trusted-service-user/appcache/application_1683115666988_0055/container_1683115666988_0055_01_000001/azurespenddataprocessing/jobs/data_storage.py", line 132, in create_data_frame
return frame_reader.parquet(self.data_path())
File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/readwriter.py", line 301, in parquet
return self._df(self._jreader.parquet(_to_seq(self._spark._sc, paths)))
File "/home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages/py4j/java_gateway.py", line 1321, in call
return_value = get_return_value(
File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/utils.py", line 111, in deco
return f(*a, **kw)
File "/home/trusted-service-user/cluster-env/env/lib/python3.8/site-packages/py4j/protocol.py", line 326, in get_return_value
raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o3768.parquet.
An error occurred while calling o3768.parquet.
: java.util.ServiceConfigurationError: org.apache.spark.sql.sources.DataSourceRegister: Provider com.azure.cosmos.spark.CosmosItemsDataSource could not be instantiated
at java.util.ServiceLoader.fail(ServiceLoader.java:232)
at java.util.ServiceLoader.access$100(ServiceLoader.java:185)
at java.util.ServiceLoader$LazyIterator.nextService(ServiceLoader.java:384)
at java.util.ServiceLoader$LazyIterator.next(ServiceLoader.java:404)
at java.util.ServiceLoader$1.next(ServiceLoader.java:480)
at scala.collection.convert.Wrappers$JIteratorWrapper.next(Wrappers.scala:46)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at scala.collection.IterableLike.foreach(IterableLike.scala:74)
at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
at scala.collection.TraversableLike.filterImpl(TraversableLike.scala:303)
at scala.collection.TraversableLike.filterImpl$(TraversableLike.scala:297)
at scala.collection.AbstractTraversable.filterImpl(Traversable.scala:108)
at scala.collection.TraversableLike.filter(TraversableLike.scala:395)
at scala.collection.TraversableLike.filter$(TraversableLike.scala:395)
at scala.collection.AbstractTraversable.filter(Traversable.scala:108)
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:652)
at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:720)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:218)
at org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:608)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.lang.Thread.run(Thread.java:750)
Caused by: java.lang.NullPointerException
at com.azure.cosmos.spark.CosmosPredicates$.isOnSparkDriver(CosmosPredicates.scala:63)
at com.azure.cosmos.spark.CosmosPredicates$.assertOnSparkDriver(CosmosPredicates.scala:68)
at com.azure.cosmos.spark.CosmosItemsDataSource.(CosmosItemsDataSource.scala:26)
at sun.reflect.GeneratedConstructorAccessor79.newInstance(Unknown Source)
at sun.reflect.DelegatingConstructor
Additional context
From callstack it shows SparkEnv is null - so, spark runtime hasn't been fully initialized. We should probably change the CosmosItemDataSOurce implementation to not assert on SparkDriver in the constructor- but instead in the individual methods. There is a race condition where DataSources get instantiated during Spark initialization.
Information Checklist
Kindly make sure that you have added all the following information above and checkoff the required fields otherwise we will treat the issuer as an incomplete report
The text was updated successfully, but these errors were encountered: