-
Notifications
You must be signed in to change notification settings - Fork 86
Closed
Labels
bugSomething isn't workingSomething isn't working
Description
Describe the bug
Running into an issue when trying to read a variable length newline separated ASCII file using Cobrix. Please see the Stacktrace below:
at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$Serializer.apply(ExpressionEncoder.scala:239)
at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$Serializer.apply(ExpressionEncoder.scala:210)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
at scala.collection.Iterator$$anon$10.next(Iterator.scala:459)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.agg_doAggregateWithKeys_0$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:757)
at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:132)
at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at org.apache.spark.scheduler.ShuffleMapTask.$anonfun$runTask$3(ShuffleMapTask.scala:81)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.ShuffleMapTask.$anonfun$runTask$1(ShuffleMapTask.scala:81)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.doRunTask(Task.scala:150)
at org.apache.spark.scheduler.Task.$anonfun$run$1(Task.scala:119)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.scheduler.Task.run(Task.scala:91)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$13(Executor.scala:813)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1643)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:816)
at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
at com.databricks.spark.util.ExecutorFrameProfiler$.record(ExecutorFrameProfiler.scala:110)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:672)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ArithmeticException: BigInteger would overflow supported range
at java.math.BigInteger.reportOverflow(BigInteger.java:1084)
at java.math.BigInteger.pow(BigInteger.java:2391)
at java.math.BigDecimal.bigTenToThe(BigDecimal.java:3574)
at java.math.BigDecimal.bigMultiplyPowerTen(BigDecimal.java:3707)
at java.math.BigDecimal.setScale(BigDecimal.java:2448)
at java.math.BigDecimal.setScale(BigDecimal.java:2515)
at scala.math.BigDecimal.setScale(BigDecimal.scala:646)
at org.apache.spark.sql.types.Decimal.set(Decimal.scala:147)
at org.apache.spark.sql.types.Decimal$.apply(Decimal.scala:559)
at org.apache.spark.sql.types.Decimal$.fromDecimal(Decimal.scala:582)
at org.apache.spark.sql.types.Decimal.fromDecimal(Decimal.scala)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection$NestedClass_6.StaticInvoke_1338$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection$NestedClass_6.createNamedStruct_36_4$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection$NestedClass_6.createNamedStruct_36$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection$NestedClass_7.createNamedStruct_52_2$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection$NestedClass_7.createNamedStruct_52$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection$NestedClass_7.createNamedStruct_53_5$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection$NestedClass_7.createNamedStruct_53$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection$NestedClass_8.writeFields_35_15$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection$NestedClass_8.writeFields_35$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source)
at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$Serializer.apply(ExpressionEncoder.scala:235)
... 30 more
To Reproduce
Steps to reproduce the behaviour OR commands run:
- Read a Variable length ASCII of a bigger size (like at least 100 GB)
- Use the following options to read
.option("encoding", "ascii")
.option("is_text", "true")
- See error
Expected behaviour
The file should parse correctly.
Screenshots
Please see the Stacktrace provided above.
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working