diff --git a/bundle-3.4/pom.xml b/bundle-3.4/pom.xml
new file mode 100644
index 00000000..1ea36ec9
--- /dev/null
+++ b/bundle-3.4/pom.xml
@@ -0,0 +1,1344 @@
+
+
+
+
+ 4.0.0
+ spark-3.4-spline-agent-bundle_2.12
+ jar
+
+
+ za.co.absa.spline.agent.spark
+ spline-spark-agent_2.12
+ ../pom.xml
+ 2.1.0-SNAPSHOT
+
+
+
+
+ za.co.absa.spline.agent.spark
+ agent-core_${scala.binary.version}
+ ${project.version}
+
+
+
+
+
+
+ javax.activation
+ activation
+ provided
+
+
+ io.airlift
+ aircompressor
+ provided
+
+
+ org.typelevel
+ algebra_2.12
+ provided
+
+
+ org.jetbrains
+ annotations
+ provided
+
+
+ org.antlr
+ antlr-runtime
+ provided
+
+
+ org.antlr
+ antlr4-runtime
+ provided
+
+
+ org.glassfish.hk2.external
+ aopalliance-repackaged
+ provided
+
+
+ dev.ludovic.netlib
+ arpack
+ provided
+
+
+ net.sourceforge.f2j
+ arpack_combined_all
+ provided
+
+
+ org.apache.arrow
+ arrow-format
+ provided
+
+
+ org.apache.arrow
+ arrow-memory-core
+ provided
+
+
+ org.apache.arrow
+ arrow-memory-netty
+ provided
+
+
+ org.apache.arrow
+ arrow-vector
+ provided
+
+
+ org.apache.yetus
+ audience-annotations
+ provided
+
+
+ org.apache.avro
+ avro
+ provided
+
+
+ org.apache.avro
+ avro-ipc
+ provided
+
+
+ org.apache.avro
+ avro-mapred
+ provided
+
+
+ dev.ludovic.netlib
+ blas
+ provided
+
+
+ com.jolbox
+ bonecp
+ provided
+
+
+ org.scalanlp
+ breeze-macros_2.12
+ provided
+
+
+ org.scalanlp
+ breeze_2.12
+ provided
+
+
+ org.typelevel
+ cats-kernel_2.12
+ provided
+
+
+ com.twitter
+ chill-java
+ provided
+
+
+ com.twitter
+ chill_2.12
+ provided
+
+
+ commons-cli
+ commons-cli
+ provided
+
+
+ commons-codec
+ commons-codec
+ provided
+
+
+ commons-collections
+ commons-collections
+ provided
+
+
+ org.apache.commons
+ commons-collections4
+ provided
+
+
+ org.codehaus.janino
+ commons-compiler
+ provided
+
+
+ org.apache.commons
+ commons-compress
+ provided
+
+
+ org.apache.commons
+ commons-crypto
+ provided
+
+
+ commons-dbcp
+ commons-dbcp
+ provided
+
+
+ commons-io
+ commons-io
+ provided
+
+
+ commons-lang
+ commons-lang
+ provided
+
+
+ org.apache.commons
+ commons-lang3
+ provided
+
+
+ commons-logging
+ commons-logging
+ provided
+
+
+ org.apache.commons
+ commons-math3
+ provided
+
+
+ commons-pool
+ commons-pool
+ provided
+
+
+ org.apache.commons
+ commons-text
+ provided
+
+
+ com.ning
+ compress-lzf
+ provided
+
+
+ org.apache.curator
+ curator-client
+ provided
+
+
+ org.apache.curator
+ curator-framework
+ provided
+
+
+ org.apache.curator
+ curator-recipes
+ provided
+
+
+ org.datanucleus
+ datanucleus-api-jdo
+ provided
+
+
+ org.datanucleus
+ datanucleus-core
+ provided
+
+
+ org.datanucleus
+ datanucleus-rdbms
+ provided
+
+
+ org.apache.derby
+ derby
+ provided
+
+
+ com.github.joshelser
+ dropwizard-metrics-hadoop-metrics2-reporter
+ provided
+
+
+ com.google.flatbuffers
+ flatbuffers-java
+ provided
+
+
+ com.google.code.gson
+ gson
+ provided
+
+
+ com.google.guava
+ guava
+ provided
+
+
+ org.apache.hadoop
+ hadoop-client-api
+ provided
+
+
+ org.apache.hadoop
+ hadoop-client-runtime
+ provided
+
+
+ org.apache.hadoop.thirdparty
+ hadoop-shaded-guava
+ provided
+
+
+ org.apache.hadoop
+ hadoop-yarn-server-web-proxy
+ provided
+
+
+ com.zaxxer
+ HikariCP
+ provided
+
+
+ org.apache.hive
+ hive-beeline
+ provided
+
+
+ org.apache.hive
+ hive-cli
+ provided
+
+
+ org.apache.hive
+ hive-common
+ provided
+
+
+ org.apache.hive
+ hive-exec
+ provided
+
+
+ org.apache.hive
+ hive-jdbc
+ provided
+
+
+ org.apache.hive
+ hive-llap-common
+ provided
+
+
+ org.apache.hive
+ hive-metastore
+ provided
+
+
+ org.apache.hive
+ hive-serde
+ provided
+
+
+ org.apache.hive
+ hive-service-rpc
+ provided
+
+
+ org.apache.hive
+ hive-shims
+ provided
+
+
+ org.apache.hive
+ hive-shims
+ provided
+
+
+ org.apache.hive.shims
+ hive-shims-common
+ provided
+
+
+ org.apache.hive.shims
+ hive-shims-scheduler
+ provided
+
+
+ org.apache.hive
+ hive-storage-api
+ provided
+
+
+ org.glassfish.hk2
+ hk2-api
+ provided
+
+
+ org.glassfish.hk2
+ hk2-locator
+ provided
+
+
+ org.glassfish.hk2
+ hk2-utils
+ provided
+
+
+ org.apache.httpcomponents
+ httpclient
+ provided
+
+
+ org.apache.httpcomponents
+ httpcore
+ provided
+
+
+ com.sun.istack
+ istack-commons-runtime
+ provided
+
+
+ org.apache.ivy
+ ivy
+ provided
+
+
+ com.fasterxml.jackson.core
+ jackson-annotations
+ provided
+
+
+ com.fasterxml.jackson.core
+ jackson-core
+ provided
+
+
+ org.codehaus.jackson
+ jackson-core-asl
+ provided
+
+
+ com.fasterxml.jackson.core
+ jackson-databind
+ provided
+
+
+ com.fasterxml.jackson.dataformat
+ jackson-dataformat-yaml
+ provided
+
+
+ com.fasterxml.jackson.datatype
+ jackson-datatype-jsr310
+ provided
+
+
+ org.codehaus.jackson
+ jackson-mapper-asl
+ provided
+
+
+ com.fasterxml.jackson.module
+ jackson-module-scala_2.12
+ provided
+
+
+ jakarta.annotation
+ jakarta.annotation-api
+ provided
+
+
+ org.glassfish.hk2.external
+ jakarta.inject
+ provided
+
+
+ jakarta.servlet
+ jakarta.servlet-api
+ provided
+
+
+ jakarta.validation
+ jakarta.validation-api
+ provided
+
+
+ jakarta.ws.rs
+ jakarta.ws.rs-api
+ provided
+
+
+ jakarta.xml.bind
+ jakarta.xml.bind-api
+ provided
+
+
+ org.codehaus.janino
+ janino
+ provided
+
+
+ org.javassist
+ javassist
+ provided
+
+
+ org.datanucleus
+ javax.jdo
+ provided
+
+
+ javolution
+ javolution
+ provided
+
+
+ org.glassfish.jaxb
+ jaxb-runtime
+ provided
+
+
+ org.slf4j
+ jcl-over-slf4j
+ provided
+
+
+ javax.jdo
+ jdo-api
+ provided
+
+
+ org.glassfish.jersey.core
+ jersey-client
+ provided
+
+
+ org.glassfish.jersey.core
+ jersey-common
+ provided
+
+
+ org.glassfish.jersey.containers
+ jersey-container-servlet
+ provided
+
+
+ org.glassfish.jersey.containers
+ jersey-container-servlet-core
+ provided
+
+
+ org.glassfish.jersey.inject
+ jersey-hk2
+ provided
+
+
+ org.glassfish.jersey.core
+ jersey-server
+ provided
+
+
+ pl.edu.icm
+ JLargeArrays
+ provided
+
+
+ jline
+ jline
+ provided
+
+
+ joda-time
+ joda-time
+ provided
+
+
+ org.jodd
+ jodd-core
+ provided
+
+
+ net.sf.jpam
+ jpam
+ provided
+
+
+ com.tdunning
+ json
+ provided
+
+
+ org.json4s
+ json4s-ast_2.12
+ provided
+
+
+ org.json4s
+ json4s-core_2.12
+ provided
+
+
+ org.json4s
+ json4s-jackson_2.12
+ provided
+
+
+ org.json4s
+ json4s-scalap_2.12
+ provided
+
+
+ com.google.code.findbugs
+ jsr305
+ provided
+
+
+ javax.transaction
+ jta
+ provided
+
+
+ com.github.wendykierp
+ JTransforms
+ provided
+
+
+ org.slf4j
+ jul-to-slf4j
+ provided
+
+
+ com.esotericsoftware
+ kryo-shaded
+ provided
+
+
+ io.fabric8
+ kubernetes-client
+ provided
+
+
+ io.fabric8
+ kubernetes-client-api
+ provided
+
+
+ io.fabric8
+ kubernetes-httpclient-okhttp
+ provided
+
+
+ io.fabric8
+ kubernetes-model-admissionregistration
+ provided
+
+
+ io.fabric8
+ kubernetes-model-apiextensions
+ provided
+
+
+ io.fabric8
+ kubernetes-model-apps
+ provided
+
+
+ io.fabric8
+ kubernetes-model-autoscaling
+ provided
+
+
+ io.fabric8
+ kubernetes-model-batch
+ provided
+
+
+ io.fabric8
+ kubernetes-model-certificates
+ provided
+
+
+ io.fabric8
+ kubernetes-model-common
+ provided
+
+
+ io.fabric8
+ kubernetes-model-coordination
+ provided
+
+
+ io.fabric8
+ kubernetes-model-core
+ provided
+
+
+ io.fabric8
+ kubernetes-model-discovery
+ provided
+
+
+ io.fabric8
+ kubernetes-model-events
+ provided
+
+
+ io.fabric8
+ kubernetes-model-extensions
+ provided
+
+
+ io.fabric8
+ kubernetes-model-flowcontrol
+ provided
+
+
+ io.fabric8
+ kubernetes-model-gatewayapi
+ provided
+
+
+ io.fabric8
+ kubernetes-model-metrics
+ provided
+
+
+ io.fabric8
+ kubernetes-model-networking
+ provided
+
+
+ io.fabric8
+ kubernetes-model-node
+ provided
+
+
+ io.fabric8
+ kubernetes-model-policy
+ provided
+
+
+ io.fabric8
+ kubernetes-model-rbac
+ provided
+
+
+ io.fabric8
+ kubernetes-model-scheduling
+ provided
+
+
+ io.fabric8
+ kubernetes-model-storageclass
+ provided
+
+
+ dev.ludovic.netlib
+ lapack
+ provided
+
+
+ org.fusesource.leveldbjni
+ leveldbjni-all
+ provided
+
+
+ org.apache.thrift
+ libfb303
+ provided
+
+
+ org.apache.thrift
+ libthrift
+ provided
+
+
+ log4j
+ log4j
+ provided
+
+
+ org.apache.logging.log4j
+ log4j-api
+ provided
+
+
+ org.apache.logging.log4j
+ log4j-core
+ provided
+
+
+ org.apache.logging.log4j
+ log4j-slf4j2-impl
+ provided
+
+
+ com.squareup.okhttp3
+ logging-interceptor
+ provided
+
+
+ org.lz4
+ lz4-java
+ provided
+
+
+ org.apache.mesos
+ mesos
+ provided
+
+
+ io.dropwizard.metrics
+ metrics-core
+ provided
+
+
+ io.dropwizard.metrics
+ metrics-graphite
+ provided
+
+
+ io.dropwizard.metrics
+ metrics-jmx
+ provided
+
+
+ io.dropwizard.metrics
+ metrics-json
+ provided
+
+
+ io.dropwizard.metrics
+ metrics-jvm
+ provided
+
+
+ com.esotericsoftware
+ minlog
+ provided
+
+
+ io.netty
+ netty-all
+ provided
+
+
+ io.netty
+ netty-buffer
+ provided
+
+
+ io.netty
+ netty-codec
+ provided
+
+
+ io.netty
+ netty-codec-http
+ provided
+
+
+ io.netty
+ netty-codec-http2
+ provided
+
+
+ io.netty
+ netty-codec-socks
+ provided
+
+
+ io.netty
+ netty-common
+ provided
+
+
+ io.netty
+ netty-handler
+ provided
+
+
+ io.netty
+ netty-handler-proxy
+ provided
+
+
+ io.netty
+ netty-resolver
+ provided
+
+
+ io.netty
+ netty-transport
+ provided
+
+
+ io.netty
+ netty-transport-classes-epoll
+ provided
+
+
+ io.netty
+ netty-transport-classes-kqueue
+ provided
+
+
+ io.netty
+ netty-transport-native-epoll
+ provided
+
+
+ io.netty
+ netty-transport-native-epoll
+ provided
+
+
+ io.netty
+ netty-transport-native-kqueue
+ provided
+
+
+ io.netty
+ netty-transport-native-kqueue
+ provided
+
+
+ io.netty
+ netty-transport-native-unix-common
+ provided
+
+
+ org.objenesis
+ objenesis
+ provided
+
+
+ com.squareup.okhttp3
+ okhttp
+ provided
+
+
+ com.squareup.okio
+ okio
+ provided
+
+
+ net.sf.opencsv
+ opencsv
+ provided
+
+
+ org.apache.orc
+ orc-core
+ provided
+
+
+ org.apache.orc
+ orc-mapreduce
+ provided
+
+
+ org.apache.orc
+ orc-shims
+ provided
+
+
+ oro
+ oro
+ provided
+
+
+ org.glassfish.hk2
+ osgi-resource-locator
+ provided
+
+
+ com.thoughtworks.paranamer
+ paranamer
+ provided
+
+
+ org.apache.parquet
+ parquet-column
+ provided
+
+
+ org.apache.parquet
+ parquet-common
+ provided
+
+
+ org.apache.parquet
+ parquet-encoding
+ provided
+
+
+ org.apache.parquet
+ parquet-format-structures
+ provided
+
+
+ org.apache.parquet
+ parquet-hadoop
+ provided
+
+
+ org.apache.parquet
+ parquet-jackson
+ provided
+
+
+ net.razorvine
+ pickle
+ provided
+
+
+ com.google.protobuf
+ protobuf-java
+ provided
+
+
+ net.sf.py4j
+ py4j
+ provided
+
+
+ org.roaringbitmap
+ RoaringBitmap
+ provided
+
+
+ org.rocksdb
+ rocksdbjni
+ provided
+
+
+ org.scala-lang.modules
+ scala-collection-compat_2.12
+ provided
+
+
+ org.scala-lang
+ scala-compiler
+ provided
+
+
+ org.scala-lang
+ scala-library
+ provided
+
+
+ org.scala-lang.modules
+ scala-parser-combinators_2.12
+ provided
+
+
+ org.scala-lang
+ scala-reflect
+ provided
+
+
+ org.scala-lang.modules
+ scala-xml_2.12
+ provided
+
+
+ org.roaringbitmap
+ shims
+ provided
+
+
+ org.slf4j
+ slf4j-api
+ provided
+
+
+ org.yaml
+ snakeyaml
+ provided
+
+
+ org.xerial.snappy
+ snappy-java
+ provided
+
+
+ org.apache.spark
+ spark-catalyst_2.12
+ provided
+
+
+ org.apache.spark
+ spark-core_2.12
+ provided
+
+
+ org.apache.spark
+ spark-graphx_2.12
+ provided
+
+
+ org.apache.spark
+ spark-hive-thriftserver_2.12
+ provided
+
+
+ org.apache.spark
+ spark-hive_2.12
+ provided
+
+
+ org.apache.spark
+ spark-kubernetes_2.12
+ provided
+
+
+ org.apache.spark
+ spark-kvstore_2.12
+ provided
+
+
+ org.apache.spark
+ spark-launcher_2.12
+ provided
+
+
+ org.apache.spark
+ spark-mesos_2.12
+ provided
+
+
+ org.apache.spark
+ spark-mllib-local_2.12
+ provided
+
+
+ org.apache.spark
+ spark-mllib_2.12
+ provided
+
+
+ org.apache.spark
+ spark-network-common_2.12
+ provided
+
+
+ org.apache.spark
+ spark-network-shuffle_2.12
+ provided
+
+
+ org.apache.spark
+ spark-repl_2.12
+ provided
+
+
+ org.apache.spark
+ spark-sketch_2.12
+ provided
+
+
+ org.apache.spark
+ spark-sql_2.12
+ provided
+
+
+ org.apache.spark
+ spark-streaming_2.12
+ provided
+
+
+ org.apache.spark
+ spark-tags_2.12
+ provided
+
+
+ org.apache.spark
+ spark-unsafe_2.12
+ provided
+
+
+ org.apache.spark
+ spark-yarn_2.12
+ provided
+
+
+ org.typelevel
+ spire-macros_2.12
+ provided
+
+
+ org.typelevel
+ spire-platform_2.12
+ provided
+
+
+ org.typelevel
+ spire-util_2.12
+ provided
+
+
+ org.typelevel
+ spire_2.12
+ provided
+
+
+ org.antlr
+ ST4
+ provided
+
+
+ stax
+ stax-api
+ provided
+
+
+ com.clearspring.analytics
+ stream
+ provided
+
+
+ net.sf.supercsv
+ super-csv
+ provided
+
+
+ org.threeten
+ threeten-extra
+ provided
+
+
+ com.google.crypto.tink
+ tink
+ provided
+
+
+ javax.transaction
+ transaction-api
+ provided
+
+
+ com.univocity
+ univocity-parsers
+ provided
+
+
+ org.apache.xbean
+ xbean-asm9-shaded
+ provided
+
+
+ org.tukaani
+ xz
+ provided
+
+
+ io.fabric8
+ zjsonpatch
+ provided
+
+
+ org.apache.zookeeper
+ zookeeper
+ provided
+
+
+ org.apache.zookeeper
+ zookeeper-jute
+ provided
+
+
+ com.github.luben
+ zstd-jni
+ provided
+
+
+
+
+
+
+
+ org.spurint.maven.plugins
+ mima-maven-plugin
+
+
+ check-abi
+ none
+
+
+
+
+ com.github.cerveada
+ scalatest-maven-plugin
+
+ true
+
+
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+
+
+ empty-javadoc-jar
+ package
+
+ jar
+
+
+ javadoc
+ ${basedir}/javadoc
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-source-plugin
+
+
+ empty-sources-jar
+
+ jar-no-fork
+
+
+ true
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-shade-plugin
+
+
+ package
+
+ shade
+
+
+
+
+
+
+
+
diff --git a/core/src/main/scala/za/co/absa/spline/harvester/plugin/embedded/RDDPlugin.scala b/core/src/main/scala/za/co/absa/spline/harvester/plugin/embedded/RDDPlugin.scala
index e6c876fb..dddfbd0b 100644
--- a/core/src/main/scala/za/co/absa/spline/harvester/plugin/embedded/RDDPlugin.scala
+++ b/core/src/main/scala/za/co/absa/spline/harvester/plugin/embedded/RDDPlugin.scala
@@ -19,7 +19,7 @@ package za.co.absa.spline.harvester.plugin.embedded
import org.apache.spark.Partition
import org.apache.spark.rdd.{HadoopRDD, RDD}
import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.execution.datasources.FileScanRDD
+import org.apache.spark.sql.execution.datasources.{FileScanRDD, PartitionedFile}
import za.co.absa.spline.commons.reflect.ReflectionUtils
import za.co.absa.spline.harvester.builder._
import za.co.absa.spline.harvester.plugin.Plugin.{Precedence, ReadNodeInfo}
@@ -39,7 +39,8 @@ class RDDPlugin(
override def rddReadNodeProcessor: PartialFunction[RDD[_], ReadNodeInfo] = {
case fsr: FileScanRDD =>
- val uris = fsr.filePartitions.flatMap(_.files.map(_.filePath))
+ val files = fsr.filePartitions.flatMap(_.files)
+ val uris = files.map(extractPath(_))
ReadNodeInfo(SourceIdentifier(None, uris: _*), Map.empty)
case hr: HadoopRDD[_, _] =>
val partitions = ReflectionUtils.extractValue[Array[Partition]](hr, "partitions_")
@@ -47,6 +48,13 @@ class RDDPlugin(
ReadNodeInfo(SourceIdentifier(None, uris: _*), Map.empty)
}
+ private def extractPath(file: PartitionedFile): String = {
+ val path = ReflectionUtils.extractValue[AnyRef](file, "filePath")
+ // for Spark 3.3 and lower path is a String
+ // for Spark 3.4 path is org.apache.spark.paths.SparkPath
+ path.toString
+ }
+
private def hadoopPartitionToUriString(hadoopPartition: Partition): String = {
val inputSplit = ReflectionUtils.extractValue[AnyRef](hadoopPartition, "inputSplit")
val fileSplitT = ReflectionUtils.extractValue[AnyRef](inputSplit, "t")
@@ -56,5 +64,4 @@ class RDDPlugin(
uri.toString
}
-
}
diff --git a/integration-tests/pom.xml b/integration-tests/pom.xml
index 3dcaadfc..8739193f 100644
--- a/integration-tests/pom.xml
+++ b/integration-tests/pom.xml
@@ -175,7 +175,7 @@
org.elasticsearch
elasticsearch-spark-${elasticsearch.spark.sufix}_${scala.binary.version}
- 8.2.2
+ 8.9.1
test
@@ -267,6 +267,21 @@
+
+ spark-3.4
+
+ 16.0.1
+ 30
+
+
+
+ org.apache.iceberg
+ iceberg-spark-runtime-3.4_${scala.binary.version}
+ 1.3.1
+ test
+
+
+
diff --git a/integration-tests/src/test/scala/za/co/absa/spline/BasicIntegrationTests.scala b/integration-tests/src/test/scala/za/co/absa/spline/BasicIntegrationTests.scala
index 27004e9f..f623e40a 100644
--- a/integration-tests/src/test/scala/za/co/absa/spline/BasicIntegrationTests.scala
+++ b/integration-tests/src/test/scala/za/co/absa/spline/BasicIntegrationTests.scala
@@ -16,6 +16,7 @@
package za.co.absa.spline
+import org.apache.spark.SPARK_VERSION
import org.apache.spark.internal.Logging
import org.apache.spark.sql.Row
import org.apache.spark.sql.SaveMode._
@@ -24,9 +25,13 @@ import org.apache.spark.sql.types.{IntegerType, StructField, StructType}
import org.scalatest.flatspec.AsyncFlatSpec
import org.scalatest.matchers.should.Matchers
import za.co.absa.spline.commons.io.TempDirectory
+import za.co.absa.spline.commons.version.Version.VersionStringInterpolator
+import za.co.absa.spline.producer.model.{ExecutionEvent, ExecutionPlan}
import za.co.absa.spline.test.fixture.SparkFixture
import za.co.absa.spline.test.fixture.spline.SplineFixture
+import scala.concurrent.Future
+
class BasicIntegrationTests extends AsyncFlatSpec
with Matchers
with SparkFixture
@@ -117,15 +122,41 @@ class BasicIntegrationTests extends AsyncFlatSpec
.write.mode(Append).saveAsTable(tableName)
}
- (plan2, _) <- captor.lineageOf {
+ // Spark 3.4+ is creating 2 commands for both writes here so we need to ignore one
+ // We only want the one that is from CreateDataSourceTableAsSelectCommand
+ // The one we ignore here is an extra InsertIntoHadoopFsRelationCommand
+ // They can come out of order so we need to filter out which one is which.
+ (plan2, _) <- if (ver"$SPARK_VERSION" >= ver"3.4.0") {
+ captor.lineageOf {
+ Thread.sleep(5000)
+ }
+ } else Future[(ExecutionPlan, Seq[ExecutionEvent])](null, null)
+
+ (plan3, _) <- captor.lineageOf {
spark
.read.table(tableName)
.write.mode(Overwrite).saveAsTable("somewhere")
}
+
+ (plan4, _) <- if (ver"$SPARK_VERSION" >= ver"3.4.0") {
+ captor.lineageOf {
+ Thread.sleep(5000)
+ }
+ } else Future[(ExecutionPlan, Seq[ExecutionEvent])](null, null)
} yield {
println("yield")
- val writeUri = plan1.operations.write.outputSource
- val readUri = plan2.operations.reads.head.inputSources.head
+
+ val writePlan = Seq(plan1, plan2)
+ .filter(null.!=)
+ .find(_.operations.write.name == "CreateDataSourceTableAsSelectCommand")
+ .get
+ val readPlan = Seq(plan3, plan4)
+ .filter(null.!=)
+ .find(_.operations.write.name == "CreateDataSourceTableAsSelectCommand")
+ .get
+
+ val writeUri = writePlan.operations.write.outputSource
+ val readUri = readPlan.operations.reads.head.inputSources.head
writeUri shouldEqual readUri
}
diff --git a/integration-tests/src/test/scala/za/co/absa/spline/DeltaSpec.scala b/integration-tests/src/test/scala/za/co/absa/spline/DeltaSpec.scala
index 4e1c2b72..d8780c03 100644
--- a/integration-tests/src/test/scala/za/co/absa/spline/DeltaSpec.scala
+++ b/integration-tests/src/test/scala/za/co/absa/spline/DeltaSpec.scala
@@ -37,7 +37,7 @@ class DeltaSpec extends AsyncFlatSpec
private val deltaPath = TempDirectory(prefix = "delta", pathOnly = true).deleteOnExit().toURI.toString
it should "support Delta Lake as a source" taggedAs
- ignoreIf(ver"$SPARK_VERSION" < ver"2.4.2") in
+ ignoreIf(ver"$SPARK_VERSION" < ver"2.4.2" || ver"$SPARK_VERSION" >= ver"3.4.0") in
withNewSparkSession { implicit spark =>
withLineageTracking { captor =>
val testData: DataFrame = {
@@ -79,7 +79,7 @@ class DeltaSpec extends AsyncFlatSpec
}
it should "support insert into existing Delta Lake table" taggedAs
- ignoreIf(ver"$SPARK_VERSION" < ver"2.4.2") in
+ ignoreIf(ver"$SPARK_VERSION" < ver"2.4.2" || ver"$SPARK_VERSION" >= ver"3.4.0") in
withNewSparkSession { implicit spark =>
withLineageTracking { lineageCaptor =>
val testData: DataFrame = {
diff --git a/integration-tests/src/test/scala/za/co/absa/spline/KafkaSinkSpec.scala b/integration-tests/src/test/scala/za/co/absa/spline/KafkaSinkSpec.scala
index a8e8eccc..0da51990 100644
--- a/integration-tests/src/test/scala/za/co/absa/spline/KafkaSinkSpec.scala
+++ b/integration-tests/src/test/scala/za/co/absa/spline/KafkaSinkSpec.scala
@@ -48,8 +48,9 @@ class KafkaSinkSpec
EmbeddedKafka.stop()
}
- it should "support Kafka as a write source" in {
+ it should "support Kafka as a write source while reading from multiple Kafka read sources" in {
val topicName = "bananas"
+ val otherTopicName = "anotherTopic"
withNewSparkSession { implicit spark =>
withLineageTracking { captor =>
@@ -74,6 +75,16 @@ class KafkaSinkSpec
.option("topic", topicName)
.save())
+ // Write to another topic seeding lineage for a downstream read
+ (_, _) <- captor.lineageOf(
+ testData
+ .selectExpr("CAST (name AS STRING) AS value")
+ .write
+ .format("kafka")
+ .option("kafka.bootstrap.servers", kafkaUrl)
+ .option("topic", otherTopicName)
+ .save())
+
(plan2, _) <- captor.lineageOf(
reader
.option("subscribe", s"$topicName,anotherTopic")
@@ -98,6 +109,7 @@ class KafkaSinkSpec
plan2.operations.reads.head.extra("sourceType") shouldBe Some("kafka")
plan2.operations.reads.head.inputSources should contain(s"kafka:$topicName")
+ plan2.operations.reads.head.inputSources should contain(s"kafka:$otherTopicName")
plan2.operations.reads.head.params should contain key "subscribe"
plan3.operations.reads.head.extra("sourceType") shouldBe Some("kafka")
diff --git a/integration-tests/src/test/scala/za/co/absa/spline/harvester/LineageHarvesterSpec.scala b/integration-tests/src/test/scala/za/co/absa/spline/harvester/LineageHarvesterSpec.scala
index f88a7bca..c63c23b2 100644
--- a/integration-tests/src/test/scala/za/co/absa/spline/harvester/LineageHarvesterSpec.scala
+++ b/integration-tests/src/test/scala/za/co/absa/spline/harvester/LineageHarvesterSpec.scala
@@ -36,6 +36,7 @@ import za.co.absa.spline.test.fixture.spline.SplineFixture
import za.co.absa.spline.test.fixture.{SparkDatabaseFixture, SparkFixture}
import java.util.UUID
+import scala.concurrent.Future
import scala.language.reflectiveCalls
import scala.util.Try
@@ -376,11 +377,25 @@ class LineageHarvesterSpec extends AsyncFlatSpec
val df = spark.createDataset(Seq(TestRow(1, 2.3, "text")))
for {
- (plan, _) <- captor.lineageOf {
+ (plan1, _) <- captor.lineageOf {
df.createOrReplaceTempView("tempView")
spark.sql("CREATE TABLE users_sales AS SELECT i, d, s FROM tempView ")
}
+ // Spark 3.4+ is creating 2 commands for this CTAS here so we need to ignore one
+ // We only want the one that is from CreateHiveTableAsSelectCommand
+ // The one we ignore here is an extra InsertIntoHiveTableCommand
+ // They can come out of order so we need to filter out which one is which.
+ (plan2, _) <- if (ver"$SPARK_VERSION" >= ver"3.4.0") {
+ captor.lineageOf {
+ Thread.sleep(5000)
+ }
+ } else Future[(ExecutionPlan, Seq[ExecutionEvent])](null, null)
} yield {
+ val plan = Seq(plan1, plan2)
+ .filter(null.!=)
+ .find(_.operations.write.name == "CreateHiveTableAsSelectCommand")
+ .get
+
val writeOperation = plan.operations.write
writeOperation.id shouldEqual "op-0"
writeOperation.append shouldEqual false
@@ -500,7 +515,7 @@ class LineageHarvesterSpec extends AsyncFlatSpec
plan should not be null
event.durationNs should be(empty)
event.error should not be empty
- event.error.get.toString should include(s"path ${tmpLocal.toURI.toString.stripSuffix("/")} already exists")
+ event.error.get.toString.toLowerCase should include(s"path ${tmpLocal.toURI.toString.stripSuffix("/")} already exists")
}
}
}
diff --git a/pom.xml b/pom.xml
index 7d94a00b..897ffc97 100644
--- a/pom.xml
+++ b/pom.xml
@@ -91,6 +91,7 @@
3.1.3
3.2.3
3.3.1
+ 3.4.1
@@ -100,6 +101,8 @@
1.0.0
2.0.0
2.1.0
+ 2.4.0
+
${cassandra-connector-24.version}
@@ -108,6 +111,7 @@
3.1.0
3.2.0
3.3.0
+ 3.4.1
0.13.7
@@ -210,6 +214,7 @@
com.github.cerveada
scalatest-maven-plugin
+ I
${project.build.directory}/surefire-reports
.
WDF TestSuite.txt
@@ -815,6 +820,29 @@
+
+ spark-3.4
+
+ ${spark-34.version}
+ ${delta-24.version}
+ ${spark-34.version}_0.19.0
+ ${cassandra-connector-34.version}
+
+
+
+
+ org.apache.spark
+ spark-parent_${scala.binary.version}
+ ${spark.version}
+ pom
+ import
+
+
+
+
+
+
+