diff --git a/README.md b/README.md
index 9564c1af4..584993dfe 100644
--- a/README.md
+++ b/README.md
@@ -2569,6 +2569,34 @@ pramen {
}
```
+### (experimental) DynamoDB database
+Here is how you can use a DynamoDB database for storing bookkeeping information:
+
+```hocon
+pramen {
+ bookkeeping.enabled = "true"
+
+ bookkeeping.dynamodb {
+ region = "af-south-1"
+ table.prefix = "pramen_uat"
+ }
+}
+```
+
+DynamoDB tables are automatically created if they don't exist with default options. Use the prefix to create multiple
+Pramen bookeeping environments per AWS account.
+
+Note that the Pramen project that uses DynamoDB for bookeeping needs to add DynamoDB as a dependency if it is not provided
+by the Spark cluster (e.g. EMR).
+
+```xml
+
+ software.amazon.awssdk
+ dynamodb
+ ${aws.sdk.version}
+
+```
+
### Hadoop (CSV+JSON)
This is less recommended way, and is quite slow. But the advantage is that you don't need a database.
diff --git a/pramen/api/src/main/scala/za/co/absa/pramen/api/MetadataManager.scala b/pramen/api/src/main/scala/za/co/absa/pramen/api/MetadataManager.scala
index 0bd91ed93..333a8c782 100644
--- a/pramen/api/src/main/scala/za/co/absa/pramen/api/MetadataManager.scala
+++ b/pramen/api/src/main/scala/za/co/absa/pramen/api/MetadataManager.scala
@@ -18,7 +18,7 @@ package za.co.absa.pramen.api
import java.time.LocalDate
-trait MetadataManager {
+trait MetadataManager extends AutoCloseable {
/**
* Get metadata value for a given table, date and key.
*
diff --git a/pramen/api/src/main/scala/za/co/absa/pramen/api/lock/TokenLockFactory.scala b/pramen/api/src/main/scala/za/co/absa/pramen/api/lock/TokenLockFactory.scala
index d0aaeaaf0..11ac6343c 100644
--- a/pramen/api/src/main/scala/za/co/absa/pramen/api/lock/TokenLockFactory.scala
+++ b/pramen/api/src/main/scala/za/co/absa/pramen/api/lock/TokenLockFactory.scala
@@ -41,6 +41,8 @@
*/
package za.co.absa.pramen.api.lock
-trait TokenLockFactory {
+trait TokenLockFactory extends AutoCloseable {
def getLock(token: String): TokenLock
+
+ override def close(): Unit = {}
}
diff --git a/pramen/core/pom.xml b/pramen/core/pom.xml
index fb500d652..54011be0e 100644
--- a/pramen/core/pom.xml
+++ b/pramen/core/pom.xml
@@ -144,6 +144,13 @@
channel_scala_${scala.compat.version}
+
+
+ software.amazon.awssdk
+ dynamodb
+ provided
+
+
org.mockito
diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/app/AppContext.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/app/AppContext.scala
index 3fca0e018..47454157b 100644
--- a/pramen/core/src/main/scala/za/co/absa/pramen/core/app/AppContext.scala
+++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/app/AppContext.scala
@@ -21,7 +21,7 @@ import za.co.absa.pramen.core.bookkeeper.Bookkeeper
import za.co.absa.pramen.core.journal.Journal
import za.co.absa.pramen.core.metastore.Metastore
-trait AppContext {
+trait AppContext extends AutoCloseable {
val appConfig: AppConfig
def bookkeeper: Bookkeeper
@@ -31,6 +31,4 @@ trait AppContext {
def journal: Journal
def metastore: Metastore
-
- def close(): Unit
}
diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/app/config/BookkeeperConfig.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/app/config/BookkeeperConfig.scala
index cf0f71080..e5434316a 100644
--- a/pramen/core/src/main/scala/za/co/absa/pramen/core/app/config/BookkeeperConfig.scala
+++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/app/config/BookkeeperConfig.scala
@@ -30,7 +30,10 @@ case class BookkeeperConfig(
bookkeepingJdbcConfig: Option[JdbcConfig],
deltaDatabase: Option[String],
deltaTablePrefix: Option[String],
- temporaryDirectory: Option[String]
+ temporaryDirectory: Option[String],
+ dynamoDbRegion: Option[String],
+ dynamoDbTableArn: Option[String],
+ dynamoDbTablePrefix: Option[String]
)
object BookkeeperConfig {
@@ -44,6 +47,9 @@ object BookkeeperConfig {
val BOOKKEEPING_DB_NAME = "pramen.bookkeeping.mongodb.database"
val BOOKKEEPING_DELTA_DB_NAME = "pramen.bookkeeping.delta.database"
val BOOKKEEPING_DELTA_TABLE_PREFIX = "pramen.bookkeeping.delta.table.prefix"
+ val BOOKKEEPING_DYNAMODB_REGION = "pramen.bookkeeping.dynamodb.region"
+ val BOOKKEEPING_DYNAMODB_TABLE_ARN = "pramen.bookkeeping.dynamodb.table.arn"
+ val BOOKKEEPING_DYNAMODB_TABLE_PREFIX = "pramen.bookkeeping.dynamodb.table.prefix"
val BOOKKEEPING_TEMPORARY_DIRECTORY_KEY = "pramen.temporary.directory"
def fromConfig(conf: Config, allowLocalBookkepingStorage: Boolean = false): BookkeeperConfig = {
@@ -56,6 +62,9 @@ object BookkeeperConfig {
val temporaryDirectory = ConfigUtils.getOptionString(conf, BOOKKEEPING_TEMPORARY_DIRECTORY_KEY)
val deltaDatabase = ConfigUtils.getOptionString(conf, BOOKKEEPING_DELTA_DB_NAME)
val deltaTablePrefix = ConfigUtils.getOptionString(conf, BOOKKEEPING_DELTA_TABLE_PREFIX)
+ val dynamoDbRegion = ConfigUtils.getOptionString(conf, BOOKKEEPING_DYNAMODB_REGION)
+ val dynamoDbTableArn = ConfigUtils.getOptionString(conf, BOOKKEEPING_DYNAMODB_TABLE_ARN)
+ val dynamoDbTablePrefix = ConfigUtils.getOptionString(conf, BOOKKEEPING_DYNAMODB_TABLE_PREFIX)
if (bookkeepingEnabled && bookkeepingJdbcConfig.isEmpty && bookkeepingHadoopFormat == HadoopFormat.Delta) {
if (bookkeepingLocation.isEmpty && deltaTablePrefix.isEmpty) {
@@ -63,7 +72,7 @@ object BookkeeperConfig {
s"Preferably $BOOKKEEPING_DELTA_DB_NAME should be defined as well for managed Delta Lake tables.")
}
} else {
- if (bookkeepingEnabled && bookkeepingConnectionString.isEmpty && bookkeepingLocation.isEmpty && bookkeepingJdbcConfig.isEmpty) {
+ if (bookkeepingEnabled && bookkeepingConnectionString.isEmpty && bookkeepingLocation.isEmpty && bookkeepingJdbcConfig.isEmpty && dynamoDbRegion.isEmpty) {
if (allowLocalBookkepingStorage) {
log.warn("Bookkeeping configuration is missing. Using the default SQLite database 'pramen.sqlite'")
return BookkeeperConfig(
@@ -78,10 +87,13 @@ object BookkeeperConfig {
)),
None,
None,
- temporaryDirectory
+ temporaryDirectory,
+ None,
+ None,
+ None
)
} else {
- throw new RuntimeException(s"One of the following should be defined: $BOOKKEEPING_PARENT.jdbc.url, $BOOKKEEPING_CONNECTION_STRING or $BOOKKEEPING_LOCATION" +
+ throw new RuntimeException(s"One of the following should be defined: $BOOKKEEPING_PARENT.jdbc.url, $BOOKKEEPING_CONNECTION_STRING, $BOOKKEEPING_DYNAMODB_REGION, or $BOOKKEEPING_LOCATION" +
s" when bookkeeping is enabled. You can disable bookkeeping by setting $BOOKKEEPING_ENABLED = false.")
}
}
@@ -89,6 +101,10 @@ object BookkeeperConfig {
if (bookkeepingConnectionString.isDefined && bookkeepingDbName.isEmpty) {
throw new RuntimeException(s"Database name is not defined. Please, define $BOOKKEEPING_DB_NAME.")
}
+
+ if (dynamoDbRegion.isDefined && dynamoDbTablePrefix.isEmpty) {
+ log.warn(s"DynamoDB table prefix is not defined. Using default prefix 'pramen'. You can define it with $BOOKKEEPING_DYNAMODB_TABLE_PREFIX.")
+ }
}
BookkeeperConfig(
@@ -100,7 +116,10 @@ object BookkeeperConfig {
bookkeepingJdbcConfig,
deltaDatabase,
deltaTablePrefix,
- temporaryDirectory
+ temporaryDirectory,
+ dynamoDbRegion,
+ dynamoDbTableArn,
+ dynamoDbTablePrefix
)
}
}
\ No newline at end of file
diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala
index 62eb8d886..8f0602754 100644
--- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala
+++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala
@@ -26,7 +26,7 @@ import za.co.absa.pramen.core.app.config.{BookkeeperConfig, HadoopFormat, Runtim
import za.co.absa.pramen.core.bookkeeper.model.DataAvailability
import za.co.absa.pramen.core.journal._
import za.co.absa.pramen.core.lock._
-import za.co.absa.pramen.core.metadata.{MetadataManagerJdbc, MetadataManagerNull}
+import za.co.absa.pramen.core.metadata.{MetadataManagerDynamoDb, MetadataManagerJdbc, MetadataManagerNull}
import za.co.absa.pramen.core.model.DataChunk
import za.co.absa.pramen.core.mongo.MongoDbConnection
import za.co.absa.pramen.core.rdb.PramenDb
@@ -90,6 +90,7 @@ object Bookkeeper {
}
val hasBookkeepingJdbc = bookkeepingConfig.bookkeepingJdbcConfig.exists(_.primaryUrl.isDefined)
+ val hasBookkeepingDynamoDb = bookkeepingConfig.dynamoDbRegion.isDefined
val dbOpt = if (hasBookkeepingJdbc) {
val jdbcConfig = bookkeepingConfig.bookkeepingJdbcConfig.get
@@ -101,6 +102,14 @@ object Bookkeeper {
if (hasBookkeepingJdbc) {
log.info(s"Using RDB for lock management.")
new TokenLockFactoryJdbc(dbOpt.get.slickDb, dbOpt.get.slickProfile)
+ } else if (hasBookkeepingDynamoDb) {
+ val tablePrefix = bookkeepingConfig.dynamoDbTablePrefix.getOrElse(BookkeeperDynamoDb.DEFAULT_TABLE_PREFIX)
+ log.info(s"Using DynamoDB for lock management in region '${bookkeepingConfig.dynamoDbRegion.get}' with table prefix '$tablePrefix'")
+ TokenLockFactoryDynamoDb.builder
+ .withRegion(bookkeepingConfig.dynamoDbRegion.get)
+ .withTablePrefix(tablePrefix)
+ .withTableArn(bookkeepingConfig.dynamoDbTableArn)
+ .build()
} else {
mongoDbConnection match {
case Some(connection) =>
@@ -129,6 +138,15 @@ object Bookkeeper {
new BookkeeperNull()
} else if (hasBookkeepingJdbc) {
BookkeeperJdbc.fromPramenDb(dbOpt.get, batchId)
+ } else if (hasBookkeepingDynamoDb) {
+ val tablePrefix = bookkeepingConfig.dynamoDbTablePrefix.getOrElse(BookkeeperDynamoDb.DEFAULT_TABLE_PREFIX)
+ log.info(s"Using DynamoDB for bookkeeping in region '${bookkeepingConfig.dynamoDbRegion.get}' with table prefix '$tablePrefix'")
+ BookkeeperDynamoDb.builder
+ .withRegion(bookkeepingConfig.dynamoDbRegion.get)
+ .withBatchId(batchId)
+ .withTablePrefix(tablePrefix)
+ .withTableArn(bookkeepingConfig.dynamoDbTableArn)
+ .build()
} else {
mongoDbConnection match {
case Some(connection) =>
@@ -161,6 +179,14 @@ object Bookkeeper {
} else if (hasBookkeepingJdbc) {
log.info(s"Using RDB to keep journal of executed jobs.")
new JournalJdbc(dbOpt.get.slickDb, dbOpt.get.slickProfile)
+ } else if (hasBookkeepingDynamoDb) {
+ val tablePrefix = bookkeepingConfig.dynamoDbTablePrefix.getOrElse(JournalDynamoDB.DEFAULT_TABLE_PREFIX)
+ log.info(s"Using DynamoDB for journal in region '${bookkeepingConfig.dynamoDbRegion.get}' with table prefix '$tablePrefix'")
+ JournalDynamoDB.builder
+ .withRegion(bookkeepingConfig.dynamoDbRegion.get)
+ .withTablePrefix(tablePrefix)
+ .withTableArn(bookkeepingConfig.dynamoDbTableArn)
+ .build()
} else {
mongoDbConnection match {
case Some(connection) =>
@@ -194,6 +220,14 @@ object Bookkeeper {
} else if (hasBookkeepingJdbc) {
log.info(s"Using RDB to keep custom metadata.")
new MetadataManagerJdbc(dbOpt.get.slickDb, dbOpt.get.slickProfile)
+ } else if (hasBookkeepingDynamoDb) {
+ val tablePrefix = bookkeepingConfig.dynamoDbTablePrefix.getOrElse(MetadataManagerDynamoDb.DEFAULT_TABLE_PREFIX)
+ log.info(s"Using DynamoDB for metadata in region '${bookkeepingConfig.dynamoDbRegion.get}' with table prefix '$tablePrefix'")
+ MetadataManagerDynamoDb.builder
+ .withRegion(bookkeepingConfig.dynamoDbRegion.get)
+ .withTablePrefix(tablePrefix)
+ .withTableArn(bookkeepingConfig.dynamoDbTableArn)
+ .build()
} else {
log.info(s"The custom metadata management is not supported.")
new MetadataManagerNull(isPersistenceEnabled = true)
@@ -203,6 +237,9 @@ object Bookkeeper {
override def close(): Unit = {
mongoDbConnection.foreach(_.close())
dbOpt.foreach(_.close())
+ tokenFactory.close()
+ journal.close()
+ metadataManager.close()
}
}
diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala
new file mode 100644
index 000000000..734a9ff23
--- /dev/null
+++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala
@@ -0,0 +1,1014 @@
+/*
+ * Copyright 2022 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.pramen.core.bookkeeper
+
+import org.apache.spark.sql.types.StructType
+import org.slf4j.LoggerFactory
+import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider
+import software.amazon.awssdk.regions.Region
+import software.amazon.awssdk.services.dynamodb.DynamoDbClient
+import software.amazon.awssdk.services.dynamodb.model._
+import za.co.absa.pramen.core.bookkeeper.model.DataAvailability
+import za.co.absa.pramen.core.model.{DataChunk, TableSchema}
+import za.co.absa.pramen.core.utils.{AlgorithmUtils, TimeUtils}
+
+import java.net.URI
+import java.time.LocalDate
+import scala.collection.JavaConverters._
+import scala.util.control.NonFatal
+
+/**
+ * DynamoDB-based implementation of the Bookkeeper.
+ *
+ * Table schema for bookkeeping:
+ * - Partition key: tableName (String)
+ * - Sort key: infoDateSortKey (String in "yyyy-MM-dd#jobFinishedMillis" format)
+ * The composite sort key allows multiple entries for the same table and date.
+ *
+ * Table schema for schemas:
+ * - Partition key: tableName (String)
+ * - Sort key: infoDate (String in yyyy-MM-dd format)
+ *
+ * @param dynamoDbClient The DynamoDB client to use for operations
+ * @param batchId The batch ID for this execution
+ * @param tableArn Optional ARN prefix for DynamoDB tables (e.g., "arn:aws:dynamodb:region:account-id:table/")
+ * @param tablePrefix Prefix for table names to allow multiple bookkeeping sets in the same account (default: "pramen")
+ */
+class BookkeeperDynamoDb private (
+ dynamoDbClient: DynamoDbClient,
+ batchId: Long,
+ tableArn: Option[String] = None,
+ tablePrefix: String = BookkeeperDynamoDb.DEFAULT_TABLE_PREFIX
+) extends BookkeeperBase(isBookkeepingEnabled = true, batchId) {
+
+ import BookkeeperDynamoDb._
+
+ private val log = LoggerFactory.getLogger(this.getClass)
+ private val queryWarningTimeoutMs = 10000L
+
+ // Construct table names with prefix
+ private val bookkeepingTableBaseName = s"${tablePrefix}_$DEFAULT_BOOKKEEPING_TABLE"
+ private val schemaTableBaseName = s"${tablePrefix}_$DEFAULT_SCHEMA_TABLE"
+
+ // Full table names/ARNs
+ private val bookkeepingTableName = getFullTableName(tableArn, bookkeepingTableBaseName)
+ private val schemaTableName = getFullTableName(tableArn, schemaTableBaseName)
+
+ // Offset management
+ private val offsetManagement = new OffsetManagerCached(
+ new OffsetManagerDynamoDb(dynamoDbClient, batchId, tableArn, tablePrefix, closesClient = false)
+ )
+
+ private val offsetManagementDynamoDB = new OffsetManagerDynamoDb(dynamoDbClient, batchId, tableArn, tablePrefix, closesClient = false)
+
+ // Initialize tables on construction
+ init()
+
+ override val bookkeepingEnabled: Boolean = true
+
+ /**
+ * Initializes the DynamoDB tables for bookkeeping and schemas.
+ * Checks if tables exist and creates them if they don't.
+ */
+ def init(): Unit = {
+ try {
+ log.info(s"Initializing DynamoDB bookkeeper with tables: bookkeeping='$bookkeepingTableName', schemas='$schemaTableName'")
+
+ // Initialize bookkeeping table
+ if (!tableExists(bookkeepingTableName)) {
+ log.info(s"Creating DynamoDB bookkeeping table: $bookkeepingTableName")
+ createBookkeepingTable(bookkeepingTableName)
+ log.info(s"Successfully created bookkeeping table: $bookkeepingTableName")
+ } else {
+ log.info(s"DynamoDB bookkeeping table already exists: $bookkeepingTableName")
+ }
+
+ // Initialize schema table
+ if (!tableExists(schemaTableName)) {
+ log.info(s"Creating DynamoDB schema table: $schemaTableName")
+ createSchemaTable(schemaTableName)
+ log.info(s"Successfully created schema table: $schemaTableName")
+ } else {
+ log.info(s"DynamoDB schema table already exists: $schemaTableName")
+ }
+
+ log.info(s"DynamoDB bookkeeper initialization complete")
+ } catch {
+ case NonFatal(ex) =>
+ log.error("Error initializing DynamoDB bookkeeper tables", ex)
+ throw new RuntimeException("Failed to initialize DynamoDB bookkeeper", ex)
+ }
+ }
+
+ /**
+ * Checks if a DynamoDB table exists.
+ *
+ * @param tableName The name of the table to check
+ * @return true if the table exists, false otherwise
+ */
+ private def tableExists(tableName: String): Boolean = {
+ try {
+ val describeRequest = DescribeTableRequest.builder()
+ .tableName(tableName)
+ .build()
+
+ dynamoDbClient.describeTable(describeRequest)
+ true
+ } catch {
+ case _: ResourceNotFoundException => false
+ case NonFatal(ex) =>
+ log.warn(s"Error checking if table exists: $tableName", ex)
+ throw ex
+ }
+ }
+
+ /**
+ * Creates the bookkeeping table with the appropriate schema.
+ * Uses a composite sort key (infoDate#jobFinished) to allow multiple entries per table and date.
+ *
+ * @param tableName The name of the table to create
+ */
+ private def createBookkeepingTable(tableName: String): Unit = {
+ val createTableRequest = CreateTableRequest.builder()
+ .tableName(tableName)
+ .keySchema(
+ KeySchemaElement.builder()
+ .attributeName(ATTR_TABLE_NAME)
+ .keyType(KeyType.HASH)
+ .build(),
+ KeySchemaElement.builder()
+ .attributeName(ATTR_INFO_DATE_SORT_KEY)
+ .keyType(KeyType.RANGE)
+ .build()
+ )
+ .attributeDefinitions(
+ AttributeDefinition.builder()
+ .attributeName(ATTR_TABLE_NAME)
+ .attributeType(ScalarAttributeType.S)
+ .build(),
+ AttributeDefinition.builder()
+ .attributeName(ATTR_INFO_DATE_SORT_KEY)
+ .attributeType(ScalarAttributeType.S)
+ .build()
+ )
+ .billingMode(BillingMode.PAY_PER_REQUEST) // On-demand billing
+ .build()
+
+ dynamoDbClient.createTable(createTableRequest)
+
+ // Wait for table to become active
+ waitForTableActive(tableName, dynamoDbClient)
+ }
+
+ /**
+ * Creates the schema table with the appropriate schema.
+ *
+ * @param tableName The name of the table to create
+ */
+ private def createSchemaTable(tableName: String): Unit = {
+ val createTableRequest = CreateTableRequest.builder()
+ .tableName(tableName)
+ .keySchema(
+ KeySchemaElement.builder()
+ .attributeName(ATTR_TABLE_NAME)
+ .keyType(KeyType.HASH)
+ .build(),
+ KeySchemaElement.builder()
+ .attributeName(ATTR_INFO_DATE)
+ .keyType(KeyType.RANGE)
+ .build()
+ )
+ .attributeDefinitions(
+ AttributeDefinition.builder()
+ .attributeName(ATTR_TABLE_NAME)
+ .attributeType(ScalarAttributeType.S)
+ .build(),
+ AttributeDefinition.builder()
+ .attributeName(ATTR_INFO_DATE)
+ .attributeType(ScalarAttributeType.S)
+ .build()
+ )
+ .billingMode(BillingMode.PAY_PER_REQUEST) // On-demand billing
+ .build()
+
+ dynamoDbClient.createTable(createTableRequest)
+
+ // Wait for table to become active
+ waitForTableActive(tableName, dynamoDbClient)
+ }
+
+ override def getLatestProcessedDateFromStorage(table: String, until: Option[LocalDate]): Option[LocalDate] = {
+ try {
+ val queryBuilder = QueryRequest.builder()
+ .tableName(bookkeepingTableName)
+ .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName")
+ .expressionAttributeValues(Map(
+ ":tableName" -> AttributeValue.builder().s(table).build()
+ ).asJava)
+ .scanIndexForward(false) // descending order
+
+ val query = until match {
+ case Some(endDate) =>
+ val endDateStr = getDateStr(endDate)
+ // Query using prefix on the sort key since we need items with infoDate <= endDate
+ queryBuilder
+ .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE_SORT_KEY <= :endDateMax")
+ .expressionAttributeValues(Map(
+ ":tableName" -> AttributeValue.builder().s(table).build(),
+ // Use max possible value after date to get all entries for that date and before
+ ":endDateMax" -> AttributeValue.builder().s(s"${endDateStr}#~").build()
+ ).asJava)
+ case None =>
+ queryBuilder
+ }
+
+ val response = dynamoDbClient.query(query.build())
+ val items = response.items().asScala
+
+ if (items.isEmpty) {
+ None
+ } else {
+ // Find the maximum infoDateEnd
+ val latestDate = items
+ .map(item => LocalDate.parse(item.get(ATTR_INFO_DATE_END).s()))
+ .maxBy(_.toEpochDay)
+ Some(latestDate)
+ }
+ } catch {
+ case NonFatal(ex) =>
+ log.error(s"Error querying latest processed date for table '$table'", ex)
+ throw ex
+ }
+ }
+
+ override def getLatestDataChunkFromStorage(table: String, infoDate: LocalDate): Option[DataChunk] = {
+ try {
+ val dateStr = getDateStr(infoDate)
+
+ val queryRequest = QueryRequest.builder()
+ .tableName(bookkeepingTableName)
+ .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND begins_with($ATTR_INFO_DATE_SORT_KEY, :infoDatePrefix)")
+ .expressionAttributeValues(Map(
+ ":tableName" -> AttributeValue.builder().s(table).build(),
+ ":infoDatePrefix" -> AttributeValue.builder().s(s"$dateStr#").build()
+ ).asJava)
+ .scanIndexForward(false) // descending order by sort key (latest jobFinished first)
+ .build()
+
+ val response = dynamoDbClient.query(queryRequest)
+ val items = response.items().asScala
+
+ if (items.isEmpty) {
+ None
+ } else {
+ // Take the first item (already sorted in descending order)
+ items
+ .map(itemToDataChunk)
+ .headOption
+ }
+ } catch {
+ case NonFatal(ex) =>
+ log.error(s"Error getting latest data chunk for table '$table' at $infoDate", ex)
+ throw ex
+ }
+ }
+
+ override def getDataChunksFromStorage(table: String, infoDate: LocalDate, batchIdFilter: Option[Long]): Seq[DataChunk] = {
+ try {
+ val dateStr = getDateStr(infoDate)
+
+ val queryBuilder = QueryRequest.builder()
+ .tableName(bookkeepingTableName)
+ .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND begins_with($ATTR_INFO_DATE_SORT_KEY, :infoDatePrefix)")
+ .expressionAttributeValues(Map(
+ ":tableName" -> AttributeValue.builder().s(table).build(),
+ ":infoDatePrefix" -> AttributeValue.builder().s(s"$dateStr#").build()
+ ).asJava)
+
+ val query = batchIdFilter match {
+ case Some(bId) =>
+ queryBuilder
+ .filterExpression(s"$ATTR_BATCH_ID = :batchId")
+ .expressionAttributeValues(Map(
+ ":tableName" -> AttributeValue.builder().s(table).build(),
+ ":infoDatePrefix" -> AttributeValue.builder().s(s"$dateStr#").build(),
+ ":batchId" -> AttributeValue.builder().n(bId.toString).build()
+ ).asJava)
+ case None =>
+ queryBuilder
+ }
+
+ val response = dynamoDbClient.query(query.build())
+ val chunks = response.items().asScala
+ .map(itemToDataChunk)
+ .sortBy(_.jobFinished)
+ .toSeq
+
+ log.debug(s"For $table ($infoDate) : ${chunks.mkString("[ ", ", ", " ]")}")
+ chunks
+ } catch {
+ case NonFatal(ex) =>
+ log.error(s"Error getting data chunks for table '$table' at $infoDate", ex)
+ throw ex
+ }
+ }
+
+ override def getDataChunksCountFromStorage(table: String, dateBeginOpt: Option[LocalDate], dateEndOpt: Option[LocalDate]): Long = {
+ try {
+ var count = 0L
+ var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null
+
+ do {
+ val queryBuilder = buildQueryForDateRange(table, dateBeginOpt, dateEndOpt)
+ .select(Select.COUNT)
+
+ if (lastEvaluatedKey != null) {
+ queryBuilder.exclusiveStartKey(lastEvaluatedKey)
+ }
+
+ val response = dynamoDbClient.query(queryBuilder.build())
+ count += response.count()
+ lastEvaluatedKey = response.lastEvaluatedKey()
+ } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty)
+
+ count
+ } catch {
+ case NonFatal(ex) =>
+ log.error(s"Error counting data chunks for table '$table'", ex)
+ throw ex
+ }
+ }
+
+ override def getDataAvailabilityFromStorage(table: String, dateBegin: LocalDate, dateEnd: LocalDate): Seq[DataAvailability] = {
+ try {
+ val allChunks = getAllChunksInDateRange(table, dateBegin, dateEnd)
+
+ // Group by infoDate and aggregate
+ val grouped = allChunks.groupBy(_.infoDate)
+ val availability = grouped.map { case (dateStr, chunks) =>
+ val date = LocalDate.parse(dateStr)
+ val totalRecords = chunks.map(_.outputRecordCount).sum
+ DataAvailability(date, chunks.length, totalRecords)
+ }.toSeq.sortBy(_.infoDate.toEpochDay)
+
+ availability
+ } catch {
+ case NonFatal(ex) =>
+ log.error(s"Error getting data availability for table '$table'", ex)
+ throw ex
+ }
+ }
+
+ override def saveRecordCountToStorage(
+ table: String,
+ infoDate: LocalDate,
+ inputRecordCount: Long,
+ outputRecordCount: Long,
+ recordsAppended: Option[Long],
+ jobStarted: Long,
+ jobFinished: Long
+ ): Unit = {
+ try {
+ val dateStr = getDateStr(infoDate)
+ val sortKey = buildSortKey(dateStr, jobFinished)
+
+ val item = dataChunkToItem(
+ DataChunk(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, jobStarted, jobFinished, Some(batchId), recordsAppended),
+ sortKey
+ )
+
+ val putRequest = PutItemRequest.builder()
+ .tableName(bookkeepingTableName)
+ .item(item)
+ .build()
+
+ dynamoDbClient.putItem(putRequest)
+ log.debug(s"Saved bookkeeping record for table '$table', infoDate='$dateStr', sortKey='$sortKey', batchId=$batchId")
+ } catch {
+ case NonFatal(ex) =>
+ log.error(s"Error saving record count for table '$table' at $infoDate", ex)
+ throw ex
+ }
+ }
+
+ override def deleteNonCurrentBatchRecords(table: String, infoDate: LocalDate): Unit = {
+ try {
+ val dateStr = getDateStr(infoDate)
+
+ AlgorithmUtils.runActionWithElapsedTimeEvent(queryWarningTimeoutMs) {
+ // Query all items for this table and date
+ val queryRequest = QueryRequest.builder()
+ .tableName(bookkeepingTableName)
+ .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND begins_with($ATTR_INFO_DATE_SORT_KEY, :infoDatePrefix)")
+ .expressionAttributeValues(Map(
+ ":tableName" -> AttributeValue.builder().s(table).build(),
+ ":infoDatePrefix" -> AttributeValue.builder().s(s"$dateStr#").build()
+ ).asJava)
+ .build()
+
+ val response = dynamoDbClient.query(queryRequest)
+ val items = response.items().asScala
+
+ // Filter and delete items with different batchId
+ items.foreach { item =>
+ val itemBatchId = Option(item.get(ATTR_BATCH_ID)).flatMap(av =>
+ if (av.n() != null) Some(av.n().toLong) else None
+ )
+
+ if (itemBatchId.exists(_ != batchId)) {
+ val sortKey = item.get(ATTR_INFO_DATE_SORT_KEY).s()
+ val deleteRequest = DeleteItemRequest.builder()
+ .tableName(bookkeepingTableName)
+ .key(Map(
+ ATTR_TABLE_NAME -> AttributeValue.builder().s(table).build(),
+ ATTR_INFO_DATE_SORT_KEY -> AttributeValue.builder().s(sortKey).build()
+ ).asJava)
+ .conditionExpression(s"$ATTR_JOB_FINISHED = :jobFinished")
+ .expressionAttributeValues(Map(
+ ":jobFinished" -> item.get(ATTR_JOB_FINISHED)
+ ).asJava)
+ .build()
+
+ try {
+ dynamoDbClient.deleteItem(deleteRequest)
+ } catch {
+ case _: ConditionalCheckFailedException =>
+ // Item was already modified or deleted, ignore
+ log.info(s"Could not delete item for table '$table', sortKey '$sortKey' - already modified")
+ }
+ }
+ }
+ } { actualTimeMs =>
+ val elapsedTime = TimeUtils.prettyPrintElapsedTimeShort(actualTimeMs)
+ log.warn(s"DynamoDB query took too long ($elapsedTime) while deleting from $bookkeepingTableName, tableName='$table', infoDate='$infoDate', batchId!=$batchId")
+ }
+ } catch {
+ case NonFatal(ex) =>
+ log.error(s"Error deleting non-current batch records for table '$table' at $infoDate", ex)
+ throw ex
+ }
+ }
+
+ override def deleteTable(tableName: String): Seq[String] = {
+ try {
+ val results = scala.collection.mutable.ListBuffer[String]()
+
+ // Delete from bookkeeping table
+ val bookkeepingCount = deleteTableFromBookkeeping(tableName)
+ results += s"Deleted $bookkeepingCount bookkeeping records for table '$tableName'"
+
+ // Delete from schema table
+ val schemaCount = deleteTableFromSchemas(tableName)
+ results += s"Deleted $schemaCount schema records for table '$tableName'"
+
+ // Delete offsets
+ val offsetResults = offsetManagementDynamoDB.deleteAllOffsets(tableName, dynamoDbClient)
+ results += s"Deleted $offsetResults offset records for table '$tableName'"
+
+ log.info(s"Successfully deleted all records for table '$tableName'")
+ results.toSeq
+ } catch {
+ case NonFatal(ex) =>
+ log.error(s"Error deleting table '$tableName'", ex)
+ throw ex
+ }
+ }
+
+ /**
+ * Deletes all bookkeeping records for the specified table.
+ *
+ * @param tableName The name of the table to delete
+ * @return The number of records deleted
+ */
+ private def deleteTableFromBookkeeping(tableName: String): Int = {
+ var deletedCount = 0
+ var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null
+
+ do {
+ val queryBuilder = QueryRequest.builder()
+ .tableName(bookkeepingTableName)
+ .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName")
+ .expressionAttributeValues(Map(
+ ":tableName" -> AttributeValue.builder().s(tableName).build()
+ ).asJava)
+
+ if (lastEvaluatedKey != null) {
+ queryBuilder.exclusiveStartKey(lastEvaluatedKey)
+ }
+
+ val response = dynamoDbClient.query(queryBuilder.build())
+ val items = response.items().asScala
+
+ // Delete each item
+ items.foreach { item =>
+ val sortKey = item.get(ATTR_INFO_DATE_SORT_KEY).s()
+ val deleteRequest = DeleteItemRequest.builder()
+ .tableName(bookkeepingTableName)
+ .key(Map(
+ ATTR_TABLE_NAME -> AttributeValue.builder().s(tableName).build(),
+ ATTR_INFO_DATE_SORT_KEY -> AttributeValue.builder().s(sortKey).build()
+ ).asJava)
+ .build()
+
+ try {
+ dynamoDbClient.deleteItem(deleteRequest)
+ deletedCount += 1
+ } catch {
+ case NonFatal(ex) =>
+ log.warn(s"Failed to delete bookkeeping item for table '$tableName', sortKey '$sortKey'", ex)
+ }
+ }
+
+ lastEvaluatedKey = response.lastEvaluatedKey()
+ } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty)
+
+ deletedCount
+ }
+
+ /**
+ * Deletes all schema records for the specified table.
+ *
+ * @param tableName The name of the table to delete
+ * @return The number of records deleted
+ */
+ private def deleteTableFromSchemas(tableName: String): Int = {
+ var deletedCount = 0
+ var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null
+
+ do {
+ val queryBuilder = QueryRequest.builder()
+ .tableName(schemaTableName)
+ .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName")
+ .expressionAttributeValues(Map(
+ ":tableName" -> AttributeValue.builder().s(tableName).build()
+ ).asJava)
+
+ if (lastEvaluatedKey != null) {
+ queryBuilder.exclusiveStartKey(lastEvaluatedKey)
+ }
+
+ val response = dynamoDbClient.query(queryBuilder.build())
+ val items = response.items().asScala
+
+ // Delete each item
+ items.foreach { item =>
+ val infoDate = item.get(ATTR_INFO_DATE).s()
+ val deleteRequest = DeleteItemRequest.builder()
+ .tableName(schemaTableName)
+ .key(Map(
+ ATTR_TABLE_NAME -> AttributeValue.builder().s(tableName).build(),
+ ATTR_INFO_DATE -> AttributeValue.builder().s(infoDate).build()
+ ).asJava)
+ .build()
+
+ try {
+ dynamoDbClient.deleteItem(deleteRequest)
+ deletedCount += 1
+ } catch {
+ case NonFatal(ex) =>
+ log.warn(s"Failed to delete schema item for table '$tableName', infoDate '$infoDate'", ex)
+ }
+ }
+
+ lastEvaluatedKey = response.lastEvaluatedKey()
+ } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty)
+
+ deletedCount
+ }
+
+ override def getLatestSchema(tableName: String, until: LocalDate): Option[(StructType, LocalDate)] = {
+ try {
+ val untilDateStr = until.toString
+
+ val queryRequest = QueryRequest.builder()
+ .tableName(schemaTableName)
+ .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE <= :untilDate")
+ .expressionAttributeValues(Map(
+ ":tableName" -> AttributeValue.builder().s(tableName).build(),
+ ":untilDate" -> AttributeValue.builder().s(untilDateStr).build()
+ ).asJava)
+ .scanIndexForward(false) // descending order
+ .limit(1)
+ .build()
+
+ val response = dynamoDbClient.query(queryRequest)
+ val items = response.items().asScala
+
+ items.headOption.flatMap { item =>
+ val tableSchema = TableSchema(
+ tableName = item.get(ATTR_TABLE_NAME).s(),
+ infoDate = item.get(ATTR_INFO_DATE).s(),
+ schemaJson = item.get(ATTR_SCHEMA_JSON).s()
+ )
+ TableSchema.toSchemaAndDate(tableSchema)
+ }
+ } catch {
+ case NonFatal(ex) =>
+ log.error(s"Error getting latest schema for table '$tableName' until $until", ex)
+ throw ex
+ }
+ }
+
+ private[pramen] override def saveSchema(tableName: String, infoDate: LocalDate, schema: StructType): Unit = {
+ try {
+ val item = Map(
+ ATTR_TABLE_NAME -> AttributeValue.builder().s(tableName).build(),
+ ATTR_INFO_DATE -> AttributeValue.builder().s(infoDate.toString).build(),
+ ATTR_SCHEMA_JSON -> AttributeValue.builder().s(schema.json).build()
+ ).asJava
+
+ val putRequest = PutItemRequest.builder()
+ .tableName(schemaTableName)
+ .item(item)
+ .build()
+
+ dynamoDbClient.putItem(putRequest)
+ log.debug(s"Saved schema for table '$tableName', infoDate='$infoDate'")
+ } catch {
+ case NonFatal(ex) =>
+ log.error(s"Error saving schema for table '$tableName' at $infoDate", ex)
+ throw ex
+ }
+ }
+
+ private[pramen] override def getOffsetManager: OffsetManager = {
+ offsetManagement
+ }
+
+ override def close(): Unit = {
+ try {
+ // Note: offsetManagement wraps OffsetManagerDynamoDb which shares the same dynamoDbClient,
+ // so we don't need to close it separately
+ dynamoDbClient.close()
+ } catch {
+ case NonFatal(ex) =>
+ log.warn("Error closing DynamoDB client", ex)
+ }
+ }
+
+ private def itemToDataChunk(item: java.util.Map[String, AttributeValue]): DataChunk = {
+ DataChunk(
+ tableName = item.get(ATTR_TABLE_NAME).s(),
+ infoDate = item.get(ATTR_INFO_DATE).s(),
+ infoDateBegin = item.get(ATTR_INFO_DATE_BEGIN).s(),
+ infoDateEnd = item.get(ATTR_INFO_DATE_END).s(),
+ inputRecordCount = item.get(ATTR_INPUT_RECORD_COUNT).n().toLong,
+ outputRecordCount = item.get(ATTR_OUTPUT_RECORD_COUNT).n().toLong,
+ jobStarted = item.get(ATTR_JOB_STARTED).n().toLong,
+ jobFinished = item.get(ATTR_JOB_FINISHED).n().toLong,
+ batchId = Option(item.get(ATTR_BATCH_ID)).flatMap(av => if (av.n() != null) Some(av.n().toLong) else None),
+ appendedRecordCount = Option(item.get(ATTR_APPENDED_RECORD_COUNT)).flatMap(av => if (av.n() != null) Some(av.n().toLong) else None)
+ )
+ }
+
+ private def dataChunkToItem(chunk: DataChunk, sortKey: String): java.util.Map[String, AttributeValue] = {
+ val baseMap = Map(
+ ATTR_TABLE_NAME -> AttributeValue.builder().s(chunk.tableName).build(),
+ ATTR_INFO_DATE_SORT_KEY -> AttributeValue.builder().s(sortKey).build(),
+ ATTR_INFO_DATE -> AttributeValue.builder().s(chunk.infoDate).build(),
+ ATTR_INFO_DATE_BEGIN -> AttributeValue.builder().s(chunk.infoDateBegin).build(),
+ ATTR_INFO_DATE_END -> AttributeValue.builder().s(chunk.infoDateEnd).build(),
+ ATTR_INPUT_RECORD_COUNT -> AttributeValue.builder().n(chunk.inputRecordCount.toString).build(),
+ ATTR_OUTPUT_RECORD_COUNT -> AttributeValue.builder().n(chunk.outputRecordCount.toString).build(),
+ ATTR_JOB_STARTED -> AttributeValue.builder().n(chunk.jobStarted.toString).build(),
+ ATTR_JOB_FINISHED -> AttributeValue.builder().n(chunk.jobFinished.toString).build()
+ )
+
+ val withBatchId = chunk.batchId match {
+ case Some(bid) => baseMap + (ATTR_BATCH_ID -> AttributeValue.builder().n(bid.toString).build())
+ case None => baseMap
+ }
+
+ val withAppendedCount = chunk.appendedRecordCount match {
+ case Some(count) => withBatchId + (ATTR_APPENDED_RECORD_COUNT -> AttributeValue.builder().n(count.toString).build())
+ case None => withBatchId
+ }
+
+ withAppendedCount.asJava
+ }
+
+ private def buildQueryForDateRange(
+ table: String,
+ dateBeginOpt: Option[LocalDate],
+ dateEndOpt: Option[LocalDate]
+ ): QueryRequest.Builder = {
+ val builder = QueryRequest.builder()
+ .tableName(bookkeepingTableName)
+
+ (dateBeginOpt, dateEndOpt) match {
+ case (Some(begin), Some(end)) =>
+ val beginStr = getDateStr(begin)
+ val endStr = getDateStr(end)
+ builder
+ .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE_SORT_KEY BETWEEN :beginDate AND :endDateMax")
+ .expressionAttributeValues(Map(
+ ":tableName" -> AttributeValue.builder().s(table).build(),
+ ":beginDate" -> AttributeValue.builder().s(s"$beginStr#").build(),
+ ":endDateMax" -> AttributeValue.builder().s(s"$endStr#~").build()
+ ).asJava)
+ case (Some(begin), None) =>
+ val beginStr = getDateStr(begin)
+ builder
+ .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE_SORT_KEY >= :beginDate")
+ .expressionAttributeValues(Map(
+ ":tableName" -> AttributeValue.builder().s(table).build(),
+ ":beginDate" -> AttributeValue.builder().s(s"$beginStr#").build()
+ ).asJava)
+ case (None, Some(end)) =>
+ val endStr = getDateStr(end)
+ builder
+ .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE_SORT_KEY <= :endDateMax")
+ .expressionAttributeValues(Map(
+ ":tableName" -> AttributeValue.builder().s(table).build(),
+ ":endDateMax" -> AttributeValue.builder().s(s"$endStr#~").build()
+ ).asJava)
+ case (None, None) =>
+ builder
+ .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName")
+ .expressionAttributeValues(Map(
+ ":tableName" -> AttributeValue.builder().s(table).build()
+ ).asJava)
+ }
+ }
+
+ private def getAllChunksInDateRange(table: String, dateBegin: LocalDate, dateEnd: LocalDate): Seq[DataChunk] = {
+ val chunks = scala.collection.mutable.ListBuffer[DataChunk]()
+ var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null
+
+ do {
+ val queryBuilder = buildQueryForDateRange(table, Some(dateBegin), Some(dateEnd))
+
+ if (lastEvaluatedKey != null) {
+ queryBuilder.exclusiveStartKey(lastEvaluatedKey)
+ }
+
+ val response = dynamoDbClient.query(queryBuilder.build())
+ chunks ++= response.items().asScala.map(itemToDataChunk)
+ lastEvaluatedKey = response.lastEvaluatedKey()
+ } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty)
+
+ chunks.toSeq
+ }
+
+ /**
+ * Builds the composite sort key for bookkeeping table: "infoDate#jobFinished"
+ *
+ * @param infoDate The information date
+ * @param jobFinished The job finished timestamp in milliseconds
+ * @return Composite sort key string
+ */
+ private def buildSortKey(infoDate: String, jobFinished: Long): String = {
+ s"$infoDate#$jobFinished"
+ }
+
+ /**
+ * Extracts the infoDate from a composite sort key.
+ *
+ * @param sortKey Composite sort key in format "infoDate#jobFinished"
+ * @return The infoDate portion
+ */
+ private def extractInfoDate(sortKey: String): String = {
+ sortKey.split("#").headOption.getOrElse(sortKey)
+ }
+}
+
+object BookkeeperDynamoDb {
+ val DEFAULT_BOOKKEEPING_TABLE = "bookkeeping"
+ val DEFAULT_SCHEMA_TABLE = "schemas"
+ val DEFAULT_TABLE_PREFIX = "pramen"
+
+ // Attribute names for bookkeeping table
+ val ATTR_TABLE_NAME = "tableName"
+ val ATTR_INFO_DATE = "infoDate"
+ val ATTR_INFO_DATE_SORT_KEY = "infoDateSortKey" // Composite: "infoDate#jobFinished"
+ val ATTR_INFO_DATE_BEGIN = "infoDateBegin"
+ val ATTR_INFO_DATE_END = "infoDateEnd"
+ val ATTR_INPUT_RECORD_COUNT = "inputRecordCount"
+ val ATTR_OUTPUT_RECORD_COUNT = "outputRecordCount"
+ val ATTR_JOB_STARTED = "jobStarted"
+ val ATTR_JOB_FINISHED = "jobFinished"
+ val ATTR_BATCH_ID = "batchId"
+ val ATTR_APPENDED_RECORD_COUNT = "appendedRecordCount"
+
+ // Attribute names for schema table
+ val ATTR_SCHEMA_JSON = "schemaJson"
+
+ val MODEL_VERSION = 1
+
+ private val log = LoggerFactory.getLogger(this.getClass)
+
+ /**
+ * Builder for creating BookkeeperDynamoDb instances.
+ * Provides a fluent API for configuring DynamoDB bookkeeper.
+ *
+ * Example:
+ * {{{
+ * val bookkeeper = BookkeeperDynamoDb.builder
+ * .withRegion("us-east-1")
+ * .withBatchId(System.currentTimeMillis())
+ * .withTablePrefix("my_app")
+ * .build()
+ * }}}
+ */
+ class BookkeeperDynamoDbBuilder {
+ private var region: Option[String] = None
+ private var batchId: Option[Long] = None
+ private var tableArn: Option[String] = None
+ private var tablePrefix: String = DEFAULT_TABLE_PREFIX
+ private var credentialsProvider: Option[AwsCredentialsProvider] = None
+ private var endpoint: Option[String] = None
+
+ /**
+ * Sets the AWS region for the DynamoDB client.
+ *
+ * @param region AWS region (e.g., "us-east-1", "eu-west-1")
+ * @return this builder
+ */
+ def withRegion(region: String): BookkeeperDynamoDbBuilder = {
+ this.region = Some(region)
+ this
+ }
+
+ /**
+ * Sets the batch ID for this bookkeeper instance.
+ *
+ * @param batchId Batch ID (typically timestamp in milliseconds)
+ * @return this builder
+ */
+ def withBatchId(batchId: Long): BookkeeperDynamoDbBuilder = {
+ this.batchId = Some(batchId)
+ this
+ }
+
+ /**
+ * Sets the table ARN prefix for cross-account or cross-region access.
+ *
+ * @param arn ARN prefix (e.g., "arn:aws:dynamodb:us-east-1:123456789012:table/")
+ * @return this builder
+ */
+ def withTableArn(arn: String): BookkeeperDynamoDbBuilder = {
+ this.tableArn = Some(arn)
+ this
+ }
+
+ /**
+ * Sets the table ARN prefix for cross-account or cross-region access.
+ *
+ * @param arnOpt ARN prefix (e.g., "arn:aws:dynamodb:us-east-1:123456789012:table/")
+ * @return this builder
+ */
+ def withTableArn(arnOpt: Option[String]): BookkeeperDynamoDbBuilder = {
+ this.tableArn = arnOpt
+ this
+ }
+
+ /**
+ * Sets the table name prefix to allow multiple bookkeeping sets in the same account.
+ *
+ * @param prefix Table name prefix (default: "pramen")
+ * @return this builder
+ */
+ def withTablePrefix(prefix: String): BookkeeperDynamoDbBuilder = {
+ this.tablePrefix = prefix
+ this
+ }
+
+ /**
+ * Sets custom AWS credentials provider.
+ *
+ * @param provider AWS credentials provider
+ * @return this builder
+ */
+ def withCredentialsProvider(provider: AwsCredentialsProvider): BookkeeperDynamoDbBuilder = {
+ this.credentialsProvider = Some(provider)
+ this
+ }
+
+ /**
+ * Sets a custom DynamoDB endpoint (useful for testing with LocalStack or DynamoDB Local).
+ *
+ * @param endpoint Endpoint URI (e.g., "http://localhost:8000")
+ * @return this builder
+ */
+ def withEndpoint(endpoint: String): BookkeeperDynamoDbBuilder = {
+ this.endpoint = Some(endpoint)
+ this
+ }
+
+ /**
+ * Builds the BookkeeperDynamoDb instance.
+ *
+ * @return Configured BookkeeperDynamoDb instance
+ * @throws IllegalArgumentException if required parameters are missing
+ */
+ def build(): BookkeeperDynamoDb = {
+ val actualBatchId = batchId.getOrElse(throw new IllegalArgumentException("BatchId is not supplied when building the instance of BookkeeperDynamoDb"))
+
+ if (region.isEmpty) {
+ throw new IllegalArgumentException("Either region or dynamoDbClient must be provided")
+ }
+
+ val clientBuilder = DynamoDbClient.builder()
+ .region(Region.of(region.get))
+
+ credentialsProvider.foreach(clientBuilder.credentialsProvider)
+
+ endpoint.foreach { ep =>
+ clientBuilder.endpointOverride(URI.create(ep))
+ }
+
+ val client = clientBuilder.build()
+
+ try {
+ new BookkeeperDynamoDb(
+ dynamoDbClient = client,
+ batchId = actualBatchId,
+ tableArn = tableArn,
+ tablePrefix = tablePrefix
+ )
+ } catch {
+ case NonFatal(ex) =>
+ client.close()
+ throw ex
+ }
+ }
+ }
+
+ def builder: BookkeeperDynamoDbBuilder = new BookkeeperDynamoDbBuilder
+
+ /**
+ * Constructs the full table name using ARN prefix and table name.
+ * If tableArn is provided, uses it as a prefix, otherwise returns just the table name.
+ *
+ * @param tableArn Optional ARN prefix for the table
+ * @param tableName The table name
+ * @return Full table name or ARN
+ */
+ def getFullTableName(tableArn: Option[String], tableName: String): String = {
+ tableArn match {
+ case Some(arn) if arn.nonEmpty =>
+ // If ARN ends with table/, append the table name, otherwise append /table/tableName
+ if (arn.endsWith("table/")) {
+ s"$arn$tableName"
+ } else if (arn.endsWith("/")) {
+ s"${arn}table/$tableName"
+ } else {
+ s"$arn/table/$tableName"
+ }
+ case _ => tableName
+ }
+ }
+
+ /**
+ * Waits for a table to become active after creation.
+ *
+ * @param tableName The name of the table to wait for
+ * @param maxWaitSeconds Maximum time to wait in seconds (default: 60)
+ */
+ def waitForTableActive(tableName: String, dynamoDbClient: DynamoDbClient, maxWaitSeconds: Int = 60): Unit = {
+ val startTime = System.currentTimeMillis()
+ val maxWaitMs = maxWaitSeconds * 1000L
+
+ var tableActive = false
+ while (!tableActive && (System.currentTimeMillis() - startTime) < maxWaitMs) {
+ try {
+ val describeRequest = DescribeTableRequest.builder()
+ .tableName(tableName)
+ .build()
+
+ val response = dynamoDbClient.describeTable(describeRequest)
+ val status = response.table().tableStatus()
+
+ if (status == TableStatus.ACTIVE) {
+ tableActive = true
+ log.info(s"Table $tableName is now ACTIVE")
+ } else {
+ log.info(s"Table $tableName status: $status, waiting...")
+ Thread.sleep(2000) // Wait 2 seconds before checking again
+ }
+ } catch {
+ case NonFatal(ex) =>
+ log.warn(s"Error checking table status for $tableName", ex)
+ Thread.sleep(2000)
+ }
+ }
+
+ if (!tableActive) {
+ throw new RuntimeException(s"Table $tableName did not become active within $maxWaitSeconds seconds")
+ }
+ }
+}
diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManager.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManager.scala
index 684825429..39addcd27 100644
--- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManager.scala
+++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManager.scala
@@ -33,7 +33,7 @@ import java.time.LocalDate
* The startWriteOffsets() together with commitOffsets() and rollbackOffsets() provide mechanisms to ensure consistency
* with data.
*/
-trait OffsetManager {
+trait OffsetManager extends AutoCloseable {
/**
* Returns offsets for an information date.
*
diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerCached.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerCached.scala
index 8da69ce30..52f411ab0 100644
--- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerCached.scala
+++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerCached.scala
@@ -25,21 +25,21 @@ import java.time.LocalDate
import scala.collection.mutable
/**
- * The offset manager decorator handles caching or repeated queries.
+ * The offset manager decorator handles caching of repeated queries.
*/
class OffsetManagerCached(offsetManager: OffsetManager) extends OffsetManager {
private val log = LoggerFactory.getLogger(this.getClass)
private val aggregatedOffsetsCache = new mutable.HashMap[(String, Option[LocalDate]), Option[DataOffsetAggregated]]
- def getOffsets(table: String, infoDate: LocalDate): Array[DataOffset] = {
+ override def getOffsets(table: String, infoDate: LocalDate): Array[DataOffset] = {
offsetManager.getOffsets(table, infoDate)
}
- def getUncommittedOffsets(table: String, onlyForInfoDate: Option[LocalDate]): Array[UncommittedOffset] = {
+ override def getUncommittedOffsets(table: String, onlyForInfoDate: Option[LocalDate]): Array[UncommittedOffset] = {
offsetManager.getUncommittedOffsets(table, onlyForInfoDate)
}
- def getMaxInfoDateAndOffset(table: String, onlyForInfoDate: Option[LocalDate]): Option[DataOffsetAggregated] = synchronized {
+ override def getMaxInfoDateAndOffset(table: String, onlyForInfoDate: Option[LocalDate]): Option[DataOffsetAggregated] = synchronized {
val tbl = onlyForInfoDate match {
case Some(date) => s"'$table' for '$date'"
case None => s"'$table'"
@@ -57,11 +57,11 @@ class OffsetManagerCached(offsetManager: OffsetManager) extends OffsetManager {
}
}
- def startWriteOffsets(table: String, infoDate: LocalDate, offsetType: OffsetType): DataOffsetRequest = {
+ override def startWriteOffsets(table: String, infoDate: LocalDate, offsetType: OffsetType): DataOffsetRequest = {
offsetManager.startWriteOffsets(table, infoDate, offsetType)
}
- def commitOffsets(request: DataOffsetRequest, minOffset: OffsetValue, maxOffset: OffsetValue): Unit = {
+ override def commitOffsets(request: DataOffsetRequest, minOffset: OffsetValue, maxOffset: OffsetValue): Unit = {
offsetManager.commitOffsets(request, minOffset, maxOffset)
this.synchronized {
@@ -69,7 +69,7 @@ class OffsetManagerCached(offsetManager: OffsetManager) extends OffsetManager {
}
}
- def commitRerun(request: DataOffsetRequest, minOffset: OffsetValue, maxOffset: OffsetValue): Unit = {
+ override def commitRerun(request: DataOffsetRequest, minOffset: OffsetValue, maxOffset: OffsetValue): Unit = {
this.synchronized {
aggregatedOffsetsCache --= aggregatedOffsetsCache.keys.filter(_._1 == request.tableName)
}
@@ -77,7 +77,7 @@ class OffsetManagerCached(offsetManager: OffsetManager) extends OffsetManager {
offsetManager.commitRerun(request, minOffset, maxOffset)
}
- def postCommittedRecords(commitRequests: Seq[OffsetCommitRequest]): Unit = {
+ override def postCommittedRecords(commitRequests: Seq[OffsetCommitRequest]): Unit = {
offsetManager.postCommittedRecords(commitRequests)
val updatedTables = commitRequests.map(_.table).toSet
@@ -86,10 +86,14 @@ class OffsetManagerCached(offsetManager: OffsetManager) extends OffsetManager {
}
}
- def rollbackOffsets(request: DataOffsetRequest): Unit = {
+ override def rollbackOffsets(request: DataOffsetRequest): Unit = {
offsetManager.rollbackOffsets(request)
}
+ override def close(): Unit = {
+ offsetManager.close()
+ }
+
private def renderAggregatedOptionalOffset(offsetsOpt: Option[DataOffsetAggregated]): String = {
offsetsOpt match {
case Some(offsets) =>
diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala
new file mode 100644
index 000000000..0c5f872d8
--- /dev/null
+++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala
@@ -0,0 +1,642 @@
+/*
+ * Copyright 2022 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.pramen.core.bookkeeper
+
+import org.slf4j.LoggerFactory
+import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider
+import software.amazon.awssdk.regions.Region
+import software.amazon.awssdk.services.dynamodb.DynamoDbClient
+import software.amazon.awssdk.services.dynamodb.model._
+import za.co.absa.pramen.api.offset.DataOffset.UncommittedOffset
+import za.co.absa.pramen.api.offset.{DataOffset, OffsetType, OffsetValue}
+import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb.waitForTableActive
+import za.co.absa.pramen.core.bookkeeper.model._
+
+import java.net.URI
+import java.time.{Instant, LocalDate}
+import scala.collection.JavaConverters._
+import scala.util.control.NonFatal
+
+/**
+ * DynamoDB-based offset manager for tracking incremental ingestion offsets.
+ *
+ * Table schema for offsets:
+ * - Partition key: pramenTableName (String)
+ * - Sort key: compositeKey (String) - format: "infoDate#createdAtMilli" for efficient querying
+ *
+ * The composite sort key allows:
+ * 1. Efficient queries for all offsets of a table+infoDate combination
+ * 2. Time-ordered offset records
+ * 3. Support for aggregation queries (fetch all offsets for a table+date)
+ *
+ * @param dynamoDbClient The DynamoDB client to use
+ * @param batchId The batch ID for this execution
+ * @param tableArn Optional ARN prefix for the offset table
+ * @param tablePrefix Prefix for the offset table name (default: "pramen")
+ */
+class OffsetManagerDynamoDb(
+ dynamoDbClient: DynamoDbClient,
+ batchId: Long,
+ tableArn: Option[String] = None,
+ tablePrefix: String = OffsetManagerDynamoDb.DEFAULT_TABLE_PREFIX,
+ closesClient: Boolean = true
+) extends OffsetManager {
+
+ import OffsetManagerDynamoDb._
+
+ private val log = LoggerFactory.getLogger(this.getClass)
+
+ private val offsetTableBaseName = s"${tablePrefix}_${DEFAULT_OFFSET_TABLE}"
+ private val offsetTableFullName = BookkeeperDynamoDb.getFullTableName(tableArn, offsetTableBaseName)
+
+ // Initialize table on creation
+ createOffsetTableIfNotExists()
+
+ override def getOffsets(table: String, infoDate: LocalDate): Array[DataOffset] = {
+ val offsets = getOffsetRecords(table, infoDate)
+
+ if (offsets.isEmpty) {
+ return Array.empty
+ }
+
+ offsets.map(OffsetRecordConverter.toDataOffset)
+ }
+
+ override def getUncommittedOffsets(table: String, onlyForInfoDate: Option[LocalDate]): Array[UncommittedOffset] = {
+ try {
+ onlyForInfoDate match {
+ case Some(infoDate) =>
+ // Query for specific table and info date
+ val offsets = getOffsetRecords(table, infoDate)
+ offsets
+ .filter(_.committedAtMilli.isEmpty)
+ .map(record => OffsetRecordConverter.toDataOffset(record).asInstanceOf[UncommittedOffset])
+
+ case None =>
+ // Query all offsets for this table with pagination
+ var allItems = Seq.empty[java.util.Map[String, AttributeValue]]
+ var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null
+
+ do {
+ val queryRequestBuilder = QueryRequest.builder()
+ .tableName(offsetTableFullName)
+ .keyConditionExpression(s"$ATTR_PRAMEN_TABLE_NAME = :table_name")
+ .filterExpression(s"attribute_not_exists($ATTR_COMMITTED_AT)")
+ .expressionAttributeValues(Map(
+ ":table_name" -> AttributeValue.builder().s(table).build()
+ ).asJava)
+
+ if (lastEvaluatedKey != null) {
+ queryRequestBuilder.exclusiveStartKey(lastEvaluatedKey)
+ }
+
+ val result = dynamoDbClient.query(queryRequestBuilder.build())
+ allItems = allItems ++ result.items().asScala
+ lastEvaluatedKey = result.lastEvaluatedKey()
+ } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty)
+
+ allItems
+ .map(itemToOffsetRecord)
+ .map(record => OffsetRecordConverter.toDataOffset(record).asInstanceOf[UncommittedOffset])
+ .toArray
+ }
+ } catch {
+ case NonFatal(ex) =>
+ throw new RuntimeException(s"Unable to read uncommitted offsets from the offset table '$offsetTableFullName'.", ex)
+ }
+ }
+
+ override def getMaxInfoDateAndOffset(table: String, onlyForInfoDate: Option[LocalDate]): Option[DataOffsetAggregated] = {
+ val maxInfoDateOpt = onlyForInfoDate.orElse(getMaximumInfoDate(table))
+
+ try {
+ maxInfoDateOpt.flatMap { infoDate =>
+ getMinMaxOffsets(table, infoDate)
+ }
+ } catch {
+ case NonFatal(ex) => throw new RuntimeException(s"Unable to read from the offset table '$offsetTableFullName'.", ex)
+ }
+ }
+
+ override def startWriteOffsets(table: String, infoDate: LocalDate, offsetType: OffsetType): DataOffsetRequest = {
+ val createdAt = Instant.now()
+ val createdAtMilli = createdAt.toEpochMilli
+ val compositeKey = s"${infoDate.toString}#${createdAtMilli}"
+
+ try {
+ val putRequest = PutItemRequest.builder()
+ .tableName(offsetTableFullName)
+ .item(Map(
+ ATTR_PRAMEN_TABLE_NAME -> AttributeValue.builder().s(table).build(),
+ ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build(),
+ ATTR_INFO_DATE -> AttributeValue.builder().s(infoDate.toString).build(),
+ ATTR_DATA_TYPE -> AttributeValue.builder().s(offsetType.dataTypeString).build(),
+ ATTR_MIN_OFFSET -> AttributeValue.builder().s("").build(),
+ ATTR_MAX_OFFSET -> AttributeValue.builder().s("").build(),
+ ATTR_BATCH_ID -> AttributeValue.builder().n(batchId.toString).build(),
+ ATTR_CREATED_AT -> AttributeValue.builder().n(createdAtMilli.toString).build()
+ ).asJava)
+ .build()
+
+ dynamoDbClient.putItem(putRequest)
+
+ DataOffsetRequest(table, infoDate, batchId, createdAt)
+ } catch {
+ case NonFatal(ex) =>
+ throw new RuntimeException(s"Unable to write to the offset table '$offsetTableFullName'.", ex)
+ }
+ }
+
+ override def commitOffsets(request: DataOffsetRequest, minOffset: OffsetValue, maxOffset: OffsetValue): Unit = {
+ val committedAt = Instant.now().toEpochMilli
+ val compositeKey = s"${request.infoDate.toString}#${request.createdAt.toEpochMilli}"
+
+ try {
+ val updateRequest = UpdateItemRequest.builder()
+ .tableName(offsetTableFullName)
+ .key(Map(
+ ATTR_PRAMEN_TABLE_NAME -> AttributeValue.builder().s(request.tableName).build(),
+ ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build()
+ ).asJava)
+ .updateExpression(s"SET ${ATTR_MIN_OFFSET} = :min_offset, ${ATTR_MAX_OFFSET} = :max_offset, ${ATTR_COMMITTED_AT} = :committed_at")
+ .expressionAttributeValues(Map(
+ ":min_offset" -> AttributeValue.builder().s(minOffset.valueString).build(),
+ ":max_offset" -> AttributeValue.builder().s(maxOffset.valueString).build(),
+ ":committed_at" -> AttributeValue.builder().n(committedAt.toString).build()
+ ).asJava)
+ .build()
+
+ dynamoDbClient.updateItem(updateRequest)
+ } catch {
+ case NonFatal(ex) =>
+ throw new RuntimeException(s"Unable to commit offsets to the offset table '$offsetTableFullName'.", ex)
+ }
+ }
+
+ override def commitRerun(request: DataOffsetRequest, minOffset: OffsetValue, maxOffset: OffsetValue): Unit = {
+ if (minOffset.compareTo(maxOffset) > 0) {
+ throw new IllegalArgumentException(s"minOffset is greater than maxOffset: ${minOffset.valueString} > ${maxOffset.valueString}")
+ }
+
+ val committedAt = Instant.now().toEpochMilli
+ val compositeKey = s"${request.infoDate.toString}#${request.createdAt.toEpochMilli}"
+
+ try {
+ // First, update the current offset
+ val updateRequest = UpdateItemRequest.builder()
+ .tableName(offsetTableFullName)
+ .key(Map(
+ ATTR_PRAMEN_TABLE_NAME -> AttributeValue.builder().s(request.tableName).build(),
+ ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build()
+ ).asJava)
+ .updateExpression(s"SET ${ATTR_MIN_OFFSET} = :min_offset, ${ATTR_MAX_OFFSET} = :max_offset, ${ATTR_COMMITTED_AT} = :committed_at")
+ .expressionAttributeValues(Map(
+ ":min_offset" -> AttributeValue.builder().s(minOffset.valueString).build(),
+ ":max_offset" -> AttributeValue.builder().s(maxOffset.valueString).build(),
+ ":committed_at" -> AttributeValue.builder().n(committedAt.toString).build()
+ ).asJava)
+ .build()
+
+ dynamoDbClient.updateItem(updateRequest)
+
+ // Then, delete all other offsets for this table and info date
+ val allOffsets = getOffsetRecords(request.tableName, request.infoDate)
+ allOffsets
+ .filter(r => r.createdAtMilli != request.createdAt.toEpochMilli)
+ .foreach { record =>
+ val deleteCompositeKey = s"${record.infoDate}#${record.createdAtMilli}"
+ val deleteRequest = DeleteItemRequest.builder()
+ .tableName(offsetTableFullName)
+ .key(Map(
+ ATTR_PRAMEN_TABLE_NAME -> AttributeValue.builder().s(request.tableName).build(),
+ ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(deleteCompositeKey).build()
+ ).asJava)
+ .build()
+
+ dynamoDbClient.deleteItem(deleteRequest)
+ }
+ } catch {
+ case NonFatal(ex) =>
+ throw new RuntimeException(s"Unable to commit rerun to the offset table '$offsetTableFullName'.", ex)
+ }
+ }
+
+ override def postCommittedRecords(commitRequests: Seq[OffsetCommitRequest]): Unit = {
+ val committedAt = Instant.now()
+ val committedAtMilli = committedAt.toEpochMilli
+
+ try {
+ // Insert all new committed records
+ commitRequests.foreach { req =>
+ val compositeKey = s"${req.infoDate.toString}#${req.createdAt.toEpochMilli}"
+
+ val putRequest = PutItemRequest.builder()
+ .tableName(offsetTableFullName)
+ .item(Map(
+ ATTR_PRAMEN_TABLE_NAME -> AttributeValue.builder().s(req.table).build(),
+ ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build(),
+ ATTR_INFO_DATE -> AttributeValue.builder().s(req.infoDate.toString).build(),
+ ATTR_DATA_TYPE -> AttributeValue.builder().s(req.minOffset.dataType.dataTypeString).build(),
+ ATTR_MIN_OFFSET -> AttributeValue.builder().s(req.minOffset.valueString).build(),
+ ATTR_MAX_OFFSET -> AttributeValue.builder().s(req.maxOffset.valueString).build(),
+ ATTR_BATCH_ID -> AttributeValue.builder().n(batchId.toString).build(),
+ ATTR_CREATED_AT -> AttributeValue.builder().n(req.createdAt.toEpochMilli.toString).build(),
+ ATTR_COMMITTED_AT -> AttributeValue.builder().n(committedAtMilli.toString).build()
+ ).asJava)
+ .build()
+
+ dynamoDbClient.putItem(putRequest)
+ }
+
+ // Delete old offsets for each (table, infoDate) pair
+ commitRequests.map(r => (r.table, r.infoDate))
+ .distinct
+ .foreach { case (table, infoDate) =>
+ val allOffsets = getOffsetRecords(table, infoDate)
+ allOffsets
+ .filter(_.committedAtMilli.exists(_ != committedAtMilli))
+ .foreach { record =>
+ val deleteCompositeKey = s"${record.infoDate}#${record.createdAtMilli}"
+ val deleteRequest = DeleteItemRequest.builder()
+ .tableName(offsetTableFullName)
+ .key(Map(
+ ATTR_PRAMEN_TABLE_NAME -> AttributeValue.builder().s(table).build(),
+ ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(deleteCompositeKey).build()
+ ).asJava)
+ .build()
+
+ dynamoDbClient.deleteItem(deleteRequest)
+ }
+ }
+ } catch {
+ case NonFatal(ex) =>
+ throw new RuntimeException(s"Unable to post committed records to the offset table '$offsetTableFullName'.", ex)
+ }
+ }
+
+ override def rollbackOffsets(request: DataOffsetRequest): Unit = {
+ val compositeKey = s"${request.infoDate.toString}#${request.createdAt.toEpochMilli}"
+
+ try {
+ val deleteRequest = DeleteItemRequest.builder()
+ .tableName(offsetTableFullName)
+ .key(Map(
+ ATTR_PRAMEN_TABLE_NAME -> AttributeValue.builder().s(request.tableName).build(),
+ ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build()
+ ).asJava)
+ .build()
+
+ dynamoDbClient.deleteItem(deleteRequest)
+ } catch {
+ case NonFatal(ex) =>
+ throw new RuntimeException(s"Unable to rollback offsets in the offset table '$offsetTableFullName'.", ex)
+ }
+ }
+
+ /**
+ * Gets all offset records for a table and info date.
+ */
+ private[core] def getOffsetRecords(table: String, infoDate: LocalDate): Array[OffsetRecord] = {
+ try {
+ var allItems = Seq.empty[java.util.Map[String, AttributeValue]]
+ var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null
+ val infoDatePrefix = s"${infoDate.toString}#"
+
+ do {
+ val queryRequestBuilder = QueryRequest.builder()
+ .tableName(offsetTableFullName)
+ .keyConditionExpression(s"$ATTR_PRAMEN_TABLE_NAME = :table_name AND begins_with($ATTR_COMPOSITE_KEY, :prefix)")
+ .expressionAttributeValues(Map(
+ ":table_name" -> AttributeValue.builder().s(table).build(),
+ ":prefix" -> AttributeValue.builder().s(infoDatePrefix).build()
+ ).asJava)
+
+ if (lastEvaluatedKey != null) {
+ queryRequestBuilder.exclusiveStartKey(lastEvaluatedKey)
+ }
+
+ val result = dynamoDbClient.query(queryRequestBuilder.build())
+ allItems = allItems ++ result.items().asScala
+ lastEvaluatedKey = result.lastEvaluatedKey()
+ } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty)
+
+ allItems.map(itemToOffsetRecord).toArray
+ } catch {
+ case NonFatal(ex) =>
+ throw new RuntimeException(s"Unable to read offset records from the offset table '$offsetTableFullName'.", ex)
+ }
+ }
+
+ /**
+ * Gets the maximum information date for a table.
+ */
+ private[core] def getMaximumInfoDate(table: String): Option[LocalDate] = {
+ try {
+ var allDates = Seq.empty[LocalDate]
+ var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null
+
+ do {
+ val queryRequestBuilder = QueryRequest.builder()
+ .tableName(offsetTableFullName)
+ .keyConditionExpression(s"${ATTR_PRAMEN_TABLE_NAME} = :table_name")
+ .expressionAttributeValues(Map(
+ ":table_name" -> AttributeValue.builder().s(table).build()
+ ).asJava)
+ .projectionExpression(ATTR_INFO_DATE)
+
+ if (lastEvaluatedKey != null) {
+ queryRequestBuilder.exclusiveStartKey(lastEvaluatedKey)
+ }
+
+ val result = dynamoDbClient.query(queryRequestBuilder.build())
+ allDates = allDates ++ result.items().asScala.map(item => LocalDate.parse(item.get(ATTR_INFO_DATE).s()))
+ lastEvaluatedKey = result.lastEvaluatedKey()
+ } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty)
+
+ if (allDates.isEmpty) {
+ None
+ } else {
+ Some(allDates.maxBy(_.toEpochDay))
+ }
+ } catch {
+ case NonFatal(ex) =>
+ throw new RuntimeException(s"Unable to read maximum info date from the offset table '$offsetTableFullName'.", ex)
+ }
+ }
+
+ /**
+ * Gets min/max offsets for a table and info date, with all offset records for that day.
+ */
+ private[core] def getMinMaxOffsets(table: String, infoDate: LocalDate): Option[DataOffsetAggregated] = {
+ val offsets = getOffsetRecords(table, infoDate).filter(_.committedAtMilli.nonEmpty)
+
+ if (offsets.isEmpty) {
+ return None
+ }
+
+ validateOffsets(table, infoDate, offsets)
+
+ val (minOffset, maxOffset) = getMinMaxOffsets(offsets)
+
+ Some(DataOffsetAggregated(table, infoDate, minOffset, maxOffset, offsets.map(OffsetRecordConverter.toDataOffset)))
+ }
+
+ /**
+ * Gets min/max offsets from an array of offset records.
+ */
+ private[core] def getMinMaxOffsets(offsets: Array[OffsetRecord]): (OffsetValue, OffsetValue) = {
+ val offsetDataType = offsets.head.dataType
+ val minOffset = offsets.flatMap(or => OffsetValue.fromString(offsetDataType, or.minOffset)).min
+ val maxOffset = offsets.flatMap(or => OffsetValue.fromString(offsetDataType, or.maxOffset)).max
+
+ (minOffset, maxOffset)
+ }
+
+ /**
+ * Validates offsets for inconsistencies (e.g., inconsistent offset value types).
+ */
+ private[core] def validateOffsets(table: String, infoDate: LocalDate, offsets: Array[OffsetRecord]): Unit = {
+ val inconsistentOffsets = offsets.groupBy(_.dataType).keys.toArray.sorted
+ if (inconsistentOffsets.length > 1) {
+ throw new RuntimeException(s"Inconsistent offset value types found for $table at $infoDate: ${inconsistentOffsets.mkString(", ")}")
+ }
+ }
+
+ /**
+ * Converts a DynamoDB item to an OffsetRecord.
+ */
+ private def itemToOffsetRecord(item: java.util.Map[String, AttributeValue]): OffsetRecord = {
+ val pramenTableName = item.get(ATTR_PRAMEN_TABLE_NAME).s()
+ val infoDate = item.get(ATTR_INFO_DATE).s()
+ val dataType = item.get(ATTR_DATA_TYPE).s()
+ val minOffset = item.get(ATTR_MIN_OFFSET).s()
+ val maxOffset = item.get(ATTR_MAX_OFFSET).s()
+ val batchId = item.get(ATTR_BATCH_ID).n().toLong
+ val createdAtMilli = item.get(ATTR_CREATED_AT).n().toLong
+ val committedAtMilli = Option(item.get(ATTR_COMMITTED_AT)).map(_.n().toLong)
+
+ OffsetRecord(pramenTableName, infoDate, dataType, minOffset, maxOffset, batchId, createdAtMilli, committedAtMilli)
+ }
+
+ /**
+ * Creates the offset table if it doesn't exist.
+ */
+ private def createOffsetTableIfNotExists(): Unit = {
+ try {
+ val describeRequest = DescribeTableRequest.builder()
+ .tableName(offsetTableFullName)
+ .build()
+
+ dynamoDbClient.describeTable(describeRequest)
+ log.info(s"Offset table '$offsetTableFullName' already exists")
+ } catch {
+ case _: ResourceNotFoundException =>
+ log.info(s"Creating offset table '$offsetTableFullName'")
+ createOffsetTable()
+ case NonFatal(ex) =>
+ log.error(s"Error checking if offset table exists", ex)
+ throw ex
+ }
+ }
+
+ /**
+ * Creates the offset table in DynamoDB.
+ */
+ private def createOffsetTable(): Unit = {
+ val createRequest = CreateTableRequest.builder()
+ .tableName(offsetTableFullName)
+ .attributeDefinitions(
+ AttributeDefinition.builder()
+ .attributeName(ATTR_PRAMEN_TABLE_NAME)
+ .attributeType(ScalarAttributeType.S)
+ .build(),
+ AttributeDefinition.builder()
+ .attributeName(ATTR_COMPOSITE_KEY)
+ .attributeType(ScalarAttributeType.S)
+ .build()
+ )
+ .keySchema(
+ KeySchemaElement.builder()
+ .attributeName(ATTR_PRAMEN_TABLE_NAME)
+ .keyType(KeyType.HASH)
+ .build(),
+ KeySchemaElement.builder()
+ .attributeName(ATTR_COMPOSITE_KEY)
+ .keyType(KeyType.RANGE)
+ .build()
+ )
+ .billingMode(BillingMode.PAY_PER_REQUEST)
+ .build()
+
+ dynamoDbClient.createTable(createRequest)
+ waitForTableActive(offsetTableFullName, dynamoDbClient)
+ log.info(s"Offset table '$offsetTableFullName' created successfully")
+ }
+
+ /**
+ * Closes the DynamoDB client.
+ */
+ override def close(): Unit = {
+ try {
+ if (closesClient) {
+ dynamoDbClient.close()
+ }
+ } catch {
+ case NonFatal(ex) =>
+ log.warn("Error closing DynamoDB client", ex)
+ }
+ }
+
+ /** Deletes all offsets for a given table. */
+ private[core] def deleteAllOffsets(tableName: String, dynamoDbClient: DynamoDbClient): Int = {
+ val log = LoggerFactory.getLogger(this.getClass)
+ try {
+ var allItems = Seq.empty[java.util.Map[String, AttributeValue]]
+ var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null
+
+ // Query all offsets for the table with pagination
+ do {
+ val queryRequestBuilder = QueryRequest.builder()
+ .tableName(offsetTableFullName)
+ .keyConditionExpression(s"$ATTR_PRAMEN_TABLE_NAME = :table_name")
+ .expressionAttributeValues(Map(
+ ":table_name" -> AttributeValue.builder().s(tableName).build()
+ ).asJava)
+
+ if (lastEvaluatedKey != null) {
+ queryRequestBuilder.exclusiveStartKey(lastEvaluatedKey)
+ }
+
+ val result = dynamoDbClient.query(queryRequestBuilder.build())
+ allItems = allItems ++ result.items().asScala
+ lastEvaluatedKey = result.lastEvaluatedKey()
+ } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty)
+
+ // Delete each item
+ allItems.foreach { item =>
+ val deleteRequest = DeleteItemRequest.builder()
+ .tableName(offsetTableFullName)
+ .key(Map(
+ ATTR_PRAMEN_TABLE_NAME -> item.get(ATTR_PRAMEN_TABLE_NAME),
+ ATTR_COMPOSITE_KEY -> item.get(ATTR_COMPOSITE_KEY)
+ ).asJava)
+ .build()
+
+ dynamoDbClient.deleteItem(deleteRequest)
+ }
+
+ val deletedCount = allItems.size
+ log.info(s"Deleted $deletedCount offset records for table '$tableName'")
+ deletedCount
+ } catch {
+ case NonFatal(ex) =>
+ log.error(s"Error deleting offsets for table '$tableName' from '$offsetTableFullName'", ex)
+ throw new RuntimeException(s"Unable to delete offsets for table '$tableName' from '$offsetTableFullName'", ex)
+ }
+ }
+}
+
+object OffsetManagerDynamoDb {
+ val DEFAULT_OFFSET_TABLE = "offsets"
+ val DEFAULT_TABLE_PREFIX = "pramen"
+
+ // Attribute names for offset table
+ val ATTR_PRAMEN_TABLE_NAME = "pramenTableName"
+ val ATTR_COMPOSITE_KEY = "compositeKey" // Format: "infoDate#createdAtMilli"
+ val ATTR_INFO_DATE = "infoDate"
+ val ATTR_DATA_TYPE = "dataType"
+ val ATTR_MIN_OFFSET = "minOffset"
+ val ATTR_MAX_OFFSET = "maxOffset"
+ val ATTR_BATCH_ID = "batchId"
+ val ATTR_CREATED_AT = "createdAt"
+ val ATTR_COMMITTED_AT = "committedAt"
+
+ /**
+ * Builder for creating OffsetManagerDynamoDb instances.
+ */
+ class OffsetManagerDynamoDbBuilder {
+ private var region: Option[String] = None
+ private var tableArn: Option[String] = None
+ private var tablePrefix: String = DEFAULT_TABLE_PREFIX
+ private var credentialsProvider: Option[AwsCredentialsProvider] = None
+ private var endpoint: Option[String] = None
+ private var batchId: Long = System.currentTimeMillis()
+
+ def withRegion(region: String): OffsetManagerDynamoDbBuilder = {
+ this.region = Some(region)
+ this
+ }
+
+ def withTableArn(arn: String): OffsetManagerDynamoDbBuilder = {
+ this.tableArn = Some(arn)
+ this
+ }
+
+ def withTablePrefix(prefix: String): OffsetManagerDynamoDbBuilder = {
+ this.tablePrefix = prefix
+ this
+ }
+
+ def withCredentialsProvider(provider: AwsCredentialsProvider): OffsetManagerDynamoDbBuilder = {
+ this.credentialsProvider = Some(provider)
+ this
+ }
+
+ def withEndpoint(endpoint: String): OffsetManagerDynamoDbBuilder = {
+ this.endpoint = Some(endpoint)
+ this
+ }
+
+ def withBatchId(batchId: Long): OffsetManagerDynamoDbBuilder = {
+ this.batchId = batchId
+ this
+ }
+
+ def build(): OffsetManagerDynamoDb = {
+ if (region.isEmpty) {
+ throw new IllegalArgumentException("Region must be provided")
+ }
+
+ val clientBuilder = DynamoDbClient.builder()
+ .region(Region.of(region.get))
+
+ credentialsProvider.foreach(clientBuilder.credentialsProvider)
+ endpoint.foreach { ep =>
+ clientBuilder.endpointOverride(URI.create(ep))
+ }
+
+ val client = clientBuilder.build()
+
+ try {
+ new OffsetManagerDynamoDb(
+ dynamoDbClient = client,
+ batchId = batchId,
+ tableArn = tableArn,
+ tablePrefix = tablePrefix,
+ closesClient = true
+ )
+ } catch {
+ case NonFatal(ex) =>
+ client.close()
+ throw ex
+ }
+ }
+ }
+
+ def builder: OffsetManagerDynamoDbBuilder = new OffsetManagerDynamoDbBuilder
+
+}
diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerJdbc.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerJdbc.scala
index 368ab026f..b1e164860 100644
--- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerJdbc.scala
+++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerJdbc.scala
@@ -150,6 +150,9 @@ class OffsetManagerJdbc(db: Database, slickProfile: JdbcProfile, offsetTable: Of
).execute()
}
+ /** This class does not own the database connection. It is responsibility of the DB connection owner to close it. */
+ override def close(): Unit = {}
+
private[core] def getMaximumInfoDate(table: String): Option[LocalDate] = {
val query = offsetTable.records
.filter(r => r.pramenTableName === table)
diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/Journal.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/Journal.scala
index 0a03c215d..b7d70ff81 100644
--- a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/Journal.scala
+++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/Journal.scala
@@ -23,10 +23,11 @@ import java.time.Instant
/**
* A journal is responsible of keeping track of all completed tasks.
*/
-trait Journal {
+trait Journal extends AutoCloseable {
def addEntry(entry: TaskCompleted): Unit
def getEntries(from: Instant, to: Instant): Seq[TaskCompleted]
+ override def close(): Unit = {}
}
diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala
new file mode 100644
index 000000000..a9453d3ed
--- /dev/null
+++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala
@@ -0,0 +1,349 @@
+/*
+ * Copyright 2022 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.pramen.core.journal
+
+import org.slf4j.LoggerFactory
+import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider
+import software.amazon.awssdk.regions.Region
+import software.amazon.awssdk.services.dynamodb.DynamoDbClient
+import software.amazon.awssdk.services.dynamodb.model._
+import za.co.absa.pramen.core.app.config.InfoDateConfig
+import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb
+import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb.waitForTableActive
+import za.co.absa.pramen.core.journal.model.TaskCompleted
+
+import java.net.URI
+import java.time.{Instant, LocalDate}
+import scala.collection.JavaConverters._
+import scala.util.control.NonFatal
+
+/**
+ * DynamoDB-based journal for tracking completed tasks.
+ *
+ * This journal stores task completion records in a DynamoDB table with automatic table creation.
+ *
+ * @param dynamoDbClient The DynamoDB client to use
+ * @param tableArn Optional ARN prefix for the journal table
+ * @param tablePrefix Prefix for the journal table name (default: "pramen")
+ */
+class JournalDynamoDB private (
+ dynamoDbClient: DynamoDbClient,
+ tableArn: Option[String] = None,
+ tablePrefix: String = JournalDynamoDB.DEFAULT_TABLE_PREFIX
+) extends Journal {
+
+ private val log = LoggerFactory.getLogger(this.getClass)
+ private val dateFormatter = InfoDateConfig.defaultDateFormatter
+
+ private val journalTableBaseName = s"${tablePrefix}_${JournalDynamoDB.DEFAULT_JOURNAL_TABLE}"
+ private val journalTableFullName = BookkeeperDynamoDb.getFullTableName(tableArn, journalTableBaseName)
+
+ // Initialize table on creation
+ createJournalTableIfNotExists()
+
+ /**
+ * Add a task completion entry to the journal.
+ * Failure reason is truncated to 4KB to fit DynamoDB item size limits.
+ */
+ override def addEntry(entry: TaskCompleted): Unit = {
+ val periodBegin = entry.periodBegin.format(dateFormatter)
+ val periodEnd = entry.periodEnd.format(dateFormatter)
+ val infoDate = entry.informationDate.format(dateFormatter)
+
+ // Truncate failure reason to 4KB maximum
+ val truncatedFailureReason = entry.failureReason.map { reason =>
+ if (reason.length > JournalDynamoDB.MAX_FAILURE_REASON_LENGTH) {
+ val truncated = reason.substring(0, JournalDynamoDB.MAX_FAILURE_REASON_LENGTH - 20)
+ truncated + "\n[... truncated ...]"
+ } else {
+ reason
+ }
+ }
+
+ val itemBuilder = Map.newBuilder[String, AttributeValue]
+
+ // Primary key: composite of jobName and finishedAt (for sorting by time)
+ itemBuilder += (JournalDynamoDB.ATTR_JOB_NAME -> AttributeValue.builder().s(entry.jobName).build())
+ itemBuilder += (JournalDynamoDB.ATTR_FINISHED_AT -> AttributeValue.builder().n(entry.finishedAt.toString).build())
+
+ // Attributes
+ itemBuilder += (JournalDynamoDB.ATTR_TABLE_NAME -> AttributeValue.builder().s(entry.tableName).build())
+ itemBuilder += (JournalDynamoDB.ATTR_PERIOD_BEGIN -> AttributeValue.builder().s(periodBegin).build())
+ itemBuilder += (JournalDynamoDB.ATTR_PERIOD_END -> AttributeValue.builder().s(periodEnd).build())
+ itemBuilder += (JournalDynamoDB.ATTR_INFO_DATE -> AttributeValue.builder().s(infoDate).build())
+ itemBuilder += (JournalDynamoDB.ATTR_INPUT_RECORD_COUNT -> AttributeValue.builder().n(entry.inputRecordCount.getOrElse(-1L).toString).build())
+ itemBuilder += (JournalDynamoDB.ATTR_INPUT_RECORD_COUNT_OLD -> AttributeValue.builder().n(entry.inputRecordCountOld.getOrElse(-1L).toString).build())
+
+ entry.outputRecordCount.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_OUTPUT_RECORD_COUNT -> AttributeValue.builder().n(v.toString).build()))
+ entry.outputRecordCountOld.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_OUTPUT_RECORD_COUNT_OLD -> AttributeValue.builder().n(v.toString).build()))
+ entry.appendedRecordCount.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_APPENDED_RECORD_COUNT -> AttributeValue.builder().n(v.toString).build()))
+ entry.outputSize.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_OUTPUT_SIZE -> AttributeValue.builder().n(v.toString).build()))
+
+ itemBuilder += (JournalDynamoDB.ATTR_STARTED_AT -> AttributeValue.builder().n(entry.startedAt.toString).build())
+ itemBuilder += (JournalDynamoDB.ATTR_STATUS -> AttributeValue.builder().s(entry.status).build())
+
+ truncatedFailureReason.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_FAILURE_REASON -> AttributeValue.builder().s(v).build()))
+ entry.sparkApplicationId.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_SPARK_APP_ID -> AttributeValue.builder().s(v).build()))
+ entry.pipelineId.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_PIPELINE_ID -> AttributeValue.builder().s(v).build()))
+ entry.pipelineName.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_PIPELINE_NAME -> AttributeValue.builder().s(v).build()))
+ entry.environmentName.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_ENVIRONMENT_NAME -> AttributeValue.builder().s(v).build()))
+ entry.tenant.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_TENANT -> AttributeValue.builder().s(v).build()))
+ entry.country.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_COUNTRY -> AttributeValue.builder().s(v).build()))
+
+ itemBuilder += (JournalDynamoDB.ATTR_BATCH_ID -> AttributeValue.builder().n(entry.batchId.toString).build())
+
+ try {
+ val putRequest = PutItemRequest.builder()
+ .tableName(journalTableFullName)
+ .item(itemBuilder.result().asJava)
+ .build()
+
+ dynamoDbClient.putItem(putRequest)
+ } catch {
+ case NonFatal(ex) =>
+ log.error(s"Unable to write to the journal table '$journalTableFullName'.", ex)
+ }
+ }
+
+ /**
+ * Get journal entries within a time range.
+ */
+ override def getEntries(from: Instant, to: Instant): Seq[TaskCompleted] = {
+ val fromSec = from.getEpochSecond
+ val toSec = to.getEpochSecond
+
+ try {
+ val scanRequest = ScanRequest.builder()
+ .tableName(journalTableFullName)
+ .filterExpression(s"${JournalDynamoDB.ATTR_FINISHED_AT} >= :from_time AND ${JournalDynamoDB.ATTR_FINISHED_AT} <= :to_time")
+ .expressionAttributeValues(Map(
+ ":from_time" -> AttributeValue.builder().n(fromSec.toString).build(),
+ ":to_time" -> AttributeValue.builder().n(toSec.toString).build()
+ ).asJava)
+ .build()
+
+ val result = dynamoDbClient.scan(scanRequest)
+
+ result.items().asScala.map { item =>
+ val getS = (attr: String) => Option(item.get(attr)).map(_.s())
+ val getN = (attr: String) => Option(item.get(attr)).map(_.n().toLong)
+
+ val inputRecordCount = getN(JournalDynamoDB.ATTR_INPUT_RECORD_COUNT).flatMap(v => if (v < 0) None else Some(v))
+ val inputRecordCountOld = getN(JournalDynamoDB.ATTR_INPUT_RECORD_COUNT_OLD).flatMap(v => if (v < 0) None else Some(v))
+
+ TaskCompleted(
+ jobName = item.get(JournalDynamoDB.ATTR_JOB_NAME).s(),
+ tableName = item.get(JournalDynamoDB.ATTR_TABLE_NAME).s(),
+ periodBegin = LocalDate.parse(item.get(JournalDynamoDB.ATTR_PERIOD_BEGIN).s(), dateFormatter),
+ periodEnd = LocalDate.parse(item.get(JournalDynamoDB.ATTR_PERIOD_END).s(), dateFormatter),
+ informationDate = LocalDate.parse(item.get(JournalDynamoDB.ATTR_INFO_DATE).s(), dateFormatter),
+ inputRecordCount = inputRecordCount,
+ inputRecordCountOld = inputRecordCountOld,
+ outputRecordCount = getN(JournalDynamoDB.ATTR_OUTPUT_RECORD_COUNT),
+ outputRecordCountOld = getN(JournalDynamoDB.ATTR_OUTPUT_RECORD_COUNT_OLD),
+ appendedRecordCount = getN(JournalDynamoDB.ATTR_APPENDED_RECORD_COUNT),
+ outputSize = getN(JournalDynamoDB.ATTR_OUTPUT_SIZE),
+ startedAt = item.get(JournalDynamoDB.ATTR_STARTED_AT).n().toLong,
+ finishedAt = item.get(JournalDynamoDB.ATTR_FINISHED_AT).n().toLong,
+ status = item.get(JournalDynamoDB.ATTR_STATUS).s(),
+ failureReason = getS(JournalDynamoDB.ATTR_FAILURE_REASON),
+ sparkApplicationId = getS(JournalDynamoDB.ATTR_SPARK_APP_ID),
+ pipelineId = getS(JournalDynamoDB.ATTR_PIPELINE_ID),
+ pipelineName = getS(JournalDynamoDB.ATTR_PIPELINE_NAME),
+ environmentName = getS(JournalDynamoDB.ATTR_ENVIRONMENT_NAME),
+ tenant = getS(JournalDynamoDB.ATTR_TENANT),
+ country = getS(JournalDynamoDB.ATTR_COUNTRY),
+ batchId = getN(JournalDynamoDB.ATTR_BATCH_ID).getOrElse(0L)
+ )
+ }.toSeq
+ } catch {
+ case NonFatal(ex) =>
+ log.error(s"Unable to read from the journal table '$journalTableFullName'.", ex)
+ Seq.empty
+ }
+ }
+
+ /**
+ * Creates the journal table if it doesn't exist.
+ */
+ private def createJournalTableIfNotExists(): Unit = {
+ try {
+ val describeRequest = DescribeTableRequest.builder()
+ .tableName(journalTableFullName)
+ .build()
+
+ dynamoDbClient.describeTable(describeRequest)
+ log.info(s"Journal table '$journalTableFullName' already exists")
+ } catch {
+ case _: ResourceNotFoundException =>
+ log.info(s"Creating journal table '$journalTableFullName'")
+ createJournalTable()
+ case NonFatal(ex) =>
+ log.error(s"Error checking if journal table exists", ex)
+ throw ex
+ }
+ }
+
+ /**
+ * Creates the journal table in DynamoDB.
+ */
+ private def createJournalTable(): Unit = {
+ val createRequest = CreateTableRequest.builder()
+ .tableName(journalTableFullName)
+ .attributeDefinitions(
+ AttributeDefinition.builder()
+ .attributeName(JournalDynamoDB.ATTR_JOB_NAME)
+ .attributeType(ScalarAttributeType.S)
+ .build(),
+ AttributeDefinition.builder()
+ .attributeName(JournalDynamoDB.ATTR_FINISHED_AT)
+ .attributeType(ScalarAttributeType.N)
+ .build()
+ )
+ .keySchema(
+ KeySchemaElement.builder()
+ .attributeName(JournalDynamoDB.ATTR_JOB_NAME)
+ .keyType(KeyType.HASH)
+ .build(),
+ KeySchemaElement.builder()
+ .attributeName(JournalDynamoDB.ATTR_FINISHED_AT)
+ .keyType(KeyType.RANGE)
+ .build()
+ )
+ .billingMode(BillingMode.PAY_PER_REQUEST)
+ .build()
+
+ dynamoDbClient.createTable(createRequest)
+ waitForTableActive(journalTableFullName, dynamoDbClient)
+ log.info(s"Journal table '$journalTableFullName' created successfully")
+ }
+
+ /**
+ * Closes the DynamoDB client.
+ */
+ override def close(): Unit = {
+ try {
+ dynamoDbClient.close()
+ } catch {
+ case NonFatal(ex) =>
+ log.warn("Error closing DynamoDB client", ex)
+ }
+ }
+}
+
+object JournalDynamoDB {
+ val DEFAULT_JOURNAL_TABLE = "journal"
+ val DEFAULT_TABLE_PREFIX = "pramen"
+
+ // Maximum length for failure reason (4KB minus some overhead)
+ val MAX_FAILURE_REASON_LENGTH = 4000
+
+ // Attribute names for journal table
+ val ATTR_JOB_NAME = "jobName"
+ val ATTR_TABLE_NAME = "tableName"
+ val ATTR_PERIOD_BEGIN = "periodBegin"
+ val ATTR_PERIOD_END = "periodEnd"
+ val ATTR_INFO_DATE = "infoDate"
+ val ATTR_INPUT_RECORD_COUNT = "inputRecordCount"
+ val ATTR_INPUT_RECORD_COUNT_OLD = "inputRecordCountOld"
+ val ATTR_OUTPUT_RECORD_COUNT = "outputRecordCount"
+ val ATTR_OUTPUT_RECORD_COUNT_OLD = "outputRecordCountOld"
+ val ATTR_APPENDED_RECORD_COUNT = "appendedRecordCount"
+ val ATTR_OUTPUT_SIZE = "outputSize"
+ val ATTR_STARTED_AT = "startedAt"
+ val ATTR_FINISHED_AT = "finishedAt"
+ val ATTR_STATUS = "status"
+ val ATTR_FAILURE_REASON = "failureReason"
+ val ATTR_SPARK_APP_ID = "sparkApplicationId"
+ val ATTR_PIPELINE_ID = "pipelineId"
+ val ATTR_PIPELINE_NAME = "pipelineName"
+ val ATTR_ENVIRONMENT_NAME = "environmentName"
+ val ATTR_TENANT = "tenant"
+ val ATTR_COUNTRY = "country"
+ val ATTR_BATCH_ID = "batchId"
+
+ /**
+ * Builder for creating JournalDynamoDB instances.
+ */
+ class JournalDynamoDBBuilder {
+ private var region: Option[String] = None
+ private var tableArn: Option[String] = None
+ private var tablePrefix: String = DEFAULT_TABLE_PREFIX
+ private var credentialsProvider: Option[AwsCredentialsProvider] = None
+ private var endpoint: Option[String] = None
+
+ def withRegion(region: String): JournalDynamoDBBuilder = {
+ this.region = Some(region)
+ this
+ }
+
+ def withTableArn(arn: String): JournalDynamoDBBuilder = {
+ this.tableArn = Some(arn)
+ this
+ }
+
+ def withTableArn(arnOpt: Option[String]): JournalDynamoDBBuilder = {
+ this.tableArn = arnOpt
+ this
+ }
+
+ def withTablePrefix(prefix: String): JournalDynamoDBBuilder = {
+ this.tablePrefix = prefix
+ this
+ }
+
+ def withCredentialsProvider(provider: AwsCredentialsProvider): JournalDynamoDBBuilder = {
+ this.credentialsProvider = Some(provider)
+ this
+ }
+
+ def withEndpoint(endpoint: String): JournalDynamoDBBuilder = {
+ this.endpoint = Some(endpoint)
+ this
+ }
+
+ def build(): JournalDynamoDB = {
+ if (region.isEmpty) {
+ throw new IllegalArgumentException("Region must be provided")
+ }
+
+ val clientBuilder = DynamoDbClient.builder()
+ .region(Region.of(region.get))
+
+ credentialsProvider.foreach(clientBuilder.credentialsProvider)
+ endpoint.foreach { ep =>
+ clientBuilder.endpointOverride(URI.create(ep))
+ }
+
+ val client = clientBuilder.build()
+
+ try {
+ new JournalDynamoDB(
+ dynamoDbClient = client,
+ tableArn = tableArn,
+ tablePrefix = tablePrefix
+ )
+ } catch {
+ case NonFatal(ex) =>
+ client.close()
+ throw ex
+ }
+ }
+ }
+
+ def builder: JournalDynamoDBBuilder = new JournalDynamoDBBuilder
+}
diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockDynamoDb.scala
new file mode 100644
index 000000000..454e90900
--- /dev/null
+++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockDynamoDb.scala
@@ -0,0 +1,214 @@
+/*
+ * Copyright 2022 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.pramen.core.lock
+
+import org.slf4j.LoggerFactory
+import software.amazon.awssdk.services.dynamodb.DynamoDbClient
+import software.amazon.awssdk.services.dynamodb.model._
+import za.co.absa.pramen.core.lock.model.LockTicket
+
+import java.time.Instant
+import java.time.temporal.ChronoUnit
+import scala.collection.JavaConverters._
+import scala.util.control.NonFatal
+import scala.util.{Failure, Success, Try}
+
+object TokenLockDynamoDb {
+ val DEFAULT_TABLE_NAME = "pramen_locks"
+
+ // Attribute names
+ val ATTR_TOKEN = "job_token" // 'token' is a reserved word in DynamoDb and can't be used as an attribute
+ val ATTR_OWNER = "job_owner" // 'owner' is a reserved word in DynamoDb and can't be used as an attribute
+ val ATTR_EXPIRES = "expiresAt"
+ val ATTR_CREATED_AT = "createdAt"
+
+ val TICKETS_HARD_EXPIRE_DAYS = 1
+}
+
+/**
+ * DynamoDB-based distributed lock implementation.
+ *
+ * This lock uses DynamoDB's conditional writes to implement distributed locking.
+ * The lock is maintained by periodic updates to the expiration time.
+ *
+ * Table schema:
+ * - Partition key: token (String)
+ * - Attributes: owner (String), expires (Number), createdAt (Number)
+ *
+ * @param token The unique identifier for the lock
+ * @param dynamoDbClient The DynamoDB client to use
+ * @param tableName The name of the locks table
+ */
+class TokenLockDynamoDb(
+ token: String,
+ dynamoDbClient: DynamoDbClient,
+ tableName: String = TokenLockDynamoDb.DEFAULT_TABLE_NAME
+) extends TokenLockBase(token) {
+
+ import TokenLockDynamoDb._
+
+ private val log = LoggerFactory.getLogger(this.getClass)
+
+ /** Invoked from a synchronized block. */
+ override def tryAcquireGuardLock(retries: Int = 3, thisTry: Int = 0): Boolean = {
+ def tryAcquireExistingTicket(): Boolean = {
+ val ticketOpt = getTicket
+
+ if (ticketOpt.isEmpty) {
+ log.warn(s"No ticket for $escapedToken")
+ tryAcquireGuardLock(retries - 1, thisTry + 1)
+ } else {
+ val ticket = ticketOpt.get
+ val expires = ticket.expires
+ val now = Instant.now().getEpochSecond
+
+ if (expires < now) {
+ log.warn(s"Taking over expired ticket $escapedToken ($expires < $now)")
+ releaseGuardLock()
+ tryAcquireGuardLock(retries - 1, thisTry + 1)
+ } else {
+ false
+ }
+ }
+ }
+
+ if (retries <= 0) {
+ log.error(s"Cannot try acquire a lock after $thisTry retries.")
+ false
+ } else {
+ val ok = Try(acquireGuardLock())
+
+ ok match {
+ case Success(_) =>
+ true
+ case Failure(_: ConditionalCheckFailedException) =>
+ // Lock already exists
+ tryAcquireExistingTicket()
+ case Failure(ex) =>
+ throw new IllegalStateException(s"Unable to acquire a lock by querying DynamoDB", ex)
+ }
+ }
+ }
+
+ /** Invoked from a synchronized block. */
+ override def releaseGuardLock(): Unit = {
+ try {
+ val now = Instant.now()
+ val nowEpoch = now.getEpochSecond
+ val hardExpireTickets = now.minus(TICKETS_HARD_EXPIRE_DAYS, ChronoUnit.DAYS).getEpochSecond
+
+ // Delete this ticket or any expired tickets
+ val deleteRequest = DeleteItemRequest.builder()
+ .tableName(tableName)
+ .key(Map(
+ ATTR_TOKEN -> AttributeValue.builder().s(escapedToken).build()
+ ).asJava)
+ .conditionExpression(s"$ATTR_OWNER = :jobOwner OR ($ATTR_EXPIRES < :now AND $ATTR_CREATED_AT < :hardExpire)")
+ .expressionAttributeValues(Map(
+ ":jobOwner" -> AttributeValue.builder().s(owner).build(),
+ ":now" -> AttributeValue.builder().n(nowEpoch.toString).build(),
+ ":hardExpire" -> AttributeValue.builder().n(hardExpireTickets.toString).build()
+ ).asJava)
+ .build()
+
+ try {
+ dynamoDbClient.deleteItem(deleteRequest)
+ } catch {
+ case _: ConditionalCheckFailedException =>
+ // Item doesn't match condition, ignore
+ log.debug(s"Could not delete ticket $escapedToken - condition not met")
+ }
+ } catch {
+ case NonFatal(ex) =>
+ log.error(s"An error occurred when trying to release the lock: $escapedToken.", ex)
+ }
+ }
+
+ /** Invoked from a synchronized block. */
+ override def updateTicket(): Unit = {
+ val newTicket = getNewTicket
+
+ try {
+ log.debug(s"Update $escapedToken to $newTicket")
+
+ val updateRequest = UpdateItemRequest.builder()
+ .tableName(tableName)
+ .key(Map(
+ ATTR_TOKEN -> AttributeValue.builder().s(escapedToken).build()
+ ).asJava)
+ .updateExpression(s"SET $ATTR_EXPIRES = :expires")
+ .expressionAttributeValues(Map(
+ ":expires" -> AttributeValue.builder().n(newTicket.toString).build()
+ ).asJava)
+ .build()
+
+ dynamoDbClient.updateItem(updateRequest)
+ } catch {
+ case NonFatal(ex) =>
+ log.error(s"An error occurred when trying to update the lock: $escapedToken.", ex)
+ }
+ }
+
+ /** Invoked from a synchronized block. */
+ private def getTicket: Option[LockTicket] = {
+ try {
+ val getRequest = GetItemRequest.builder()
+ .tableName(tableName)
+ .key(Map(
+ ATTR_TOKEN -> AttributeValue.builder().s(escapedToken).build()
+ ).asJava)
+ .build()
+
+ val response = dynamoDbClient.getItem(getRequest)
+
+ if (response.hasItem && !response.item().isEmpty) {
+ val item = response.item()
+ Some(LockTicket(
+ token = item.get(ATTR_TOKEN).s(),
+ owner = item.get(ATTR_OWNER).s(),
+ expires = item.get(ATTR_EXPIRES).n().toLong,
+ createdAt = Option(item.get(ATTR_CREATED_AT)).map(_.n().toLong)
+ ))
+ } else {
+ None
+ }
+ } catch {
+ case NonFatal(ex) =>
+ log.error(s"Error getting ticket for $escapedToken", ex)
+ None
+ }
+ }
+
+ /** Invoked from a synchronized block. */
+ private def acquireGuardLock(): Unit = {
+ val now = Instant.now().getEpochSecond
+ val item = Map(
+ ATTR_TOKEN -> AttributeValue.builder().s(escapedToken).build(),
+ ATTR_OWNER -> AttributeValue.builder().s(owner).build(),
+ ATTR_EXPIRES -> AttributeValue.builder().n(getNewTicket.toString).build(),
+ ATTR_CREATED_AT -> AttributeValue.builder().n(now.toString).build()
+ ).asJava
+
+ val putRequest = PutItemRequest.builder()
+ .tableName(tableName)
+ .item(item)
+ .conditionExpression(s"attribute_not_exists($ATTR_TOKEN)")
+ .build()
+
+ dynamoDbClient.putItem(putRequest)
+ }
+}
diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala
new file mode 100644
index 000000000..75a669c65
--- /dev/null
+++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala
@@ -0,0 +1,280 @@
+/*
+ * Copyright 2022 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.pramen.core.lock
+
+import org.slf4j.LoggerFactory
+import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider
+import software.amazon.awssdk.regions.Region
+import software.amazon.awssdk.services.dynamodb.DynamoDbClient
+import software.amazon.awssdk.services.dynamodb.model._
+import za.co.absa.pramen.api.lock.{TokenLock, TokenLockFactory}
+import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb
+import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb.waitForTableActive
+
+import java.net.URI
+import scala.util.control.NonFatal
+
+/**
+ * Factory for creating DynamoDB-based distributed locks.
+ *
+ * This factory creates and manages a DynamoDB table for storing lock tickets.
+ * The table is created automatically if it doesn't exist.
+ *
+ * @param dynamoDbClient The DynamoDB client to use
+ * @param tableArn Optional ARN prefix for the locks table
+ * @param tablePrefix Prefix for the locks table name (default: "pramen")
+ */
+class TokenLockFactoryDynamoDb private(
+ dynamoDbClient: DynamoDbClient,
+ tableArn: Option[String] = None,
+ tablePrefix: String = "pramen"
+) extends TokenLockFactory with AutoCloseable {
+ import TokenLockFactoryDynamoDb._
+
+ import TokenLockDynamoDb._
+
+ private val log = LoggerFactory.getLogger(this.getClass)
+
+ // Construct table name with prefix
+ private val locksTableBaseName = s"${tablePrefix}_locks"
+ private val locksTableName = BookkeeperDynamoDb.getFullTableName(tableArn, locksTableBaseName)
+
+ // Initialize table on construction
+ init()
+
+ override def getLock(token: String): TokenLock = {
+ new TokenLockDynamoDb(token, dynamoDbClient, locksTableName)
+ }
+
+ /**
+ * Closes the DynamoDB client.
+ * Should be called when the lock factory is no longer needed.
+ */
+ override def close(): Unit = {
+ try {
+ dynamoDbClient.close()
+ } catch {
+ case NonFatal(ex) =>
+ log.warn("Error closing DynamoDB client", ex)
+ }
+ }
+
+ /**
+ * Initializes the DynamoDB locks table.
+ * Checks if the table exists and creates it if it doesn't.
+ */
+ private def init(): Unit = {
+ try {
+ log.info(s"Initializing DynamoDB lock factory with table: '$locksTableName'")
+
+ if (!tableExists(locksTableName)) {
+ log.info(s"Creating DynamoDB locks table: $locksTableName")
+ createLocksTable(locksTableName)
+ log.info(s"Successfully created locks table: $locksTableName")
+ } else {
+ log.info(s"DynamoDB locks table already exists: $locksTableName")
+ }
+
+ log.info(s"DynamoDB lock factory initialization complete")
+ } catch {
+ case NonFatal(ex) =>
+ log.error("Error initializing DynamoDB lock factory", ex)
+ throw new RuntimeException("Failed to initialize DynamoDB lock factory", ex)
+ }
+ }
+
+ /**
+ * Checks if a DynamoDB table exists.
+ *
+ * @param tableName The name of the table to check
+ * @return true if the table exists, false otherwise
+ */
+ private def tableExists(tableName: String): Boolean = {
+ try {
+ val describeRequest = DescribeTableRequest.builder()
+ .tableName(tableName)
+ .build()
+
+ dynamoDbClient.describeTable(describeRequest)
+ true
+ } catch {
+ case _: ResourceNotFoundException => false
+ case NonFatal(ex) =>
+ log.warn(s"Error checking if table exists: $tableName", ex)
+ throw ex
+ }
+ }
+
+ /**
+ * Creates the locks table with the appropriate schema.
+ *
+ * @param tableName The name of the table to create
+ */
+ private def createLocksTable(tableName: String): Unit = {
+ val createTableRequest = CreateTableRequest.builder()
+ .tableName(tableName)
+ .keySchema(
+ KeySchemaElement.builder()
+ .attributeName(ATTR_TOKEN)
+ .keyType(KeyType.HASH)
+ .build()
+ )
+ .attributeDefinitions(
+ AttributeDefinition.builder()
+ .attributeName(ATTR_TOKEN)
+ .attributeType(ScalarAttributeType.S)
+ .build()
+ )
+ .billingMode(BillingMode.PAY_PER_REQUEST) // On-demand billing
+ .build()
+
+ dynamoDbClient.createTable(createTableRequest)
+
+ // Wait for table to become active
+ waitForTableActive(tableName, dynamoDbClient)
+ }
+}
+
+object TokenLockFactoryDynamoDb {
+ /**
+ * Builder for creating TokenLockFactoryDynamoDb instances.
+ * Provides a fluent API for configuring DynamoDB lock factory.
+ *
+ * Example:
+ * {{{
+ * val lockFactory = TokenLockFactoryDynamoDb.builder
+ * .withRegion("us-east-1")
+ * .withTablePrefix("my_app")
+ * .build()
+ * }}}
+ */
+ class TokenLockFactoryDynamoDbBuilder {
+ private var region: Option[String] = None
+ private var tableArn: Option[String] = None
+ private var tablePrefix: String = BookkeeperDynamoDb.DEFAULT_TABLE_PREFIX
+ private var credentialsProvider: Option[AwsCredentialsProvider] = None
+ private var endpoint: Option[String] = None
+
+ /**
+ * Sets the AWS region for the DynamoDB client.
+ *
+ * @param region AWS region (e.g., "us-east-1", "eu-west-1")
+ * @return this builder
+ */
+ def withRegion(region: String): TokenLockFactoryDynamoDbBuilder = {
+ this.region = Some(region)
+ this
+ }
+
+ /**
+ * Sets the table ARN prefix for cross-account or cross-region access.
+ *
+ * @param arn ARN prefix (e.g., "arn:aws:dynamodb:us-east-1:123456789012:table/")
+ * @return this builder
+ */
+ def withTableArn(arn: String): TokenLockFactoryDynamoDbBuilder = {
+ this.tableArn = Some(arn)
+ this
+ }
+
+ /**
+ * Sets the table ARN prefix for cross-account or cross-region access.
+ *
+ * @param arnOpt ARN prefix (e.g., "arn:aws:dynamodb:us-east-1:123456789012:table/")
+ * @return this builder
+ */
+ def withTableArn(arnOpt: Option[String]): TokenLockFactoryDynamoDbBuilder = {
+ this.tableArn = arnOpt
+ this
+ }
+
+ /**
+ * Sets the table name prefix to allow multiple lock tables in the same account.
+ *
+ * @param prefix Table name prefix (default: "pramen")
+ * @return this builder
+ */
+ def withTablePrefix(prefix: String): TokenLockFactoryDynamoDbBuilder = {
+ this.tablePrefix = prefix
+ this
+ }
+
+ /**
+ * Sets custom AWS credentials provider.
+ *
+ * @param provider AWS credentials provider
+ * @return this builder
+ */
+ def withCredentialsProvider(provider: AwsCredentialsProvider): TokenLockFactoryDynamoDbBuilder = {
+ this.credentialsProvider = Some(provider)
+ this
+ }
+
+ /**
+ * Sets a custom DynamoDB endpoint (useful for testing with LocalStack or DynamoDB Local).
+ *
+ * @param endpoint Endpoint URI (e.g., "http://localhost:8000")
+ * @return this builder
+ */
+ def withEndpoint(endpoint: String): TokenLockFactoryDynamoDbBuilder = {
+ this.endpoint = Some(endpoint)
+ this
+ }
+
+ /**
+ * Builds the TokenLockFactoryDynamoDb instance.
+ *
+ * @return Configured TokenLockFactoryDynamoDb instance
+ * @throws IllegalArgumentException if required parameters are missing
+ */
+ def build(): TokenLockFactoryDynamoDb = {
+ if (region.isEmpty) {
+ throw new IllegalArgumentException("Region must be provided")
+ }
+
+ val clientBuilder = DynamoDbClient.builder()
+ .region(Region.of(region.get))
+
+ credentialsProvider.foreach(clientBuilder.credentialsProvider)
+
+ endpoint.foreach { ep =>
+ clientBuilder.endpointOverride(URI.create(ep))
+ }
+
+ val client = clientBuilder.build()
+
+ try {
+ new TokenLockFactoryDynamoDb(
+ dynamoDbClient = client,
+ tableArn = tableArn,
+ tablePrefix = tablePrefix
+ )
+ } catch {
+ case NonFatal(ex) =>
+ client.close()
+ throw ex
+ }
+ }
+ }
+
+ /**
+ * Creates a new builder for TokenLockFactoryDynamoDb.
+ *
+ * @return A new builder instance
+ */
+ def builder: TokenLockFactoryDynamoDbBuilder = new TokenLockFactoryDynamoDbBuilder
+}
diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala
new file mode 100644
index 000000000..31b8a8932
--- /dev/null
+++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala
@@ -0,0 +1,328 @@
+/*
+ * Copyright 2022 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.pramen.core.metadata
+
+import org.slf4j.LoggerFactory
+import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider
+import software.amazon.awssdk.regions.Region
+import software.amazon.awssdk.services.dynamodb.DynamoDbClient
+import software.amazon.awssdk.services.dynamodb.model._
+import za.co.absa.pramen.api.MetadataValue
+import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb
+import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb.waitForTableActive
+
+import java.net.URI
+import java.time.{Instant, LocalDate}
+import scala.collection.JavaConverters._
+import scala.util.control.NonFatal
+
+/**
+ * DynamoDB-based metadata manager for storing custom metadata.
+ *
+ * This manager stores metadata key-value pairs in a DynamoDB table with automatic table creation.
+ *
+ * @param dynamoDbClient The DynamoDB client to use
+ * @param tableArn Optional ARN prefix for the metadata table
+ * @param tablePrefix Prefix for the metadata table name (default: "pramen")
+ */
+class MetadataManagerDynamoDb private (
+ dynamoDbClient: DynamoDbClient,
+ tableArn: Option[String] = None,
+ tablePrefix: String = MetadataManagerDynamoDb.DEFAULT_TABLE_PREFIX
+) extends MetadataManagerBase(true) with AutoCloseable {
+
+ private val log = LoggerFactory.getLogger(this.getClass)
+
+ private val metadataTableBaseName = s"${tablePrefix}_${MetadataManagerDynamoDb.DEFAULT_METADATA_TABLE}"
+ private val metadataTableFullName = BookkeeperDynamoDb.getFullTableName(tableArn, metadataTableBaseName)
+
+ // Initialize table on creation
+ createMetadataTableIfNotExists()
+
+ override def getMetadataFromStorage(tableName: String, infoDate: LocalDate, key: String): Option[MetadataValue] = {
+ try {
+ val compositeKey = s"$tableName#$infoDate"
+
+ val getRequest = GetItemRequest.builder()
+ .tableName(metadataTableFullName)
+ .key(Map(
+ MetadataManagerDynamoDb.ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build(),
+ MetadataManagerDynamoDb.ATTR_METADATA_KEY -> AttributeValue.builder().s(key).build()
+ ).asJava)
+ .build()
+
+ val result = dynamoDbClient.getItem(getRequest)
+
+ if (result.hasItem) {
+ val item = result.item()
+ val value = item.get(MetadataManagerDynamoDb.ATTR_METADATA_VALUE).s()
+ val lastUpdated = Instant.ofEpochSecond(item.get(MetadataManagerDynamoDb.ATTR_LAST_UPDATED).n().toLong)
+ Some(MetadataValue(value, lastUpdated))
+ } else {
+ None
+ }
+ } catch {
+ case NonFatal(ex) =>
+ throw new RuntimeException(s"Unable to read from the metadata table '$metadataTableFullName'.", ex)
+ }
+ }
+
+ override def getMetadataFromStorage(tableName: String, infoDate: LocalDate): Map[String, MetadataValue] = {
+ try {
+ val compositeKey = s"$tableName#$infoDate"
+
+ val queryRequest = QueryRequest.builder()
+ .tableName(metadataTableFullName)
+ .keyConditionExpression(s"${MetadataManagerDynamoDb.ATTR_COMPOSITE_KEY} = :composite_key")
+ .expressionAttributeValues(Map(
+ ":composite_key" -> AttributeValue.builder().s(compositeKey).build()
+ ).asJava)
+ .build()
+
+ val result = dynamoDbClient.query(queryRequest)
+
+ result.items().asScala.map { item =>
+ val key = item.get(MetadataManagerDynamoDb.ATTR_METADATA_KEY).s()
+ val value = item.get(MetadataManagerDynamoDb.ATTR_METADATA_VALUE).s()
+ val lastUpdated = Instant.ofEpochSecond(item.get(MetadataManagerDynamoDb.ATTR_LAST_UPDATED).n().toLong)
+ key -> MetadataValue(value, lastUpdated)
+ }.toMap
+ } catch {
+ case NonFatal(ex) =>
+ throw new RuntimeException(s"Unable to read from the metadata table '$metadataTableFullName'.", ex)
+ }
+ }
+
+ override def setMetadataToStorage(tableName: String, infoDate: LocalDate, key: String, metadata: MetadataValue): Unit = {
+ try {
+ val compositeKey = s"$tableName#$infoDate"
+
+ val putRequest = PutItemRequest.builder()
+ .tableName(metadataTableFullName)
+ .item(Map(
+ MetadataManagerDynamoDb.ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build(),
+ MetadataManagerDynamoDb.ATTR_METADATA_KEY -> AttributeValue.builder().s(key).build(),
+ MetadataManagerDynamoDb.ATTR_METADATA_VALUE -> AttributeValue.builder().s(metadata.value).build(),
+ MetadataManagerDynamoDb.ATTR_LAST_UPDATED -> AttributeValue.builder().n(metadata.lastUpdated.getEpochSecond.toString).build(),
+ MetadataManagerDynamoDb.ATTR_TABLE_NAME -> AttributeValue.builder().s(tableName).build(),
+ MetadataManagerDynamoDb.ATTR_INFO_DATE -> AttributeValue.builder().s(infoDate.toString).build()
+ ).asJava)
+ .build()
+
+ dynamoDbClient.putItem(putRequest)
+ } catch {
+ case NonFatal(ex) =>
+ throw new RuntimeException(s"Unable to write to the metadata table '$metadataTableFullName'.", ex)
+ }
+ }
+
+ override def deleteMetadataFromStorage(tableName: String, infoDate: LocalDate, key: String): Unit = {
+ try {
+ val compositeKey = s"$tableName#$infoDate"
+
+ val deleteRequest = DeleteItemRequest.builder()
+ .tableName(metadataTableFullName)
+ .key(Map(
+ MetadataManagerDynamoDb.ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build(),
+ MetadataManagerDynamoDb.ATTR_METADATA_KEY -> AttributeValue.builder().s(key).build()
+ ).asJava)
+ .build()
+
+ dynamoDbClient.deleteItem(deleteRequest)
+ } catch {
+ case NonFatal(ex) =>
+ throw new RuntimeException(s"Unable to delete from the metadata table '$metadataTableFullName'.", ex)
+ }
+ }
+
+ override def deleteMetadataFromStorage(tableName: String, infoDate: LocalDate): Unit = {
+ try {
+ val compositeKey = s"$tableName#$infoDate"
+
+ // First, query all items with this composite key
+ val queryRequest = QueryRequest.builder()
+ .tableName(metadataTableFullName)
+ .keyConditionExpression(s"${MetadataManagerDynamoDb.ATTR_COMPOSITE_KEY} = :composite_key")
+ .expressionAttributeValues(Map(
+ ":composite_key" -> AttributeValue.builder().s(compositeKey).build()
+ ).asJava)
+ .build()
+
+ val result = dynamoDbClient.query(queryRequest)
+
+ // Delete each item
+ result.items().asScala.foreach { item =>
+ val key = item.get(MetadataManagerDynamoDb.ATTR_METADATA_KEY).s()
+ deleteMetadataFromStorage(tableName, infoDate, key)
+ }
+ } catch {
+ case NonFatal(ex) =>
+ throw new RuntimeException(s"Unable to delete from the metadata table '$metadataTableFullName'.", ex)
+ }
+ }
+
+ /**
+ * Creates the metadata table if it doesn't exist.
+ */
+ private def createMetadataTableIfNotExists(): Unit = {
+ try {
+ val describeRequest = DescribeTableRequest.builder()
+ .tableName(metadataTableFullName)
+ .build()
+
+ dynamoDbClient.describeTable(describeRequest)
+ log.info(s"Metadata table '$metadataTableFullName' already exists")
+ } catch {
+ case _: ResourceNotFoundException =>
+ log.info(s"Creating metadata table '$metadataTableFullName'")
+ createMetadataTable()
+ case NonFatal(ex) =>
+ log.error(s"Error checking if metadata table exists", ex)
+ throw ex
+ }
+ }
+
+ /**
+ * Creates the metadata table in DynamoDB.
+ */
+ private def createMetadataTable(): Unit = {
+ val createRequest = CreateTableRequest.builder()
+ .tableName(metadataTableFullName)
+ .attributeDefinitions(
+ AttributeDefinition.builder()
+ .attributeName(MetadataManagerDynamoDb.ATTR_COMPOSITE_KEY)
+ .attributeType(ScalarAttributeType.S)
+ .build(),
+ AttributeDefinition.builder()
+ .attributeName(MetadataManagerDynamoDb.ATTR_METADATA_KEY)
+ .attributeType(ScalarAttributeType.S)
+ .build()
+ )
+ .keySchema(
+ KeySchemaElement.builder()
+ .attributeName(MetadataManagerDynamoDb.ATTR_COMPOSITE_KEY)
+ .keyType(KeyType.HASH)
+ .build(),
+ KeySchemaElement.builder()
+ .attributeName(MetadataManagerDynamoDb.ATTR_METADATA_KEY)
+ .keyType(KeyType.RANGE)
+ .build()
+ )
+ .billingMode(BillingMode.PAY_PER_REQUEST)
+ .build()
+
+ dynamoDbClient.createTable(createRequest)
+ waitForTableActive(metadataTableFullName, dynamoDbClient)
+ log.info(s"Metadata table '$metadataTableFullName' created successfully")
+ }
+
+ /**
+ * Closes the DynamoDB client.
+ */
+ override def close(): Unit = {
+ try {
+ dynamoDbClient.close()
+ } catch {
+ case NonFatal(ex) =>
+ log.warn("Error closing DynamoDB client", ex)
+ }
+ }
+}
+
+object MetadataManagerDynamoDb {
+ val DEFAULT_METADATA_TABLE = "metadata"
+ val DEFAULT_TABLE_PREFIX = "pramen"
+
+ // Attribute names for metadata table
+ val ATTR_COMPOSITE_KEY = "compositeKey" // tableName#infoDate
+ val ATTR_METADATA_KEY = "metadataKey"
+ val ATTR_METADATA_VALUE = "metadataValue"
+ val ATTR_LAST_UPDATED = "lastUpdated"
+ val ATTR_TABLE_NAME = "tableName" // For filtering/queries
+ val ATTR_INFO_DATE = "infoDate" // For filtering/queries
+
+ /**
+ * Builder for creating MetadataManagerDynamoDb instances.
+ */
+ class MetadataManagerDynamoDbBuilder {
+ private var region: Option[String] = None
+ private var tableArn: Option[String] = None
+ private var tablePrefix: String = DEFAULT_TABLE_PREFIX
+ private var credentialsProvider: Option[AwsCredentialsProvider] = None
+ private var endpoint: Option[String] = None
+
+ def withRegion(region: String): MetadataManagerDynamoDbBuilder = {
+ this.region = Some(region)
+ this
+ }
+
+ def withTableArn(arn: String): MetadataManagerDynamoDbBuilder = {
+ this.tableArn = Some(arn)
+ this
+ }
+
+ def withTableArn(arnOpt: Option[String]): MetadataManagerDynamoDbBuilder = {
+ this.tableArn = arnOpt
+ this
+ }
+
+ def withTablePrefix(prefix: String): MetadataManagerDynamoDbBuilder = {
+ this.tablePrefix = prefix
+ this
+ }
+
+ def withCredentialsProvider(provider: AwsCredentialsProvider): MetadataManagerDynamoDbBuilder = {
+ this.credentialsProvider = Some(provider)
+ this
+ }
+
+ def withEndpoint(endpoint: String): MetadataManagerDynamoDbBuilder = {
+ this.endpoint = Some(endpoint)
+ this
+ }
+
+ def build(): MetadataManagerDynamoDb = {
+ if (region.isEmpty) {
+ throw new IllegalArgumentException("Region must be provided")
+ }
+
+ val clientBuilder = DynamoDbClient.builder()
+ .region(Region.of(region.get))
+
+ credentialsProvider.foreach(clientBuilder.credentialsProvider)
+ endpoint.foreach { ep =>
+ clientBuilder.endpointOverride(URI.create(ep))
+ }
+
+ val client = clientBuilder.build()
+
+ try {
+ new MetadataManagerDynamoDb(
+ dynamoDbClient = client,
+ tableArn = tableArn,
+ tablePrefix = tablePrefix
+ )
+ } catch {
+ case NonFatal(ex) =>
+ client.close()
+ throw ex
+ }
+ }
+ }
+
+ def builder: MetadataManagerDynamoDbBuilder = new MetadataManagerDynamoDbBuilder
+}
diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerJdbc.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerJdbc.scala
index a630784ac..64c6263f6 100644
--- a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerJdbc.scala
+++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerJdbc.scala
@@ -102,4 +102,7 @@ class MetadataManagerJdbc(db: Database, slickProfile: JdbcProfile) extends Metad
case NonFatal(ex) => throw new RuntimeException(s"Unable to delete from the metadata table.", ex)
}
}
+
+ /** The implementation does not own DB connections, so it is not responsible for closing them. */
+ override def close(): Unit = {}
}
diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerNull.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerNull.scala
index 3b91bd9e0..dda48941f 100644
--- a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerNull.scala
+++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerNull.scala
@@ -42,4 +42,6 @@ class MetadataManagerNull(isPersistenceEnabled: Boolean) extends MetadataManager
def deleteMetadataFromStorage(tableName: String, infoDate: LocalDate): Unit = {
throw new UnsupportedOperationException(errorMessage)
}
+
+ override def close(): Unit = {}
}
diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/BookkeepingConfigFactory.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/BookkeepingConfigFactory.scala
index 6fc1f2623..17ca5bc9f 100644
--- a/pramen/core/src/test/scala/za/co/absa/pramen/core/BookkeepingConfigFactory.scala
+++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/BookkeepingConfigFactory.scala
@@ -28,7 +28,10 @@ object BookkeepingConfigFactory {
bookkeepingJdbcConfig: Option[JdbcConfig] = None,
deltaDatabase: Option[String] = None,
deltaTablePrefix: Option[String] = None,
- temporaryDirectory: Option[String] = None): BookkeeperConfig = {
+ temporaryDirectory: Option[String] = None,
+ dynamoDbRegion: Option[String] = None,
+ dynamoDbTableArn: Option[String] = None,
+ dynamoDbTablePrefix: Option[String] = None): BookkeeperConfig = {
BookkeeperConfig(
bookkeepingEnabled,
bookkeepingLocation,
@@ -38,7 +41,10 @@ object BookkeepingConfigFactory {
bookkeepingJdbcConfig,
deltaDatabase,
deltaTablePrefix,
- temporaryDirectory
+ temporaryDirectory,
+ dynamoDbRegion,
+ dynamoDbTableArn,
+ dynamoDbTablePrefix
)
}
diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/metadata/MetadataManagerSpy.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/metadata/MetadataManagerSpy.scala
index 997db6c6b..812ba06ca 100644
--- a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/metadata/MetadataManagerSpy.scala
+++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/metadata/MetadataManagerSpy.scala
@@ -65,4 +65,6 @@ class MetadataManagerSpy(isPersistent: Boolean) extends MetadataManagerBase(isPe
metadataLocalStore.remove(MetadataTableKey(tableName, infoDate))
}
+
+ override def close(): Unit = {}
}
diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/model/QueryBuilderSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/model/QueryBuilderSuite.scala
index c5f4fa71c..fc8de3c0b 100644
--- a/pramen/core/src/test/scala/za/co/absa/pramen/core/model/QueryBuilderSuite.scala
+++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/model/QueryBuilderSuite.scala
@@ -51,31 +51,31 @@ class QueryBuilderSuite extends AnyWordSpec {
"throw an exception when no query configuration is specified" in {
val conf = ConfigFactory.parseString("")
- val exception = intercept[IllegalArgumentException] {
+ val ex = intercept[IllegalArgumentException] {
QueryBuilder.fromConfig(conf, "", "")
}
- assert(exception.getMessage == "No options are specified for the query. Usually, it is one of: 'sql', 'path', 'table', 'db.table', 'topic'.")
+ assert(ex.getMessage == "No options are specified for the query. Usually, it is one of: 'sql', 'path', 'table', 'db.table', 'topic'.")
}
"throw an exception when the prefix is empty" in {
val conf = ConfigFactory.parseString("data = /tmp")
- val exception = intercept[IllegalArgumentException] {
+ val ex = intercept[IllegalArgumentException] {
QueryBuilder.fromConfig(conf, "input", "")
}
- assert(exception.getMessage == "No options are specified for the 'input' query. Usually, it is one of: 'input.sql', 'input.path', 'input.table', 'input.db.table', 'input.topic'.")
+ assert(ex.getMessage == "No options are specified for the 'input' query. Usually, it is one of: 'input.sql', 'input.path', 'input.table', 'input.db.table', 'input.topic'.")
}
"throw an exception when the prefix is empty and parent is specified" in {
val conf = ConfigFactory.parseString("data = /tmp")
- val exception = intercept[IllegalArgumentException] {
+ val ex = intercept[IllegalArgumentException] {
QueryBuilder.fromConfig(conf, "input", "my.parent")
}
- assert(exception.getMessage == "No options are specified for the 'input' query. Usually, it is one of: 'input.sql', 'input.path', 'input.table', 'input.db.table', 'input.topic' at my.parent.")
+ assert(ex.getMessage == "No options are specified for the 'input' query. Usually, it is one of: 'input.sql', 'input.path', 'input.table', 'input.db.table', 'input.topic' at my.parent.")
}
}
}
diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDynamoDbBuilderSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDynamoDbBuilderSuite.scala
new file mode 100644
index 000000000..d0dbdf0e8
--- /dev/null
+++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDynamoDbBuilderSuite.scala
@@ -0,0 +1,154 @@
+/*
+ * Copyright 2022 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.pramen.core.tests.bookkeeper
+
+import org.scalatest.wordspec.AnyWordSpec
+import software.amazon.awssdk.auth.credentials.{AwsBasicCredentials, StaticCredentialsProvider}
+import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb
+
+class BookkeeperDynamoDbBuilderSuite extends AnyWordSpec {
+
+ "BookkeeperDynamoDbBuilder" should {
+ "use default table prefix" in {
+ val builder = BookkeeperDynamoDb.builder
+ .withRegion("us-east-1")
+ .withBatchId(123456789L)
+
+ // We can't instantiate without valid DynamoDB connection,
+ // but we can verify the builder returns itself (fluent API)
+ assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder])
+ }
+
+ "allow setting region" in {
+ val builder = BookkeeperDynamoDb.builder
+ .withRegion("eu-west-1")
+ .withBatchId(123456789L)
+
+ assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder])
+ }
+
+ "allow setting table ARN" in {
+ val builder = BookkeeperDynamoDb.builder
+ .withRegion("us-east-1")
+ .withTableArn("arn:aws:dynamodb:us-east-1:123456789012:table/")
+ .withBatchId(123456789L)
+
+ assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder])
+ }
+
+ "allow setting table prefix" in {
+ val builder = BookkeeperDynamoDb.builder
+ .withRegion("us-east-1")
+ .withTablePrefix("test_pramen")
+ .withBatchId(123456789L)
+
+ assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder])
+ }
+
+ "allow setting credentials provider" in {
+ val credentials = AwsBasicCredentials.create("accessKey", "secretKey")
+ val credentialsProvider = StaticCredentialsProvider.create(credentials)
+
+ val builder = BookkeeperDynamoDb.builder
+ .withRegion("us-east-1")
+ .withCredentialsProvider(credentialsProvider)
+ .withBatchId(123456789L)
+
+ assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder])
+ }
+
+ "allow setting endpoint" in {
+ val builder = BookkeeperDynamoDb.builder
+ .withRegion("us-east-1")
+ .withEndpoint("http://localhost:8000")
+ .withBatchId(123456789L)
+
+ assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder])
+ }
+
+ "allow setting batch ID" in {
+ val batchId = 987654321L
+ val builder = BookkeeperDynamoDb.builder
+ .withRegion("us-east-1")
+ .withBatchId(batchId)
+
+ assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder])
+ }
+
+ "support fluent API chaining" in {
+ val credentials = AwsBasicCredentials.create("accessKey", "secretKey")
+ val credentialsProvider = StaticCredentialsProvider.create(credentials)
+
+ val builder = BookkeeperDynamoDb.builder
+ .withRegion("ap-southeast-2")
+ .withTableArn("arn:aws:dynamodb:ap-southeast-2:123456789012:table/")
+ .withTablePrefix("prod_pramen")
+ .withCredentialsProvider(credentialsProvider)
+ .withEndpoint("http://localhost:8000")
+ .withBatchId(111222333L)
+
+ assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder])
+ }
+
+ "throw IllegalArgumentException when region is not set" in {
+ val builder = BookkeeperDynamoDb.builder
+ .withBatchId(123456789L)
+
+ val ex = intercept[IllegalArgumentException] {
+ builder.build()
+ }
+
+ assert(ex.getMessage.contains("region"))
+ }
+
+ "throw IllegalArgumentException when batch ID is not set" in {
+ val builder = BookkeeperDynamoDb.builder
+ .withRegion("us-east-1")
+
+ val ex = intercept[IllegalArgumentException] {
+ builder.build()
+ }
+
+ assert(ex.getMessage.contains("BatchId is not supplied"))
+ }
+ }
+
+ "BookkeeperDynamoDb.getFullTableName" should {
+ "return table name when no ARN is provided" in {
+ val result = BookkeeperDynamoDb.getFullTableName(None, "test_table")
+ assert(result == "test_table")
+ }
+
+ "return ARN with some prefix ends with slash" in {
+ val arn = "arn:aws:dynamodb:us-east-1:123456789012:path/"
+ val result = BookkeeperDynamoDb.getFullTableName(Some(arn), "test_table")
+ assert(result == s"${arn}table/test_table")
+ }
+
+ "return ARN with /table/ prefix when ARN ends with slash" in {
+ val arn = "arn:aws:dynamodb:us-east-1:123456789012:table/"
+ val result = BookkeeperDynamoDb.getFullTableName(Some(arn), "test_table")
+ assert(result == s"${arn}test_table")
+ }
+
+ "handle ARN without trailing slash by adding /table/" in {
+ val arn = "arn:aws:dynamodb:eu-west-1:987654321098"
+ val result = BookkeeperDynamoDb.getFullTableName(Some(arn), "my_table")
+ assert(result == s"$arn/table/my_table")
+ }
+ }
+}
diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/OffsetManagerDynamoDbBuilderSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/OffsetManagerDynamoDbBuilderSuite.scala
new file mode 100644
index 000000000..b8714d858
--- /dev/null
+++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/OffsetManagerDynamoDbBuilderSuite.scala
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2022 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.pramen.core.tests.bookkeeper
+
+import org.scalatest.wordspec.AnyWordSpec
+import software.amazon.awssdk.auth.credentials.{AwsBasicCredentials, StaticCredentialsProvider}
+import za.co.absa.pramen.core.bookkeeper.OffsetManagerDynamoDb
+
+class OffsetManagerDynamoDbBuilderSuite extends AnyWordSpec {
+
+ "OffsetManagerDynamoDbBuilder" should {
+ "use default table prefix when not specified" in {
+ val builder = OffsetManagerDynamoDb.builder
+ .withRegion("us-east-1")
+ .withBatchId(123456789L)
+
+ assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder])
+ }
+
+ "allow setting region" in {
+ val builder = OffsetManagerDynamoDb.builder
+ .withRegion("eu-central-1")
+ .withBatchId(123456789L)
+
+ assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder])
+ }
+
+ "allow setting table ARN" in {
+ val builder = OffsetManagerDynamoDb.builder
+ .withRegion("us-west-2")
+ .withTableArn("arn:aws:dynamodb:us-west-2:123456789012:table/")
+ .withBatchId(123456789L)
+
+ assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder])
+ }
+
+ "allow setting table prefix" in {
+ val builder = OffsetManagerDynamoDb.builder
+ .withRegion("ap-northeast-1")
+ .withTablePrefix("staging_pramen")
+ .withBatchId(123456789L)
+
+ assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder])
+ }
+
+ "allow setting credentials provider" in {
+ val credentials = AwsBasicCredentials.create("testAccessKey", "testSecretKey")
+ val credentialsProvider = StaticCredentialsProvider.create(credentials)
+
+ val builder = OffsetManagerDynamoDb.builder
+ .withRegion("us-east-1")
+ .withCredentialsProvider(credentialsProvider)
+ .withBatchId(123456789L)
+
+ assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder])
+ }
+
+ "allow setting endpoint for local testing" in {
+ val builder = OffsetManagerDynamoDb.builder
+ .withRegion("us-east-1")
+ .withEndpoint("http://localhost:4566")
+ .withBatchId(123456789L)
+
+ assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder])
+ }
+
+ "allow setting batch ID" in {
+ val batchId = 1234567890123L
+ val builder = OffsetManagerDynamoDb.builder
+ .withRegion("us-east-1")
+ .withBatchId(batchId)
+
+ assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder])
+ }
+
+ "support fluent API with all parameters" in {
+ val credentials = AwsBasicCredentials.create("key", "secret")
+ val credentialsProvider = StaticCredentialsProvider.create(credentials)
+ val batchId = System.currentTimeMillis()
+
+ val builder = OffsetManagerDynamoDb.builder
+ .withRegion("sa-east-1")
+ .withTableArn("arn:aws:dynamodb:sa-east-1:999888777666:table/")
+ .withTablePrefix("dev_pramen")
+ .withCredentialsProvider(credentialsProvider)
+ .withEndpoint("http://dynamodb.local:8000")
+ .withBatchId(batchId)
+
+ assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder])
+ }
+
+ "throw IllegalArgumentException when region is missing" in {
+ val builder = OffsetManagerDynamoDb.builder
+ .withBatchId(123456789L)
+
+ val ex = intercept[IllegalArgumentException] {
+ builder.build()
+ }
+
+ assert(ex.getMessage.contains("Region"))
+ }
+ }
+}
diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/journal/JournalDynamoDBBuilderSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/journal/JournalDynamoDBBuilderSuite.scala
new file mode 100644
index 000000000..2574096bc
--- /dev/null
+++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/journal/JournalDynamoDBBuilderSuite.scala
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2022 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.pramen.core.tests.journal
+
+import org.scalatest.wordspec.AnyWordSpec
+import software.amazon.awssdk.auth.credentials.{AwsBasicCredentials, StaticCredentialsProvider}
+import za.co.absa.pramen.core.journal.JournalDynamoDB
+
+class JournalDynamoDBBuilderSuite extends AnyWordSpec {
+
+ "JournalDynamoDBBuilder" should {
+ "use default table prefix when not specified" in {
+ val builder = JournalDynamoDB.builder
+ .withRegion("us-east-1")
+
+ assert(builder.isInstanceOf[JournalDynamoDB.JournalDynamoDBBuilder])
+ }
+
+ "allow setting region" in {
+ val builder = JournalDynamoDB.builder
+ .withRegion("eu-west-2")
+
+ assert(builder.isInstanceOf[JournalDynamoDB.JournalDynamoDBBuilder])
+ }
+
+ "allow setting table ARN" in {
+ val builder = JournalDynamoDB.builder
+ .withRegion("us-west-1")
+ .withTableArn("arn:aws:dynamodb:us-west-1:111222333444:table/")
+
+ assert(builder.isInstanceOf[JournalDynamoDB.JournalDynamoDBBuilder])
+ }
+
+ "allow setting table prefix" in {
+ val builder = JournalDynamoDB.builder
+ .withRegion("ap-south-1")
+ .withTablePrefix("qa_pramen")
+
+ assert(builder.isInstanceOf[JournalDynamoDB.JournalDynamoDBBuilder])
+ }
+
+ "allow setting credentials provider" in {
+ val credentials = AwsBasicCredentials.create("myAccessKey", "mySecretKey")
+ val credentialsProvider = StaticCredentialsProvider.create(credentials)
+
+ val builder = JournalDynamoDB.builder
+ .withRegion("us-east-2")
+ .withCredentialsProvider(credentialsProvider)
+
+ assert(builder.isInstanceOf[JournalDynamoDB.JournalDynamoDBBuilder])
+ }
+
+ "allow setting endpoint for local development" in {
+ val builder = JournalDynamoDB.builder
+ .withRegion("local")
+ .withEndpoint("http://localhost:8000")
+
+ assert(builder.isInstanceOf[JournalDynamoDB.JournalDynamoDBBuilder])
+ }
+
+ "support fluent API chaining" in {
+ val credentials = AwsBasicCredentials.create("testKey", "testSecret")
+ val credentialsProvider = StaticCredentialsProvider.create(credentials)
+
+ val builder = JournalDynamoDB.builder
+ .withRegion("ca-central-1")
+ .withTableArn("arn:aws:dynamodb:ca-central-1:555666777888:table/")
+ .withTablePrefix("prod_journal")
+ .withCredentialsProvider(credentialsProvider)
+ .withEndpoint("http://dynamodb-local:8000")
+
+ assert(builder.isInstanceOf[JournalDynamoDB.JournalDynamoDBBuilder])
+ }
+
+ "throw IllegalArgumentException when region is not set" in {
+ val builder = JournalDynamoDB.builder
+
+ val ex = intercept[IllegalArgumentException] {
+ builder.build()
+ }
+
+ assert(ex.getMessage.contains("Region"))
+ }
+ }
+}
diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/lock/TokenLockFactoryDynamoDbBuilderSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/lock/TokenLockFactoryDynamoDbBuilderSuite.scala
new file mode 100644
index 000000000..a653303b0
--- /dev/null
+++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/lock/TokenLockFactoryDynamoDbBuilderSuite.scala
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2022 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.pramen.core.tests.lock
+
+import org.scalatest.wordspec.AnyWordSpec
+import software.amazon.awssdk.auth.credentials.{AwsBasicCredentials, StaticCredentialsProvider}
+import za.co.absa.pramen.core.lock.TokenLockFactoryDynamoDb
+
+class TokenLockFactoryDynamoDbBuilderSuite extends AnyWordSpec {
+
+ "TokenLockFactoryDynamoDbBuilder" should {
+ "use default table prefix when not set" in {
+ val builder = TokenLockFactoryDynamoDb.builder
+ .withRegion("us-east-1")
+
+ assert(builder.isInstanceOf[TokenLockFactoryDynamoDb.TokenLockFactoryDynamoDbBuilder])
+ }
+
+ "allow setting region" in {
+ val builder = TokenLockFactoryDynamoDb.builder
+ .withRegion("eu-west-3")
+
+ assert(builder.isInstanceOf[TokenLockFactoryDynamoDb.TokenLockFactoryDynamoDbBuilder])
+ }
+
+ "allow setting table ARN" in {
+ val builder = TokenLockFactoryDynamoDb.builder
+ .withRegion("ap-east-1")
+ .withTableArn("arn:aws:dynamodb:ap-east-1:999888777666:table/")
+
+ assert(builder.isInstanceOf[TokenLockFactoryDynamoDb.TokenLockFactoryDynamoDbBuilder])
+ }
+
+ "allow setting table prefix" in {
+ val builder = TokenLockFactoryDynamoDb.builder
+ .withRegion("sa-east-1")
+ .withTablePrefix("lock_pramen")
+
+ assert(builder.isInstanceOf[TokenLockFactoryDynamoDb.TokenLockFactoryDynamoDbBuilder])
+ }
+
+ "allow setting credentials provider" in {
+ val credentials = AwsBasicCredentials.create("lockKey", "lockSecret")
+ val credentialsProvider = StaticCredentialsProvider.create(credentials)
+
+ val builder = TokenLockFactoryDynamoDb.builder
+ .withRegion("us-west-1")
+ .withCredentialsProvider(credentialsProvider)
+
+ assert(builder.isInstanceOf[TokenLockFactoryDynamoDb.TokenLockFactoryDynamoDbBuilder])
+ }
+
+ "allow setting endpoint for testing" in {
+ val builder = TokenLockFactoryDynamoDb.builder
+ .withRegion("local")
+ .withEndpoint("http://dynamodb.local:8888")
+
+ assert(builder.isInstanceOf[TokenLockFactoryDynamoDb.TokenLockFactoryDynamoDbBuilder])
+ }
+
+ "support full fluent API" in {
+ val credentials = AwsBasicCredentials.create("fluentKey", "fluentSecret")
+ val credentialsProvider = StaticCredentialsProvider.create(credentials)
+
+ val builder = TokenLockFactoryDynamoDb.builder
+ .withRegion("cn-north-1")
+ .withTableArn("arn:aws-cn:dynamodb:cn-north-1:123456789012:table/")
+ .withTablePrefix("distributed_locks")
+ .withCredentialsProvider(credentialsProvider)
+ .withEndpoint("http://private-dynamodb:8000")
+
+ assert(builder.isInstanceOf[TokenLockFactoryDynamoDb.TokenLockFactoryDynamoDbBuilder])
+ }
+
+ "throw IllegalArgumentException when region is not provided" in {
+ val builder = TokenLockFactoryDynamoDb.builder
+
+ val ex = intercept[IllegalArgumentException] {
+ builder.build()
+ }
+
+ assert(ex.getMessage.contains("Region"))
+ }
+ }
+}
diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/metadata/MetadataManagerDynamoDbBuilderSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/metadata/MetadataManagerDynamoDbBuilderSuite.scala
new file mode 100644
index 000000000..84e294732
--- /dev/null
+++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/metadata/MetadataManagerDynamoDbBuilderSuite.scala
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2022 ABSA Group Limited
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package za.co.absa.pramen.core.tests.metadata
+
+import org.scalatest.wordspec.AnyWordSpec
+import software.amazon.awssdk.auth.credentials.{AwsBasicCredentials, StaticCredentialsProvider}
+import za.co.absa.pramen.core.metadata.MetadataManagerDynamoDb
+
+class MetadataManagerDynamoDbBuilderSuite extends AnyWordSpec {
+
+ "MetadataManagerDynamoDbBuilder" should {
+ "use default table prefix" in {
+ val builder = MetadataManagerDynamoDb.builder
+ .withRegion("us-east-1")
+
+ assert(builder.isInstanceOf[MetadataManagerDynamoDb.MetadataManagerDynamoDbBuilder])
+ }
+
+ "allow setting region" in {
+ val builder = MetadataManagerDynamoDb.builder
+ .withRegion("eu-north-1")
+
+ assert(builder.isInstanceOf[MetadataManagerDynamoDb.MetadataManagerDynamoDbBuilder])
+ }
+
+ "allow setting table ARN" in {
+ val builder = MetadataManagerDynamoDb.builder
+ .withRegion("ap-southeast-1")
+ .withTableArn("arn:aws:dynamodb:ap-southeast-1:123123123123:table/")
+
+ assert(builder.isInstanceOf[MetadataManagerDynamoDb.MetadataManagerDynamoDbBuilder])
+ }
+
+ "allow setting table prefix" in {
+ val builder = MetadataManagerDynamoDb.builder
+ .withRegion("me-south-1")
+ .withTablePrefix("test_metadata")
+
+ assert(builder.isInstanceOf[MetadataManagerDynamoDb.MetadataManagerDynamoDbBuilder])
+ }
+
+ "allow setting credentials provider" in {
+ val credentials = AwsBasicCredentials.create("access", "secret")
+ val credentialsProvider = StaticCredentialsProvider.create(credentials)
+
+ val builder = MetadataManagerDynamoDb.builder
+ .withRegion("af-south-1")
+ .withCredentialsProvider(credentialsProvider)
+
+ assert(builder.isInstanceOf[MetadataManagerDynamoDb.MetadataManagerDynamoDbBuilder])
+ }
+
+ "allow setting endpoint" in {
+ val builder = MetadataManagerDynamoDb.builder
+ .withRegion("us-west-2")
+ .withEndpoint("http://localstack:4566")
+
+ assert(builder.isInstanceOf[MetadataManagerDynamoDb.MetadataManagerDynamoDbBuilder])
+ }
+
+ "support complete fluent API chain" in {
+ val credentials = AwsBasicCredentials.create("myKey", "mySecret")
+ val credentialsProvider = StaticCredentialsProvider.create(credentials)
+
+ val builder = MetadataManagerDynamoDb.builder
+ .withRegion("ap-northeast-2")
+ .withTableArn("arn:aws:dynamodb:ap-northeast-2:444333222111:table/")
+ .withTablePrefix("metadata_manager")
+ .withCredentialsProvider(credentialsProvider)
+ .withEndpoint("http://custom-endpoint:9000")
+
+ assert(builder.isInstanceOf[MetadataManagerDynamoDb.MetadataManagerDynamoDbBuilder])
+ }
+
+ "throw IllegalArgumentException when region is missing" in {
+ val builder = MetadataManagerDynamoDb.builder
+
+ val ex = intercept[IllegalArgumentException] {
+ builder.build()
+ }
+
+ assert(ex.getMessage.contains("Region"))
+ }
+ }
+}
diff --git a/pramen/examples/dynamodb_bookkeeping/README.md b/pramen/examples/dynamodb_bookkeeping/README.md
new file mode 100644
index 000000000..4e0f567c6
--- /dev/null
+++ b/pramen/examples/dynamodb_bookkeeping/README.md
@@ -0,0 +1,531 @@
+# DynamoDB Bookkeeping Example
+
+This example demonstrates how to configure Pramen to use AWS DynamoDB for bookkeeping instead of MongoDB, JDBC databases, or Hadoop-based storage.
+
+## Overview
+
+DynamoDB bookkeeping provides a serverless, fully managed solution for tracking pipeline state, record counts, and data availability in Pramen pipelines.
+
+### Benefits
+
+- **Serverless**: No database servers to manage or maintain
+- **Auto-scaling**: Automatically scales to handle workload
+- **Pay-per-request**: No fixed costs, pay only for what you use
+- **High Availability**: Built-in replication across AWS availability zones
+- **Multi-environment**: Easy separation via table prefixes
+- **Automatic Table Creation**: Tables are created automatically on first run
+
+## Configuration
+
+### Minimal Configuration
+
+```hocon
+pramen.bookkeeping {
+ enabled = true
+ dynamodb.region = "us-east-1"
+}
+```
+
+This creates tables:
+- `pramen_bookkeeping` - Data availability and record counts
+- `pramen_schemas` - Table schema evolution
+- `pramen_locks` - Distributed locking (if locks enabled)
+- `pramen_journal` - Task completion history
+- `pramen_metadata` - Custom metadata key-value pairs
+- `pramen_offsets` - Incremental ingestion offset tracking
+
+### Production Configuration
+
+```hocon
+pramen.bookkeeping {
+ enabled = true
+ dynamodb.region = "us-east-1"
+ dynamodb.table.prefix = "pramen_production"
+}
+```
+
+This creates tables:
+- `pramen_production_bookkeeping` - Data availability and record counts
+- `pramen_production_schemas` - Table schema evolution
+- `pramen_production_locks` - Distributed locking (if locks enabled)
+- `pramen_production_journal` - Task completion history
+- `pramen_production_metadata` - Custom metadata key-value pairs
+- `pramen_production_offsets` - Incremental ingestion offset tracking
+
+### Multi-Environment Configuration
+
+**Development:**
+```hocon
+pramen.bookkeeping {
+ enabled = true
+ dynamodb.region = "us-east-1"
+ dynamodb.table.prefix = "pramen_dev"
+}
+```
+
+**Staging:**
+```hocon
+pramen.bookkeeping {
+ enabled = true
+ dynamodb.region = "us-east-1"
+ dynamodb.table.prefix = "pramen_staging"
+}
+```
+
+**Production:**
+```hocon
+pramen.bookkeeping {
+ enabled = true
+ dynamodb.region = "us-east-1"
+ dynamodb.table.prefix = "pramen_production"
+}
+```
+
+### Cross-Account Configuration
+
+If DynamoDB tables are in a different AWS account:
+
+```hocon
+pramen.bookkeeping {
+ enabled = true
+ dynamodb.region = "us-west-2"
+ dynamodb.table.arn = "arn:aws:dynamodb:us-west-2:987654321098:table/"
+ dynamodb.table.prefix = "shared_pramen"
+}
+```
+
+## AWS Setup
+
+### 1. AWS Credentials
+
+Pramen uses the AWS SDK's `DefaultCredentialsProvider`, which loads credentials from:
+
+1. **Environment Variables**:
+ ```bash
+ export AWS_ACCESS_KEY_ID=your_access_key
+ export AWS_SECRET_ACCESS_KEY=your_secret_key
+ export AWS_REGION=us-east-1
+ ```
+
+2. **AWS Credentials File** (`~/.aws/credentials`):
+ ```ini
+ [default]
+ aws_access_key_id = your_access_key
+ aws_secret_access_key = your_secret_key
+ region = us-east-1
+ ```
+
+3. **IAM Role** (recommended for EC2, ECS, EMR, etc.):
+ - No credentials needed in configuration
+ - Automatically uses the instance/task role
+
+### 2. Required IAM Permissions
+
+Create an IAM policy with these permissions:
+
+```json
+{
+ "Version": "2012-10-17",
+ "Statement": [
+ {
+ "Effect": "Allow",
+ "Action": [
+ "dynamodb:CreateTable",
+ "dynamodb:DescribeTable",
+ "dynamodb:Query",
+ "dynamodb:PutItem",
+ "dynamodb:DeleteItem",
+ "dynamodb:GetItem",
+ "dynamodb:Scan"
+ ],
+ "Resource": [
+ "arn:aws:dynamodb:us-east-1:*:table/pramen_*"
+ ]
+ }
+ ]
+}
+```
+
+**Note**: Adjust the region and table name pattern based on your configuration.
+
+### 3. Table Structure
+
+Tables are automatically created with the following schema:
+
+#### Bookkeeping Table (`{prefix}_bookkeeping`)
+- **Partition Key**: `tableName` (String)
+- **Sort Key**: `infoDate` (String, format: yyyy-MM-dd)
+- **Billing Mode**: PAY_PER_REQUEST (on-demand)
+- **Attributes**:
+ - `tableName`: Name of the metastore table
+ - `infoDate`: Information date
+ - `infoDateBegin`: Start of date range
+ - `infoDateEnd`: End of date range
+ - `inputRecordCount`: Number of input records
+ - `outputRecordCount`: Number of output records
+ - `jobStarted`: Job start timestamp (milliseconds)
+ - `jobFinished`: Job finish timestamp (milliseconds)
+ - `batchId`: Batch execution ID
+ - `appendedRecordCount`: Records appended (optional)
+
+#### Schema Table (`{prefix}_schemas`)
+- **Partition Key**: `tableName` (String)
+- **Sort Key**: `infoDate` (String)
+- **Billing Mode**: PAY_PER_REQUEST (on-demand)
+- **Attributes**:
+ - `tableName`: Name of the metastore table
+ - `infoDate`: Date when schema was recorded
+ - `schemaJson`: Spark schema in JSON format
+
+#### Offset Table (`{prefix}_offsets`)
+- **Partition Key**: `pramenTableName` (String)
+- **Sort Key**: `compositeKey` (String, format: "infoDate#createdAtMilli")
+- **Billing Mode**: PAY_PER_REQUEST (on-demand)
+- **Attributes**:
+ - `pramenTableName`: Name of the metastore table
+ - `compositeKey`: Composite key for efficient querying (infoDate#createdAtMilli)
+ - `infoDate`: Information date
+ - `dataType`: Offset data type (e.g., "IntegralType", "StringType")
+ - `minOffset`: Minimum offset value for this batch
+ - `maxOffset`: Maximum offset value for this batch
+ - `batchId`: Batch execution ID
+ - `createdAt`: Timestamp when offset was created (milliseconds)
+ - `committedAt`: Timestamp when offset was committed (milliseconds, optional)
+
+## Running the Example
+
+1. **Configure AWS credentials** (see above)
+
+2. **Update the configuration file**:
+ ```bash
+ vi examples/dynamodb_bookkeeping/dynamodb_bookkeeping.conf
+ ```
+
+3. **Run Pramen**:
+ ```bash
+ spark-submit \
+ --class za.co.absa.pramen.runner.PipelineRunner \
+ --master local[*] \
+ pramen-runner_2.12-1.13.10.jar \
+ --config examples/dynamodb_bookkeeping/dynamodb_bookkeeping.conf \
+ --date 2024-01-15
+ ```
+
+4. **Verify tables were created**:
+ ```bash
+ aws dynamodb list-tables --region us-east-1
+ ```
+
+ You should see:
+ - `pramen_production_bookkeeping`
+ - `pramen_production_schemas`
+ - `pramen_production_offsets` (if using incremental ingestion)
+ - `pramen_production_locks` (if locks enabled)
+ - `pramen_production_journal` (if journal enabled)
+ - `pramen_production_metadata` (if metadata enabled)
+
+5. **Query bookkeeping data**:
+ ```bash
+ aws dynamodb query \
+ --table-name pramen_production_bookkeeping \
+ --key-condition-expression "tableName = :table" \
+ --expression-attribute-values '{":table":{"S":"example_table"}}' \
+ --region us-east-1
+ ```
+
+## Cost Considerations
+
+DynamoDB uses pay-per-request billing:
+
+- **On-demand mode** (default):
+ - Write: $1.25 per million write requests
+ - Read: $0.25 per million read requests
+ - Storage: $0.25 per GB-month
+
+- **Typical Pramen workload**:
+ - Small pipelines: < $1/month
+ - Medium pipelines: $5-20/month
+ - Large pipelines: $50-100/month
+
+**Cost optimization tips**:
+1. Use table prefixes to separate environments (avoid duplicating production data)
+2. Archive old bookkeeping data periodically
+3. Monitor usage via AWS Cost Explorer
+
+## Troubleshooting
+
+### Issue: "Access Denied" error
+
+**Cause**: Missing IAM permissions
+
+**Solution**: Verify IAM policy includes all required DynamoDB permissions
+
+### Issue: "Region not found" error
+
+**Cause**: Invalid AWS region specified
+
+**Solution**: Check region name in configuration matches AWS region codes
+(e.g., `us-east-1`, `eu-west-1`, `ap-southeast-1`)
+
+### Issue: Tables not created automatically
+
+**Cause**: Missing `dynamodb:CreateTable` permission
+
+**Solution**: Add CreateTable permission to IAM policy, or manually create tables:
+
+```bash
+# Create bookkeeping table
+aws dynamodb create-table \
+ --table-name pramen_production_bookkeeping \
+ --attribute-definitions \
+ AttributeName=tableName,AttributeType=S \
+ AttributeName=infoDate,AttributeType=S \
+ --key-schema \
+ AttributeName=tableName,KeyType=HASH \
+ AttributeName=infoDate,KeyType=RANGE \
+ --billing-mode PAY_PER_REQUEST \
+ --region us-east-1
+
+# Create schema table
+aws dynamodb create-table \
+ --table-name pramen_production_schemas \
+ --attribute-definitions \
+ AttributeName=tableName,AttributeType=S \
+ AttributeName=infoDate,AttributeType=S \
+ --key-schema \
+ AttributeName=tableName,KeyType=HASH \
+ AttributeName=infoDate,KeyType=RANGE \
+ --billing-mode PAY_PER_REQUEST \
+ --region us-east-1
+```
+
+### Issue: Slow queries
+
+**Cause**: Large number of bookkeeping records
+
+**Solution**:
+1. Use date range filters in queries
+2. Consider implementing table retention policy
+3. Archive old data to S3
+
+## Comparison with Other Bookkeeping Options
+
+| Feature | DynamoDB | JDBC | MongoDB | Hadoop/Delta |
+|---------|----------|------|---------|--------------|
+| Setup Complexity | Low | Medium | Medium | Low |
+| Maintenance | None | High | Medium | Low |
+| Cost (small) | Very Low | Medium | Medium | Very Low |
+| Cost (large) | Medium | High | Medium | Low |
+| Scaling | Automatic | Manual | Manual | Automatic |
+| Multi-region | Yes | No | Yes | Yes |
+| Query Performance | Fast | Fast | Fast | Slower |
+| Incremental Support | Yes | Yes | No | No |
+
+## Distributed Locking with DynamoDB
+
+When DynamoDB is configured for bookkeeping, Pramen automatically uses it for distributed locking to prevent concurrent pipeline runs. This ensures data consistency in multi-instance deployments.
+
+### How It Works
+
+1. **Automatic Lock Table Creation**: A locks table is created automatically using a builder pattern:
+ - Table name: `{prefix}_locks` (e.g., `pramen_production_locks`)
+ - Schema: `token` (partition key), `owner`, `expires`, `createdAt`
+ - Created via `TokenLockFactoryDynamoDb.builder`
+
+2. **Lock Acquisition**: Uses DynamoDB conditional writes (`attribute_not_exists`) for atomic lock operations
+
+3. **Lock Renewal**: Active pipelines automatically renew their locks every 2 minutes
+
+4. **Lock Expiration**: Locks expire after 10 minutes of inactivity and can be taken over
+
+5. **Hard Expiration**: Stale locks are cleaned up after 1 day
+
+6. **Builder Pattern**: Lock factory is created using a fluent builder API for flexible configuration
+
+### Configuration
+
+Enable locking along with DynamoDB bookkeeping:
+
+```hocon
+pramen {
+ # Enable distributed locking
+ runtime.use.locks = true
+
+ bookkeeping {
+ enabled = true
+ dynamodb.region = "us-east-1"
+ dynamodb.table.prefix = "pramen_production"
+ }
+}
+```
+
+This creates six tables:
+- `pramen_production_bookkeeping` - Bookkeeping data
+- `pramen_production_schemas` - Table schemas
+- `pramen_production_locks` - Distributed locks
+- `pramen_production_journal` - Task completion history
+- `pramen_production_metadata` - Custom metadata
+- `pramen_production_offsets` - Incremental ingestion offsets
+
+See `dynamodb_with_locks.conf` for a complete example.
+
+### IAM Permissions for Locks
+
+Add the locks table to your IAM policy:
+
+```json
+{
+ "Version": "2012-10-17",
+ "Statement": [
+ {
+ "Effect": "Allow",
+ "Action": [
+ "dynamodb:CreateTable",
+ "dynamodb:DescribeTable",
+ "dynamodb:PutItem",
+ "dynamodb:GetItem",
+ "dynamodb:DeleteItem",
+ "dynamodb:UpdateItem"
+ ],
+ "Resource": [
+ "arn:aws:dynamodb:*:*:table/pramen_production_bookkeeping",
+ "arn:aws:dynamodb:*:*:table/pramen_production_schemas",
+ "arn:aws:dynamodb:*:*:table/pramen_production_locks",
+ "arn:aws:dynamodb:*:*:table/pramen_production_journal",
+ "arn:aws:dynamodb:*:*:table/pramen_production_metadata",
+ "arn:aws:dynamodb:*:*:table/pramen_production_offsets"
+ ]
+ }
+ ]
+}
+```
+
+### Programmatic Usage
+
+You can also create lock factories programmatically using the builder pattern:
+
+```scala
+import za.co.absa.pramen.core.lock.TokenLockFactoryDynamoDb
+
+// Basic usage
+val lockFactory = TokenLockFactoryDynamoDb.builder
+ .withRegion("us-east-1")
+ .withTablePrefix("my_app")
+ .build()
+
+try {
+ val lock = lockFactory.getLock("my_pipeline")
+
+ if (lock.tryAcquire()) {
+ try {
+ // Run your pipeline
+ } finally {
+ lock.release()
+ }
+ }
+} finally {
+ lockFactory.close()
+}
+
+// Testing with DynamoDB Local
+val testFactory = TokenLockFactoryDynamoDb.builder
+ .withRegion("us-east-1")
+ .withEndpoint("http://localhost:8000")
+ .build()
+```
+
+
+### Lock Behavior
+
+**Scenario 1: Single Pipeline Run**
+- Pipeline acquires lock → processes data → releases lock
+
+**Scenario 2: Concurrent Pipeline Runs**
+- Instance A acquires lock → starts processing
+- Instance B tries to acquire same lock → blocked (lock already held)
+- Instance A completes → releases lock
+- Instance B can now acquire lock (if still attempting)
+
+**Scenario 3: Pipeline Crash**
+- Pipeline acquires lock → crashes
+- Lock expires after 10 minutes (no renewal)
+- New pipeline run can take over expired lock
+
+### Monitoring Locks
+
+Query active locks:
+
+```bash
+aws dynamodb scan \
+ --table-name pramen_production_locks \
+ --region us-east-1
+```
+
+Check specific lock:
+
+```bash
+aws dynamodb get-item \
+ --table-name pramen_production_locks \
+ --key '{"token":{"S":"my_pipeline_lock"}}' \
+ --region us-east-1
+```
+
+Manually release stuck lock (use with caution):
+
+```bash
+aws dynamodb delete-item \
+ --table-name pramen_production_locks \
+ --key '{"token":{"S":"my_pipeline_lock"}}' \
+ --region us-east-1
+```
+
+### Lock Cost
+
+Lock operations add minimal cost:
+- Lock acquisition: 1 write request (~$0.00000125)
+- Lock renewal (every 2 min): 1 write request per renewal
+- Lock release: 1 delete request (~$0.00000125)
+- Total per pipeline run: ~$0.00001 (for 10-minute pipeline)
+
+## Advanced Topics
+
+### Using DynamoDB Local for Development
+
+For local development/testing, use DynamoDB Local:
+
+1. **Start DynamoDB Local**:
+ ```bash
+ docker run -p 8000:8000 amazon/dynamodb-local
+ ```
+
+2. **Configure endpoint** (requires code modification):
+ ```scala
+ val client = DynamoDbClient.builder()
+ .endpointOverride(new URI("http://localhost:8000"))
+ .region(Region.US_EAST_1)
+ .build()
+ ```
+
+### Table Backup and Restore
+
+Use AWS Backup or DynamoDB point-in-time recovery:
+
+```bash
+# Enable point-in-time recovery
+aws dynamodb update-continuous-backups \
+ --table-name pramen_production_bookkeeping \
+ --point-in-time-recovery-specification PointInTimeRecoveryEnabled=true
+```
+
+### Monitoring
+
+Monitor DynamoDB metrics in CloudWatch:
+- `UserErrors` - Check for configuration issues
+- `ConsumedReadCapacityUnits` / `ConsumedWriteCapacityUnits` - Monitor costs
+- `SystemErrors` - Check for service issues
+
+## References
+
+- [AWS DynamoDB Documentation](https://docs.aws.amazon.com/dynamodb/)
+- [AWS SDK for Java Documentation](https://docs.aws.amazon.com/sdk-for-java/)
+- [DynamoDB Best Practices](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/best-practices.html)
diff --git a/pramen/examples/dynamodb_bookkeeping/dynamodb_bookkeeping.conf b/pramen/examples/dynamodb_bookkeeping/dynamodb_bookkeeping.conf
new file mode 100644
index 000000000..bd36630b4
--- /dev/null
+++ b/pramen/examples/dynamodb_bookkeeping/dynamodb_bookkeeping.conf
@@ -0,0 +1,123 @@
+# Copyright 2022 ABSA Group Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# =============================================================================
+# Example Configuration: DynamoDB Bookkeeping
+# =============================================================================
+#
+# This example shows how to configure Pramen to use AWS DynamoDB for
+# bookkeeping instead of MongoDB, JDBC, or Hadoop-based storage.
+#
+# DynamoDB bookkeeping provides:
+# - Serverless, fully managed storage
+# - Pay-per-request billing (no fixed costs)
+# - Automatic scaling
+# - High availability across AWS regions
+# - Multi-environment support via table prefixes
+#
+
+# General options
+pramen {
+ environment.name = "Production"
+ pipeline.name = "DynamoDB Bookkeeping Example"
+
+ # Enable bookkeeping with DynamoDB
+ bookkeeping {
+ enabled = true
+
+ # =======================================================================
+ # DynamoDB Configuration
+ # =======================================================================
+
+ # AWS Region where DynamoDB tables will be created/accessed
+ # REQUIRED when using DynamoDB bookkeeping
+ dynamodb.region = "af-south-1"
+
+ # Table prefix for multi-environment/multi-tenant deployments
+ # OPTIONAL - defaults to "pramen" if not specified
+ # Creates tables: {prefix}_bookkeeping and {prefix}_schemas
+ dynamodb.table.prefix = "pramen_production"
+
+ # Table ARN prefix for cross-account or resource-based policy access
+ # OPTIONAL - only needed for advanced scenarios
+ # Format: arn:aws:dynamodb:region:account-id:table/
+ # dynamodb.table.arn = "arn:aws:dynamodb:us-east-1:123456789012:table/"
+
+ # =======================================================================
+ # Notes on DynamoDB Configuration
+ # =======================================================================
+ #
+ # 1. AWS Credentials:
+ # - Pramen uses the AWS SDK's DefaultCredentialsProvider
+ # - Credentials are loaded from:
+ # a) Environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
+ # b) AWS credentials file (~/.aws/credentials)
+ # c) IAM role (when running on EC2, ECS, Lambda, etc.)
+ #
+ # 2. Required IAM Permissions:
+ # Your AWS credentials/role must have permissions for:
+ # - dynamodb:CreateTable (for automatic table creation)
+ # - dynamodb:DescribeTable
+ # - dynamodb:Query
+ # - dynamodb:PutItem
+ # - dynamodb:DeleteItem
+ #
+ # 3. Table Structure:
+ # Tables are automatically created with:
+ # - Bookkeeping: Partition key=tableName, Sort key=infoDate
+ # - Schemas: Partition key=tableName, Sort key=infoDate
+ # - Billing mode: PAY_PER_REQUEST (on-demand)
+ #
+ # 4. Multi-Environment Setup:
+ # Use different table prefixes for different environments:
+ # - Dev: dynamodb.table.prefix = "pramen_dev"
+ # - Staging: dynamodb.table.prefix = "pramen_staging"
+ # - Production: dynamodb.table.prefix = "pramen_production"
+ #
+ # 5. Cross-Account Access:
+ # If tables are in a different AWS account, use the table ARN:
+ # dynamodb.table.arn = "arn:aws:dynamodb:us-west-2:987654321098:table/"
+ #
+
+ # Hadoop format is required even when using DynamoDB
+ # (legacy requirement, set to any valid value)
+ hadoop.format = "delta"
+ }
+
+ # Temporary directory (optional)
+ temporary.directory = "/tmp"
+}
+
+# Metastore configuration
+pramen.metastore {
+ tables = [
+ {
+ name = "example_table"
+ format = "delta"
+ path = "/data/lake/example_table"
+ }
+ ]
+}
+
+# Operations
+pramen.operations = [
+ {
+ name = "Example Operation"
+ type = "transformation"
+ schedule.type = "daily"
+
+ transformer.class = "za.co.absa.pramen.core.transformers.IdentityTransformer"
+ input.table = "example_table"
+ output.table = "example_table"
+ }
+]
diff --git a/pramen/examples/dynamodb_bookkeeping/dynamodb_with_locks.conf b/pramen/examples/dynamodb_bookkeeping/dynamodb_with_locks.conf
new file mode 100644
index 000000000..0ec56e079
--- /dev/null
+++ b/pramen/examples/dynamodb_bookkeeping/dynamodb_with_locks.conf
@@ -0,0 +1,153 @@
+# DynamoDB Bookkeeping with Distributed Locking Example
+#
+# This configuration demonstrates how to use DynamoDB for both bookkeeping AND distributed locking.
+# When DynamoDB is configured for bookkeeping, Pramen automatically uses it for locks as well.
+
+pramen {
+ # Enable locking to prevent concurrent pipeline runs
+ runtime.use.locks = true
+
+ # Bookkeeping configuration
+ bookkeeping {
+ enabled = true
+
+ # DynamoDB Configuration
+ # =====================
+
+ # AWS region where DynamoDB tables will be created (REQUIRED)
+ dynamodb.region = "us-east-1"
+
+ # Table prefix for all Pramen tables (OPTIONAL, default: "pramen")
+ # This creates: {prefix}_bookkeeping, {prefix}_schemas, {prefix}_locks
+ dynamodb.table.prefix = "pramen_production"
+
+ # Table ARN prefix for cross-account or cross-region access (OPTIONAL)
+ # dynamodb.table.arn = "arn:aws:dynamodb:us-east-1:123456789012:table/"
+
+ # Legacy required field (not used for DynamoDB but must be set)
+ hadoop.format = "delta"
+ }
+
+ # Pipeline configuration
+ environment.name = "production"
+ pipeline.name = "example_pipeline"
+
+ # Metastore configuration (example)
+ metastore {
+ tables = [
+ {
+ name = "customer_data"
+ format = "parquet"
+ path = "/data/customers"
+ }
+ ]
+ }
+
+ # Example operations
+ operations = [
+ {
+ name = "ingest_customers"
+ type = "ingestion"
+ schedule.type = "daily"
+
+ source {
+ factory.class = "za.co.absa.pramen.core.source.JdbcSource"
+ jdbc.url = "jdbc:postgresql://localhost:5432/source_db"
+ jdbc.user = "reader"
+ jdbc.password = "secret"
+ query = "SELECT * FROM customers WHERE date = :infoDate"
+ }
+
+ metastore.table = "customer_data"
+ }
+ ]
+}
+
+# ============================================================================
+# How This Configuration Works
+# ============================================================================
+#
+# 1. BOOKKEEPING TABLES:
+# - pramen_production_bookkeeping: Stores data chunk metadata
+# - pramen_production_schemas: Stores table schemas
+#
+# 2. LOCK TABLE:
+# - pramen_production_locks: Stores distributed locks
+#
+# 3. JOURNAL TABLE:
+# - pramen_production_journal: Stores task completion records
+#
+# 4. METADATA TABLE:
+# - pramen_production_metadata: Stores custom metadata key-value pairs
+#
+# 5. AUTOMATIC TABLE CREATION:
+# All tables are created automatically on first run with proper schema:
+# - Partition keys and sort keys configured
+# - PAY_PER_REQUEST billing mode (on-demand)
+# - Five tables total: bookkeeping, schemas, locks, journal, metadata
+#
+# 6. LOCK BEHAVIOR:
+# When a pipeline runs:
+# - Acquires a lock by writing to pramen_production_locks table
+# - Uses DynamoDB conditional writes (attribute_not_exists) for atomicity
+# - Lock ticket expires after 10 minutes (automatically renewed)
+# - If another instance tries to run, it will be blocked
+# - Lock is released when pipeline completes or fails
+#
+# 7. MULTI-ENVIRONMENT SETUP:
+# Use different table prefixes for different environments:
+# - Dev: pramen_dev_*
+# - Staging: pramen_staging_*
+# - Production: pramen_production_*
+#
+# 8. AWS CREDENTIALS:
+# Pramen uses AWS DefaultCredentialsProvider which loads from:
+# - Environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
+# - AWS credentials file (~/.aws/credentials)
+# - IAM role (EC2, ECS, EMR, Lambda, Glue)
+#
+# 9. IAM PERMISSIONS REQUIRED:
+# {
+# "Version": "2012-10-17",
+# "Statement": [
+# {
+# "Effect": "Allow",
+# "Action": [
+# "dynamodb:CreateTable",
+# "dynamodb:DescribeTable",
+# "dynamodb:PutItem",
+# "dynamodb:GetItem",
+# "dynamodb:DeleteItem",
+# "dynamodb:UpdateItem",
+# "dynamodb:Query",
+# "dynamodb:Scan"
+# ],
+# "Resource": [
+# "arn:aws:dynamodb:*:*:table/pramen_production_bookkeeping",
+# "arn:aws:dynamodb:*:*:table/pramen_production_schemas",
+# "arn:aws:dynamodb:*:*:table/pramen_production_locks",
+# "arn:aws:dynamodb:*:*:table/pramen_production_journal",
+# "arn:aws:dynamodb:*:*:table/pramen_production_metadata",
+# "arn:aws:dynamodb:*:*:table/pramen_production_offsets"
+# ]
+# }
+# ]
+# }
+#
+# 10. TESTING:
+# spark-submit --class za.co.absa.pramen.runner.PipelineRunner \
+# pramen-runner_2.12-1.13.10.jar \
+# --config dynamodb_with_locks.conf \
+# --date 2024-01-15
+#
+# 11. COST OPTIMIZATION:
+# - PAY_PER_REQUEST billing: $1.25 per million writes, $0.25 per million reads
+# - Typical pipeline: ~10-20 requests per run
+# - Cost per run: < $0.001
+# - Monthly cost for 100 daily runs: ~$3
+#
+# 12. LOCK EXPIRATION:
+# - Lock tickets expire after 10 minutes of inactivity
+# - Active pipelines renew their lock every 2 minutes
+# - Expired locks can be taken over by new pipeline runs
+# - Hard expiration after 1 day (cleanup of stale locks)
diff --git a/pramen/extras/src/main/scala/za/co/absa/pramen/extras/notification/mq/SingleMessageProducer.scala b/pramen/extras/src/main/scala/za/co/absa/pramen/extras/notification/mq/SingleMessageProducer.scala
index 95c92d0af..3c3c4d458 100644
--- a/pramen/extras/src/main/scala/za/co/absa/pramen/extras/notification/mq/SingleMessageProducer.scala
+++ b/pramen/extras/src/main/scala/za/co/absa/pramen/extras/notification/mq/SingleMessageProducer.scala
@@ -16,10 +16,8 @@
package za.co.absa.pramen.extras.notification.mq
-trait SingleMessageProducer {
+trait SingleMessageProducer extends AutoCloseable {
def send(topic: String, message: String, numberOrRetries: Int = 3): Unit
def connect(): Unit
-
- def close(): Unit
}
diff --git a/pramen/pom.xml b/pramen/pom.xml
index 3bdeff7e5..f3c429dbf 100644
--- a/pramen/pom.xml
+++ b/pramen/pom.xml
@@ -143,6 +143,7 @@
1.1.4
1.10.3
0-10
+ 2.42.23
true
@@ -372,6 +373,13 @@
0.8.0
+
+
+ software.amazon.awssdk
+ dynamodb
+ ${aws.sdk.version}
+
+
org.scalatest
diff --git a/pramen/project/Dependencies.scala b/pramen/project/Dependencies.scala
index b382b0da7..572b2da9a 100644
--- a/pramen/project/Dependencies.scala
+++ b/pramen/project/Dependencies.scala
@@ -29,6 +29,7 @@ object Dependencies {
def CoreDependencies(scalaVersion: String, isDeltaCompile: Boolean): Seq[ModuleID] = Seq(
"org.apache.spark" %% "spark-sql" % sparkVersion(scalaVersion) % Provided,
+ "software.amazon.awssdk" % "dynamodb" % awsSdkVersion % Provided,
"org.mongodb.scala" %% "mongo-scala-driver" % mongoDbScalaDriverVersion,
"com.typesafe.slick" %% "slick" % slickVersion,
"com.typesafe.slick" %% "slick-hikaricp" % slickVersion,
diff --git a/pramen/project/Versions.scala b/pramen/project/Versions.scala
index 86725e468..64e7c57c1 100644
--- a/pramen/project/Versions.scala
+++ b/pramen/project/Versions.scala
@@ -37,6 +37,7 @@ object Versions {
val scalatestVersion = "3.2.14"
val mockitoVersion = "2.28.2"
val httpClientVersion = "4.5.14"
+ val awsSdkVersion = "2.42.23"
def sparkFallbackVersion(scalaVersion: String): String = {
if (scalaVersion.startsWith("2.11.")) {