From 1c00d8714e38e36b63b841b7bcb40673defa595c Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Mon, 30 Mar 2026 17:44:25 +0200 Subject: [PATCH 01/13] #181 Add AWS SDK for DynamoDB as a dependency --- pramen/core/pom.xml | 6 ++++++ pramen/pom.xml | 8 ++++++++ pramen/project/Dependencies.scala | 1 + pramen/project/Versions.scala | 1 + 4 files changed, 16 insertions(+) diff --git a/pramen/core/pom.xml b/pramen/core/pom.xml index fb500d65..2db0d900 100644 --- a/pramen/core/pom.xml +++ b/pramen/core/pom.xml @@ -144,6 +144,12 @@ channel_scala_${scala.compat.version} + + + software.amazon.awssdk + dynamodb + + org.mockito diff --git a/pramen/pom.xml b/pramen/pom.xml index 3bdeff7e..f3c429db 100644 --- a/pramen/pom.xml +++ b/pramen/pom.xml @@ -143,6 +143,7 @@ 1.1.4 1.10.3 0-10 + 2.42.23 true @@ -372,6 +373,13 @@ 0.8.0 + + + software.amazon.awssdk + dynamodb + ${aws.sdk.version} + + org.scalatest diff --git a/pramen/project/Dependencies.scala b/pramen/project/Dependencies.scala index b382b0da..192ccad1 100644 --- a/pramen/project/Dependencies.scala +++ b/pramen/project/Dependencies.scala @@ -38,6 +38,7 @@ object Dependencies { "com.github.yruslan" %% "channel_scala" % channelVersion, "com.sun.mail" % "javax.mail" % javaXMailVersion, "com.lihaoyi" %% "requests" % requestsVersion, + "software.amazon.awssdk" % "dynamodb" % awsSdkVersion, "org.scala-lang.modules" %% "scala-collection-compat" % scalaCompatColsVersion % Test, "org.scalatest" %% "scalatest" % scalatestVersion % Test, "org.mockito" % "mockito-core" % mockitoVersion % Test, diff --git a/pramen/project/Versions.scala b/pramen/project/Versions.scala index 86725e46..64e7c57c 100644 --- a/pramen/project/Versions.scala +++ b/pramen/project/Versions.scala @@ -37,6 +37,7 @@ object Versions { val scalatestVersion = "3.2.14" val mockitoVersion = "2.28.2" val httpClientVersion = "4.5.14" + val awsSdkVersion = "2.42.23" def sparkFallbackVersion(scalaVersion: String): String = { if (scalaVersion.startsWith("2.11.")) { From f10c822dd3c2e203fabf38f2cb2b9145f7b262dd Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Tue, 31 Mar 2026 09:21:23 +0200 Subject: [PATCH 02/13] #181 Add implementation of the bookeeper using DynamoDB --- .../core/app/config/BookkeeperConfig.scala | 29 +- .../pramen/core/bookkeeper/Bookkeeper.scala | 13 + .../core/bookkeeper/BookkeeperDynamoDb.scala | 830 ++++++++++++++++++ .../core/BookkeepingConfigFactory.scala | 10 +- .../examples/dynamodb_bookkeeping/README.md | 340 +++++++ .../dynamodb_bookkeeping.conf | 123 +++ 6 files changed, 1338 insertions(+), 7 deletions(-) create mode 100644 pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala create mode 100644 pramen/examples/dynamodb_bookkeeping/README.md create mode 100644 pramen/examples/dynamodb_bookkeeping/dynamodb_bookkeeping.conf diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/app/config/BookkeeperConfig.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/app/config/BookkeeperConfig.scala index cf0f7108..e5434316 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/app/config/BookkeeperConfig.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/app/config/BookkeeperConfig.scala @@ -30,7 +30,10 @@ case class BookkeeperConfig( bookkeepingJdbcConfig: Option[JdbcConfig], deltaDatabase: Option[String], deltaTablePrefix: Option[String], - temporaryDirectory: Option[String] + temporaryDirectory: Option[String], + dynamoDbRegion: Option[String], + dynamoDbTableArn: Option[String], + dynamoDbTablePrefix: Option[String] ) object BookkeeperConfig { @@ -44,6 +47,9 @@ object BookkeeperConfig { val BOOKKEEPING_DB_NAME = "pramen.bookkeeping.mongodb.database" val BOOKKEEPING_DELTA_DB_NAME = "pramen.bookkeeping.delta.database" val BOOKKEEPING_DELTA_TABLE_PREFIX = "pramen.bookkeeping.delta.table.prefix" + val BOOKKEEPING_DYNAMODB_REGION = "pramen.bookkeeping.dynamodb.region" + val BOOKKEEPING_DYNAMODB_TABLE_ARN = "pramen.bookkeeping.dynamodb.table.arn" + val BOOKKEEPING_DYNAMODB_TABLE_PREFIX = "pramen.bookkeeping.dynamodb.table.prefix" val BOOKKEEPING_TEMPORARY_DIRECTORY_KEY = "pramen.temporary.directory" def fromConfig(conf: Config, allowLocalBookkepingStorage: Boolean = false): BookkeeperConfig = { @@ -56,6 +62,9 @@ object BookkeeperConfig { val temporaryDirectory = ConfigUtils.getOptionString(conf, BOOKKEEPING_TEMPORARY_DIRECTORY_KEY) val deltaDatabase = ConfigUtils.getOptionString(conf, BOOKKEEPING_DELTA_DB_NAME) val deltaTablePrefix = ConfigUtils.getOptionString(conf, BOOKKEEPING_DELTA_TABLE_PREFIX) + val dynamoDbRegion = ConfigUtils.getOptionString(conf, BOOKKEEPING_DYNAMODB_REGION) + val dynamoDbTableArn = ConfigUtils.getOptionString(conf, BOOKKEEPING_DYNAMODB_TABLE_ARN) + val dynamoDbTablePrefix = ConfigUtils.getOptionString(conf, BOOKKEEPING_DYNAMODB_TABLE_PREFIX) if (bookkeepingEnabled && bookkeepingJdbcConfig.isEmpty && bookkeepingHadoopFormat == HadoopFormat.Delta) { if (bookkeepingLocation.isEmpty && deltaTablePrefix.isEmpty) { @@ -63,7 +72,7 @@ object BookkeeperConfig { s"Preferably $BOOKKEEPING_DELTA_DB_NAME should be defined as well for managed Delta Lake tables.") } } else { - if (bookkeepingEnabled && bookkeepingConnectionString.isEmpty && bookkeepingLocation.isEmpty && bookkeepingJdbcConfig.isEmpty) { + if (bookkeepingEnabled && bookkeepingConnectionString.isEmpty && bookkeepingLocation.isEmpty && bookkeepingJdbcConfig.isEmpty && dynamoDbRegion.isEmpty) { if (allowLocalBookkepingStorage) { log.warn("Bookkeeping configuration is missing. Using the default SQLite database 'pramen.sqlite'") return BookkeeperConfig( @@ -78,10 +87,13 @@ object BookkeeperConfig { )), None, None, - temporaryDirectory + temporaryDirectory, + None, + None, + None ) } else { - throw new RuntimeException(s"One of the following should be defined: $BOOKKEEPING_PARENT.jdbc.url, $BOOKKEEPING_CONNECTION_STRING or $BOOKKEEPING_LOCATION" + + throw new RuntimeException(s"One of the following should be defined: $BOOKKEEPING_PARENT.jdbc.url, $BOOKKEEPING_CONNECTION_STRING, $BOOKKEEPING_DYNAMODB_REGION, or $BOOKKEEPING_LOCATION" + s" when bookkeeping is enabled. You can disable bookkeeping by setting $BOOKKEEPING_ENABLED = false.") } } @@ -89,6 +101,10 @@ object BookkeeperConfig { if (bookkeepingConnectionString.isDefined && bookkeepingDbName.isEmpty) { throw new RuntimeException(s"Database name is not defined. Please, define $BOOKKEEPING_DB_NAME.") } + + if (dynamoDbRegion.isDefined && dynamoDbTablePrefix.isEmpty) { + log.warn(s"DynamoDB table prefix is not defined. Using default prefix 'pramen'. You can define it with $BOOKKEEPING_DYNAMODB_TABLE_PREFIX.") + } } BookkeeperConfig( @@ -100,7 +116,10 @@ object BookkeeperConfig { bookkeepingJdbcConfig, deltaDatabase, deltaTablePrefix, - temporaryDirectory + temporaryDirectory, + dynamoDbRegion, + dynamoDbTableArn, + dynamoDbTablePrefix ) } } \ No newline at end of file diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala index 62eb8d88..04e5c233 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala @@ -90,6 +90,7 @@ object Bookkeeper { } val hasBookkeepingJdbc = bookkeepingConfig.bookkeepingJdbcConfig.exists(_.primaryUrl.isDefined) + val hasBookkeepingDynamoDb = bookkeepingConfig.dynamoDbRegion.isDefined val dbOpt = if (hasBookkeepingJdbc) { val jdbcConfig = bookkeepingConfig.bookkeepingJdbcConfig.get @@ -129,6 +130,18 @@ object Bookkeeper { new BookkeeperNull() } else if (hasBookkeepingJdbc) { BookkeeperJdbc.fromPramenDb(dbOpt.get, batchId) + } else if (hasBookkeepingDynamoDb) { + val tablePrefix = bookkeepingConfig.dynamoDbTablePrefix.getOrElse(BookkeeperDynamoDb.DEFAULT_TABLE_PREFIX) + log.info(s"Using DynamoDB for bookkeeping in region '${bookkeepingConfig.dynamoDbRegion.get}' with table prefix '$tablePrefix'") + val builder = BookkeeperDynamoDb.builder + .withRegion(bookkeepingConfig.dynamoDbRegion.get) + .withBatchId(batchId) + .withTablePrefix(tablePrefix) + val builder2 = bookkeepingConfig.dynamoDbTableArn match { + case Some(arn) => builder.withTableArn(arn) + case None => builder + } + builder2.build() } else { mongoDbConnection match { case Some(connection) => diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala new file mode 100644 index 00000000..97f1ae54 --- /dev/null +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala @@ -0,0 +1,830 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.bookkeeper + +import org.apache.spark.sql.types.StructType +import org.slf4j.LoggerFactory +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider +import software.amazon.awssdk.regions.Region +import software.amazon.awssdk.services.dynamodb.DynamoDbClient +import software.amazon.awssdk.services.dynamodb.model._ +import za.co.absa.pramen.core.bookkeeper.model.DataAvailability +import za.co.absa.pramen.core.model.{DataChunk, TableSchema} +import za.co.absa.pramen.core.utils.{AlgorithmUtils, TimeUtils} + +import java.net.URI +import java.time.LocalDate +import scala.collection.JavaConverters._ +import scala.util.control.NonFatal + +/** + * DynamoDB-based implementation of the Bookkeeper. + * + * Table schema for bookkeeping: + * - Partition key: tableName (String) + * - Sort key: infoDate (String in yyyy-MM-dd format) + * + * Table schema for schemas: + * - Partition key: tableName (String) + * - Sort key: infoDate (String in yyyy-MM-dd format) + * + * @param dynamoDbClient The DynamoDB client to use for operations + * @param batchId The batch ID for this execution + * @param tableArn Optional ARN prefix for DynamoDB tables (e.g., "arn:aws:dynamodb:region:account-id:table/") + * @param tablePrefix Prefix for table names to allow multiple bookkeeping sets in the same account (default: "pramen") + */ +class BookkeeperDynamoDb( + dynamoDbClient: DynamoDbClient, + batchId: Long, + tableArn: Option[String] = None, + tablePrefix: String = BookkeeperDynamoDb.DEFAULT_TABLE_PREFIX +) extends BookkeeperBase(isBookkeepingEnabled = true, batchId) { + + import BookkeeperDynamoDb._ + + private val log = LoggerFactory.getLogger(this.getClass) + private val queryWarningTimeoutMs = 10000L + + // Construct table names with prefix + private val bookkeepingTableBaseName = s"${tablePrefix}_${DEFAULT_BOOKKEEPING_TABLE}" + private val schemaTableBaseName = s"${tablePrefix}_${DEFAULT_SCHEMA_TABLE}" + + // Full table names/ARNs + private val bookkeepingTableName = getFullTableName(tableArn, bookkeepingTableBaseName) + private val schemaTableName = getFullTableName(tableArn, schemaTableBaseName) + + // Initialize tables on construction + init() + + override val bookkeepingEnabled: Boolean = true + + /** + * Initializes the DynamoDB tables for bookkeeping and schemas. + * Checks if tables exist and creates them if they don't. + */ + def init(): Unit = { + try { + log.info(s"Initializing DynamoDB bookkeeper with tables: bookkeeping='$bookkeepingTableName', schemas='$schemaTableName'") + + // Initialize bookkeeping table + if (!tableExists(bookkeepingTableBaseName)) { + log.info(s"Creating DynamoDB bookkeeping table: $bookkeepingTableBaseName") + createBookkeepingTable(bookkeepingTableBaseName) + log.info(s"Successfully created bookkeeping table: $bookkeepingTableBaseName") + } else { + log.info(s"DynamoDB bookkeeping table already exists: $bookkeepingTableBaseName") + } + + // Initialize schema table + if (!tableExists(schemaTableBaseName)) { + log.info(s"Creating DynamoDB schema table: $schemaTableBaseName") + createSchemaTable(schemaTableBaseName) + log.info(s"Successfully created schema table: $schemaTableBaseName") + } else { + log.info(s"DynamoDB schema table already exists: $schemaTableBaseName") + } + + log.info(s"DynamoDB bookkeeper initialization complete") + } catch { + case NonFatal(ex) => + log.error("Error initializing DynamoDB bookkeeper tables", ex) + throw new RuntimeException("Failed to initialize DynamoDB bookkeeper", ex) + } + } + + /** + * Checks if a DynamoDB table exists. + * + * @param tableName The name of the table to check + * @return true if the table exists, false otherwise + */ + private def tableExists(tableName: String): Boolean = { + try { + val describeRequest = DescribeTableRequest.builder() + .tableName(tableName) + .build() + + dynamoDbClient.describeTable(describeRequest) + true + } catch { + case _: ResourceNotFoundException => false + case NonFatal(ex) => + log.warn(s"Error checking if table exists: $tableName", ex) + throw ex + } + } + + /** + * Creates the bookkeeping table with the appropriate schema. + * + * @param tableName The name of the table to create + */ + private def createBookkeepingTable(tableName: String): Unit = { + val createTableRequest = CreateTableRequest.builder() + .tableName(tableName) + .keySchema( + KeySchemaElement.builder() + .attributeName(ATTR_TABLE_NAME) + .keyType(KeyType.HASH) + .build(), + KeySchemaElement.builder() + .attributeName(ATTR_INFO_DATE) + .keyType(KeyType.RANGE) + .build() + ) + .attributeDefinitions( + AttributeDefinition.builder() + .attributeName(ATTR_TABLE_NAME) + .attributeType(ScalarAttributeType.S) + .build(), + AttributeDefinition.builder() + .attributeName(ATTR_INFO_DATE) + .attributeType(ScalarAttributeType.S) + .build() + ) + .billingMode(BillingMode.PAY_PER_REQUEST) // On-demand billing + .build() + + dynamoDbClient.createTable(createTableRequest) + + // Wait for table to become active + waitForTableActive(tableName) + } + + /** + * Creates the schema table with the appropriate schema. + * + * @param tableName The name of the table to create + */ + private def createSchemaTable(tableName: String): Unit = { + val createTableRequest = CreateTableRequest.builder() + .tableName(tableName) + .keySchema( + KeySchemaElement.builder() + .attributeName(ATTR_TABLE_NAME) + .keyType(KeyType.HASH) + .build(), + KeySchemaElement.builder() + .attributeName(ATTR_INFO_DATE) + .keyType(KeyType.RANGE) + .build() + ) + .attributeDefinitions( + AttributeDefinition.builder() + .attributeName(ATTR_TABLE_NAME) + .attributeType(ScalarAttributeType.S) + .build(), + AttributeDefinition.builder() + .attributeName(ATTR_INFO_DATE) + .attributeType(ScalarAttributeType.S) + .build() + ) + .billingMode(BillingMode.PAY_PER_REQUEST) // On-demand billing + .build() + + dynamoDbClient.createTable(createTableRequest) + + // Wait for table to become active + waitForTableActive(tableName) + } + + /** + * Waits for a table to become active after creation. + * + * @param tableName The name of the table to wait for + * @param maxWaitSeconds Maximum time to wait in seconds (default: 60) + */ + private def waitForTableActive(tableName: String, maxWaitSeconds: Int = 60): Unit = { + val startTime = System.currentTimeMillis() + val maxWaitMs = maxWaitSeconds * 1000L + + var tableActive = false + while (!tableActive && (System.currentTimeMillis() - startTime) < maxWaitMs) { + try { + val describeRequest = DescribeTableRequest.builder() + .tableName(tableName) + .build() + + val response = dynamoDbClient.describeTable(describeRequest) + val status = response.table().tableStatus() + + if (status == TableStatus.ACTIVE) { + tableActive = true + log.debug(s"Table $tableName is now ACTIVE") + } else { + log.debug(s"Table $tableName status: $status, waiting...") + Thread.sleep(2000) // Wait 2 seconds before checking again + } + } catch { + case NonFatal(ex) => + log.warn(s"Error checking table status for $tableName", ex) + Thread.sleep(2000) + } + } + + if (!tableActive) { + throw new RuntimeException(s"Table $tableName did not become active within $maxWaitSeconds seconds") + } + } + + override def getLatestProcessedDateFromStorage(table: String, until: Option[LocalDate]): Option[LocalDate] = { + try { + val queryBuilder = QueryRequest.builder() + .tableName(bookkeepingTableName) + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(table).build() + ).asJava) + .scanIndexForward(false) // descending order + + val query = until match { + case Some(endDate) => + val endDateStr = getDateStr(endDate) + queryBuilder + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE <= :endDate") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(table).build(), + ":endDate" -> AttributeValue.builder().s(endDateStr).build() + ).asJava) + case None => + queryBuilder + } + + val response = dynamoDbClient.query(query.build()) + val items = response.items().asScala + + if (items.isEmpty) { + None + } else { + // Find the maximum infoDateEnd + val latestDate = items + .map(item => LocalDate.parse(item.get(ATTR_INFO_DATE_END).s())) + .maxBy(_.toEpochDay) + Some(latestDate) + } + } catch { + case NonFatal(ex) => + log.error(s"Error querying latest processed date for table '$table'", ex) + throw ex + } + } + + override def getLatestDataChunkFromStorage(table: String, infoDate: LocalDate): Option[DataChunk] = { + try { + val dateStr = getDateStr(infoDate) + + val queryRequest = QueryRequest.builder() + .tableName(bookkeepingTableName) + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE = :infoDate") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(table).build(), + ":infoDate" -> AttributeValue.builder().s(dateStr).build() + ).asJava) + .build() + + val response = dynamoDbClient.query(queryRequest) + val items = response.items().asScala + + if (items.isEmpty) { + None + } else { + // Sort by jobFinished descending and take the first + items + .map(itemToDataChunk) + .sortBy(-_.jobFinished) + .headOption + } + } catch { + case NonFatal(ex) => + log.error(s"Error getting latest data chunk for table '$table' at $infoDate", ex) + throw ex + } + } + + override def getDataChunksFromStorage(table: String, infoDate: LocalDate, batchIdFilter: Option[Long]): Seq[DataChunk] = { + try { + val dateStr = getDateStr(infoDate) + + val queryBuilder = QueryRequest.builder() + .tableName(bookkeepingTableName) + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE = :infoDate") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(table).build(), + ":infoDate" -> AttributeValue.builder().s(dateStr).build() + ).asJava) + + val query = batchIdFilter match { + case Some(bId) => + queryBuilder + .filterExpression(s"$ATTR_BATCH_ID = :batchId") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(table).build(), + ":infoDate" -> AttributeValue.builder().s(dateStr).build(), + ":batchId" -> AttributeValue.builder().n(bId.toString).build() + ).asJava) + case None => + queryBuilder + } + + val response = dynamoDbClient.query(query.build()) + val chunks = response.items().asScala + .map(itemToDataChunk) + .sortBy(_.jobFinished) + .toSeq + + log.debug(s"For $table ($infoDate) : ${chunks.mkString("[ ", ", ", " ]")}") + chunks + } catch { + case NonFatal(ex) => + log.error(s"Error getting data chunks for table '$table' at $infoDate", ex) + throw ex + } + } + + override def getDataChunksCountFromStorage(table: String, dateBeginOpt: Option[LocalDate], dateEndOpt: Option[LocalDate]): Long = { + try { + var count = 0L + var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null + + do { + val queryBuilder = buildQueryForDateRange(table, dateBeginOpt, dateEndOpt) + .select(Select.COUNT) + + if (lastEvaluatedKey != null) { + queryBuilder.exclusiveStartKey(lastEvaluatedKey) + } + + val response = dynamoDbClient.query(queryBuilder.build()) + count += response.count() + lastEvaluatedKey = response.lastEvaluatedKey() + } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty) + + count + } catch { + case NonFatal(ex) => + log.error(s"Error counting data chunks for table '$table'", ex) + throw ex + } + } + + override def getDataAvailabilityFromStorage(table: String, dateBegin: LocalDate, dateEnd: LocalDate): Seq[DataAvailability] = { + try { + val allChunks = getAllChunksInDateRange(table, dateBegin, dateEnd) + + // Group by infoDate and aggregate + val grouped = allChunks.groupBy(_.infoDate) + val availability = grouped.map { case (dateStr, chunks) => + val date = LocalDate.parse(dateStr) + val totalRecords = chunks.map(_.outputRecordCount).sum + DataAvailability(date, chunks.length, totalRecords) + }.toSeq.sortBy(_.infoDate.toEpochDay) + + availability + } catch { + case NonFatal(ex) => + log.error(s"Error getting data availability for table '$table'", ex) + throw ex + } + } + + override def saveRecordCountToStorage( + table: String, + infoDate: LocalDate, + inputRecordCount: Long, + outputRecordCount: Long, + recordsAppended: Option[Long], + jobStarted: Long, + jobFinished: Long + ): Unit = { + try { + val dateStr = getDateStr(infoDate) + val item = dataChunkToItem( + DataChunk(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, jobStarted, jobFinished, Some(batchId), recordsAppended) + ) + + val putRequest = PutItemRequest.builder() + .tableName(bookkeepingTableName) + .item(item) + .build() + + dynamoDbClient.putItem(putRequest) + log.debug(s"Saved bookkeeping record for table '$table', infoDate='$dateStr', batchId=$batchId") + } catch { + case NonFatal(ex) => + log.error(s"Error saving record count for table '$table' at $infoDate", ex) + throw ex + } + } + + override def deleteNonCurrentBatchRecords(table: String, infoDate: LocalDate): Unit = { + try { + val dateStr = getDateStr(infoDate) + + AlgorithmUtils.runActionWithElapsedTimeEvent(queryWarningTimeoutMs) { + // Query all items for this table and date + val queryRequest = QueryRequest.builder() + .tableName(bookkeepingTableName) + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE = :infoDate") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(table).build(), + ":infoDate" -> AttributeValue.builder().s(dateStr).build() + ).asJava) + .build() + + val response = dynamoDbClient.query(queryRequest) + val items = response.items().asScala + + // Filter and delete items with different batchId + items.foreach { item => + val itemBatchId = Option(item.get(ATTR_BATCH_ID)).flatMap(av => + if (av.n() != null) Some(av.n().toLong) else None + ) + + if (itemBatchId.exists(_ != batchId)) { + val deleteRequest = DeleteItemRequest.builder() + .tableName(bookkeepingTableName) + .key(Map( + ATTR_TABLE_NAME -> AttributeValue.builder().s(table).build(), + ATTR_INFO_DATE -> AttributeValue.builder().s(dateStr).build() + ).asJava) + .conditionExpression(s"$ATTR_JOB_FINISHED = :jobFinished") + .expressionAttributeValues(Map( + ":jobFinished" -> item.get(ATTR_JOB_FINISHED) + ).asJava) + .build() + + try { + dynamoDbClient.deleteItem(deleteRequest) + } catch { + case _: ConditionalCheckFailedException => + // Item was already modified or deleted, ignore + log.debug(s"Could not delete item for table '$table', date '$dateStr' - already modified") + } + } + } + } { actualTimeMs => + val elapsedTime = TimeUtils.prettyPrintElapsedTimeShort(actualTimeMs) + log.warn(s"DynamoDB query took too long ($elapsedTime) while deleting from $bookkeepingTableName, tableName='$table', infoDate='$infoDate', batchId!=$batchId") + } + } catch { + case NonFatal(ex) => + log.error(s"Error deleting non-current batch records for table '$table' at $infoDate", ex) + throw ex + } + } + + override def deleteTable(tableWithWildcard: String): Seq[String] = { + // DynamoDB implementation for wildcard deletion + // This would require scanning and deleting matching items + throw new UnsupportedOperationException("deleteTable with wildcards is not yet implemented for DynamoDB bookkeeper") + } + + override def getLatestSchema(tableName: String, until: LocalDate): Option[(StructType, LocalDate)] = { + try { + val untilDateStr = until.toString + + val queryRequest = QueryRequest.builder() + .tableName(schemaTableName) + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE <= :untilDate") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(tableName).build(), + ":untilDate" -> AttributeValue.builder().s(untilDateStr).build() + ).asJava) + .scanIndexForward(false) // descending order + .limit(1) + .build() + + val response = dynamoDbClient.query(queryRequest) + val items = response.items().asScala + + items.headOption.flatMap { item => + val tableSchema = TableSchema( + tableName = item.get(ATTR_TABLE_NAME).s(), + infoDate = item.get(ATTR_INFO_DATE).s(), + schemaJson = item.get(ATTR_SCHEMA_JSON).s() + ) + TableSchema.toSchemaAndDate(tableSchema) + } + } catch { + case NonFatal(ex) => + log.error(s"Error getting latest schema for table '$tableName' until $until", ex) + throw ex + } + } + + private[pramen] override def saveSchema(tableName: String, infoDate: LocalDate, schema: StructType): Unit = { + try { + val item = Map( + ATTR_TABLE_NAME -> AttributeValue.builder().s(tableName).build(), + ATTR_INFO_DATE -> AttributeValue.builder().s(infoDate.toString).build(), + ATTR_SCHEMA_JSON -> AttributeValue.builder().s(schema.json).build() + ).asJava + + val putRequest = PutItemRequest.builder() + .tableName(schemaTableName) + .item(item) + .build() + + dynamoDbClient.putItem(putRequest) + log.debug(s"Saved schema for table '$tableName', infoDate='$infoDate'") + } catch { + case NonFatal(ex) => + log.error(s"Error saving schema for table '$tableName' at $infoDate", ex) + throw ex + } + } + + override def close(): Unit = { + try { + dynamoDbClient.close() + } catch { + case NonFatal(ex) => + log.warn("Error closing DynamoDB client", ex) + } + } + + private def itemToDataChunk(item: java.util.Map[String, AttributeValue]): DataChunk = { + DataChunk( + tableName = item.get(ATTR_TABLE_NAME).s(), + infoDate = item.get(ATTR_INFO_DATE).s(), + infoDateBegin = item.get(ATTR_INFO_DATE_BEGIN).s(), + infoDateEnd = item.get(ATTR_INFO_DATE_END).s(), + inputRecordCount = item.get(ATTR_INPUT_RECORD_COUNT).n().toLong, + outputRecordCount = item.get(ATTR_OUTPUT_RECORD_COUNT).n().toLong, + jobStarted = item.get(ATTR_JOB_STARTED).n().toLong, + jobFinished = item.get(ATTR_JOB_FINISHED).n().toLong, + batchId = Option(item.get(ATTR_BATCH_ID)).flatMap(av => if (av.n() != null) Some(av.n().toLong) else None), + appendedRecordCount = Option(item.get(ATTR_APPENDED_RECORD_COUNT)).flatMap(av => if (av.n() != null) Some(av.n().toLong) else None) + ) + } + + private def dataChunkToItem(chunk: DataChunk): java.util.Map[String, AttributeValue] = { + val baseMap = Map( + ATTR_TABLE_NAME -> AttributeValue.builder().s(chunk.tableName).build(), + ATTR_INFO_DATE -> AttributeValue.builder().s(chunk.infoDate).build(), + ATTR_INFO_DATE_BEGIN -> AttributeValue.builder().s(chunk.infoDateBegin).build(), + ATTR_INFO_DATE_END -> AttributeValue.builder().s(chunk.infoDateEnd).build(), + ATTR_INPUT_RECORD_COUNT -> AttributeValue.builder().n(chunk.inputRecordCount.toString).build(), + ATTR_OUTPUT_RECORD_COUNT -> AttributeValue.builder().n(chunk.outputRecordCount.toString).build(), + ATTR_JOB_STARTED -> AttributeValue.builder().n(chunk.jobStarted.toString).build(), + ATTR_JOB_FINISHED -> AttributeValue.builder().n(chunk.jobFinished.toString).build() + ) + + val withBatchId = chunk.batchId match { + case Some(bid) => baseMap + (ATTR_BATCH_ID -> AttributeValue.builder().n(bid.toString).build()) + case None => baseMap + } + + val withAppendedCount = chunk.appendedRecordCount match { + case Some(count) => withBatchId + (ATTR_APPENDED_RECORD_COUNT -> AttributeValue.builder().n(count.toString).build()) + case None => withBatchId + } + + withAppendedCount.asJava + } + + private def buildQueryForDateRange( + table: String, + dateBeginOpt: Option[LocalDate], + dateEndOpt: Option[LocalDate] + ): QueryRequest.Builder = { + val builder = QueryRequest.builder() + .tableName(bookkeepingTableName) + + (dateBeginOpt, dateEndOpt) match { + case (Some(begin), Some(end)) => + val beginStr = getDateStr(begin) + val endStr = getDateStr(end) + builder + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE BETWEEN :beginDate AND :endDate") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(table).build(), + ":beginDate" -> AttributeValue.builder().s(beginStr).build(), + ":endDate" -> AttributeValue.builder().s(endStr).build() + ).asJava) + case (Some(begin), None) => + val beginStr = getDateStr(begin) + builder + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE >= :beginDate") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(table).build(), + ":beginDate" -> AttributeValue.builder().s(beginStr).build() + ).asJava) + case (None, Some(end)) => + val endStr = getDateStr(end) + builder + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE <= :endDate") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(table).build(), + ":endDate" -> AttributeValue.builder().s(endStr).build() + ).asJava) + case (None, None) => + builder + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(table).build() + ).asJava) + } + } + + private def getAllChunksInDateRange(table: String, dateBegin: LocalDate, dateEnd: LocalDate): Seq[DataChunk] = { + val chunks = scala.collection.mutable.ListBuffer[DataChunk]() + var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null + + do { + val queryBuilder = buildQueryForDateRange(table, Some(dateBegin), Some(dateEnd)) + + if (lastEvaluatedKey != null) { + queryBuilder.exclusiveStartKey(lastEvaluatedKey) + } + + val response = dynamoDbClient.query(queryBuilder.build()) + chunks ++= response.items().asScala.map(itemToDataChunk) + lastEvaluatedKey = response.lastEvaluatedKey() + } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty) + + chunks.toSeq + } +} + +object BookkeeperDynamoDb { + val DEFAULT_BOOKKEEPING_TABLE = "bookkeeping" + val DEFAULT_SCHEMA_TABLE = "schemas" + val DEFAULT_TABLE_PREFIX = "pramen" + + // Attribute names for bookkeeping table + val ATTR_TABLE_NAME = "tableName" + val ATTR_INFO_DATE = "infoDate" + val ATTR_INFO_DATE_BEGIN = "infoDateBegin" + val ATTR_INFO_DATE_END = "infoDateEnd" + val ATTR_INPUT_RECORD_COUNT = "inputRecordCount" + val ATTR_OUTPUT_RECORD_COUNT = "outputRecordCount" + val ATTR_JOB_STARTED = "jobStarted" + val ATTR_JOB_FINISHED = "jobFinished" + val ATTR_BATCH_ID = "batchId" + val ATTR_APPENDED_RECORD_COUNT = "appendedRecordCount" + + // Attribute names for schema table + val ATTR_SCHEMA_JSON = "schemaJson" + + val MODEL_VERSION = 1 + + /** + * Builder for creating BookkeeperDynamoDb instances. + * Provides a fluent API for configuring DynamoDB bookkeeper. + * + * Example: + * {{{ + * val bookkeeper = BookkeeperDynamoDb.builder + * .withRegion("us-east-1") + * .withBatchId(System.currentTimeMillis()) + * .withTablePrefix("my_app") + * .build() + * }}} + */ + class BookkeeperDynamoDbBuilder { + private var region: Option[String] = None + private var batchId: Option[Long] = None + private var tableArn: Option[String] = None + private var tablePrefix: String = DEFAULT_TABLE_PREFIX + private var credentialsProvider: Option[AwsCredentialsProvider] = None + private var endpoint: Option[String] = None + + /** + * Sets the AWS region for the DynamoDB client. + * + * @param region AWS region (e.g., "us-east-1", "eu-west-1") + * @return this builder + */ + def withRegion(region: String): BookkeeperDynamoDbBuilder = { + this.region = Some(region) + this + } + + /** + * Sets the batch ID for this bookkeeper instance. + * + * @param batchId Batch ID (typically timestamp in milliseconds) + * @return this builder + */ + def withBatchId(batchId: Long): BookkeeperDynamoDbBuilder = { + this.batchId = Some(batchId) + this + } + + /** + * Sets the table ARN prefix for cross-account or cross-region access. + * + * @param arn ARN prefix (e.g., "arn:aws:dynamodb:us-east-1:123456789012:table/") + * @return this builder + */ + def withTableArn(arn: String): BookkeeperDynamoDbBuilder = { + this.tableArn = Some(arn) + this + } + + /** + * Sets the table name prefix to allow multiple bookkeeping sets in the same account. + * + * @param prefix Table name prefix (default: "pramen") + * @return this builder + */ + def withTablePrefix(prefix: String): BookkeeperDynamoDbBuilder = { + this.tablePrefix = prefix + this + } + + /** + * Sets custom AWS credentials provider. + * + * @param provider AWS credentials provider + * @return this builder + */ + def withCredentialsProvider(provider: AwsCredentialsProvider): BookkeeperDynamoDbBuilder = { + this.credentialsProvider = Some(provider) + this + } + + /** + * Sets a custom DynamoDB endpoint (useful for testing with LocalStack or DynamoDB Local). + * + * @param endpoint Endpoint URI (e.g., "http://localhost:8000") + * @return this builder + */ + def withEndpoint(endpoint: String): BookkeeperDynamoDbBuilder = { + this.endpoint = Some(endpoint) + this + } + + /** + * Builds the BookkeeperDynamoDb instance. + * + * @return Configured BookkeeperDynamoDb instance + * @throws IllegalArgumentException if required parameters are missing + */ + def build(): BookkeeperDynamoDb = { + val actualBatchId = batchId.getOrElse(System.currentTimeMillis()) + + if (region.isEmpty) { + throw new IllegalArgumentException("Either region or dynamoDbClient must be provided") + } + + val clientBuilder = DynamoDbClient.builder() + .region(Region.of(region.get)) + + credentialsProvider.foreach(clientBuilder.credentialsProvider) + + endpoint.foreach { ep => + clientBuilder.endpointOverride(URI.create(ep)) + } + + val client = clientBuilder.build() + + new BookkeeperDynamoDb( + dynamoDbClient = client, + batchId = actualBatchId, + tableArn = tableArn, + tablePrefix = tablePrefix + ) + } + } + + def builder: BookkeeperDynamoDbBuilder = new BookkeeperDynamoDbBuilder + + /** + * Constructs the full table name using ARN prefix and table name. + * If tableArn is provided, uses it as a prefix, otherwise returns just the table name. + * + * @param tableArn Optional ARN prefix for the table + * @param tableName The table name + * @return Full table name or ARN + */ + def getFullTableName(tableArn: Option[String], tableName: String): String = { + tableArn match { + case Some(arn) if arn.nonEmpty => + // If ARN ends with table/, append the table name, otherwise append /table/tableName + if (arn.endsWith("/")) { + s"${arn}table/$tableName" + } else if (arn.contains("/table/")) { + arn // ARN already includes table path + } else { + s"$arn/table/$tableName" + } + case _ => tableName + } + } +} diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/BookkeepingConfigFactory.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/BookkeepingConfigFactory.scala index 6fc1f262..17ca5bc9 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/BookkeepingConfigFactory.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/BookkeepingConfigFactory.scala @@ -28,7 +28,10 @@ object BookkeepingConfigFactory { bookkeepingJdbcConfig: Option[JdbcConfig] = None, deltaDatabase: Option[String] = None, deltaTablePrefix: Option[String] = None, - temporaryDirectory: Option[String] = None): BookkeeperConfig = { + temporaryDirectory: Option[String] = None, + dynamoDbRegion: Option[String] = None, + dynamoDbTableArn: Option[String] = None, + dynamoDbTablePrefix: Option[String] = None): BookkeeperConfig = { BookkeeperConfig( bookkeepingEnabled, bookkeepingLocation, @@ -38,7 +41,10 @@ object BookkeepingConfigFactory { bookkeepingJdbcConfig, deltaDatabase, deltaTablePrefix, - temporaryDirectory + temporaryDirectory, + dynamoDbRegion, + dynamoDbTableArn, + dynamoDbTablePrefix ) } diff --git a/pramen/examples/dynamodb_bookkeeping/README.md b/pramen/examples/dynamodb_bookkeeping/README.md new file mode 100644 index 00000000..2d587323 --- /dev/null +++ b/pramen/examples/dynamodb_bookkeeping/README.md @@ -0,0 +1,340 @@ +# DynamoDB Bookkeeping Example + +This example demonstrates how to configure Pramen to use AWS DynamoDB for bookkeeping instead of MongoDB, JDBC databases, or Hadoop-based storage. + +## Overview + +DynamoDB bookkeeping provides a serverless, fully managed solution for tracking pipeline state, record counts, and data availability in Pramen pipelines. + +### Benefits + +- **Serverless**: No database servers to manage or maintain +- **Auto-scaling**: Automatically scales to handle workload +- **Pay-per-request**: No fixed costs, pay only for what you use +- **High Availability**: Built-in replication across AWS availability zones +- **Multi-environment**: Easy separation via table prefixes +- **Automatic Table Creation**: Tables are created automatically on first run + +## Configuration + +### Minimal Configuration + +```hocon +pramen.bookkeeping { + enabled = true + dynamodb.region = "us-east-1" +} +``` + +This creates tables: +- `pramen_bookkeeping` +- `pramen_schemas` + +### Production Configuration + +```hocon +pramen.bookkeeping { + enabled = true + dynamodb.region = "us-east-1" + dynamodb.table.prefix = "pramen_production" +} +``` + +This creates tables: +- `pramen_production_bookkeeping` +- `pramen_production_schemas` + +### Multi-Environment Configuration + +**Development:** +```hocon +pramen.bookkeeping { + enabled = true + dynamodb.region = "us-east-1" + dynamodb.table.prefix = "pramen_dev" +} +``` + +**Staging:** +```hocon +pramen.bookkeeping { + enabled = true + dynamodb.region = "us-east-1" + dynamodb.table.prefix = "pramen_staging" +} +``` + +**Production:** +```hocon +pramen.bookkeeping { + enabled = true + dynamodb.region = "us-east-1" + dynamodb.table.prefix = "pramen_production" +} +``` + +### Cross-Account Configuration + +If DynamoDB tables are in a different AWS account: + +```hocon +pramen.bookkeeping { + enabled = true + dynamodb.region = "us-west-2" + dynamodb.table.arn = "arn:aws:dynamodb:us-west-2:987654321098:table/" + dynamodb.table.prefix = "shared_pramen" +} +``` + +## AWS Setup + +### 1. AWS Credentials + +Pramen uses the AWS SDK's `DefaultCredentialsProvider`, which loads credentials from: + +1. **Environment Variables**: + ```bash + export AWS_ACCESS_KEY_ID=your_access_key + export AWS_SECRET_ACCESS_KEY=your_secret_key + export AWS_REGION=us-east-1 + ``` + +2. **AWS Credentials File** (`~/.aws/credentials`): + ```ini + [default] + aws_access_key_id = your_access_key + aws_secret_access_key = your_secret_key + region = us-east-1 + ``` + +3. **IAM Role** (recommended for EC2, ECS, EMR, etc.): + - No credentials needed in configuration + - Automatically uses the instance/task role + +### 2. Required IAM Permissions + +Create an IAM policy with these permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "dynamodb:CreateTable", + "dynamodb:DescribeTable", + "dynamodb:Query", + "dynamodb:PutItem", + "dynamodb:DeleteItem", + "dynamodb:GetItem", + "dynamodb:Scan" + ], + "Resource": [ + "arn:aws:dynamodb:us-east-1:*:table/pramen_*" + ] + } + ] +} +``` + +**Note**: Adjust the region and table name pattern based on your configuration. + +### 3. Table Structure + +Tables are automatically created with the following schema: + +#### Bookkeeping Table (`{prefix}_bookkeeping`) +- **Partition Key**: `tableName` (String) +- **Sort Key**: `infoDate` (String, format: yyyy-MM-dd) +- **Billing Mode**: PAY_PER_REQUEST (on-demand) +- **Attributes**: + - `tableName`: Name of the metastore table + - `infoDate`: Information date + - `infoDateBegin`: Start of date range + - `infoDateEnd`: End of date range + - `inputRecordCount`: Number of input records + - `outputRecordCount`: Number of output records + - `jobStarted`: Job start timestamp (milliseconds) + - `jobFinished`: Job finish timestamp (milliseconds) + - `batchId`: Batch execution ID + - `appendedRecordCount`: Records appended (optional) + +#### Schema Table (`{prefix}_schemas`) +- **Partition Key**: `tableName` (String) +- **Sort Key**: `infoDate` (String) +- **Billing Mode**: PAY_PER_REQUEST (on-demand) +- **Attributes**: + - `tableName`: Name of the metastore table + - `infoDate`: Date when schema was recorded + - `schemaJson`: Spark schema in JSON format + +## Running the Example + +1. **Configure AWS credentials** (see above) + +2. **Update the configuration file**: + ```bash + vi examples/dynamodb_bookkeeping/dynamodb_bookkeeping.conf + ``` + +3. **Run Pramen**: + ```bash + spark-submit \ + --class za.co.absa.pramen.runner.PipelineRunner \ + --master local[*] \ + pramen-runner_2.12-1.13.10.jar \ + --config examples/dynamodb_bookkeeping/dynamodb_bookkeeping.conf \ + --date 2024-01-15 + ``` + +4. **Verify tables were created**: + ```bash + aws dynamodb list-tables --region us-east-1 + ``` + + You should see: + - `pramen_production_bookkeeping` + - `pramen_production_schemas` + +5. **Query bookkeeping data**: + ```bash + aws dynamodb query \ + --table-name pramen_production_bookkeeping \ + --key-condition-expression "tableName = :table" \ + --expression-attribute-values '{":table":{"S":"example_table"}}' \ + --region us-east-1 + ``` + +## Cost Considerations + +DynamoDB uses pay-per-request billing: + +- **On-demand mode** (default): + - Write: $1.25 per million write requests + - Read: $0.25 per million read requests + - Storage: $0.25 per GB-month + +- **Typical Pramen workload**: + - Small pipelines: < $1/month + - Medium pipelines: $5-20/month + - Large pipelines: $50-100/month + +**Cost optimization tips**: +1. Use table prefixes to separate environments (avoid duplicating production data) +2. Archive old bookkeeping data periodically +3. Monitor usage via AWS Cost Explorer + +## Troubleshooting + +### Issue: "Access Denied" error + +**Cause**: Missing IAM permissions + +**Solution**: Verify IAM policy includes all required DynamoDB permissions + +### Issue: "Region not found" error + +**Cause**: Invalid AWS region specified + +**Solution**: Check region name in configuration matches AWS region codes +(e.g., `us-east-1`, `eu-west-1`, `ap-southeast-1`) + +### Issue: Tables not created automatically + +**Cause**: Missing `dynamodb:CreateTable` permission + +**Solution**: Add CreateTable permission to IAM policy, or manually create tables: + +```bash +# Create bookkeeping table +aws dynamodb create-table \ + --table-name pramen_production_bookkeeping \ + --attribute-definitions \ + AttributeName=tableName,AttributeType=S \ + AttributeName=infoDate,AttributeType=S \ + --key-schema \ + AttributeName=tableName,KeyType=HASH \ + AttributeName=infoDate,KeyType=RANGE \ + --billing-mode PAY_PER_REQUEST \ + --region us-east-1 + +# Create schema table +aws dynamodb create-table \ + --table-name pramen_production_schemas \ + --attribute-definitions \ + AttributeName=tableName,AttributeType=S \ + AttributeName=infoDate,AttributeType=S \ + --key-schema \ + AttributeName=tableName,KeyType=HASH \ + AttributeName=infoDate,KeyType=RANGE \ + --billing-mode PAY_PER_REQUEST \ + --region us-east-1 +``` + +### Issue: Slow queries + +**Cause**: Large number of bookkeeping records + +**Solution**: +1. Use date range filters in queries +2. Consider implementing table retention policy +3. Archive old data to S3 + +## Comparison with Other Bookkeeping Options + +| Feature | DynamoDB | JDBC | MongoDB | Hadoop/Delta | +|---------|----------|------|---------|--------------| +| Setup Complexity | Low | Medium | Medium | Low | +| Maintenance | None | High | Medium | Low | +| Cost (small) | Very Low | Medium | Medium | Very Low | +| Cost (large) | Medium | High | Medium | Low | +| Scaling | Automatic | Manual | Manual | Automatic | +| Multi-region | Yes | No | Yes | Yes | +| Query Performance | Fast | Fast | Fast | Slower | +| Incremental Support | No* | Yes | No | No | + +*Note: Offset management for incremental pipelines is not yet implemented for DynamoDB bookkeeper. + +## Advanced Topics + +### Using DynamoDB Local for Development + +For local development/testing, use DynamoDB Local: + +1. **Start DynamoDB Local**: + ```bash + docker run -p 8000:8000 amazon/dynamodb-local + ``` + +2. **Configure endpoint** (requires code modification): + ```scala + val client = DynamoDbClient.builder() + .endpointOverride(new URI("http://localhost:8000")) + .region(Region.US_EAST_1) + .build() + ``` + +### Table Backup and Restore + +Use AWS Backup or DynamoDB point-in-time recovery: + +```bash +# Enable point-in-time recovery +aws dynamodb update-continuous-backups \ + --table-name pramen_production_bookkeeping \ + --point-in-time-recovery-specification PointInTimeRecoveryEnabled=true +``` + +### Monitoring + +Monitor DynamoDB metrics in CloudWatch: +- `UserErrors` - Check for configuration issues +- `ConsumedReadCapacityUnits` / `ConsumedWriteCapacityUnits` - Monitor costs +- `SystemErrors` - Check for service issues + +## References + +- [AWS DynamoDB Documentation](https://docs.aws.amazon.com/dynamodb/) +- [AWS SDK for Java Documentation](https://docs.aws.amazon.com/sdk-for-java/) +- [DynamoDB Best Practices](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/best-practices.html) diff --git a/pramen/examples/dynamodb_bookkeeping/dynamodb_bookkeeping.conf b/pramen/examples/dynamodb_bookkeeping/dynamodb_bookkeeping.conf new file mode 100644 index 00000000..248f5e48 --- /dev/null +++ b/pramen/examples/dynamodb_bookkeeping/dynamodb_bookkeeping.conf @@ -0,0 +1,123 @@ +# Copyright 2022 ABSA Group Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ============================================================================= +# Example Configuration: DynamoDB Bookkeeping +# ============================================================================= +# +# This example shows how to configure Pramen to use AWS DynamoDB for +# bookkeeping instead of MongoDB, JDBC, or Hadoop-based storage. +# +# DynamoDB bookkeeping provides: +# - Serverless, fully managed storage +# - Pay-per-request billing (no fixed costs) +# - Automatic scaling +# - High availability across AWS regions +# - Multi-environment support via table prefixes +# + +# General options +pramen { + environment.name = "Production" + pipeline.name = "DynamoDB Bookkeeping Example" + + # Enable bookkeeping with DynamoDB + bookkeeping { + enabled = true + + # ======================================================================= + # DynamoDB Configuration + # ======================================================================= + + # AWS Region where DynamoDB tables will be created/accessed + # REQUIRED when using DynamoDB bookkeeping + dynamodb.region = "us-east-1" + + # Table prefix for multi-environment/multi-tenant deployments + # OPTIONAL - defaults to "pramen" if not specified + # Creates tables: {prefix}_bookkeeping and {prefix}_schemas + dynamodb.table.prefix = "pramen_production" + + # Table ARN prefix for cross-account or resource-based policy access + # OPTIONAL - only needed for advanced scenarios + # Format: arn:aws:dynamodb:region:account-id:table/ + # dynamodb.table.arn = "arn:aws:dynamodb:us-east-1:123456789012:table/" + + # ======================================================================= + # Notes on DynamoDB Configuration + # ======================================================================= + # + # 1. AWS Credentials: + # - Pramen uses the AWS SDK's DefaultCredentialsProvider + # - Credentials are loaded from: + # a) Environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) + # b) AWS credentials file (~/.aws/credentials) + # c) IAM role (when running on EC2, ECS, Lambda, etc.) + # + # 2. Required IAM Permissions: + # Your AWS credentials/role must have permissions for: + # - dynamodb:CreateTable (for automatic table creation) + # - dynamodb:DescribeTable + # - dynamodb:Query + # - dynamodb:PutItem + # - dynamodb:DeleteItem + # + # 3. Table Structure: + # Tables are automatically created with: + # - Bookkeeping: Partition key=tableName, Sort key=infoDate + # - Schemas: Partition key=tableName, Sort key=infoDate + # - Billing mode: PAY_PER_REQUEST (on-demand) + # + # 4. Multi-Environment Setup: + # Use different table prefixes for different environments: + # - Dev: dynamodb.table.prefix = "pramen_dev" + # - Staging: dynamodb.table.prefix = "pramen_staging" + # - Production: dynamodb.table.prefix = "pramen_production" + # + # 5. Cross-Account Access: + # If tables are in a different AWS account, use the table ARN: + # dynamodb.table.arn = "arn:aws:dynamodb:us-west-2:987654321098:table/" + # + + # Hadoop format is required even when using DynamoDB + # (legacy requirement, set to any valid value) + hadoop.format = "delta" + } + + # Temporary directory (optional) + temporary.directory = "/tmp" +} + +# Metastore configuration +pramen.metastore { + tables = [ + { + name = "example_table" + format = "delta" + path = "/data/lake/example_table" + } + ] +} + +# Operations +pramen.operations = [ + { + name = "Example Operation" + type = "transformation" + schedule.type = "daily" + + transformer.class = "za.co.absa.pramen.core.transformers.IdentityTransformer" + input.table = "example_table" + output.table = "example_table" + } +] From 4174cadf79b75d14ab92aa20cebba8ed38619d87 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Tue, 31 Mar 2026 15:08:17 +0200 Subject: [PATCH 03/13] #181 Add DynamoDB-based distributed locking implementation --- .../pramen/core/bookkeeper/Bookkeeper.scala | 24 +- .../core/bookkeeper/BookkeeperDynamoDb.scala | 15 +- .../pramen/core/lock/TokenLockDynamoDb.scala | 214 +++++++++++ .../core/lock/TokenLockFactoryDynamoDb.scala | 357 ++++++++++++++++++ .../examples/dynamodb_bookkeeping/README.md | 161 ++++++++ .../dynamodb_bookkeeping.conf | 2 +- pramen/project/Dependencies.scala | 2 +- 7 files changed, 765 insertions(+), 10 deletions(-) create mode 100644 pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockDynamoDb.scala create mode 100644 pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala index 04e5c233..1d45285c 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala @@ -102,6 +102,14 @@ object Bookkeeper { if (hasBookkeepingJdbc) { log.info(s"Using RDB for lock management.") new TokenLockFactoryJdbc(dbOpt.get.slickDb, dbOpt.get.slickProfile) + } else if (hasBookkeepingDynamoDb) { + val tablePrefix = bookkeepingConfig.dynamoDbTablePrefix.getOrElse(BookkeeperDynamoDb.DEFAULT_TABLE_PREFIX) + log.info(s"Using DynamoDB for lock management in region '${bookkeepingConfig.dynamoDbRegion.get}' with table prefix '$tablePrefix'") + TokenLockFactoryDynamoDb.builder + .withRegion(bookkeepingConfig.dynamoDbRegion.get) + .withTablePrefix(tablePrefix) + .withTableArn(bookkeepingConfig.dynamoDbTableArn) + .build() } else { mongoDbConnection match { case Some(connection) => @@ -133,15 +141,12 @@ object Bookkeeper { } else if (hasBookkeepingDynamoDb) { val tablePrefix = bookkeepingConfig.dynamoDbTablePrefix.getOrElse(BookkeeperDynamoDb.DEFAULT_TABLE_PREFIX) log.info(s"Using DynamoDB for bookkeeping in region '${bookkeepingConfig.dynamoDbRegion.get}' with table prefix '$tablePrefix'") - val builder = BookkeeperDynamoDb.builder + BookkeeperDynamoDb.builder .withRegion(bookkeepingConfig.dynamoDbRegion.get) .withBatchId(batchId) .withTablePrefix(tablePrefix) - val builder2 = bookkeepingConfig.dynamoDbTableArn match { - case Some(arn) => builder.withTableArn(arn) - case None => builder - } - builder2.build() + .withTableArn(bookkeepingConfig.dynamoDbTableArn) + .build() } else { mongoDbConnection match { case Some(connection) => @@ -174,6 +179,9 @@ object Bookkeeper { } else if (hasBookkeepingJdbc) { log.info(s"Using RDB to keep journal of executed jobs.") new JournalJdbc(dbOpt.get.slickDb, dbOpt.get.slickProfile) + } else if (hasBookkeepingDynamoDb) { + log.info(s"The journal is DISABLED.") + new JournalNull() } else { mongoDbConnection match { case Some(connection) => @@ -216,6 +224,10 @@ object Bookkeeper { override def close(): Unit = { mongoDbConnection.foreach(_.close()) dbOpt.foreach(_.close()) + tokenFactory match { + case closeable: AutoCloseable => closeable.close() + case _ => // Not closeable + } } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala index 97f1ae54..c8651549 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala @@ -60,8 +60,8 @@ class BookkeeperDynamoDb( private val queryWarningTimeoutMs = 10000L // Construct table names with prefix - private val bookkeepingTableBaseName = s"${tablePrefix}_${DEFAULT_BOOKKEEPING_TABLE}" - private val schemaTableBaseName = s"${tablePrefix}_${DEFAULT_SCHEMA_TABLE}" + private val bookkeepingTableBaseName = s"${tablePrefix}_$DEFAULT_BOOKKEEPING_TABLE" + private val schemaTableBaseName = s"${tablePrefix}_$DEFAULT_SCHEMA_TABLE" // Full table names/ARNs private val bookkeepingTableName = getFullTableName(tableArn, bookkeepingTableBaseName) @@ -737,6 +737,17 @@ object BookkeeperDynamoDb { this } + /** + * Sets the table ARN prefix for cross-account or cross-region access. + * + * @param arnOpt ARN prefix (e.g., "arn:aws:dynamodb:us-east-1:123456789012:table/") + * @return this builder + */ + def withTableArn(arnOpt: Option[String]): BookkeeperDynamoDbBuilder = { + this.tableArn = arnOpt + this + } + /** * Sets the table name prefix to allow multiple bookkeeping sets in the same account. * diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockDynamoDb.scala new file mode 100644 index 00000000..454e9090 --- /dev/null +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockDynamoDb.scala @@ -0,0 +1,214 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.lock + +import org.slf4j.LoggerFactory +import software.amazon.awssdk.services.dynamodb.DynamoDbClient +import software.amazon.awssdk.services.dynamodb.model._ +import za.co.absa.pramen.core.lock.model.LockTicket + +import java.time.Instant +import java.time.temporal.ChronoUnit +import scala.collection.JavaConverters._ +import scala.util.control.NonFatal +import scala.util.{Failure, Success, Try} + +object TokenLockDynamoDb { + val DEFAULT_TABLE_NAME = "pramen_locks" + + // Attribute names + val ATTR_TOKEN = "job_token" // 'token' is a reserved word in DynamoDb and can't be used as an attribute + val ATTR_OWNER = "job_owner" // 'owner' is a reserved word in DynamoDb and can't be used as an attribute + val ATTR_EXPIRES = "expiresAt" + val ATTR_CREATED_AT = "createdAt" + + val TICKETS_HARD_EXPIRE_DAYS = 1 +} + +/** + * DynamoDB-based distributed lock implementation. + * + * This lock uses DynamoDB's conditional writes to implement distributed locking. + * The lock is maintained by periodic updates to the expiration time. + * + * Table schema: + * - Partition key: token (String) + * - Attributes: owner (String), expires (Number), createdAt (Number) + * + * @param token The unique identifier for the lock + * @param dynamoDbClient The DynamoDB client to use + * @param tableName The name of the locks table + */ +class TokenLockDynamoDb( + token: String, + dynamoDbClient: DynamoDbClient, + tableName: String = TokenLockDynamoDb.DEFAULT_TABLE_NAME +) extends TokenLockBase(token) { + + import TokenLockDynamoDb._ + + private val log = LoggerFactory.getLogger(this.getClass) + + /** Invoked from a synchronized block. */ + override def tryAcquireGuardLock(retries: Int = 3, thisTry: Int = 0): Boolean = { + def tryAcquireExistingTicket(): Boolean = { + val ticketOpt = getTicket + + if (ticketOpt.isEmpty) { + log.warn(s"No ticket for $escapedToken") + tryAcquireGuardLock(retries - 1, thisTry + 1) + } else { + val ticket = ticketOpt.get + val expires = ticket.expires + val now = Instant.now().getEpochSecond + + if (expires < now) { + log.warn(s"Taking over expired ticket $escapedToken ($expires < $now)") + releaseGuardLock() + tryAcquireGuardLock(retries - 1, thisTry + 1) + } else { + false + } + } + } + + if (retries <= 0) { + log.error(s"Cannot try acquire a lock after $thisTry retries.") + false + } else { + val ok = Try(acquireGuardLock()) + + ok match { + case Success(_) => + true + case Failure(_: ConditionalCheckFailedException) => + // Lock already exists + tryAcquireExistingTicket() + case Failure(ex) => + throw new IllegalStateException(s"Unable to acquire a lock by querying DynamoDB", ex) + } + } + } + + /** Invoked from a synchronized block. */ + override def releaseGuardLock(): Unit = { + try { + val now = Instant.now() + val nowEpoch = now.getEpochSecond + val hardExpireTickets = now.minus(TICKETS_HARD_EXPIRE_DAYS, ChronoUnit.DAYS).getEpochSecond + + // Delete this ticket or any expired tickets + val deleteRequest = DeleteItemRequest.builder() + .tableName(tableName) + .key(Map( + ATTR_TOKEN -> AttributeValue.builder().s(escapedToken).build() + ).asJava) + .conditionExpression(s"$ATTR_OWNER = :jobOwner OR ($ATTR_EXPIRES < :now AND $ATTR_CREATED_AT < :hardExpire)") + .expressionAttributeValues(Map( + ":jobOwner" -> AttributeValue.builder().s(owner).build(), + ":now" -> AttributeValue.builder().n(nowEpoch.toString).build(), + ":hardExpire" -> AttributeValue.builder().n(hardExpireTickets.toString).build() + ).asJava) + .build() + + try { + dynamoDbClient.deleteItem(deleteRequest) + } catch { + case _: ConditionalCheckFailedException => + // Item doesn't match condition, ignore + log.debug(s"Could not delete ticket $escapedToken - condition not met") + } + } catch { + case NonFatal(ex) => + log.error(s"An error occurred when trying to release the lock: $escapedToken.", ex) + } + } + + /** Invoked from a synchronized block. */ + override def updateTicket(): Unit = { + val newTicket = getNewTicket + + try { + log.debug(s"Update $escapedToken to $newTicket") + + val updateRequest = UpdateItemRequest.builder() + .tableName(tableName) + .key(Map( + ATTR_TOKEN -> AttributeValue.builder().s(escapedToken).build() + ).asJava) + .updateExpression(s"SET $ATTR_EXPIRES = :expires") + .expressionAttributeValues(Map( + ":expires" -> AttributeValue.builder().n(newTicket.toString).build() + ).asJava) + .build() + + dynamoDbClient.updateItem(updateRequest) + } catch { + case NonFatal(ex) => + log.error(s"An error occurred when trying to update the lock: $escapedToken.", ex) + } + } + + /** Invoked from a synchronized block. */ + private def getTicket: Option[LockTicket] = { + try { + val getRequest = GetItemRequest.builder() + .tableName(tableName) + .key(Map( + ATTR_TOKEN -> AttributeValue.builder().s(escapedToken).build() + ).asJava) + .build() + + val response = dynamoDbClient.getItem(getRequest) + + if (response.hasItem && !response.item().isEmpty) { + val item = response.item() + Some(LockTicket( + token = item.get(ATTR_TOKEN).s(), + owner = item.get(ATTR_OWNER).s(), + expires = item.get(ATTR_EXPIRES).n().toLong, + createdAt = Option(item.get(ATTR_CREATED_AT)).map(_.n().toLong) + )) + } else { + None + } + } catch { + case NonFatal(ex) => + log.error(s"Error getting ticket for $escapedToken", ex) + None + } + } + + /** Invoked from a synchronized block. */ + private def acquireGuardLock(): Unit = { + val now = Instant.now().getEpochSecond + val item = Map( + ATTR_TOKEN -> AttributeValue.builder().s(escapedToken).build(), + ATTR_OWNER -> AttributeValue.builder().s(owner).build(), + ATTR_EXPIRES -> AttributeValue.builder().n(getNewTicket.toString).build(), + ATTR_CREATED_AT -> AttributeValue.builder().n(now.toString).build() + ).asJava + + val putRequest = PutItemRequest.builder() + .tableName(tableName) + .item(item) + .conditionExpression(s"attribute_not_exists($ATTR_TOKEN)") + .build() + + dynamoDbClient.putItem(putRequest) + } +} diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala new file mode 100644 index 00000000..c8e8a150 --- /dev/null +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala @@ -0,0 +1,357 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.lock + +import org.slf4j.LoggerFactory +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider +import software.amazon.awssdk.regions.Region +import software.amazon.awssdk.services.dynamodb.DynamoDbClient +import software.amazon.awssdk.services.dynamodb.model._ +import za.co.absa.pramen.api.lock.{TokenLock, TokenLockFactory} +import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb.DEFAULT_TABLE_PREFIX + +import java.net.URI +import scala.util.control.NonFatal + +/** + * Factory for creating DynamoDB-based distributed locks. + * + * This factory creates and manages a DynamoDB table for storing lock tickets. + * The table is created automatically if it doesn't exist. + * + * @param dynamoDbClient The DynamoDB client to use + * @param tableArn Optional ARN prefix for the locks table + * @param tablePrefix Prefix for the locks table name (default: "pramen") + */ +class TokenLockFactoryDynamoDb( + dynamoDbClient: DynamoDbClient, + tableArn: Option[String] = None, + tablePrefix: String = "pramen" +) extends TokenLockFactory with AutoCloseable { + + import TokenLockDynamoDb._ + + private val log = LoggerFactory.getLogger(this.getClass) + + // Construct table name with prefix + private val locksTableBaseName = s"${tablePrefix}_locks" + private val locksTableName = getFullTableName(tableArn, locksTableBaseName) + + // Initialize table on construction + init() + + override def getLock(token: String): TokenLock = { + new TokenLockDynamoDb(token, dynamoDbClient, locksTableBaseName) + } + + /** + * Closes the DynamoDB client. + * Should be called when the lock factory is no longer needed. + */ + override def close(): Unit = { + try { + dynamoDbClient.close() + } catch { + case NonFatal(ex) => + log.warn("Error closing DynamoDB client", ex) + } + } + + /** + * Initializes the DynamoDB locks table. + * Checks if the table exists and creates it if it doesn't. + */ + private def init(): Unit = { + try { + log.info(s"Initializing DynamoDB lock factory with table: '$locksTableName'") + + if (!tableExists(locksTableBaseName)) { + log.info(s"Creating DynamoDB locks table: $locksTableBaseName") + createLocksTable(locksTableBaseName) + log.info(s"Successfully created locks table: $locksTableBaseName") + } else { + log.info(s"DynamoDB locks table already exists: $locksTableBaseName") + } + + log.info(s"DynamoDB lock factory initialization complete") + } catch { + case NonFatal(ex) => + log.error("Error initializing DynamoDB lock factory", ex) + throw new RuntimeException("Failed to initialize DynamoDB lock factory", ex) + } + } + + /** + * Checks if a DynamoDB table exists. + * + * @param tableName The name of the table to check + * @return true if the table exists, false otherwise + */ + private def tableExists(tableName: String): Boolean = { + try { + val describeRequest = DescribeTableRequest.builder() + .tableName(tableName) + .build() + + dynamoDbClient.describeTable(describeRequest) + true + } catch { + case _: ResourceNotFoundException => false + case NonFatal(ex) => + log.warn(s"Error checking if table exists: $tableName", ex) + throw ex + } + } + + /** + * Creates the locks table with the appropriate schema. + * + * @param tableName The name of the table to create + */ + private def createLocksTable(tableName: String): Unit = { + val createTableRequest = CreateTableRequest.builder() + .tableName(tableName) + .keySchema( + KeySchemaElement.builder() + .attributeName(ATTR_TOKEN) + .keyType(KeyType.HASH) + .build() + ) + .attributeDefinitions( + AttributeDefinition.builder() + .attributeName(ATTR_TOKEN) + .attributeType(ScalarAttributeType.S) + .build() + ) + .billingMode(BillingMode.PAY_PER_REQUEST) // On-demand billing + .build() + + dynamoDbClient.createTable(createTableRequest) + + // Wait for table to become active + waitForTableActive(tableName) + } + + /** + * Waits for a table to become active after creation. + * + * @param tableName The name of the table to wait for + * @param maxWaitSeconds Maximum time to wait in seconds (default: 60) + */ + private def waitForTableActive(tableName: String, maxWaitSeconds: Int = 60): Unit = { + val startTime = System.currentTimeMillis() + val maxWaitMs = maxWaitSeconds * 1000L + + var tableActive = false + while (!tableActive && (System.currentTimeMillis() - startTime) < maxWaitMs) { + try { + val describeRequest = DescribeTableRequest.builder() + .tableName(tableName) + .build() + + val response = dynamoDbClient.describeTable(describeRequest) + val status = response.table().tableStatus() + + if (status == TableStatus.ACTIVE) { + tableActive = true + log.debug(s"Table $tableName is now ACTIVE") + } else { + log.debug(s"Table $tableName status: $status, waiting...") + Thread.sleep(2000) // Wait 2 seconds before checking again + } + } catch { + case NonFatal(ex) => + log.warn(s"Error checking table status for $tableName", ex) + Thread.sleep(2000) + } + } + + if (!tableActive) { + throw new RuntimeException(s"Table $tableName did not become active within $maxWaitSeconds seconds") + } + } + + /** + * Constructs the full table name using ARN prefix and table name. + * If tableArn is provided, uses it as a prefix, otherwise returns just the table name. + * + * @param tableArn Optional ARN prefix for the table + * @param tableName The table name + * @return Full table name or ARN + */ + private def getFullTableName(tableArn: Option[String], tableName: String): String = { + tableArn match { + case Some(arn) if arn.nonEmpty => + // If ARN ends with table/, append the table name, otherwise append /table/tableName + if (arn.endsWith("/")) { + s"${arn}table/$tableName" + } else if (arn.contains("/table/")) { + arn // ARN already includes table path + } else { + s"$arn/table/$tableName" + } + case _ => tableName + } + } +} + +object TokenLockFactoryDynamoDb { + /** + * Builder for creating TokenLockFactoryDynamoDb instances. + * Provides a fluent API for configuring DynamoDB lock factory. + * + * Example: + * {{{ + * val lockFactory = TokenLockFactoryDynamoDb.builder + * .withRegion("us-east-1") + * .withTablePrefix("my_app") + * .build() + * }}} + */ + class TokenLockFactoryDynamoDbBuilder { + private var region: Option[String] = None + private var tableArn: Option[String] = None + private var tablePrefix: String = DEFAULT_TABLE_PREFIX + private var credentialsProvider: Option[AwsCredentialsProvider] = None + private var endpoint: Option[String] = None + + /** + * Sets the AWS region for the DynamoDB client. + * + * @param region AWS region (e.g., "us-east-1", "eu-west-1") + * @return this builder + */ + def withRegion(region: String): TokenLockFactoryDynamoDbBuilder = { + this.region = Some(region) + this + } + + /** + * Sets the table ARN prefix for cross-account or cross-region access. + * + * @param arn ARN prefix (e.g., "arn:aws:dynamodb:us-east-1:123456789012:table/") + * @return this builder + */ + def withTableArn(arn: String): TokenLockFactoryDynamoDbBuilder = { + this.tableArn = Some(arn) + this + } + + /** + * Sets the table ARN prefix for cross-account or cross-region access. + * + * @param arnOpt ARN prefix (e.g., "arn:aws:dynamodb:us-east-1:123456789012:table/") + * @return this builder + */ + def withTableArn(arnOpt: Option[String]): TokenLockFactoryDynamoDbBuilder = { + this.tableArn = arnOpt + this + } + + /** + * Sets the table name prefix to allow multiple lock tables in the same account. + * + * @param prefix Table name prefix (default: "pramen") + * @return this builder + */ + def withTablePrefix(prefix: String): TokenLockFactoryDynamoDbBuilder = { + this.tablePrefix = prefix + this + } + + /** + * Sets custom AWS credentials provider. + * + * @param provider AWS credentials provider + * @return this builder + */ + def withCredentialsProvider(provider: AwsCredentialsProvider): TokenLockFactoryDynamoDbBuilder = { + this.credentialsProvider = Some(provider) + this + } + + /** + * Sets a custom DynamoDB endpoint (useful for testing with LocalStack or DynamoDB Local). + * + * @param endpoint Endpoint URI (e.g., "http://localhost:8000") + * @return this builder + */ + def withEndpoint(endpoint: String): TokenLockFactoryDynamoDbBuilder = { + this.endpoint = Some(endpoint) + this + } + + /** + * Builds the TokenLockFactoryDynamoDb instance. + * + * @return Configured TokenLockFactoryDynamoDb instance + * @throws IllegalArgumentException if required parameters are missing + */ + def build(): TokenLockFactoryDynamoDb = { + if (region.isEmpty) { + throw new IllegalArgumentException("Region must be provided") + } + + val clientBuilder = DynamoDbClient.builder() + .region(Region.of(region.get)) + + credentialsProvider.foreach(clientBuilder.credentialsProvider) + + endpoint.foreach { ep => + clientBuilder.endpointOverride(URI.create(ep)) + } + + val client = clientBuilder.build() + + new TokenLockFactoryDynamoDb( + dynamoDbClient = client, + tableArn = tableArn, + tablePrefix = tablePrefix + ) + } + } + + /** + * Creates a new builder for TokenLockFactoryDynamoDb. + * + * @return A new builder instance + */ + def builder: TokenLockFactoryDynamoDbBuilder = new TokenLockFactoryDynamoDbBuilder + + /** + * Constructs the full table name using ARN prefix and table name. + * If tableArn is provided, uses it as a prefix, otherwise returns just the table name. + * + * @param tableArn Optional ARN prefix for the table + * @param tableName The table name + * @return Full table name or ARN + */ + def getFullTableName(tableArn: Option[String], tableName: String): String = { + tableArn match { + case Some(arn) if arn.nonEmpty => + // If ARN ends with table/, append the table name, otherwise append /table/tableName + if (arn.endsWith("/")) { + s"${arn}table/$tableName" + } else if (arn.contains("/table/")) { + arn // ARN already includes table path + } else { + s"$arn/table/$tableName" + } + case _ => tableName + } + } +} diff --git a/pramen/examples/dynamodb_bookkeeping/README.md b/pramen/examples/dynamodb_bookkeeping/README.md index 2d587323..5cc76f85 100644 --- a/pramen/examples/dynamodb_bookkeeping/README.md +++ b/pramen/examples/dynamodb_bookkeeping/README.md @@ -296,6 +296,167 @@ aws dynamodb create-table \ *Note: Offset management for incremental pipelines is not yet implemented for DynamoDB bookkeeper. +## Distributed Locking with DynamoDB + +When DynamoDB is configured for bookkeeping, Pramen automatically uses it for distributed locking to prevent concurrent pipeline runs. This ensures data consistency in multi-instance deployments. + +### How It Works + +1. **Automatic Lock Table Creation**: A locks table is created automatically using a builder pattern: + - Table name: `{prefix}_locks` (e.g., `pramen_production_locks`) + - Schema: `token` (partition key), `owner`, `expires`, `createdAt` + - Created via `TokenLockFactoryDynamoDb.builder` + +2. **Lock Acquisition**: Uses DynamoDB conditional writes (`attribute_not_exists`) for atomic lock operations + +3. **Lock Renewal**: Active pipelines automatically renew their locks every 2 minutes + +4. **Lock Expiration**: Locks expire after 10 minutes of inactivity and can be taken over + +5. **Hard Expiration**: Stale locks are cleaned up after 1 day + +6. **Builder Pattern**: Lock factory is created using a fluent builder API for flexible configuration + +### Configuration + +Enable locking along with DynamoDB bookkeeping: + +```hocon +pramen { + # Enable distributed locking + runtime.use.locks = true + + bookkeeping { + enabled = true + dynamodb.region = "us-east-1" + dynamodb.table.prefix = "pramen_production" + } +} +``` + +This creates three tables: +- `pramen_production_bookkeeping` - Bookkeeping data +- `pramen_production_schemas` - Table schemas +- `pramen_production_locks` - Distributed locks + +See `dynamodb_with_locks.conf` for a complete example. + +### IAM Permissions for Locks + +Add the locks table to your IAM policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "dynamodb:CreateTable", + "dynamodb:DescribeTable", + "dynamodb:PutItem", + "dynamodb:GetItem", + "dynamodb:DeleteItem", + "dynamodb:UpdateItem" + ], + "Resource": [ + "arn:aws:dynamodb:*:*:table/pramen_production_bookkeeping", + "arn:aws:dynamodb:*:*:table/pramen_production_schemas", + "arn:aws:dynamodb:*:*:table/pramen_production_locks" + ] + } + ] +} +``` + +### Programmatic Usage + +You can also create lock factories programmatically using the builder pattern: + +```scala +import za.co.absa.pramen.core.lock.TokenLockFactoryDynamoDb + +// Basic usage +val lockFactory = TokenLockFactoryDynamoDb.builder + .withRegion("us-east-1") + .withTablePrefix("my_app") + .build() + +try { + val lock = lockFactory.getLock("my_pipeline") + + if (lock.tryAcquire()) { + try { + // Run your pipeline + } finally { + lock.release() + } + } +} finally { + lockFactory.close() +} + +// Testing with DynamoDB Local +val testFactory = TokenLockFactoryDynamoDb.builder + .withRegion("us-east-1") + .withEndpoint("http://localhost:8000") + .build() +``` + +See `core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDbExample.scala` for more examples. + +### Lock Behavior + +**Scenario 1: Single Pipeline Run** +- Pipeline acquires lock → processes data → releases lock + +**Scenario 2: Concurrent Pipeline Runs** +- Instance A acquires lock → starts processing +- Instance B tries to acquire same lock → blocked (lock already held) +- Instance A completes → releases lock +- Instance B can now acquire lock (if still attempting) + +**Scenario 3: Pipeline Crash** +- Pipeline acquires lock → crashes +- Lock expires after 10 minutes (no renewal) +- New pipeline run can take over expired lock + +### Monitoring Locks + +Query active locks: + +```bash +aws dynamodb scan \ + --table-name pramen_production_locks \ + --region us-east-1 +``` + +Check specific lock: + +```bash +aws dynamodb get-item \ + --table-name pramen_production_locks \ + --key '{"token":{"S":"my_pipeline_lock"}}' \ + --region us-east-1 +``` + +Manually release stuck lock (use with caution): + +```bash +aws dynamodb delete-item \ + --table-name pramen_production_locks \ + --key '{"token":{"S":"my_pipeline_lock"}}' \ + --region us-east-1 +``` + +### Lock Cost + +Lock operations add minimal cost: +- Lock acquisition: 1 write request (~$0.00000125) +- Lock renewal (every 2 min): 1 write request per renewal +- Lock release: 1 delete request (~$0.00000125) +- Total per pipeline run: ~$0.00001 (for 10-minute pipeline) + ## Advanced Topics ### Using DynamoDB Local for Development diff --git a/pramen/examples/dynamodb_bookkeeping/dynamodb_bookkeeping.conf b/pramen/examples/dynamodb_bookkeeping/dynamodb_bookkeeping.conf index 248f5e48..bd36630b 100644 --- a/pramen/examples/dynamodb_bookkeeping/dynamodb_bookkeeping.conf +++ b/pramen/examples/dynamodb_bookkeeping/dynamodb_bookkeeping.conf @@ -41,7 +41,7 @@ pramen { # AWS Region where DynamoDB tables will be created/accessed # REQUIRED when using DynamoDB bookkeeping - dynamodb.region = "us-east-1" + dynamodb.region = "af-south-1" # Table prefix for multi-environment/multi-tenant deployments # OPTIONAL - defaults to "pramen" if not specified diff --git a/pramen/project/Dependencies.scala b/pramen/project/Dependencies.scala index 192ccad1..572b2da9 100644 --- a/pramen/project/Dependencies.scala +++ b/pramen/project/Dependencies.scala @@ -29,6 +29,7 @@ object Dependencies { def CoreDependencies(scalaVersion: String, isDeltaCompile: Boolean): Seq[ModuleID] = Seq( "org.apache.spark" %% "spark-sql" % sparkVersion(scalaVersion) % Provided, + "software.amazon.awssdk" % "dynamodb" % awsSdkVersion % Provided, "org.mongodb.scala" %% "mongo-scala-driver" % mongoDbScalaDriverVersion, "com.typesafe.slick" %% "slick" % slickVersion, "com.typesafe.slick" %% "slick-hikaricp" % slickVersion, @@ -38,7 +39,6 @@ object Dependencies { "com.github.yruslan" %% "channel_scala" % channelVersion, "com.sun.mail" % "javax.mail" % javaXMailVersion, "com.lihaoyi" %% "requests" % requestsVersion, - "software.amazon.awssdk" % "dynamodb" % awsSdkVersion, "org.scala-lang.modules" %% "scala-collection-compat" % scalaCompatColsVersion % Test, "org.scalatest" %% "scalatest" % scalatestVersion % Test, "org.mockito" % "mockito-core" % mockitoVersion % Test, From 7e309387d384e85c9c975db5b9e56c5c6945081c Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Wed, 1 Apr 2026 09:18:51 +0200 Subject: [PATCH 04/13] #181 Add DynamoDB-based journal, metadata manager, and the offset manager. --- .../pramen/core/bookkeeper/Bookkeeper.scala | 33 +- .../core/bookkeeper/BookkeeperDynamoDb.scala | 11 + .../core/bookkeeper/OffsetManagerCached.scala | 2 +- .../bookkeeper/OffsetManagerDynamoDb.scala | 593 ++++++++++++++++++ .../pramen/core/journal/JournalDynamoDB.scala | 372 +++++++++++ .../core/lock/TokenLockFactoryDynamoDb.scala | 53 +- .../metadata/MetadataManagerDynamoDb.scala | 351 +++++++++++ .../examples/dynamodb_bookkeeping/README.md | 49 +- .../dynamodb_with_locks.conf | 150 +++++ 9 files changed, 1552 insertions(+), 62 deletions(-) create mode 100644 pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala create mode 100644 pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala create mode 100644 pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala create mode 100644 pramen/examples/dynamodb_bookkeeping/dynamodb_with_locks.conf diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala index 1d45285c..dadce5c4 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala @@ -26,7 +26,7 @@ import za.co.absa.pramen.core.app.config.{BookkeeperConfig, HadoopFormat, Runtim import za.co.absa.pramen.core.bookkeeper.model.DataAvailability import za.co.absa.pramen.core.journal._ import za.co.absa.pramen.core.lock._ -import za.co.absa.pramen.core.metadata.{MetadataManagerJdbc, MetadataManagerNull} +import za.co.absa.pramen.core.metadata.{MetadataManagerDynamoDb, MetadataManagerJdbc, MetadataManagerNull} import za.co.absa.pramen.core.model.DataChunk import za.co.absa.pramen.core.mongo.MongoDbConnection import za.co.absa.pramen.core.rdb.PramenDb @@ -180,8 +180,16 @@ object Bookkeeper { log.info(s"Using RDB to keep journal of executed jobs.") new JournalJdbc(dbOpt.get.slickDb, dbOpt.get.slickProfile) } else if (hasBookkeepingDynamoDb) { - log.info(s"The journal is DISABLED.") - new JournalNull() + val tablePrefix = bookkeepingConfig.dynamoDbTablePrefix.getOrElse(JournalDynamoDB.DEFAULT_TABLE_PREFIX) + log.info(s"Using DynamoDB for journal in region '${bookkeepingConfig.dynamoDbRegion.get}' with table prefix '$tablePrefix'") + val builder = JournalDynamoDB.builder + .withRegion(bookkeepingConfig.dynamoDbRegion.get) + .withTablePrefix(tablePrefix) + val builder2 = bookkeepingConfig.dynamoDbTableArn match { + case Some(arn) => builder.withTableArn(arn) + case None => builder + } + builder2.build() } else { mongoDbConnection match { case Some(connection) => @@ -215,6 +223,17 @@ object Bookkeeper { } else if (hasBookkeepingJdbc) { log.info(s"Using RDB to keep custom metadata.") new MetadataManagerJdbc(dbOpt.get.slickDb, dbOpt.get.slickProfile) + } else if (hasBookkeepingDynamoDb) { + val tablePrefix = bookkeepingConfig.dynamoDbTablePrefix.getOrElse(MetadataManagerDynamoDb.DEFAULT_TABLE_PREFIX) + log.info(s"Using DynamoDB for metadata in region '${bookkeepingConfig.dynamoDbRegion.get}' with table prefix '$tablePrefix'") + val builder = MetadataManagerDynamoDb.builder + .withRegion(bookkeepingConfig.dynamoDbRegion.get) + .withTablePrefix(tablePrefix) + val builder2 = bookkeepingConfig.dynamoDbTableArn match { + case Some(arn) => builder.withTableArn(arn) + case None => builder + } + builder2.build() } else { log.info(s"The custom metadata management is not supported.") new MetadataManagerNull(isPersistenceEnabled = true) @@ -228,6 +247,14 @@ object Bookkeeper { case closeable: AutoCloseable => closeable.close() case _ => // Not closeable } + journal match { + case closeable: AutoCloseable => closeable.close() + case _ => // Not closeable + } + metadataManager match { + case closeable: AutoCloseable => closeable.close() + case _ => // Not closeable + } } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala index c8651549..832a88ab 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala @@ -67,6 +67,11 @@ class BookkeeperDynamoDb( private val bookkeepingTableName = getFullTableName(tableArn, bookkeepingTableBaseName) private val schemaTableName = getFullTableName(tableArn, schemaTableBaseName) + // Offset management + private val offsetManagement = new OffsetManagerCached( + new OffsetManagerDynamoDb(dynamoDbClient, batchId, tableArn, tablePrefix) + ) + // Initialize tables on construction init() @@ -548,8 +553,14 @@ class BookkeeperDynamoDb( } } + private[pramen] override def getOffsetManager: OffsetManager = { + offsetManagement + } + override def close(): Unit = { try { + // Note: offsetManagement wraps OffsetManagerDynamoDb which shares the same dynamoDbClient, + // so we don't need to close it separately dynamoDbClient.close() } catch { case NonFatal(ex) => diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerCached.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerCached.scala index 8da69ce3..bf85c594 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerCached.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerCached.scala @@ -25,7 +25,7 @@ import java.time.LocalDate import scala.collection.mutable /** - * The offset manager decorator handles caching or repeated queries. + * The offset manager decorator handles caching of repeated queries. */ class OffsetManagerCached(offsetManager: OffsetManager) extends OffsetManager { private val log = LoggerFactory.getLogger(this.getClass) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala new file mode 100644 index 00000000..861950fd --- /dev/null +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala @@ -0,0 +1,593 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.bookkeeper + +import org.slf4j.LoggerFactory +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider +import software.amazon.awssdk.regions.Region +import software.amazon.awssdk.services.dynamodb.DynamoDbClient +import software.amazon.awssdk.services.dynamodb.model._ +import za.co.absa.pramen.api.offset.DataOffset.UncommittedOffset +import za.co.absa.pramen.api.offset.{DataOffset, OffsetType, OffsetValue} +import za.co.absa.pramen.core.bookkeeper.model._ + +import java.net.URI +import java.time.{Instant, LocalDate} +import scala.collection.JavaConverters._ +import scala.util.control.NonFatal + +/** + * DynamoDB-based offset manager for tracking incremental ingestion offsets. + * + * Table schema for offsets: + * - Partition key: pramenTableName (String) + * - Sort key: compositeKey (String) - format: "infoDate#createdAtMilli" for efficient querying + * + * The composite sort key allows: + * 1. Efficient queries for all offsets of a table+infoDate combination + * 2. Time-ordered offset records + * 3. Support for aggregation queries (fetch all offsets for a table+date) + * + * @param dynamoDbClient The DynamoDB client to use + * @param batchId The batch ID for this execution + * @param tableArn Optional ARN prefix for the offset table + * @param tablePrefix Prefix for the offset table name (default: "pramen") + */ +class OffsetManagerDynamoDb( + dynamoDbClient: DynamoDbClient, + batchId: Long, + tableArn: Option[String] = None, + tablePrefix: String = OffsetManagerDynamoDb.DEFAULT_TABLE_PREFIX +) extends OffsetManager with AutoCloseable { + + import OffsetManagerDynamoDb._ + + private val log = LoggerFactory.getLogger(this.getClass) + + private val offsetTableBaseName = s"${tablePrefix}_${DEFAULT_OFFSET_TABLE}" + private val offsetTableFullName = BookkeeperDynamoDb.getFullTableName(tableArn, offsetTableBaseName) + + // Initialize table on creation + createOffsetTableIfNotExists() + + override def getOffsets(table: String, infoDate: LocalDate): Array[DataOffset] = { + val offsets = getOffsetRecords(table, infoDate) + + if (offsets.isEmpty) { + return Array.empty + } + + offsets.map(OffsetRecordConverter.toDataOffset) + } + + override def getUncommittedOffsets(table: String, onlyForInfoDate: Option[LocalDate]): Array[UncommittedOffset] = { + try { + onlyForInfoDate match { + case Some(infoDate) => + // Query for specific table and info date + val offsets = getOffsetRecords(table, infoDate) + offsets + .filter(_.committedAtMilli.isEmpty) + .map(record => OffsetRecordConverter.toDataOffset(record).asInstanceOf[UncommittedOffset]) + + case None => + // Scan all offsets for this table (no info date filter) + val scanRequest = ScanRequest.builder() + .tableName(offsetTableFullName) + .filterExpression(s"${ATTR_PRAMEN_TABLE_NAME} = :table_name AND attribute_not_exists(${ATTR_COMMITTED_AT})") + .expressionAttributeValues(Map( + ":table_name" -> AttributeValue.builder().s(table).build() + ).asJava) + .build() + + val result = dynamoDbClient.scan(scanRequest) + + result.items().asScala + .map(itemToOffsetRecord) + .map(record => OffsetRecordConverter.toDataOffset(record).asInstanceOf[UncommittedOffset]) + .toArray + } + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to read uncommitted offsets from the offset table '$offsetTableFullName'.", ex) + } + } + + override def getMaxInfoDateAndOffset(table: String, onlyForInfoDate: Option[LocalDate]): Option[DataOffsetAggregated] = { + val maxInfoDateOpt = onlyForInfoDate.orElse(getMaximumInfoDate(table)) + + try { + maxInfoDateOpt.flatMap { infoDate => + getMinMaxOffsets(table, infoDate) + } + } catch { + case NonFatal(ex) => throw new RuntimeException(s"Unable to read from the offset table '$offsetTableFullName'.", ex) + } + } + + override def startWriteOffsets(table: String, infoDate: LocalDate, offsetType: OffsetType): DataOffsetRequest = { + val createdAt = Instant.now() + val createdAtMilli = createdAt.toEpochMilli + val compositeKey = s"${infoDate.toString}#${createdAtMilli}" + + try { + val putRequest = PutItemRequest.builder() + .tableName(offsetTableFullName) + .item(Map( + ATTR_PRAMEN_TABLE_NAME -> AttributeValue.builder().s(table).build(), + ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build(), + ATTR_INFO_DATE -> AttributeValue.builder().s(infoDate.toString).build(), + ATTR_DATA_TYPE -> AttributeValue.builder().s(offsetType.dataTypeString).build(), + ATTR_MIN_OFFSET -> AttributeValue.builder().s("").build(), + ATTR_MAX_OFFSET -> AttributeValue.builder().s("").build(), + ATTR_BATCH_ID -> AttributeValue.builder().n(batchId.toString).build(), + ATTR_CREATED_AT -> AttributeValue.builder().n(createdAtMilli.toString).build() + ).asJava) + .build() + + dynamoDbClient.putItem(putRequest) + + DataOffsetRequest(table, infoDate, batchId, createdAt) + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to write to the offset table '$offsetTableFullName'.", ex) + } + } + + override def commitOffsets(request: DataOffsetRequest, minOffset: OffsetValue, maxOffset: OffsetValue): Unit = { + val committedAt = Instant.now().toEpochMilli + val compositeKey = s"${request.infoDate.toString}#${request.createdAt.toEpochMilli}" + + try { + val updateRequest = UpdateItemRequest.builder() + .tableName(offsetTableFullName) + .key(Map( + ATTR_PRAMEN_TABLE_NAME -> AttributeValue.builder().s(request.tableName).build(), + ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build() + ).asJava) + .updateExpression(s"SET ${ATTR_MIN_OFFSET} = :min_offset, ${ATTR_MAX_OFFSET} = :max_offset, ${ATTR_COMMITTED_AT} = :committed_at") + .expressionAttributeValues(Map( + ":min_offset" -> AttributeValue.builder().s(minOffset.valueString).build(), + ":max_offset" -> AttributeValue.builder().s(maxOffset.valueString).build(), + ":committed_at" -> AttributeValue.builder().n(committedAt.toString).build() + ).asJava) + .build() + + dynamoDbClient.updateItem(updateRequest) + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to commit offsets to the offset table '$offsetTableFullName'.", ex) + } + } + + override def commitRerun(request: DataOffsetRequest, minOffset: OffsetValue, maxOffset: OffsetValue): Unit = { + if (minOffset.compareTo(maxOffset) > 0) { + throw new IllegalArgumentException(s"minOffset is greater than maxOffset: ${minOffset.valueString} > ${maxOffset.valueString}") + } + + val committedAt = Instant.now().toEpochMilli + val compositeKey = s"${request.infoDate.toString}#${request.createdAt.toEpochMilli}" + + try { + // First, update the current offset + val updateRequest = UpdateItemRequest.builder() + .tableName(offsetTableFullName) + .key(Map( + ATTR_PRAMEN_TABLE_NAME -> AttributeValue.builder().s(request.tableName).build(), + ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build() + ).asJava) + .updateExpression(s"SET ${ATTR_MIN_OFFSET} = :min_offset, ${ATTR_MAX_OFFSET} = :max_offset, ${ATTR_COMMITTED_AT} = :committed_at") + .expressionAttributeValues(Map( + ":min_offset" -> AttributeValue.builder().s(minOffset.valueString).build(), + ":max_offset" -> AttributeValue.builder().s(maxOffset.valueString).build(), + ":committed_at" -> AttributeValue.builder().n(committedAt.toString).build() + ).asJava) + .build() + + dynamoDbClient.updateItem(updateRequest) + + // Then, delete all other offsets for this table and info date + val allOffsets = getOffsetRecords(request.tableName, request.infoDate) + allOffsets + .filter(r => r.createdAtMilli != request.createdAt.toEpochMilli) + .foreach { record => + val deleteCompositeKey = s"${record.infoDate}#${record.createdAtMilli}" + val deleteRequest = DeleteItemRequest.builder() + .tableName(offsetTableFullName) + .key(Map( + ATTR_PRAMEN_TABLE_NAME -> AttributeValue.builder().s(request.tableName).build(), + ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(deleteCompositeKey).build() + ).asJava) + .build() + + dynamoDbClient.deleteItem(deleteRequest) + } + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to commit rerun to the offset table '$offsetTableFullName'.", ex) + } + } + + override def postCommittedRecords(commitRequests: Seq[OffsetCommitRequest]): Unit = { + val committedAt = Instant.now() + val committedAtMilli = committedAt.toEpochMilli + + try { + // Insert all new committed records + commitRequests.foreach { req => + val compositeKey = s"${req.infoDate.toString}#${req.createdAt.toEpochMilli}" + + val putRequest = PutItemRequest.builder() + .tableName(offsetTableFullName) + .item(Map( + ATTR_PRAMEN_TABLE_NAME -> AttributeValue.builder().s(req.table).build(), + ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build(), + ATTR_INFO_DATE -> AttributeValue.builder().s(req.infoDate.toString).build(), + ATTR_DATA_TYPE -> AttributeValue.builder().s(req.minOffset.dataType.dataTypeString).build(), + ATTR_MIN_OFFSET -> AttributeValue.builder().s(req.minOffset.valueString).build(), + ATTR_MAX_OFFSET -> AttributeValue.builder().s(req.maxOffset.valueString).build(), + ATTR_BATCH_ID -> AttributeValue.builder().n(batchId.toString).build(), + ATTR_CREATED_AT -> AttributeValue.builder().n(req.createdAt.toEpochMilli.toString).build(), + ATTR_COMMITTED_AT -> AttributeValue.builder().n(committedAtMilli.toString).build() + ).asJava) + .build() + + dynamoDbClient.putItem(putRequest) + } + + // Delete old offsets for each (table, infoDate) pair + commitRequests.map(r => (r.table, r.infoDate)) + .distinct + .foreach { case (table, infoDate) => + val allOffsets = getOffsetRecords(table, infoDate) + allOffsets + .filter(_.committedAtMilli.exists(_ != committedAtMilli)) + .foreach { record => + val deleteCompositeKey = s"${record.infoDate}#${record.createdAtMilli}" + val deleteRequest = DeleteItemRequest.builder() + .tableName(offsetTableFullName) + .key(Map( + ATTR_PRAMEN_TABLE_NAME -> AttributeValue.builder().s(table).build(), + ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(deleteCompositeKey).build() + ).asJava) + .build() + + dynamoDbClient.deleteItem(deleteRequest) + } + } + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to post committed records to the offset table '$offsetTableFullName'.", ex) + } + } + + override def rollbackOffsets(request: DataOffsetRequest): Unit = { + val compositeKey = s"${request.infoDate.toString}#${request.createdAt.toEpochMilli}" + + try { + val deleteRequest = DeleteItemRequest.builder() + .tableName(offsetTableFullName) + .key(Map( + ATTR_PRAMEN_TABLE_NAME -> AttributeValue.builder().s(request.tableName).build(), + ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build() + ).asJava) + .build() + + dynamoDbClient.deleteItem(deleteRequest) + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to rollback offsets in the offset table '$offsetTableFullName'.", ex) + } + } + + /** + * Gets all offset records for a table and info date. + */ + private[core] def getOffsetRecords(table: String, infoDate: LocalDate): Array[OffsetRecord] = { + try { + val queryRequest = QueryRequest.builder() + .tableName(offsetTableFullName) + .keyConditionExpression(s"${ATTR_PRAMEN_TABLE_NAME} = :table_name") + .filterExpression(s"${ATTR_INFO_DATE} = :info_date") + .expressionAttributeValues(Map( + ":table_name" -> AttributeValue.builder().s(table).build(), + ":info_date" -> AttributeValue.builder().s(infoDate.toString).build() + ).asJava) + .build() + + val result = dynamoDbClient.query(queryRequest) + + result.items().asScala + .map(itemToOffsetRecord) + .toArray + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to read offset records from the offset table '$offsetTableFullName'.", ex) + } + } + + /** + * Gets the maximum information date for a table. + */ + private[core] def getMaximumInfoDate(table: String): Option[LocalDate] = { + try { + val queryRequest = QueryRequest.builder() + .tableName(offsetTableFullName) + .keyConditionExpression(s"${ATTR_PRAMEN_TABLE_NAME} = :table_name") + .expressionAttributeValues(Map( + ":table_name" -> AttributeValue.builder().s(table).build() + ).asJava) + .projectionExpression(ATTR_INFO_DATE) + .build() + + val result = dynamoDbClient.query(queryRequest) + + if (result.items().isEmpty) { + None + } else { + // Use maxBy with compareTo to avoid needing implicit Ordering + val maxInfoDate = result.items().asScala + .map(item => LocalDate.parse(item.get(ATTR_INFO_DATE).s())) + .maxBy(_.toEpochDay) + + Some(maxInfoDate) + } + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to read maximum info date from the offset table '$offsetTableFullName'.", ex) + } + } + + /** + * Gets min/max offsets for a table and info date, with all offset records for that day. + */ + private[core] def getMinMaxOffsets(table: String, infoDate: LocalDate): Option[DataOffsetAggregated] = { + val offsets = getOffsetRecords(table, infoDate).filter(_.committedAtMilli.nonEmpty) + + if (offsets.isEmpty) { + return None + } + + validateOffsets(table, infoDate, offsets) + + val (minOffset, maxOffset) = getMinMaxOffsets(offsets) + + Some(DataOffsetAggregated(table, infoDate, minOffset, maxOffset, offsets.map(OffsetRecordConverter.toDataOffset))) + } + + /** + * Gets min/max offsets from an array of offset records. + */ + private[core] def getMinMaxOffsets(offsets: Array[OffsetRecord]): (OffsetValue, OffsetValue) = { + val offsetDataType = offsets.head.dataType + val minOffset = offsets.flatMap(or => OffsetValue.fromString(offsetDataType, or.minOffset)).min + val maxOffset = offsets.flatMap(or => OffsetValue.fromString(offsetDataType, or.maxOffset)).max + + (minOffset, maxOffset) + } + + /** + * Validates offsets for inconsistencies (e.g., inconsistent offset value types). + */ + private[core] def validateOffsets(table: String, infoDate: LocalDate, offsets: Array[OffsetRecord]): Unit = { + val inconsistentOffsets = offsets.groupBy(_.dataType).keys.toArray.sorted + if (inconsistentOffsets.length > 1) { + throw new RuntimeException(s"Inconsistent offset value types found for $table at $infoDate: ${inconsistentOffsets.mkString(", ")}") + } + } + + /** + * Converts a DynamoDB item to an OffsetRecord. + */ + private def itemToOffsetRecord(item: java.util.Map[String, AttributeValue]): OffsetRecord = { + val pramenTableName = item.get(ATTR_PRAMEN_TABLE_NAME).s() + val infoDate = item.get(ATTR_INFO_DATE).s() + val dataType = item.get(ATTR_DATA_TYPE).s() + val minOffset = item.get(ATTR_MIN_OFFSET).s() + val maxOffset = item.get(ATTR_MAX_OFFSET).s() + val batchId = item.get(ATTR_BATCH_ID).n().toLong + val createdAtMilli = item.get(ATTR_CREATED_AT).n().toLong + val committedAtMilli = Option(item.get(ATTR_COMMITTED_AT)).map(_.n().toLong) + + OffsetRecord(pramenTableName, infoDate, dataType, minOffset, maxOffset, batchId, createdAtMilli, committedAtMilli) + } + + /** + * Creates the offset table if it doesn't exist. + */ + private def createOffsetTableIfNotExists(): Unit = { + try { + val describeRequest = DescribeTableRequest.builder() + .tableName(offsetTableFullName) + .build() + + dynamoDbClient.describeTable(describeRequest) + log.info(s"Offset table '$offsetTableFullName' already exists") + } catch { + case _: ResourceNotFoundException => + log.info(s"Creating offset table '$offsetTableFullName'") + createOffsetTable() + case NonFatal(ex) => + log.error(s"Error checking if offset table exists", ex) + throw ex + } + } + + /** + * Creates the offset table in DynamoDB. + */ + private def createOffsetTable(): Unit = { + val createRequest = CreateTableRequest.builder() + .tableName(offsetTableFullName) + .attributeDefinitions( + AttributeDefinition.builder() + .attributeName(ATTR_PRAMEN_TABLE_NAME) + .attributeType(ScalarAttributeType.S) + .build(), + AttributeDefinition.builder() + .attributeName(ATTR_COMPOSITE_KEY) + .attributeType(ScalarAttributeType.S) + .build() + ) + .keySchema( + KeySchemaElement.builder() + .attributeName(ATTR_PRAMEN_TABLE_NAME) + .keyType(KeyType.HASH) + .build(), + KeySchemaElement.builder() + .attributeName(ATTR_COMPOSITE_KEY) + .keyType(KeyType.RANGE) + .build() + ) + .billingMode(BillingMode.PAY_PER_REQUEST) + .build() + + dynamoDbClient.createTable(createRequest) + waitForTableActive(offsetTableFullName) + log.info(s"Offset table '$offsetTableFullName' created successfully") + } + + /** + * Waits for a table to become ACTIVE. + */ + private def waitForTableActive(tableName: String, maxAttempts: Int = 30): Unit = { + var attempts = 0 + var isActive = false + + while (attempts < maxAttempts && !isActive) { + try { + val describeRequest = DescribeTableRequest.builder() + .tableName(tableName) + .build() + + val response = dynamoDbClient.describeTable(describeRequest) + val status = response.table().tableStatus() + + if (status == TableStatus.ACTIVE) { + isActive = true + } else { + Thread.sleep(1000) + attempts += 1 + } + } catch { + case NonFatal(ex) => + log.warn(s"Error waiting for table '$tableName' to become active", ex) + Thread.sleep(1000) + attempts += 1 + } + } + + if (!isActive) { + throw new RuntimeException(s"Table '$tableName' did not become ACTIVE after $maxAttempts attempts") + } + } + + /** + * Closes the DynamoDB client. + */ + override def close(): Unit = { + try { + dynamoDbClient.close() + } catch { + case NonFatal(ex) => + log.warn("Error closing DynamoDB client", ex) + } + } +} + +object OffsetManagerDynamoDb { + val DEFAULT_OFFSET_TABLE = "offsets" + val DEFAULT_TABLE_PREFIX = "pramen" + + // Attribute names for offset table + val ATTR_PRAMEN_TABLE_NAME = "pramenTableName" + val ATTR_COMPOSITE_KEY = "compositeKey" // Format: "infoDate#createdAtMilli" + val ATTR_INFO_DATE = "infoDate" + val ATTR_DATA_TYPE = "dataType" + val ATTR_MIN_OFFSET = "minOffset" + val ATTR_MAX_OFFSET = "maxOffset" + val ATTR_BATCH_ID = "batchId" + val ATTR_CREATED_AT = "createdAt" + val ATTR_COMMITTED_AT = "committedAt" + + /** + * Builder for creating OffsetManagerDynamoDb instances. + */ + class OffsetManagerDynamoDbBuilder { + private var region: Option[String] = None + private var tableArn: Option[String] = None + private var tablePrefix: String = DEFAULT_TABLE_PREFIX + private var credentialsProvider: Option[AwsCredentialsProvider] = None + private var endpoint: Option[String] = None + private var batchId: Long = System.currentTimeMillis() + + def withRegion(region: String): OffsetManagerDynamoDbBuilder = { + this.region = Some(region) + this + } + + def withTableArn(arn: String): OffsetManagerDynamoDbBuilder = { + this.tableArn = Some(arn) + this + } + + def withTablePrefix(prefix: String): OffsetManagerDynamoDbBuilder = { + this.tablePrefix = prefix + this + } + + def withCredentialsProvider(provider: AwsCredentialsProvider): OffsetManagerDynamoDbBuilder = { + this.credentialsProvider = Some(provider) + this + } + + def withEndpoint(endpoint: String): OffsetManagerDynamoDbBuilder = { + this.endpoint = Some(endpoint) + this + } + + def withBatchId(batchId: Long): OffsetManagerDynamoDbBuilder = { + this.batchId = batchId + this + } + + def build(): OffsetManagerDynamoDb = { + if (region.isEmpty) { + throw new IllegalArgumentException("Region must be provided") + } + + val clientBuilder = DynamoDbClient.builder() + .region(Region.of(region.get)) + + credentialsProvider.foreach(clientBuilder.credentialsProvider) + endpoint.foreach { ep => + clientBuilder.endpointOverride(URI.create(ep)) + } + + val client = clientBuilder.build() + + new OffsetManagerDynamoDb( + dynamoDbClient = client, + batchId = batchId, + tableArn = tableArn, + tablePrefix = tablePrefix + ) + } + } + + def builder: OffsetManagerDynamoDbBuilder = new OffsetManagerDynamoDbBuilder +} diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala new file mode 100644 index 00000000..15fe0702 --- /dev/null +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala @@ -0,0 +1,372 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.journal + +import org.slf4j.LoggerFactory +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider +import software.amazon.awssdk.regions.Region +import software.amazon.awssdk.services.dynamodb.DynamoDbClient +import software.amazon.awssdk.services.dynamodb.model._ +import za.co.absa.pramen.core.app.config.InfoDateConfig +import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb +import za.co.absa.pramen.core.journal.model.TaskCompleted + +import java.net.URI +import java.time.{Instant, LocalDate} +import scala.collection.JavaConverters._ +import scala.util.control.NonFatal + +/** + * DynamoDB-based journal for tracking completed tasks. + * + * This journal stores task completion records in a DynamoDB table with automatic table creation. + * + * @param dynamoDbClient The DynamoDB client to use + * @param tableArn Optional ARN prefix for the journal table + * @param tablePrefix Prefix for the journal table name (default: "pramen") + */ +class JournalDynamoDB( + dynamoDbClient: DynamoDbClient, + tableArn: Option[String] = None, + tablePrefix: String = JournalDynamoDB.DEFAULT_TABLE_PREFIX +) extends Journal with AutoCloseable { + + private val log = LoggerFactory.getLogger(this.getClass) + private val dateFormatter = InfoDateConfig.defaultDateFormatter + + private val journalTableBaseName = s"${tablePrefix}_${JournalDynamoDB.DEFAULT_JOURNAL_TABLE}" + private val journalTableFullName = BookkeeperDynamoDb.getFullTableName(tableArn, journalTableBaseName) + + // Initialize table on creation + createJournalTableIfNotExists() + + /** + * Add a task completion entry to the journal. + * Failure reason is truncated to 4KB to fit DynamoDB item size limits. + */ + override def addEntry(entry: TaskCompleted): Unit = { + val periodBegin = entry.periodBegin.format(dateFormatter) + val periodEnd = entry.periodEnd.format(dateFormatter) + val infoDate = entry.informationDate.format(dateFormatter) + + // Truncate failure reason to 4KB maximum + val truncatedFailureReason = entry.failureReason.map { reason => + if (reason.length > JournalDynamoDB.MAX_FAILURE_REASON_LENGTH) { + val truncated = reason.substring(0, JournalDynamoDB.MAX_FAILURE_REASON_LENGTH - 20) + truncated + "\n[... truncated ...]" + } else { + reason + } + } + + val itemBuilder = Map.newBuilder[String, AttributeValue] + + // Primary key: composite of jobName and finishedAt (for sorting by time) + itemBuilder += (JournalDynamoDB.ATTR_JOB_NAME -> AttributeValue.builder().s(entry.jobName).build()) + itemBuilder += (JournalDynamoDB.ATTR_FINISHED_AT -> AttributeValue.builder().n(entry.finishedAt.toString).build()) + + // Attributes + itemBuilder += (JournalDynamoDB.ATTR_TABLE_NAME -> AttributeValue.builder().s(entry.tableName).build()) + itemBuilder += (JournalDynamoDB.ATTR_PERIOD_BEGIN -> AttributeValue.builder().s(periodBegin).build()) + itemBuilder += (JournalDynamoDB.ATTR_PERIOD_END -> AttributeValue.builder().s(periodEnd).build()) + itemBuilder += (JournalDynamoDB.ATTR_INFO_DATE -> AttributeValue.builder().s(infoDate).build()) + itemBuilder += (JournalDynamoDB.ATTR_INPUT_RECORD_COUNT -> AttributeValue.builder().n(entry.inputRecordCount.getOrElse(-1L).toString).build()) + itemBuilder += (JournalDynamoDB.ATTR_INPUT_RECORD_COUNT_OLD -> AttributeValue.builder().n(entry.inputRecordCountOld.getOrElse(-1L).toString).build()) + + entry.outputRecordCount.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_OUTPUT_RECORD_COUNT -> AttributeValue.builder().n(v.toString).build())) + entry.outputRecordCountOld.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_OUTPUT_RECORD_COUNT_OLD -> AttributeValue.builder().n(v.toString).build())) + entry.appendedRecordCount.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_APPENDED_RECORD_COUNT -> AttributeValue.builder().n(v.toString).build())) + entry.outputSize.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_OUTPUT_SIZE -> AttributeValue.builder().n(v.toString).build())) + + itemBuilder += (JournalDynamoDB.ATTR_STARTED_AT -> AttributeValue.builder().n(entry.startedAt.toString).build()) + itemBuilder += (JournalDynamoDB.ATTR_STATUS -> AttributeValue.builder().s(entry.status).build()) + + truncatedFailureReason.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_FAILURE_REASON -> AttributeValue.builder().s(v).build())) + entry.sparkApplicationId.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_SPARK_APP_ID -> AttributeValue.builder().s(v).build())) + entry.pipelineId.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_PIPELINE_ID -> AttributeValue.builder().s(v).build())) + entry.pipelineName.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_PIPELINE_NAME -> AttributeValue.builder().s(v).build())) + entry.environmentName.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_ENVIRONMENT_NAME -> AttributeValue.builder().s(v).build())) + entry.tenant.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_TENANT -> AttributeValue.builder().s(v).build())) + entry.country.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_COUNTRY -> AttributeValue.builder().s(v).build())) + + itemBuilder += (JournalDynamoDB.ATTR_BATCH_ID -> AttributeValue.builder().n(entry.batchId.toString).build()) + + try { + val putRequest = PutItemRequest.builder() + .tableName(journalTableFullName) + .item(itemBuilder.result().asJava) + .build() + + dynamoDbClient.putItem(putRequest) + } catch { + case NonFatal(ex) => + log.error(s"Unable to write to the journal table '$journalTableFullName'.", ex) + } + } + + /** + * Get journal entries within a time range. + */ + override def getEntries(from: Instant, to: Instant): Seq[TaskCompleted] = { + val fromSec = from.getEpochSecond + val toSec = to.getEpochSecond + + try { + val scanRequest = ScanRequest.builder() + .tableName(journalTableFullName) + .filterExpression(s"${JournalDynamoDB.ATTR_FINISHED_AT} >= :from_time AND ${JournalDynamoDB.ATTR_FINISHED_AT} <= :to_time") + .expressionAttributeValues(Map( + ":from_time" -> AttributeValue.builder().n(fromSec.toString).build(), + ":to_time" -> AttributeValue.builder().n(toSec.toString).build() + ).asJava) + .build() + + val result = dynamoDbClient.scan(scanRequest) + + result.items().asScala.map { item => + val getS = (attr: String) => Option(item.get(attr)).map(_.s()) + val getN = (attr: String) => Option(item.get(attr)).map(_.n().toLong) + + val inputRecordCount = getN(JournalDynamoDB.ATTR_INPUT_RECORD_COUNT).flatMap(v => if (v < 0) None else Some(v)) + val inputRecordCountOld = getN(JournalDynamoDB.ATTR_INPUT_RECORD_COUNT_OLD).flatMap(v => if (v < 0) None else Some(v)) + + TaskCompleted( + jobName = item.get(JournalDynamoDB.ATTR_JOB_NAME).s(), + tableName = item.get(JournalDynamoDB.ATTR_TABLE_NAME).s(), + periodBegin = LocalDate.parse(item.get(JournalDynamoDB.ATTR_PERIOD_BEGIN).s(), dateFormatter), + periodEnd = LocalDate.parse(item.get(JournalDynamoDB.ATTR_PERIOD_END).s(), dateFormatter), + informationDate = LocalDate.parse(item.get(JournalDynamoDB.ATTR_INFO_DATE).s(), dateFormatter), + inputRecordCount = inputRecordCount, + inputRecordCountOld = inputRecordCountOld, + outputRecordCount = getN(JournalDynamoDB.ATTR_OUTPUT_RECORD_COUNT), + outputRecordCountOld = getN(JournalDynamoDB.ATTR_OUTPUT_RECORD_COUNT_OLD), + appendedRecordCount = getN(JournalDynamoDB.ATTR_APPENDED_RECORD_COUNT), + outputSize = getN(JournalDynamoDB.ATTR_OUTPUT_SIZE), + startedAt = item.get(JournalDynamoDB.ATTR_STARTED_AT).n().toLong, + finishedAt = item.get(JournalDynamoDB.ATTR_FINISHED_AT).n().toLong, + status = item.get(JournalDynamoDB.ATTR_STATUS).s(), + failureReason = getS(JournalDynamoDB.ATTR_FAILURE_REASON), + sparkApplicationId = getS(JournalDynamoDB.ATTR_SPARK_APP_ID), + pipelineId = getS(JournalDynamoDB.ATTR_PIPELINE_ID), + pipelineName = getS(JournalDynamoDB.ATTR_PIPELINE_NAME), + environmentName = getS(JournalDynamoDB.ATTR_ENVIRONMENT_NAME), + tenant = getS(JournalDynamoDB.ATTR_TENANT), + country = getS(JournalDynamoDB.ATTR_COUNTRY), + batchId = getN(JournalDynamoDB.ATTR_BATCH_ID).getOrElse(0L) + ) + }.toSeq + } catch { + case NonFatal(ex) => + log.error(s"Unable to read from the journal table '$journalTableFullName'.", ex) + Seq.empty + } + } + + /** + * Creates the journal table if it doesn't exist. + */ + private def createJournalTableIfNotExists(): Unit = { + try { + val describeRequest = DescribeTableRequest.builder() + .tableName(journalTableFullName) + .build() + + dynamoDbClient.describeTable(describeRequest) + log.info(s"Journal table '$journalTableFullName' already exists") + } catch { + case _: ResourceNotFoundException => + log.info(s"Creating journal table '$journalTableFullName'") + createJournalTable() + case NonFatal(ex) => + log.error(s"Error checking if journal table exists", ex) + throw ex + } + } + + /** + * Creates the journal table in DynamoDB. + */ + private def createJournalTable(): Unit = { + val createRequest = CreateTableRequest.builder() + .tableName(journalTableFullName) + .attributeDefinitions( + AttributeDefinition.builder() + .attributeName(JournalDynamoDB.ATTR_JOB_NAME) + .attributeType(ScalarAttributeType.S) + .build(), + AttributeDefinition.builder() + .attributeName(JournalDynamoDB.ATTR_FINISHED_AT) + .attributeType(ScalarAttributeType.N) + .build() + ) + .keySchema( + KeySchemaElement.builder() + .attributeName(JournalDynamoDB.ATTR_JOB_NAME) + .keyType(KeyType.HASH) + .build(), + KeySchemaElement.builder() + .attributeName(JournalDynamoDB.ATTR_FINISHED_AT) + .keyType(KeyType.RANGE) + .build() + ) + .billingMode(BillingMode.PAY_PER_REQUEST) + .build() + + dynamoDbClient.createTable(createRequest) + waitForTableActive(journalTableFullName) + log.info(s"Journal table '$journalTableFullName' created successfully") + } + + /** + * Waits for a table to become ACTIVE. + */ + private def waitForTableActive(tableName: String, maxAttempts: Int = 30): Unit = { + var attempts = 0 + var isActive = false + + while (attempts < maxAttempts && !isActive) { + try { + val describeRequest = DescribeTableRequest.builder() + .tableName(tableName) + .build() + + val response = dynamoDbClient.describeTable(describeRequest) + val status = response.table().tableStatus() + + if (status == TableStatus.ACTIVE) { + isActive = true + } else { + Thread.sleep(1000) + attempts += 1 + } + } catch { + case NonFatal(ex) => + log.warn(s"Error waiting for table '$tableName' to become active", ex) + Thread.sleep(1000) + attempts += 1 + } + } + + if (!isActive) { + throw new RuntimeException(s"Table '$tableName' did not become ACTIVE after $maxAttempts attempts") + } + } + + /** + * Closes the DynamoDB client. + */ + override def close(): Unit = { + try { + dynamoDbClient.close() + } catch { + case NonFatal(ex) => + log.warn("Error closing DynamoDB client", ex) + } + } +} + +object JournalDynamoDB { + val DEFAULT_JOURNAL_TABLE = "journal" + val DEFAULT_TABLE_PREFIX = "pramen" + + // Maximum length for failure reason (4KB minus some overhead) + val MAX_FAILURE_REASON_LENGTH = 4000 + + // Attribute names for journal table + val ATTR_JOB_NAME = "jobName" + val ATTR_TABLE_NAME = "tableName" + val ATTR_PERIOD_BEGIN = "periodBegin" + val ATTR_PERIOD_END = "periodEnd" + val ATTR_INFO_DATE = "infoDate" + val ATTR_INPUT_RECORD_COUNT = "inputRecordCount" + val ATTR_INPUT_RECORD_COUNT_OLD = "inputRecordCountOld" + val ATTR_OUTPUT_RECORD_COUNT = "outputRecordCount" + val ATTR_OUTPUT_RECORD_COUNT_OLD = "outputRecordCountOld" + val ATTR_APPENDED_RECORD_COUNT = "appendedRecordCount" + val ATTR_OUTPUT_SIZE = "outputSize" + val ATTR_STARTED_AT = "startedAt" + val ATTR_FINISHED_AT = "finishedAt" + val ATTR_STATUS = "status" + val ATTR_FAILURE_REASON = "failureReason" + val ATTR_SPARK_APP_ID = "sparkApplicationId" + val ATTR_PIPELINE_ID = "pipelineId" + val ATTR_PIPELINE_NAME = "pipelineName" + val ATTR_ENVIRONMENT_NAME = "environmentName" + val ATTR_TENANT = "tenant" + val ATTR_COUNTRY = "country" + val ATTR_BATCH_ID = "batchId" + + /** + * Builder for creating JournalDynamoDB instances. + */ + class JournalDynamoDBBuilder { + private var region: Option[String] = None + private var tableArn: Option[String] = None + private var tablePrefix: String = DEFAULT_TABLE_PREFIX + private var credentialsProvider: Option[AwsCredentialsProvider] = None + private var endpoint: Option[String] = None + + def withRegion(region: String): JournalDynamoDBBuilder = { + this.region = Some(region) + this + } + + def withTableArn(arn: String): JournalDynamoDBBuilder = { + this.tableArn = Some(arn) + this + } + + def withTablePrefix(prefix: String): JournalDynamoDBBuilder = { + this.tablePrefix = prefix + this + } + + def withCredentialsProvider(provider: AwsCredentialsProvider): JournalDynamoDBBuilder = { + this.credentialsProvider = Some(provider) + this + } + + def withEndpoint(endpoint: String): JournalDynamoDBBuilder = { + this.endpoint = Some(endpoint) + this + } + + def build(): JournalDynamoDB = { + if (region.isEmpty) { + throw new IllegalArgumentException("Region must be provided") + } + + val clientBuilder = DynamoDbClient.builder() + .region(Region.of(region.get)) + + credentialsProvider.foreach(clientBuilder.credentialsProvider) + endpoint.foreach { ep => + clientBuilder.endpointOverride(URI.create(ep)) + } + + val client = clientBuilder.build() + + new JournalDynamoDB( + dynamoDbClient = client, + tableArn = tableArn, + tablePrefix = tablePrefix + ) + } + } + + def builder: JournalDynamoDBBuilder = new JournalDynamoDBBuilder +} diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala index c8e8a150..8162413e 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala @@ -22,7 +22,7 @@ import software.amazon.awssdk.regions.Region import software.amazon.awssdk.services.dynamodb.DynamoDbClient import software.amazon.awssdk.services.dynamodb.model._ import za.co.absa.pramen.api.lock.{TokenLock, TokenLockFactory} -import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb.DEFAULT_TABLE_PREFIX +import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb import java.net.URI import scala.util.control.NonFatal @@ -42,6 +42,7 @@ class TokenLockFactoryDynamoDb( tableArn: Option[String] = None, tablePrefix: String = "pramen" ) extends TokenLockFactory with AutoCloseable { + import TokenLockFactoryDynamoDb._ import TokenLockDynamoDb._ @@ -49,7 +50,7 @@ class TokenLockFactoryDynamoDb( // Construct table name with prefix private val locksTableBaseName = s"${tablePrefix}_locks" - private val locksTableName = getFullTableName(tableArn, locksTableBaseName) + private val locksTableName = BookkeeperDynamoDb.getFullTableName(tableArn, locksTableBaseName) // Initialize table on construction init() @@ -184,29 +185,6 @@ class TokenLockFactoryDynamoDb( throw new RuntimeException(s"Table $tableName did not become active within $maxWaitSeconds seconds") } } - - /** - * Constructs the full table name using ARN prefix and table name. - * If tableArn is provided, uses it as a prefix, otherwise returns just the table name. - * - * @param tableArn Optional ARN prefix for the table - * @param tableName The table name - * @return Full table name or ARN - */ - private def getFullTableName(tableArn: Option[String], tableName: String): String = { - tableArn match { - case Some(arn) if arn.nonEmpty => - // If ARN ends with table/, append the table name, otherwise append /table/tableName - if (arn.endsWith("/")) { - s"${arn}table/$tableName" - } else if (arn.contains("/table/")) { - arn // ARN already includes table path - } else { - s"$arn/table/$tableName" - } - case _ => tableName - } - } } object TokenLockFactoryDynamoDb { @@ -225,7 +203,7 @@ object TokenLockFactoryDynamoDb { class TokenLockFactoryDynamoDbBuilder { private var region: Option[String] = None private var tableArn: Option[String] = None - private var tablePrefix: String = DEFAULT_TABLE_PREFIX + private var tablePrefix: String = BookkeeperDynamoDb.DEFAULT_TABLE_PREFIX private var credentialsProvider: Option[AwsCredentialsProvider] = None private var endpoint: Option[String] = None @@ -331,27 +309,4 @@ object TokenLockFactoryDynamoDb { * @return A new builder instance */ def builder: TokenLockFactoryDynamoDbBuilder = new TokenLockFactoryDynamoDbBuilder - - /** - * Constructs the full table name using ARN prefix and table name. - * If tableArn is provided, uses it as a prefix, otherwise returns just the table name. - * - * @param tableArn Optional ARN prefix for the table - * @param tableName The table name - * @return Full table name or ARN - */ - def getFullTableName(tableArn: Option[String], tableName: String): String = { - tableArn match { - case Some(arn) if arn.nonEmpty => - // If ARN ends with table/, append the table name, otherwise append /table/tableName - if (arn.endsWith("/")) { - s"${arn}table/$tableName" - } else if (arn.contains("/table/")) { - arn // ARN already includes table path - } else { - s"$arn/table/$tableName" - } - case _ => tableName - } - } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala new file mode 100644 index 00000000..af35adc2 --- /dev/null +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala @@ -0,0 +1,351 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.metadata + +import org.slf4j.LoggerFactory +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider +import software.amazon.awssdk.regions.Region +import software.amazon.awssdk.services.dynamodb.DynamoDbClient +import software.amazon.awssdk.services.dynamodb.model._ +import za.co.absa.pramen.api.MetadataValue +import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb + +import java.net.URI +import java.time.{Instant, LocalDate} +import scala.collection.JavaConverters._ +import scala.util.control.NonFatal + +/** + * DynamoDB-based metadata manager for storing custom metadata. + * + * This manager stores metadata key-value pairs in a DynamoDB table with automatic table creation. + * + * @param dynamoDbClient The DynamoDB client to use + * @param tableArn Optional ARN prefix for the metadata table + * @param tablePrefix Prefix for the metadata table name (default: "pramen") + */ +class MetadataManagerDynamoDb( + dynamoDbClient: DynamoDbClient, + tableArn: Option[String] = None, + tablePrefix: String = MetadataManagerDynamoDb.DEFAULT_TABLE_PREFIX +) extends MetadataManagerBase(true) with AutoCloseable { + + private val log = LoggerFactory.getLogger(this.getClass) + + private val metadataTableBaseName = s"${tablePrefix}_${MetadataManagerDynamoDb.DEFAULT_METADATA_TABLE}" + private val metadataTableFullName = BookkeeperDynamoDb.getFullTableName(tableArn, metadataTableBaseName) + + // Initialize table on creation + createMetadataTableIfNotExists() + + override def getMetadataFromStorage(tableName: String, infoDate: LocalDate, key: String): Option[MetadataValue] = { + try { + val compositeKey = s"$tableName#$infoDate" + + val getRequest = GetItemRequest.builder() + .tableName(metadataTableFullName) + .key(Map( + MetadataManagerDynamoDb.ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build(), + MetadataManagerDynamoDb.ATTR_METADATA_KEY -> AttributeValue.builder().s(key).build() + ).asJava) + .build() + + val result = dynamoDbClient.getItem(getRequest) + + if (result.hasItem) { + val item = result.item() + val value = item.get(MetadataManagerDynamoDb.ATTR_METADATA_VALUE).s() + val lastUpdated = Instant.ofEpochSecond(item.get(MetadataManagerDynamoDb.ATTR_LAST_UPDATED).n().toLong) + Some(MetadataValue(value, lastUpdated)) + } else { + None + } + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to read from the metadata table '$metadataTableFullName'.", ex) + } + } + + override def getMetadataFromStorage(tableName: String, infoDate: LocalDate): Map[String, MetadataValue] = { + try { + val compositeKey = s"$tableName#$infoDate" + + val queryRequest = QueryRequest.builder() + .tableName(metadataTableFullName) + .keyConditionExpression(s"${MetadataManagerDynamoDb.ATTR_COMPOSITE_KEY} = :composite_key") + .expressionAttributeValues(Map( + ":composite_key" -> AttributeValue.builder().s(compositeKey).build() + ).asJava) + .build() + + val result = dynamoDbClient.query(queryRequest) + + result.items().asScala.map { item => + val key = item.get(MetadataManagerDynamoDb.ATTR_METADATA_KEY).s() + val value = item.get(MetadataManagerDynamoDb.ATTR_METADATA_VALUE).s() + val lastUpdated = Instant.ofEpochSecond(item.get(MetadataManagerDynamoDb.ATTR_LAST_UPDATED).n().toLong) + key -> MetadataValue(value, lastUpdated) + }.toMap + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to read from the metadata table '$metadataTableFullName'.", ex) + } + } + + override def setMetadataToStorage(tableName: String, infoDate: LocalDate, key: String, metadata: MetadataValue): Unit = { + try { + val compositeKey = s"$tableName#$infoDate" + + val putRequest = PutItemRequest.builder() + .tableName(metadataTableFullName) + .item(Map( + MetadataManagerDynamoDb.ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build(), + MetadataManagerDynamoDb.ATTR_METADATA_KEY -> AttributeValue.builder().s(key).build(), + MetadataManagerDynamoDb.ATTR_METADATA_VALUE -> AttributeValue.builder().s(metadata.value).build(), + MetadataManagerDynamoDb.ATTR_LAST_UPDATED -> AttributeValue.builder().n(metadata.lastUpdated.getEpochSecond.toString).build(), + MetadataManagerDynamoDb.ATTR_TABLE_NAME -> AttributeValue.builder().s(tableName).build(), + MetadataManagerDynamoDb.ATTR_INFO_DATE -> AttributeValue.builder().s(infoDate.toString).build() + ).asJava) + .build() + + dynamoDbClient.putItem(putRequest) + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to write to the metadata table '$metadataTableFullName'.", ex) + } + } + + override def deleteMetadataFromStorage(tableName: String, infoDate: LocalDate, key: String): Unit = { + try { + val compositeKey = s"$tableName#$infoDate" + + val deleteRequest = DeleteItemRequest.builder() + .tableName(metadataTableFullName) + .key(Map( + MetadataManagerDynamoDb.ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build(), + MetadataManagerDynamoDb.ATTR_METADATA_KEY -> AttributeValue.builder().s(key).build() + ).asJava) + .build() + + dynamoDbClient.deleteItem(deleteRequest) + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to delete from the metadata table '$metadataTableFullName'.", ex) + } + } + + override def deleteMetadataFromStorage(tableName: String, infoDate: LocalDate): Unit = { + try { + val compositeKey = s"$tableName#$infoDate" + + // First, query all items with this composite key + val queryRequest = QueryRequest.builder() + .tableName(metadataTableFullName) + .keyConditionExpression(s"${MetadataManagerDynamoDb.ATTR_COMPOSITE_KEY} = :composite_key") + .expressionAttributeValues(Map( + ":composite_key" -> AttributeValue.builder().s(compositeKey).build() + ).asJava) + .build() + + val result = dynamoDbClient.query(queryRequest) + + // Delete each item + result.items().asScala.foreach { item => + val key = item.get(MetadataManagerDynamoDb.ATTR_METADATA_KEY).s() + deleteMetadataFromStorage(tableName, infoDate, key) + } + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to delete from the metadata table '$metadataTableFullName'.", ex) + } + } + + /** + * Creates the metadata table if it doesn't exist. + */ + private def createMetadataTableIfNotExists(): Unit = { + try { + val describeRequest = DescribeTableRequest.builder() + .tableName(metadataTableFullName) + .build() + + dynamoDbClient.describeTable(describeRequest) + log.info(s"Metadata table '$metadataTableFullName' already exists") + } catch { + case _: ResourceNotFoundException => + log.info(s"Creating metadata table '$metadataTableFullName'") + createMetadataTable() + case NonFatal(ex) => + log.error(s"Error checking if metadata table exists", ex) + throw ex + } + } + + /** + * Creates the metadata table in DynamoDB. + */ + private def createMetadataTable(): Unit = { + val createRequest = CreateTableRequest.builder() + .tableName(metadataTableFullName) + .attributeDefinitions( + AttributeDefinition.builder() + .attributeName(MetadataManagerDynamoDb.ATTR_COMPOSITE_KEY) + .attributeType(ScalarAttributeType.S) + .build(), + AttributeDefinition.builder() + .attributeName(MetadataManagerDynamoDb.ATTR_METADATA_KEY) + .attributeType(ScalarAttributeType.S) + .build() + ) + .keySchema( + KeySchemaElement.builder() + .attributeName(MetadataManagerDynamoDb.ATTR_COMPOSITE_KEY) + .keyType(KeyType.HASH) + .build(), + KeySchemaElement.builder() + .attributeName(MetadataManagerDynamoDb.ATTR_METADATA_KEY) + .keyType(KeyType.RANGE) + .build() + ) + .billingMode(BillingMode.PAY_PER_REQUEST) + .build() + + dynamoDbClient.createTable(createRequest) + waitForTableActive(metadataTableFullName) + log.info(s"Metadata table '$metadataTableFullName' created successfully") + } + + /** + * Waits for a table to become ACTIVE. + */ + private def waitForTableActive(tableName: String, maxAttempts: Int = 30): Unit = { + var attempts = 0 + var isActive = false + + while (attempts < maxAttempts && !isActive) { + try { + val describeRequest = DescribeTableRequest.builder() + .tableName(tableName) + .build() + + val response = dynamoDbClient.describeTable(describeRequest) + val status = response.table().tableStatus() + + if (status == TableStatus.ACTIVE) { + isActive = true + } else { + Thread.sleep(1000) + attempts += 1 + } + } catch { + case NonFatal(ex) => + log.warn(s"Error waiting for table '$tableName' to become active", ex) + Thread.sleep(1000) + attempts += 1 + } + } + + if (!isActive) { + throw new RuntimeException(s"Table '$tableName' did not become ACTIVE after $maxAttempts attempts") + } + } + + /** + * Closes the DynamoDB client. + */ + override def close(): Unit = { + try { + dynamoDbClient.close() + } catch { + case NonFatal(ex) => + log.warn("Error closing DynamoDB client", ex) + } + } +} + +object MetadataManagerDynamoDb { + val DEFAULT_METADATA_TABLE = "metadata" + val DEFAULT_TABLE_PREFIX = "pramen" + + // Attribute names for metadata table + val ATTR_COMPOSITE_KEY = "compositeKey" // tableName#infoDate + val ATTR_METADATA_KEY = "metadataKey" + val ATTR_METADATA_VALUE = "metadataValue" + val ATTR_LAST_UPDATED = "lastUpdated" + val ATTR_TABLE_NAME = "tableName" // For filtering/queries + val ATTR_INFO_DATE = "infoDate" // For filtering/queries + + /** + * Builder for creating MetadataManagerDynamoDb instances. + */ + class MetadataManagerDynamoDbBuilder { + private var region: Option[String] = None + private var tableArn: Option[String] = None + private var tablePrefix: String = DEFAULT_TABLE_PREFIX + private var credentialsProvider: Option[AwsCredentialsProvider] = None + private var endpoint: Option[String] = None + + def withRegion(region: String): MetadataManagerDynamoDbBuilder = { + this.region = Some(region) + this + } + + def withTableArn(arn: String): MetadataManagerDynamoDbBuilder = { + this.tableArn = Some(arn) + this + } + + def withTablePrefix(prefix: String): MetadataManagerDynamoDbBuilder = { + this.tablePrefix = prefix + this + } + + def withCredentialsProvider(provider: AwsCredentialsProvider): MetadataManagerDynamoDbBuilder = { + this.credentialsProvider = Some(provider) + this + } + + def withEndpoint(endpoint: String): MetadataManagerDynamoDbBuilder = { + this.endpoint = Some(endpoint) + this + } + + def build(): MetadataManagerDynamoDb = { + if (region.isEmpty) { + throw new IllegalArgumentException("Region must be provided") + } + + val clientBuilder = DynamoDbClient.builder() + .region(Region.of(region.get)) + + credentialsProvider.foreach(clientBuilder.credentialsProvider) + endpoint.foreach { ep => + clientBuilder.endpointOverride(URI.create(ep)) + } + + val client = clientBuilder.build() + + new MetadataManagerDynamoDb( + dynamoDbClient = client, + tableArn = tableArn, + tablePrefix = tablePrefix + ) + } + } + + def builder: MetadataManagerDynamoDbBuilder = new MetadataManagerDynamoDbBuilder +} diff --git a/pramen/examples/dynamodb_bookkeeping/README.md b/pramen/examples/dynamodb_bookkeeping/README.md index 5cc76f85..c49195a6 100644 --- a/pramen/examples/dynamodb_bookkeeping/README.md +++ b/pramen/examples/dynamodb_bookkeeping/README.md @@ -27,8 +27,12 @@ pramen.bookkeeping { ``` This creates tables: -- `pramen_bookkeeping` -- `pramen_schemas` +- `pramen_bookkeeping` - Data availability and record counts +- `pramen_schemas` - Table schema evolution +- `pramen_locks` - Distributed locking (if locks enabled) +- `pramen_journal` - Task completion history +- `pramen_metadata` - Custom metadata key-value pairs +- `pramen_offsets` - Incremental ingestion offset tracking ### Production Configuration @@ -41,8 +45,12 @@ pramen.bookkeeping { ``` This creates tables: -- `pramen_production_bookkeeping` -- `pramen_production_schemas` +- `pramen_production_bookkeeping` - Data availability and record counts +- `pramen_production_schemas` - Table schema evolution +- `pramen_production_locks` - Distributed locking (if locks enabled) +- `pramen_production_journal` - Task completion history +- `pramen_production_metadata` - Custom metadata key-value pairs +- `pramen_production_offsets` - Incremental ingestion offset tracking ### Multi-Environment Configuration @@ -169,6 +177,21 @@ Tables are automatically created with the following schema: - `infoDate`: Date when schema was recorded - `schemaJson`: Spark schema in JSON format +#### Offset Table (`{prefix}_offsets`) +- **Partition Key**: `pramenTableName` (String) +- **Sort Key**: `compositeKey` (String, format: "infoDate#createdAtMilli") +- **Billing Mode**: PAY_PER_REQUEST (on-demand) +- **Attributes**: + - `pramenTableName`: Name of the metastore table + - `compositeKey`: Composite key for efficient querying (infoDate#createdAtMilli) + - `infoDate`: Information date + - `dataType`: Offset data type (e.g., "IntegralType", "StringType") + - `minOffset`: Minimum offset value for this batch + - `maxOffset`: Maximum offset value for this batch + - `batchId`: Batch execution ID + - `createdAt`: Timestamp when offset was created (milliseconds) + - `committedAt`: Timestamp when offset was committed (milliseconds, optional) + ## Running the Example 1. **Configure AWS credentials** (see above) @@ -196,6 +219,10 @@ Tables are automatically created with the following schema: You should see: - `pramen_production_bookkeeping` - `pramen_production_schemas` + - `pramen_production_offsets` (if using incremental ingestion) + - `pramen_production_locks` (if locks enabled) + - `pramen_production_journal` (if journal enabled) + - `pramen_production_metadata` (if metadata enabled) 5. **Query bookkeeping data**: ```bash @@ -292,9 +319,7 @@ aws dynamodb create-table \ | Scaling | Automatic | Manual | Manual | Automatic | | Multi-region | Yes | No | Yes | Yes | | Query Performance | Fast | Fast | Fast | Slower | -| Incremental Support | No* | Yes | No | No | - -*Note: Offset management for incremental pipelines is not yet implemented for DynamoDB bookkeeper. +| Incremental Support | Yes | Yes | No | No | ## Distributed Locking with DynamoDB @@ -334,10 +359,13 @@ pramen { } ``` -This creates three tables: +This creates six tables: - `pramen_production_bookkeeping` - Bookkeeping data - `pramen_production_schemas` - Table schemas - `pramen_production_locks` - Distributed locks +- `pramen_production_journal` - Task completion history +- `pramen_production_metadata` - Custom metadata +- `pramen_production_offsets` - Incremental ingestion offsets See `dynamodb_with_locks.conf` for a complete example. @@ -362,7 +390,10 @@ Add the locks table to your IAM policy: "Resource": [ "arn:aws:dynamodb:*:*:table/pramen_production_bookkeeping", "arn:aws:dynamodb:*:*:table/pramen_production_schemas", - "arn:aws:dynamodb:*:*:table/pramen_production_locks" + "arn:aws:dynamodb:*:*:table/pramen_production_locks", + "arn:aws:dynamodb:*:*:table/pramen_production_journal", + "arn:aws:dynamodb:*:*:table/pramen_production_metadata", + "arn:aws:dynamodb:*:*:table/pramen_production_offsets" ] } ] diff --git a/pramen/examples/dynamodb_bookkeeping/dynamodb_with_locks.conf b/pramen/examples/dynamodb_bookkeeping/dynamodb_with_locks.conf new file mode 100644 index 00000000..4f5140b1 --- /dev/null +++ b/pramen/examples/dynamodb_bookkeeping/dynamodb_with_locks.conf @@ -0,0 +1,150 @@ +# DynamoDB Bookkeeping with Distributed Locking Example +# +# This configuration demonstrates how to use DynamoDB for both bookkeeping AND distributed locking. +# When DynamoDB is configured for bookkeeping, Pramen automatically uses it for locks as well. + +pramen { + # Enable locking to prevent concurrent pipeline runs + runtime.use.locks = true + + # Bookkeeping configuration + bookkeeping { + enabled = true + + # DynamoDB Configuration + # ===================== + + # AWS region where DynamoDB tables will be created (REQUIRED) + dynamodb.region = "us-east-1" + + # Table prefix for all Pramen tables (OPTIONAL, default: "pramen") + # This creates: {prefix}_bookkeeping, {prefix}_schemas, {prefix}_locks + dynamodb.table.prefix = "pramen_production" + + # Table ARN prefix for cross-account or cross-region access (OPTIONAL) + # dynamodb.table.arn = "arn:aws:dynamodb:us-east-1:123456789012:table/" + + # Legacy required field (not used for DynamoDB but must be set) + hadoop.format = "delta" + } + + # Pipeline configuration + environment.name = "production" + pipeline.name = "example_pipeline" + + # Metastore configuration (example) + metastore { + tables = [ + { + name = "customer_data" + format = "parquet" + path = "/data/customers" + } + ] + } + + # Example operations + operations = [ + { + name = "ingest_customers" + type = "ingestion" + schedule.type = "daily" + + source { + factory.class = "za.co.absa.pramen.core.source.JdbcSource" + jdbc.url = "jdbc:postgresql://localhost:5432/source_db" + jdbc.user = "reader" + jdbc.password = "secret" + query = "SELECT * FROM customers WHERE date = :infoDate" + } + + metastore.table = "customer_data" + } + ] +} + +# ============================================================================ +# How This Configuration Works +# ============================================================================ +# +# 1. BOOKKEEPING TABLES: +# - pramen_production_bookkeeping: Stores data chunk metadata +# - pramen_production_schemas: Stores table schemas +# +# 2. LOCK TABLE: +# - pramen_production_locks: Stores distributed locks +# +# 3. JOURNAL TABLE: +# - pramen_production_journal: Stores task completion records +# +# 4. METADATA TABLE: +# - pramen_production_metadata: Stores custom metadata key-value pairs +# +# 5. AUTOMATIC TABLE CREATION: +# All tables are created automatically on first run with proper schema: +# - Partition keys and sort keys configured +# - PAY_PER_REQUEST billing mode (on-demand) +# - Five tables total: bookkeeping, schemas, locks, journal, metadata +# +# 6. LOCK BEHAVIOR: +# When a pipeline runs: +# - Acquires a lock by writing to pramen_production_locks table +# - Uses DynamoDB conditional writes (attribute_not_exists) for atomicity +# - Lock ticket expires after 10 minutes (automatically renewed) +# - If another instance tries to run, it will be blocked +# - Lock is released when pipeline completes or fails +# +# 7. MULTI-ENVIRONMENT SETUP: +# Use different table prefixes for different environments: +# - Dev: pramen_dev_* +# - Staging: pramen_staging_* +# - Production: pramen_production_* +# +# 8. AWS CREDENTIALS: +# Pramen uses AWS DefaultCredentialsProvider which loads from: +# - Environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) +# - AWS credentials file (~/.aws/credentials) +# - IAM role (EC2, ECS, EMR, Lambda, Glue) +# +# 9. IAM PERMISSIONS REQUIRED: +# { +# "Version": "2012-10-17", +# "Statement": [ +# { +# "Effect": "Allow", +# "Action": [ +# "dynamodb:CreateTable", +# "dynamodb:DescribeTable", +# "dynamodb:PutItem", +# "dynamodb:GetItem", +# "dynamodb:DeleteItem", +# "dynamodb:UpdateItem", +# "dynamodb:Query", +# "dynamodb:Scan" +# ], +# "Resource": [ +# "arn:aws:dynamodb:*:*:table/pramen_production_bookkeeping", +# "arn:aws:dynamodb:*:*:table/pramen_production_schemas", +# "arn:aws:dynamodb:*:*:table/pramen_production_locks" +# ] +# } +# ] +# } +# +# 10. TESTING: +# spark-submit --class za.co.absa.pramen.runner.PipelineRunner \ +# pramen-runner_2.12-1.13.10.jar \ +# --config dynamodb_with_locks.conf \ +# --date 2024-01-15 +# +# 11. COST OPTIMIZATION: +# - PAY_PER_REQUEST billing: $1.25 per million writes, $0.25 per million reads +# - Typical pipeline: ~10-20 requests per run +# - Cost per run: < $0.001 +# - Monthly cost for 100 daily runs: ~$3 +# +# 12. LOCK EXPIRATION: +# - Lock tickets expire after 10 minutes of inactivity +# - Active pipelines renew their lock every 2 minutes +# - Expired locks can be taken over by new pipeline runs +# - Hard expiration after 1 day (cleanup of stale locks) From 231636724f2be82910bda2102a2a9860bcd93ed0 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Wed, 1 Apr 2026 11:53:15 +0200 Subject: [PATCH 05/13] #181 Implement AutoCloseable interface for various traits and update close methods --- .../co/absa/pramen/api/MetadataManager.scala | 4 ++- .../pramen/api/lock/TokenLockFactory.scala | 4 ++- .../co/absa/pramen/core/app/AppContext.scala | 4 +-- .../pramen/core/bookkeeper/Bookkeeper.scala | 33 +++++-------------- .../core/bookkeeper/BookkeeperDynamoDb.scala | 2 +- .../core/bookkeeper/OffsetManager.scala | 4 ++- .../bookkeeper/OffsetManagerDynamoDb.scala | 12 ++++--- .../co/absa/pramen/core/journal/Journal.scala | 3 +- .../pramen/core/journal/JournalDynamoDB.scala | 7 +++- .../metadata/MetadataManagerDynamoDb.scala | 5 +++ .../mq/SingleMessageProducer.scala | 4 +-- 11 files changed, 42 insertions(+), 40 deletions(-) diff --git a/pramen/api/src/main/scala/za/co/absa/pramen/api/MetadataManager.scala b/pramen/api/src/main/scala/za/co/absa/pramen/api/MetadataManager.scala index 0bd91ed9..279f7969 100644 --- a/pramen/api/src/main/scala/za/co/absa/pramen/api/MetadataManager.scala +++ b/pramen/api/src/main/scala/za/co/absa/pramen/api/MetadataManager.scala @@ -18,7 +18,7 @@ package za.co.absa.pramen.api import java.time.LocalDate -trait MetadataManager { +trait MetadataManager extends AutoCloseable { /** * Get metadata value for a given table, date and key. * @@ -70,4 +70,6 @@ trait MetadataManager { * Returns false if metadata is available only for the duration of the session. */ def isPersistent: Boolean + + override def close(): Unit = {} } diff --git a/pramen/api/src/main/scala/za/co/absa/pramen/api/lock/TokenLockFactory.scala b/pramen/api/src/main/scala/za/co/absa/pramen/api/lock/TokenLockFactory.scala index d0aaeaaf..11ac6343 100644 --- a/pramen/api/src/main/scala/za/co/absa/pramen/api/lock/TokenLockFactory.scala +++ b/pramen/api/src/main/scala/za/co/absa/pramen/api/lock/TokenLockFactory.scala @@ -41,6 +41,8 @@ */ package za.co.absa.pramen.api.lock -trait TokenLockFactory { +trait TokenLockFactory extends AutoCloseable { def getLock(token: String): TokenLock + + override def close(): Unit = {} } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/app/AppContext.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/app/AppContext.scala index 3fca0e01..47454157 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/app/AppContext.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/app/AppContext.scala @@ -21,7 +21,7 @@ import za.co.absa.pramen.core.bookkeeper.Bookkeeper import za.co.absa.pramen.core.journal.Journal import za.co.absa.pramen.core.metastore.Metastore -trait AppContext { +trait AppContext extends AutoCloseable { val appConfig: AppConfig def bookkeeper: Bookkeeper @@ -31,6 +31,4 @@ trait AppContext { def journal: Journal def metastore: Metastore - - def close(): Unit } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala index dadce5c4..8f060275 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala @@ -182,14 +182,11 @@ object Bookkeeper { } else if (hasBookkeepingDynamoDb) { val tablePrefix = bookkeepingConfig.dynamoDbTablePrefix.getOrElse(JournalDynamoDB.DEFAULT_TABLE_PREFIX) log.info(s"Using DynamoDB for journal in region '${bookkeepingConfig.dynamoDbRegion.get}' with table prefix '$tablePrefix'") - val builder = JournalDynamoDB.builder + JournalDynamoDB.builder .withRegion(bookkeepingConfig.dynamoDbRegion.get) .withTablePrefix(tablePrefix) - val builder2 = bookkeepingConfig.dynamoDbTableArn match { - case Some(arn) => builder.withTableArn(arn) - case None => builder - } - builder2.build() + .withTableArn(bookkeepingConfig.dynamoDbTableArn) + .build() } else { mongoDbConnection match { case Some(connection) => @@ -226,14 +223,11 @@ object Bookkeeper { } else if (hasBookkeepingDynamoDb) { val tablePrefix = bookkeepingConfig.dynamoDbTablePrefix.getOrElse(MetadataManagerDynamoDb.DEFAULT_TABLE_PREFIX) log.info(s"Using DynamoDB for metadata in region '${bookkeepingConfig.dynamoDbRegion.get}' with table prefix '$tablePrefix'") - val builder = MetadataManagerDynamoDb.builder + MetadataManagerDynamoDb.builder .withRegion(bookkeepingConfig.dynamoDbRegion.get) .withTablePrefix(tablePrefix) - val builder2 = bookkeepingConfig.dynamoDbTableArn match { - case Some(arn) => builder.withTableArn(arn) - case None => builder - } - builder2.build() + .withTableArn(bookkeepingConfig.dynamoDbTableArn) + .build() } else { log.info(s"The custom metadata management is not supported.") new MetadataManagerNull(isPersistenceEnabled = true) @@ -243,18 +237,9 @@ object Bookkeeper { override def close(): Unit = { mongoDbConnection.foreach(_.close()) dbOpt.foreach(_.close()) - tokenFactory match { - case closeable: AutoCloseable => closeable.close() - case _ => // Not closeable - } - journal match { - case closeable: AutoCloseable => closeable.close() - case _ => // Not closeable - } - metadataManager match { - case closeable: AutoCloseable => closeable.close() - case _ => // Not closeable - } + tokenFactory.close() + journal.close() + metadataManager.close() } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala index 832a88ab..5f282795 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala @@ -69,7 +69,7 @@ class BookkeeperDynamoDb( // Offset management private val offsetManagement = new OffsetManagerCached( - new OffsetManagerDynamoDb(dynamoDbClient, batchId, tableArn, tablePrefix) + new OffsetManagerDynamoDb(dynamoDbClient, batchId, tableArn, tablePrefix, closesClient = false) ) // Initialize tables on construction diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManager.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManager.scala index 68482542..7ce0255d 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManager.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManager.scala @@ -33,7 +33,7 @@ import java.time.LocalDate * The startWriteOffsets() together with commitOffsets() and rollbackOffsets() provide mechanisms to ensure consistency * with data. */ -trait OffsetManager { +trait OffsetManager extends AutoCloseable { /** * Returns offsets for an information date. * @@ -88,4 +88,6 @@ trait OffsetManager { * Rolls back an offset request */ def rollbackOffsets(request: DataOffsetRequest): Unit + + override def close(): Unit = {} } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala index 861950fd..4d6f6a08 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala @@ -51,8 +51,9 @@ class OffsetManagerDynamoDb( dynamoDbClient: DynamoDbClient, batchId: Long, tableArn: Option[String] = None, - tablePrefix: String = OffsetManagerDynamoDb.DEFAULT_TABLE_PREFIX -) extends OffsetManager with AutoCloseable { + tablePrefix: String = OffsetManagerDynamoDb.DEFAULT_TABLE_PREFIX, + closesClient: Boolean = true +) extends OffsetManager { import OffsetManagerDynamoDb._ @@ -501,7 +502,9 @@ class OffsetManagerDynamoDb( */ override def close(): Unit = { try { - dynamoDbClient.close() + if (closesClient) { + dynamoDbClient.close() + } } catch { case NonFatal(ex) => log.warn("Error closing DynamoDB client", ex) @@ -584,7 +587,8 @@ object OffsetManagerDynamoDb { dynamoDbClient = client, batchId = batchId, tableArn = tableArn, - tablePrefix = tablePrefix + tablePrefix = tablePrefix, + closesClient = true ) } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/Journal.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/Journal.scala index 0a03c215..b7d70ff8 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/Journal.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/Journal.scala @@ -23,10 +23,11 @@ import java.time.Instant /** * A journal is responsible of keeping track of all completed tasks. */ -trait Journal { +trait Journal extends AutoCloseable { def addEntry(entry: TaskCompleted): Unit def getEntries(from: Instant, to: Instant): Seq[TaskCompleted] + override def close(): Unit = {} } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala index 15fe0702..bb688562 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala @@ -43,7 +43,7 @@ class JournalDynamoDB( dynamoDbClient: DynamoDbClient, tableArn: Option[String] = None, tablePrefix: String = JournalDynamoDB.DEFAULT_TABLE_PREFIX -) extends Journal with AutoCloseable { +) extends Journal { private val log = LoggerFactory.getLogger(this.getClass) private val dateFormatter = InfoDateConfig.defaultDateFormatter @@ -330,6 +330,11 @@ object JournalDynamoDB { this } + def withTableArn(arnOpt: Option[String]): JournalDynamoDBBuilder = { + this.tableArn = arnOpt + this + } + def withTablePrefix(prefix: String): JournalDynamoDBBuilder = { this.tablePrefix = prefix this diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala index af35adc2..c85153c6 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala @@ -309,6 +309,11 @@ object MetadataManagerDynamoDb { this } + def withTableArn(arnOpt: Option[String]): MetadataManagerDynamoDbBuilder = { + this.tableArn = arnOpt + this + } + def withTablePrefix(prefix: String): MetadataManagerDynamoDbBuilder = { this.tablePrefix = prefix this diff --git a/pramen/extras/src/main/scala/za/co/absa/pramen/extras/notification/mq/SingleMessageProducer.scala b/pramen/extras/src/main/scala/za/co/absa/pramen/extras/notification/mq/SingleMessageProducer.scala index 95c92d0a..3c3c4d45 100644 --- a/pramen/extras/src/main/scala/za/co/absa/pramen/extras/notification/mq/SingleMessageProducer.scala +++ b/pramen/extras/src/main/scala/za/co/absa/pramen/extras/notification/mq/SingleMessageProducer.scala @@ -16,10 +16,8 @@ package za.co.absa.pramen.extras.notification.mq -trait SingleMessageProducer { +trait SingleMessageProducer extends AutoCloseable { def send(topic: String, message: String, numberOrRetries: Int = 3): Unit def connect(): Unit - - def close(): Unit } From 143c6bc38c22bc9bd18b662e3cfa90de3e47786e Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Wed, 1 Apr 2026 16:49:01 +0200 Subject: [PATCH 06/13] #181 Refactor DynamoDB classes to enforce private constructors and improve error handling in builders --- README.md | 28 ++++ .../core/bookkeeper/BookkeeperDynamoDb.scala | 4 +- .../pramen/core/journal/JournalDynamoDB.scala | 2 +- .../metadata/MetadataManagerDynamoDb.scala | 2 +- .../pramen/core/model/QueryBuilderSuite.scala | 12 +- .../BookkeeperDynamoDbBuilderSuite.scala | 148 ++++++++++++++++++ .../OffsetManagerDynamoDbBuilderSuite.scala | 117 ++++++++++++++ .../journal/JournalDynamoDBBuilderSuite.scala | 99 ++++++++++++ ...TokenLockFactoryDynamoDbBuilderSuite.scala | 99 ++++++++++++ .../MetadataManagerDynamoDbBuilderSuite.scala | 99 ++++++++++++ 10 files changed, 600 insertions(+), 10 deletions(-) create mode 100644 pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDynamoDbBuilderSuite.scala create mode 100644 pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/OffsetManagerDynamoDbBuilderSuite.scala create mode 100644 pramen/core/src/test/scala/za/co/absa/pramen/core/tests/journal/JournalDynamoDBBuilderSuite.scala create mode 100644 pramen/core/src/test/scala/za/co/absa/pramen/core/tests/lock/TokenLockFactoryDynamoDbBuilderSuite.scala create mode 100644 pramen/core/src/test/scala/za/co/absa/pramen/core/tests/metadata/MetadataManagerDynamoDbBuilderSuite.scala diff --git a/README.md b/README.md index 9564c1af..584993df 100644 --- a/README.md +++ b/README.md @@ -2569,6 +2569,34 @@ pramen { } ``` +### (experimental) DynamoDB database +Here is how you can use a DynamoDB database for storing bookkeeping information: + +```hocon +pramen { + bookkeeping.enabled = "true" + + bookkeeping.dynamodb { + region = "af-south-1" + table.prefix = "pramen_uat" + } +} +``` + +DynamoDB tables are automatically created if they don't exist with default options. Use the prefix to create multiple +Pramen bookeeping environments per AWS account. + +Note that the Pramen project that uses DynamoDB for bookeeping needs to add DynamoDB as a dependency if it is not provided +by the Spark cluster (e.g. EMR). + +```xml + + software.amazon.awssdk + dynamodb + ${aws.sdk.version} + +``` + ### Hadoop (CSV+JSON) This is less recommended way, and is quite slow. But the advantage is that you don't need a database. diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala index 5f282795..17fb6834 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala @@ -47,7 +47,7 @@ import scala.util.control.NonFatal * @param tableArn Optional ARN prefix for DynamoDB tables (e.g., "arn:aws:dynamodb:region:account-id:table/") * @param tablePrefix Prefix for table names to allow multiple bookkeeping sets in the same account (default: "pramen") */ -class BookkeeperDynamoDb( +class BookkeeperDynamoDb private ( dynamoDbClient: DynamoDbClient, batchId: Long, tableArn: Option[String] = None, @@ -799,7 +799,7 @@ object BookkeeperDynamoDb { * @throws IllegalArgumentException if required parameters are missing */ def build(): BookkeeperDynamoDb = { - val actualBatchId = batchId.getOrElse(System.currentTimeMillis()) + val actualBatchId = batchId.getOrElse(throw new IllegalArgumentException("BatchId is not supplied when building the instance of BookkeeperDynamoDb")) if (region.isEmpty) { throw new IllegalArgumentException("Either region or dynamoDbClient must be provided") diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala index bb688562..b271ab81 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala @@ -39,7 +39,7 @@ import scala.util.control.NonFatal * @param tableArn Optional ARN prefix for the journal table * @param tablePrefix Prefix for the journal table name (default: "pramen") */ -class JournalDynamoDB( +class JournalDynamoDB private ( dynamoDbClient: DynamoDbClient, tableArn: Option[String] = None, tablePrefix: String = JournalDynamoDB.DEFAULT_TABLE_PREFIX diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala index c85153c6..16d8143d 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala @@ -38,7 +38,7 @@ import scala.util.control.NonFatal * @param tableArn Optional ARN prefix for the metadata table * @param tablePrefix Prefix for the metadata table name (default: "pramen") */ -class MetadataManagerDynamoDb( +class MetadataManagerDynamoDb private ( dynamoDbClient: DynamoDbClient, tableArn: Option[String] = None, tablePrefix: String = MetadataManagerDynamoDb.DEFAULT_TABLE_PREFIX diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/model/QueryBuilderSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/model/QueryBuilderSuite.scala index c5f4fa71..fc8de3c0 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/model/QueryBuilderSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/model/QueryBuilderSuite.scala @@ -51,31 +51,31 @@ class QueryBuilderSuite extends AnyWordSpec { "throw an exception when no query configuration is specified" in { val conf = ConfigFactory.parseString("") - val exception = intercept[IllegalArgumentException] { + val ex = intercept[IllegalArgumentException] { QueryBuilder.fromConfig(conf, "", "") } - assert(exception.getMessage == "No options are specified for the query. Usually, it is one of: 'sql', 'path', 'table', 'db.table', 'topic'.") + assert(ex.getMessage == "No options are specified for the query. Usually, it is one of: 'sql', 'path', 'table', 'db.table', 'topic'.") } "throw an exception when the prefix is empty" in { val conf = ConfigFactory.parseString("data = /tmp") - val exception = intercept[IllegalArgumentException] { + val ex = intercept[IllegalArgumentException] { QueryBuilder.fromConfig(conf, "input", "") } - assert(exception.getMessage == "No options are specified for the 'input' query. Usually, it is one of: 'input.sql', 'input.path', 'input.table', 'input.db.table', 'input.topic'.") + assert(ex.getMessage == "No options are specified for the 'input' query. Usually, it is one of: 'input.sql', 'input.path', 'input.table', 'input.db.table', 'input.topic'.") } "throw an exception when the prefix is empty and parent is specified" in { val conf = ConfigFactory.parseString("data = /tmp") - val exception = intercept[IllegalArgumentException] { + val ex = intercept[IllegalArgumentException] { QueryBuilder.fromConfig(conf, "input", "my.parent") } - assert(exception.getMessage == "No options are specified for the 'input' query. Usually, it is one of: 'input.sql', 'input.path', 'input.table', 'input.db.table', 'input.topic' at my.parent.") + assert(ex.getMessage == "No options are specified for the 'input' query. Usually, it is one of: 'input.sql', 'input.path', 'input.table', 'input.db.table', 'input.topic' at my.parent.") } } } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDynamoDbBuilderSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDynamoDbBuilderSuite.scala new file mode 100644 index 00000000..612592e7 --- /dev/null +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDynamoDbBuilderSuite.scala @@ -0,0 +1,148 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.tests.bookkeeper + +import org.scalatest.wordspec.AnyWordSpec +import software.amazon.awssdk.auth.credentials.{AwsBasicCredentials, StaticCredentialsProvider} +import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb + +class BookkeeperDynamoDbBuilderSuite extends AnyWordSpec { + + "BookkeeperDynamoDbBuilder" should { + "use default table prefix" in { + val builder = BookkeeperDynamoDb.builder + .withRegion("us-east-1") + .withBatchId(123456789L) + + // We can't instantiate without valid DynamoDB connection, + // but we can verify the builder returns itself (fluent API) + assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder]) + } + + "allow setting region" in { + val builder = BookkeeperDynamoDb.builder + .withRegion("eu-west-1") + .withBatchId(123456789L) + + assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder]) + } + + "allow setting table ARN" in { + val builder = BookkeeperDynamoDb.builder + .withRegion("us-east-1") + .withTableArn("arn:aws:dynamodb:us-east-1:123456789012:table/") + .withBatchId(123456789L) + + assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder]) + } + + "allow setting table prefix" in { + val builder = BookkeeperDynamoDb.builder + .withRegion("us-east-1") + .withTablePrefix("test_pramen") + .withBatchId(123456789L) + + assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder]) + } + + "allow setting credentials provider" in { + val credentials = AwsBasicCredentials.create("accessKey", "secretKey") + val credentialsProvider = StaticCredentialsProvider.create(credentials) + + val builder = BookkeeperDynamoDb.builder + .withRegion("us-east-1") + .withCredentialsProvider(credentialsProvider) + .withBatchId(123456789L) + + assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder]) + } + + "allow setting endpoint" in { + val builder = BookkeeperDynamoDb.builder + .withRegion("us-east-1") + .withEndpoint("http://localhost:8000") + .withBatchId(123456789L) + + assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder]) + } + + "allow setting batch ID" in { + val batchId = 987654321L + val builder = BookkeeperDynamoDb.builder + .withRegion("us-east-1") + .withBatchId(batchId) + + assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder]) + } + + "support fluent API chaining" in { + val credentials = AwsBasicCredentials.create("accessKey", "secretKey") + val credentialsProvider = StaticCredentialsProvider.create(credentials) + + val builder = BookkeeperDynamoDb.builder + .withRegion("ap-southeast-2") + .withTableArn("arn:aws:dynamodb:ap-southeast-2:123456789012:table/") + .withTablePrefix("prod_pramen") + .withCredentialsProvider(credentialsProvider) + .withEndpoint("http://localhost:8000") + .withBatchId(111222333L) + + assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder]) + } + + "throw IllegalArgumentException when region is not set" in { + val builder = BookkeeperDynamoDb.builder + .withBatchId(123456789L) + + val ex = intercept[IllegalArgumentException] { + builder.build() + } + + assert(ex.getMessage.contains("region")) + } + + "throw IllegalArgumentException when batch ID is not set" in { + val builder = BookkeeperDynamoDb.builder + .withRegion("us-east-1") + + val ex = intercept[IllegalArgumentException] { + builder.build() + } + + assert(ex.getMessage.contains("BatchId is not supplied")) + } + } + + "BookkeeperDynamoDb.getFullTableName" should { + "return table name when no ARN is provided" in { + val result = BookkeeperDynamoDb.getFullTableName(None, "test_table") + assert(result == "test_table") + } + + "return ARN with /table/ prefix when ARN ends with slash" in { + val arn = "arn:aws:dynamodb:us-east-1:123456789012:table/" + val result = BookkeeperDynamoDb.getFullTableName(Some(arn), "test_table") + assert(result == s"${arn}table/test_table") + } + + "handle ARN without trailing slash by adding /table/" in { + val arn = "arn:aws:dynamodb:eu-west-1:987654321098" + val result = BookkeeperDynamoDb.getFullTableName(Some(arn), "my_table") + assert(result == s"$arn/table/my_table") + } + } +} diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/OffsetManagerDynamoDbBuilderSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/OffsetManagerDynamoDbBuilderSuite.scala new file mode 100644 index 00000000..b8714d85 --- /dev/null +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/OffsetManagerDynamoDbBuilderSuite.scala @@ -0,0 +1,117 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.tests.bookkeeper + +import org.scalatest.wordspec.AnyWordSpec +import software.amazon.awssdk.auth.credentials.{AwsBasicCredentials, StaticCredentialsProvider} +import za.co.absa.pramen.core.bookkeeper.OffsetManagerDynamoDb + +class OffsetManagerDynamoDbBuilderSuite extends AnyWordSpec { + + "OffsetManagerDynamoDbBuilder" should { + "use default table prefix when not specified" in { + val builder = OffsetManagerDynamoDb.builder + .withRegion("us-east-1") + .withBatchId(123456789L) + + assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder]) + } + + "allow setting region" in { + val builder = OffsetManagerDynamoDb.builder + .withRegion("eu-central-1") + .withBatchId(123456789L) + + assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder]) + } + + "allow setting table ARN" in { + val builder = OffsetManagerDynamoDb.builder + .withRegion("us-west-2") + .withTableArn("arn:aws:dynamodb:us-west-2:123456789012:table/") + .withBatchId(123456789L) + + assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder]) + } + + "allow setting table prefix" in { + val builder = OffsetManagerDynamoDb.builder + .withRegion("ap-northeast-1") + .withTablePrefix("staging_pramen") + .withBatchId(123456789L) + + assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder]) + } + + "allow setting credentials provider" in { + val credentials = AwsBasicCredentials.create("testAccessKey", "testSecretKey") + val credentialsProvider = StaticCredentialsProvider.create(credentials) + + val builder = OffsetManagerDynamoDb.builder + .withRegion("us-east-1") + .withCredentialsProvider(credentialsProvider) + .withBatchId(123456789L) + + assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder]) + } + + "allow setting endpoint for local testing" in { + val builder = OffsetManagerDynamoDb.builder + .withRegion("us-east-1") + .withEndpoint("http://localhost:4566") + .withBatchId(123456789L) + + assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder]) + } + + "allow setting batch ID" in { + val batchId = 1234567890123L + val builder = OffsetManagerDynamoDb.builder + .withRegion("us-east-1") + .withBatchId(batchId) + + assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder]) + } + + "support fluent API with all parameters" in { + val credentials = AwsBasicCredentials.create("key", "secret") + val credentialsProvider = StaticCredentialsProvider.create(credentials) + val batchId = System.currentTimeMillis() + + val builder = OffsetManagerDynamoDb.builder + .withRegion("sa-east-1") + .withTableArn("arn:aws:dynamodb:sa-east-1:999888777666:table/") + .withTablePrefix("dev_pramen") + .withCredentialsProvider(credentialsProvider) + .withEndpoint("http://dynamodb.local:8000") + .withBatchId(batchId) + + assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder]) + } + + "throw IllegalArgumentException when region is missing" in { + val builder = OffsetManagerDynamoDb.builder + .withBatchId(123456789L) + + val ex = intercept[IllegalArgumentException] { + builder.build() + } + + assert(ex.getMessage.contains("Region")) + } + } +} diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/journal/JournalDynamoDBBuilderSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/journal/JournalDynamoDBBuilderSuite.scala new file mode 100644 index 00000000..2574096b --- /dev/null +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/journal/JournalDynamoDBBuilderSuite.scala @@ -0,0 +1,99 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.tests.journal + +import org.scalatest.wordspec.AnyWordSpec +import software.amazon.awssdk.auth.credentials.{AwsBasicCredentials, StaticCredentialsProvider} +import za.co.absa.pramen.core.journal.JournalDynamoDB + +class JournalDynamoDBBuilderSuite extends AnyWordSpec { + + "JournalDynamoDBBuilder" should { + "use default table prefix when not specified" in { + val builder = JournalDynamoDB.builder + .withRegion("us-east-1") + + assert(builder.isInstanceOf[JournalDynamoDB.JournalDynamoDBBuilder]) + } + + "allow setting region" in { + val builder = JournalDynamoDB.builder + .withRegion("eu-west-2") + + assert(builder.isInstanceOf[JournalDynamoDB.JournalDynamoDBBuilder]) + } + + "allow setting table ARN" in { + val builder = JournalDynamoDB.builder + .withRegion("us-west-1") + .withTableArn("arn:aws:dynamodb:us-west-1:111222333444:table/") + + assert(builder.isInstanceOf[JournalDynamoDB.JournalDynamoDBBuilder]) + } + + "allow setting table prefix" in { + val builder = JournalDynamoDB.builder + .withRegion("ap-south-1") + .withTablePrefix("qa_pramen") + + assert(builder.isInstanceOf[JournalDynamoDB.JournalDynamoDBBuilder]) + } + + "allow setting credentials provider" in { + val credentials = AwsBasicCredentials.create("myAccessKey", "mySecretKey") + val credentialsProvider = StaticCredentialsProvider.create(credentials) + + val builder = JournalDynamoDB.builder + .withRegion("us-east-2") + .withCredentialsProvider(credentialsProvider) + + assert(builder.isInstanceOf[JournalDynamoDB.JournalDynamoDBBuilder]) + } + + "allow setting endpoint for local development" in { + val builder = JournalDynamoDB.builder + .withRegion("local") + .withEndpoint("http://localhost:8000") + + assert(builder.isInstanceOf[JournalDynamoDB.JournalDynamoDBBuilder]) + } + + "support fluent API chaining" in { + val credentials = AwsBasicCredentials.create("testKey", "testSecret") + val credentialsProvider = StaticCredentialsProvider.create(credentials) + + val builder = JournalDynamoDB.builder + .withRegion("ca-central-1") + .withTableArn("arn:aws:dynamodb:ca-central-1:555666777888:table/") + .withTablePrefix("prod_journal") + .withCredentialsProvider(credentialsProvider) + .withEndpoint("http://dynamodb-local:8000") + + assert(builder.isInstanceOf[JournalDynamoDB.JournalDynamoDBBuilder]) + } + + "throw IllegalArgumentException when region is not set" in { + val builder = JournalDynamoDB.builder + + val ex = intercept[IllegalArgumentException] { + builder.build() + } + + assert(ex.getMessage.contains("Region")) + } + } +} diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/lock/TokenLockFactoryDynamoDbBuilderSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/lock/TokenLockFactoryDynamoDbBuilderSuite.scala new file mode 100644 index 00000000..a653303b --- /dev/null +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/lock/TokenLockFactoryDynamoDbBuilderSuite.scala @@ -0,0 +1,99 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.tests.lock + +import org.scalatest.wordspec.AnyWordSpec +import software.amazon.awssdk.auth.credentials.{AwsBasicCredentials, StaticCredentialsProvider} +import za.co.absa.pramen.core.lock.TokenLockFactoryDynamoDb + +class TokenLockFactoryDynamoDbBuilderSuite extends AnyWordSpec { + + "TokenLockFactoryDynamoDbBuilder" should { + "use default table prefix when not set" in { + val builder = TokenLockFactoryDynamoDb.builder + .withRegion("us-east-1") + + assert(builder.isInstanceOf[TokenLockFactoryDynamoDb.TokenLockFactoryDynamoDbBuilder]) + } + + "allow setting region" in { + val builder = TokenLockFactoryDynamoDb.builder + .withRegion("eu-west-3") + + assert(builder.isInstanceOf[TokenLockFactoryDynamoDb.TokenLockFactoryDynamoDbBuilder]) + } + + "allow setting table ARN" in { + val builder = TokenLockFactoryDynamoDb.builder + .withRegion("ap-east-1") + .withTableArn("arn:aws:dynamodb:ap-east-1:999888777666:table/") + + assert(builder.isInstanceOf[TokenLockFactoryDynamoDb.TokenLockFactoryDynamoDbBuilder]) + } + + "allow setting table prefix" in { + val builder = TokenLockFactoryDynamoDb.builder + .withRegion("sa-east-1") + .withTablePrefix("lock_pramen") + + assert(builder.isInstanceOf[TokenLockFactoryDynamoDb.TokenLockFactoryDynamoDbBuilder]) + } + + "allow setting credentials provider" in { + val credentials = AwsBasicCredentials.create("lockKey", "lockSecret") + val credentialsProvider = StaticCredentialsProvider.create(credentials) + + val builder = TokenLockFactoryDynamoDb.builder + .withRegion("us-west-1") + .withCredentialsProvider(credentialsProvider) + + assert(builder.isInstanceOf[TokenLockFactoryDynamoDb.TokenLockFactoryDynamoDbBuilder]) + } + + "allow setting endpoint for testing" in { + val builder = TokenLockFactoryDynamoDb.builder + .withRegion("local") + .withEndpoint("http://dynamodb.local:8888") + + assert(builder.isInstanceOf[TokenLockFactoryDynamoDb.TokenLockFactoryDynamoDbBuilder]) + } + + "support full fluent API" in { + val credentials = AwsBasicCredentials.create("fluentKey", "fluentSecret") + val credentialsProvider = StaticCredentialsProvider.create(credentials) + + val builder = TokenLockFactoryDynamoDb.builder + .withRegion("cn-north-1") + .withTableArn("arn:aws-cn:dynamodb:cn-north-1:123456789012:table/") + .withTablePrefix("distributed_locks") + .withCredentialsProvider(credentialsProvider) + .withEndpoint("http://private-dynamodb:8000") + + assert(builder.isInstanceOf[TokenLockFactoryDynamoDb.TokenLockFactoryDynamoDbBuilder]) + } + + "throw IllegalArgumentException when region is not provided" in { + val builder = TokenLockFactoryDynamoDb.builder + + val ex = intercept[IllegalArgumentException] { + builder.build() + } + + assert(ex.getMessage.contains("Region")) + } + } +} diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/metadata/MetadataManagerDynamoDbBuilderSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/metadata/MetadataManagerDynamoDbBuilderSuite.scala new file mode 100644 index 00000000..84e29473 --- /dev/null +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/metadata/MetadataManagerDynamoDbBuilderSuite.scala @@ -0,0 +1,99 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.tests.metadata + +import org.scalatest.wordspec.AnyWordSpec +import software.amazon.awssdk.auth.credentials.{AwsBasicCredentials, StaticCredentialsProvider} +import za.co.absa.pramen.core.metadata.MetadataManagerDynamoDb + +class MetadataManagerDynamoDbBuilderSuite extends AnyWordSpec { + + "MetadataManagerDynamoDbBuilder" should { + "use default table prefix" in { + val builder = MetadataManagerDynamoDb.builder + .withRegion("us-east-1") + + assert(builder.isInstanceOf[MetadataManagerDynamoDb.MetadataManagerDynamoDbBuilder]) + } + + "allow setting region" in { + val builder = MetadataManagerDynamoDb.builder + .withRegion("eu-north-1") + + assert(builder.isInstanceOf[MetadataManagerDynamoDb.MetadataManagerDynamoDbBuilder]) + } + + "allow setting table ARN" in { + val builder = MetadataManagerDynamoDb.builder + .withRegion("ap-southeast-1") + .withTableArn("arn:aws:dynamodb:ap-southeast-1:123123123123:table/") + + assert(builder.isInstanceOf[MetadataManagerDynamoDb.MetadataManagerDynamoDbBuilder]) + } + + "allow setting table prefix" in { + val builder = MetadataManagerDynamoDb.builder + .withRegion("me-south-1") + .withTablePrefix("test_metadata") + + assert(builder.isInstanceOf[MetadataManagerDynamoDb.MetadataManagerDynamoDbBuilder]) + } + + "allow setting credentials provider" in { + val credentials = AwsBasicCredentials.create("access", "secret") + val credentialsProvider = StaticCredentialsProvider.create(credentials) + + val builder = MetadataManagerDynamoDb.builder + .withRegion("af-south-1") + .withCredentialsProvider(credentialsProvider) + + assert(builder.isInstanceOf[MetadataManagerDynamoDb.MetadataManagerDynamoDbBuilder]) + } + + "allow setting endpoint" in { + val builder = MetadataManagerDynamoDb.builder + .withRegion("us-west-2") + .withEndpoint("http://localstack:4566") + + assert(builder.isInstanceOf[MetadataManagerDynamoDb.MetadataManagerDynamoDbBuilder]) + } + + "support complete fluent API chain" in { + val credentials = AwsBasicCredentials.create("myKey", "mySecret") + val credentialsProvider = StaticCredentialsProvider.create(credentials) + + val builder = MetadataManagerDynamoDb.builder + .withRegion("ap-northeast-2") + .withTableArn("arn:aws:dynamodb:ap-northeast-2:444333222111:table/") + .withTablePrefix("metadata_manager") + .withCredentialsProvider(credentialsProvider) + .withEndpoint("http://custom-endpoint:9000") + + assert(builder.isInstanceOf[MetadataManagerDynamoDb.MetadataManagerDynamoDbBuilder]) + } + + "throw IllegalArgumentException when region is missing" in { + val builder = MetadataManagerDynamoDb.builder + + val ex = intercept[IllegalArgumentException] { + builder.build() + } + + assert(ex.getMessage.contains("Region")) + } + } +} From d8797d951435b81ae5b5d8efba9268e5de8b5684 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Fri, 3 Apr 2026 12:16:00 +0200 Subject: [PATCH 07/13] #181 Change DynamoDB bookkeeping table schema to use composite sort key for multiple entries per table and date --- pramen/core/pom.xml | 1 + .../core/bookkeeper/BookkeeperDynamoDb.scala | 109 +++++++++++------- .../core/bookkeeper/OffsetManager.scala | 2 +- .../core/bookkeeper/OffsetManagerCached.scala | 20 ++-- .../bookkeeper/OffsetManagerDynamoDb.scala | 31 +++-- .../core/bookkeeper/OffsetManagerJdbc.scala | 3 + .../BookkeeperDynamoDbBuilderSuite.scala | 8 +- 7 files changed, 115 insertions(+), 59 deletions(-) diff --git a/pramen/core/pom.xml b/pramen/core/pom.xml index 2db0d900..54011be0 100644 --- a/pramen/core/pom.xml +++ b/pramen/core/pom.xml @@ -148,6 +148,7 @@ software.amazon.awssdk dynamodb + provided diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala index 17fb6834..23a6c6b3 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala @@ -36,7 +36,8 @@ import scala.util.control.NonFatal * * Table schema for bookkeeping: * - Partition key: tableName (String) - * - Sort key: infoDate (String in yyyy-MM-dd format) + * - Sort key: infoDateSortKey (String in "yyyy-MM-dd#jobFinishedMillis" format) + * The composite sort key allows multiple entries for the same table and date. * * Table schema for schemas: * - Partition key: tableName (String) @@ -86,21 +87,21 @@ class BookkeeperDynamoDb private ( log.info(s"Initializing DynamoDB bookkeeper with tables: bookkeeping='$bookkeepingTableName', schemas='$schemaTableName'") // Initialize bookkeeping table - if (!tableExists(bookkeepingTableBaseName)) { - log.info(s"Creating DynamoDB bookkeeping table: $bookkeepingTableBaseName") - createBookkeepingTable(bookkeepingTableBaseName) - log.info(s"Successfully created bookkeeping table: $bookkeepingTableBaseName") + if (!tableExists(bookkeepingTableName)) { + log.info(s"Creating DynamoDB bookkeeping table: $bookkeepingTableName") + createBookkeepingTable(bookkeepingTableName) + log.info(s"Successfully created bookkeeping table: $bookkeepingTableName") } else { - log.info(s"DynamoDB bookkeeping table already exists: $bookkeepingTableBaseName") + log.info(s"DynamoDB bookkeeping table already exists: $bookkeepingTableName") } // Initialize schema table - if (!tableExists(schemaTableBaseName)) { - log.info(s"Creating DynamoDB schema table: $schemaTableBaseName") - createSchemaTable(schemaTableBaseName) - log.info(s"Successfully created schema table: $schemaTableBaseName") + if (!tableExists(schemaTableName)) { + log.info(s"Creating DynamoDB schema table: $schemaTableName") + createSchemaTable(schemaTableName) + log.info(s"Successfully created schema table: $schemaTableName") } else { - log.info(s"DynamoDB schema table already exists: $schemaTableBaseName") + log.info(s"DynamoDB schema table already exists: $schemaTableName") } log.info(s"DynamoDB bookkeeper initialization complete") @@ -135,6 +136,7 @@ class BookkeeperDynamoDb private ( /** * Creates the bookkeeping table with the appropriate schema. + * Uses a composite sort key (infoDate#jobFinished) to allow multiple entries per table and date. * * @param tableName The name of the table to create */ @@ -147,7 +149,7 @@ class BookkeeperDynamoDb private ( .keyType(KeyType.HASH) .build(), KeySchemaElement.builder() - .attributeName(ATTR_INFO_DATE) + .attributeName(ATTR_INFO_DATE_SORT_KEY) .keyType(KeyType.RANGE) .build() ) @@ -157,7 +159,7 @@ class BookkeeperDynamoDb private ( .attributeType(ScalarAttributeType.S) .build(), AttributeDefinition.builder() - .attributeName(ATTR_INFO_DATE) + .attributeName(ATTR_INFO_DATE_SORT_KEY) .attributeType(ScalarAttributeType.S) .build() ) @@ -259,11 +261,13 @@ class BookkeeperDynamoDb private ( val query = until match { case Some(endDate) => val endDateStr = getDateStr(endDate) + // Query using prefix on the sort key since we need items with infoDate <= endDate queryBuilder - .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE <= :endDate") + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE_SORT_KEY <= :endDateMax") .expressionAttributeValues(Map( ":tableName" -> AttributeValue.builder().s(table).build(), - ":endDate" -> AttributeValue.builder().s(endDateStr).build() + // Use max possible value after date to get all entries for that date and before + ":endDateMax" -> AttributeValue.builder().s(s"${endDateStr}#~").build() ).asJava) case None => queryBuilder @@ -294,11 +298,12 @@ class BookkeeperDynamoDb private ( val queryRequest = QueryRequest.builder() .tableName(bookkeepingTableName) - .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE = :infoDate") + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND begins_with($ATTR_INFO_DATE_SORT_KEY, :infoDatePrefix)") .expressionAttributeValues(Map( ":tableName" -> AttributeValue.builder().s(table).build(), - ":infoDate" -> AttributeValue.builder().s(dateStr).build() + ":infoDatePrefix" -> AttributeValue.builder().s(s"$dateStr#").build() ).asJava) + .scanIndexForward(false) // descending order by sort key (latest jobFinished first) .build() val response = dynamoDbClient.query(queryRequest) @@ -307,10 +312,9 @@ class BookkeeperDynamoDb private ( if (items.isEmpty) { None } else { - // Sort by jobFinished descending and take the first + // Take the first item (already sorted in descending order) items .map(itemToDataChunk) - .sortBy(-_.jobFinished) .headOption } } catch { @@ -326,10 +330,10 @@ class BookkeeperDynamoDb private ( val queryBuilder = QueryRequest.builder() .tableName(bookkeepingTableName) - .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE = :infoDate") + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND begins_with($ATTR_INFO_DATE_SORT_KEY, :infoDatePrefix)") .expressionAttributeValues(Map( ":tableName" -> AttributeValue.builder().s(table).build(), - ":infoDate" -> AttributeValue.builder().s(dateStr).build() + ":infoDatePrefix" -> AttributeValue.builder().s(s"$dateStr#").build() ).asJava) val query = batchIdFilter match { @@ -338,7 +342,7 @@ class BookkeeperDynamoDb private ( .filterExpression(s"$ATTR_BATCH_ID = :batchId") .expressionAttributeValues(Map( ":tableName" -> AttributeValue.builder().s(table).build(), - ":infoDate" -> AttributeValue.builder().s(dateStr).build(), + ":infoDatePrefix" -> AttributeValue.builder().s(s"$dateStr#").build(), ":batchId" -> AttributeValue.builder().n(bId.toString).build() ).asJava) case None => @@ -417,8 +421,11 @@ class BookkeeperDynamoDb private ( ): Unit = { try { val dateStr = getDateStr(infoDate) + val sortKey = buildSortKey(dateStr, jobFinished) + val item = dataChunkToItem( - DataChunk(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, jobStarted, jobFinished, Some(batchId), recordsAppended) + DataChunk(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, jobStarted, jobFinished, Some(batchId), recordsAppended), + sortKey ) val putRequest = PutItemRequest.builder() @@ -427,7 +434,7 @@ class BookkeeperDynamoDb private ( .build() dynamoDbClient.putItem(putRequest) - log.debug(s"Saved bookkeeping record for table '$table', infoDate='$dateStr', batchId=$batchId") + log.debug(s"Saved bookkeeping record for table '$table', infoDate='$dateStr', sortKey='$sortKey', batchId=$batchId") } catch { case NonFatal(ex) => log.error(s"Error saving record count for table '$table' at $infoDate", ex) @@ -443,10 +450,10 @@ class BookkeeperDynamoDb private ( // Query all items for this table and date val queryRequest = QueryRequest.builder() .tableName(bookkeepingTableName) - .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE = :infoDate") + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND begins_with($ATTR_INFO_DATE_SORT_KEY, :infoDatePrefix)") .expressionAttributeValues(Map( ":tableName" -> AttributeValue.builder().s(table).build(), - ":infoDate" -> AttributeValue.builder().s(dateStr).build() + ":infoDatePrefix" -> AttributeValue.builder().s(s"$dateStr#").build() ).asJava) .build() @@ -460,11 +467,12 @@ class BookkeeperDynamoDb private ( ) if (itemBatchId.exists(_ != batchId)) { + val sortKey = item.get(ATTR_INFO_DATE_SORT_KEY).s() val deleteRequest = DeleteItemRequest.builder() .tableName(bookkeepingTableName) .key(Map( ATTR_TABLE_NAME -> AttributeValue.builder().s(table).build(), - ATTR_INFO_DATE -> AttributeValue.builder().s(dateStr).build() + ATTR_INFO_DATE_SORT_KEY -> AttributeValue.builder().s(sortKey).build() ).asJava) .conditionExpression(s"$ATTR_JOB_FINISHED = :jobFinished") .expressionAttributeValues(Map( @@ -477,7 +485,7 @@ class BookkeeperDynamoDb private ( } catch { case _: ConditionalCheckFailedException => // Item was already modified or deleted, ignore - log.debug(s"Could not delete item for table '$table', date '$dateStr' - already modified") + log.info(s"Could not delete item for table '$table', sortKey '$sortKey' - already modified") } } } @@ -583,9 +591,10 @@ class BookkeeperDynamoDb private ( ) } - private def dataChunkToItem(chunk: DataChunk): java.util.Map[String, AttributeValue] = { + private def dataChunkToItem(chunk: DataChunk, sortKey: String): java.util.Map[String, AttributeValue] = { val baseMap = Map( ATTR_TABLE_NAME -> AttributeValue.builder().s(chunk.tableName).build(), + ATTR_INFO_DATE_SORT_KEY -> AttributeValue.builder().s(sortKey).build(), ATTR_INFO_DATE -> AttributeValue.builder().s(chunk.infoDate).build(), ATTR_INFO_DATE_BEGIN -> AttributeValue.builder().s(chunk.infoDateBegin).build(), ATTR_INFO_DATE_END -> AttributeValue.builder().s(chunk.infoDateEnd).build(), @@ -621,27 +630,27 @@ class BookkeeperDynamoDb private ( val beginStr = getDateStr(begin) val endStr = getDateStr(end) builder - .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE BETWEEN :beginDate AND :endDate") + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE_SORT_KEY BETWEEN :beginDate AND :endDateMax") .expressionAttributeValues(Map( ":tableName" -> AttributeValue.builder().s(table).build(), - ":beginDate" -> AttributeValue.builder().s(beginStr).build(), - ":endDate" -> AttributeValue.builder().s(endStr).build() + ":beginDate" -> AttributeValue.builder().s(s"$beginStr#").build(), + ":endDateMax" -> AttributeValue.builder().s(s"$endStr#~").build() ).asJava) case (Some(begin), None) => val beginStr = getDateStr(begin) builder - .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE >= :beginDate") + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE_SORT_KEY >= :beginDate") .expressionAttributeValues(Map( ":tableName" -> AttributeValue.builder().s(table).build(), - ":beginDate" -> AttributeValue.builder().s(beginStr).build() + ":beginDate" -> AttributeValue.builder().s(s"$beginStr#").build() ).asJava) case (None, Some(end)) => val endStr = getDateStr(end) builder - .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE <= :endDate") + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE_SORT_KEY <= :endDateMax") .expressionAttributeValues(Map( ":tableName" -> AttributeValue.builder().s(table).build(), - ":endDate" -> AttributeValue.builder().s(endStr).build() + ":endDateMax" -> AttributeValue.builder().s(s"$endStr#~").build() ).asJava) case (None, None) => builder @@ -670,6 +679,27 @@ class BookkeeperDynamoDb private ( chunks.toSeq } + + /** + * Builds the composite sort key for bookkeeping table: "infoDate#jobFinished" + * + * @param infoDate The information date + * @param jobFinished The job finished timestamp in milliseconds + * @return Composite sort key string + */ + private def buildSortKey(infoDate: String, jobFinished: Long): String = { + s"$infoDate#$jobFinished" + } + + /** + * Extracts the infoDate from a composite sort key. + * + * @param sortKey Composite sort key in format "infoDate#jobFinished" + * @return The infoDate portion + */ + private def extractInfoDate(sortKey: String): String = { + sortKey.split("#").headOption.getOrElse(sortKey) + } } object BookkeeperDynamoDb { @@ -680,6 +710,7 @@ object BookkeeperDynamoDb { // Attribute names for bookkeeping table val ATTR_TABLE_NAME = "tableName" val ATTR_INFO_DATE = "infoDate" + val ATTR_INFO_DATE_SORT_KEY = "infoDateSortKey" // Composite: "infoDate#jobFinished" val ATTR_INFO_DATE_BEGIN = "infoDateBegin" val ATTR_INFO_DATE_END = "infoDateEnd" val ATTR_INPUT_RECORD_COUNT = "inputRecordCount" @@ -839,10 +870,10 @@ object BookkeeperDynamoDb { tableArn match { case Some(arn) if arn.nonEmpty => // If ARN ends with table/, append the table name, otherwise append /table/tableName - if (arn.endsWith("/")) { + if (arn.endsWith("table/")) { + s"$arn$tableName" + } else if (arn.endsWith("/")) { s"${arn}table/$tableName" - } else if (arn.contains("/table/")) { - arn // ARN already includes table path } else { s"$arn/table/$tableName" } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManager.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManager.scala index 7ce0255d..fe20eb4d 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManager.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManager.scala @@ -89,5 +89,5 @@ trait OffsetManager extends AutoCloseable { */ def rollbackOffsets(request: DataOffsetRequest): Unit - override def close(): Unit = {} + override def close(): Unit } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerCached.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerCached.scala index bf85c594..52f411ab 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerCached.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerCached.scala @@ -31,15 +31,15 @@ class OffsetManagerCached(offsetManager: OffsetManager) extends OffsetManager { private val log = LoggerFactory.getLogger(this.getClass) private val aggregatedOffsetsCache = new mutable.HashMap[(String, Option[LocalDate]), Option[DataOffsetAggregated]] - def getOffsets(table: String, infoDate: LocalDate): Array[DataOffset] = { + override def getOffsets(table: String, infoDate: LocalDate): Array[DataOffset] = { offsetManager.getOffsets(table, infoDate) } - def getUncommittedOffsets(table: String, onlyForInfoDate: Option[LocalDate]): Array[UncommittedOffset] = { + override def getUncommittedOffsets(table: String, onlyForInfoDate: Option[LocalDate]): Array[UncommittedOffset] = { offsetManager.getUncommittedOffsets(table, onlyForInfoDate) } - def getMaxInfoDateAndOffset(table: String, onlyForInfoDate: Option[LocalDate]): Option[DataOffsetAggregated] = synchronized { + override def getMaxInfoDateAndOffset(table: String, onlyForInfoDate: Option[LocalDate]): Option[DataOffsetAggregated] = synchronized { val tbl = onlyForInfoDate match { case Some(date) => s"'$table' for '$date'" case None => s"'$table'" @@ -57,11 +57,11 @@ class OffsetManagerCached(offsetManager: OffsetManager) extends OffsetManager { } } - def startWriteOffsets(table: String, infoDate: LocalDate, offsetType: OffsetType): DataOffsetRequest = { + override def startWriteOffsets(table: String, infoDate: LocalDate, offsetType: OffsetType): DataOffsetRequest = { offsetManager.startWriteOffsets(table, infoDate, offsetType) } - def commitOffsets(request: DataOffsetRequest, minOffset: OffsetValue, maxOffset: OffsetValue): Unit = { + override def commitOffsets(request: DataOffsetRequest, minOffset: OffsetValue, maxOffset: OffsetValue): Unit = { offsetManager.commitOffsets(request, minOffset, maxOffset) this.synchronized { @@ -69,7 +69,7 @@ class OffsetManagerCached(offsetManager: OffsetManager) extends OffsetManager { } } - def commitRerun(request: DataOffsetRequest, minOffset: OffsetValue, maxOffset: OffsetValue): Unit = { + override def commitRerun(request: DataOffsetRequest, minOffset: OffsetValue, maxOffset: OffsetValue): Unit = { this.synchronized { aggregatedOffsetsCache --= aggregatedOffsetsCache.keys.filter(_._1 == request.tableName) } @@ -77,7 +77,7 @@ class OffsetManagerCached(offsetManager: OffsetManager) extends OffsetManager { offsetManager.commitRerun(request, minOffset, maxOffset) } - def postCommittedRecords(commitRequests: Seq[OffsetCommitRequest]): Unit = { + override def postCommittedRecords(commitRequests: Seq[OffsetCommitRequest]): Unit = { offsetManager.postCommittedRecords(commitRequests) val updatedTables = commitRequests.map(_.table).toSet @@ -86,10 +86,14 @@ class OffsetManagerCached(offsetManager: OffsetManager) extends OffsetManager { } } - def rollbackOffsets(request: DataOffsetRequest): Unit = { + override def rollbackOffsets(request: DataOffsetRequest): Unit = { offsetManager.rollbackOffsets(request) } + override def close(): Unit = { + offsetManager.close() + } + private def renderAggregatedOptionalOffset(offsetsOpt: Option[DataOffsetAggregated]): String = { offsetsOpt match { case Some(offsets) => diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala index 4d6f6a08..e633ee23 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala @@ -86,18 +86,29 @@ class OffsetManagerDynamoDb( .map(record => OffsetRecordConverter.toDataOffset(record).asInstanceOf[UncommittedOffset]) case None => - // Scan all offsets for this table (no info date filter) - val scanRequest = ScanRequest.builder() - .tableName(offsetTableFullName) - .filterExpression(s"${ATTR_PRAMEN_TABLE_NAME} = :table_name AND attribute_not_exists(${ATTR_COMMITTED_AT})") - .expressionAttributeValues(Map( - ":table_name" -> AttributeValue.builder().s(table).build() - ).asJava) - .build() + // Query all offsets for this table with pagination + var allItems = Seq.empty[java.util.Map[String, AttributeValue]] + var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null + + do { + val queryRequestBuilder = QueryRequest.builder() + .tableName(offsetTableFullName) + .keyConditionExpression(s"$ATTR_PRAMEN_TABLE_NAME = :table_name") + .filterExpression(s"attribute_not_exists($ATTR_COMMITTED_AT)") + .expressionAttributeValues(Map( + ":table_name" -> AttributeValue.builder().s(table).build() + ).asJava) + + if (lastEvaluatedKey != null) { + queryRequestBuilder.exclusiveStartKey(lastEvaluatedKey) + } - val result = dynamoDbClient.scan(scanRequest) + val result = dynamoDbClient.query(queryRequestBuilder.build()) + allItems = allItems ++ result.items().asScala + lastEvaluatedKey = result.lastEvaluatedKey() + } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty) - result.items().asScala + allItems .map(itemToOffsetRecord) .map(record => OffsetRecordConverter.toDataOffset(record).asInstanceOf[UncommittedOffset]) .toArray diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerJdbc.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerJdbc.scala index 368ab026..b1e16486 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerJdbc.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerJdbc.scala @@ -150,6 +150,9 @@ class OffsetManagerJdbc(db: Database, slickProfile: JdbcProfile, offsetTable: Of ).execute() } + /** This class does not own the database connection. It is responsibility of the DB connection owner to close it. */ + override def close(): Unit = {} + private[core] def getMaximumInfoDate(table: String): Option[LocalDate] = { val query = offsetTable.records .filter(r => r.pramenTableName === table) diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDynamoDbBuilderSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDynamoDbBuilderSuite.scala index 612592e7..d0dbdf0e 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDynamoDbBuilderSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDynamoDbBuilderSuite.scala @@ -133,10 +133,16 @@ class BookkeeperDynamoDbBuilderSuite extends AnyWordSpec { assert(result == "test_table") } + "return ARN with some prefix ends with slash" in { + val arn = "arn:aws:dynamodb:us-east-1:123456789012:path/" + val result = BookkeeperDynamoDb.getFullTableName(Some(arn), "test_table") + assert(result == s"${arn}table/test_table") + } + "return ARN with /table/ prefix when ARN ends with slash" in { val arn = "arn:aws:dynamodb:us-east-1:123456789012:table/" val result = BookkeeperDynamoDb.getFullTableName(Some(arn), "test_table") - assert(result == s"${arn}table/test_table") + assert(result == s"${arn}test_table") } "handle ARN without trailing slash by adding /table/" in { From d46493478e1a95fdb8187dc11961ab8b64038d33 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Tue, 7 Apr 2026 11:26:06 +0200 Subject: [PATCH 08/13] #181 Fix PR suggestions and add support for bookkeeping table-based record deletion from DynamoDB (Thanks @coderabbitai). --- .../core/bookkeeper/BookkeeperDynamoDb.scala | 129 +++++++++++++++++- .../bookkeeper/OffsetManagerDynamoDb.scala | 51 +++++++ .../core/lock/TokenLockFactoryDynamoDb.scala | 12 +- .../examples/dynamodb_bookkeeping/README.md | 1 - .../dynamodb_with_locks.conf | 5 +- 5 files changed, 186 insertions(+), 12 deletions(-) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala index 23a6c6b3..f26e7d40 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala @@ -500,10 +500,131 @@ class BookkeeperDynamoDb private ( } } - override def deleteTable(tableWithWildcard: String): Seq[String] = { - // DynamoDB implementation for wildcard deletion - // This would require scanning and deleting matching items - throw new UnsupportedOperationException("deleteTable with wildcards is not yet implemented for DynamoDB bookkeeper") + override def deleteTable(tableName: String): Seq[String] = { + try { + val results = scala.collection.mutable.ListBuffer[String]() + + // Delete from bookkeeping table + val bookkeepingCount = deleteTableFromBookkeeping(tableName) + results += s"Deleted $bookkeepingCount bookkeeping records for table '$tableName'" + + // Delete from schema table + val schemaCount = deleteTableFromSchemas(tableName) + results += s"Deleted $schemaCount schema records for table '$tableName'" + + // Delete offsets + val offsetResults = OffsetManagerDynamoDb.deleteAllOffsets(tableName, dynamoDbClient) + results += s"Deleted $offsetResults offset records for table '$tableName'" + + log.info(s"Successfully deleted all records for table '$tableName'") + results.toSeq + } catch { + case NonFatal(ex) => + log.error(s"Error deleting table '$tableName'", ex) + throw ex + } + } + + /** + * Deletes all bookkeeping records for the specified table. + * + * @param tableName The name of the table to delete + * @return The number of records deleted + */ + private def deleteTableFromBookkeeping(tableName: String): Int = { + var deletedCount = 0 + var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null + + do { + val queryBuilder = QueryRequest.builder() + .tableName(bookkeepingTableName) + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(tableName).build() + ).asJava) + + if (lastEvaluatedKey != null) { + queryBuilder.exclusiveStartKey(lastEvaluatedKey) + } + + val response = dynamoDbClient.query(queryBuilder.build()) + val items = response.items().asScala + + // Delete each item + items.foreach { item => + val sortKey = item.get(ATTR_INFO_DATE_SORT_KEY).s() + val deleteRequest = DeleteItemRequest.builder() + .tableName(bookkeepingTableName) + .key(Map( + ATTR_TABLE_NAME -> AttributeValue.builder().s(tableName).build(), + ATTR_INFO_DATE_SORT_KEY -> AttributeValue.builder().s(sortKey).build() + ).asJava) + .build() + + try { + dynamoDbClient.deleteItem(deleteRequest) + deletedCount += 1 + } catch { + case NonFatal(ex) => + log.warn(s"Failed to delete bookkeeping item for table '$tableName', sortKey '$sortKey'", ex) + } + } + + lastEvaluatedKey = response.lastEvaluatedKey() + } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty) + + deletedCount + } + + /** + * Deletes all schema records for the specified table. + * + * @param tableName The name of the table to delete + * @return The number of records deleted + */ + private def deleteTableFromSchemas(tableName: String): Int = { + var deletedCount = 0 + var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null + + do { + val queryBuilder = QueryRequest.builder() + .tableName(schemaTableName) + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(tableName).build() + ).asJava) + + if (lastEvaluatedKey != null) { + queryBuilder.exclusiveStartKey(lastEvaluatedKey) + } + + val response = dynamoDbClient.query(queryBuilder.build()) + val items = response.items().asScala + + // Delete each item + items.foreach { item => + val infoDate = item.get(ATTR_INFO_DATE).s() + val deleteRequest = DeleteItemRequest.builder() + .tableName(schemaTableName) + .key(Map( + ATTR_TABLE_NAME -> AttributeValue.builder().s(tableName).build(), + ATTR_INFO_DATE -> AttributeValue.builder().s(infoDate).build() + ).asJava) + .build() + + try { + dynamoDbClient.deleteItem(deleteRequest) + deletedCount += 1 + } catch { + case NonFatal(ex) => + log.warn(s"Failed to delete schema item for table '$tableName', infoDate '$infoDate'", ex) + } + } + + lastEvaluatedKey = response.lastEvaluatedKey() + } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty) + + deletedCount } override def getLatestSchema(tableName: String, until: LocalDate): Option[(StructType, LocalDate)] = { diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala index e633ee23..7ab2b2dd 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala @@ -605,4 +605,55 @@ object OffsetManagerDynamoDb { } def builder: OffsetManagerDynamoDbBuilder = new OffsetManagerDynamoDbBuilder + + /** Deletes all offsets for a given table. */ + def deleteAllOffsets(tableName: String, dynamoDbClient: DynamoDbClient): Int = { + val log = LoggerFactory.getLogger(this.getClass) + val offsetTableBaseName = s"${DEFAULT_TABLE_PREFIX}_${DEFAULT_OFFSET_TABLE}" + val offsetTableFullName = BookkeeperDynamoDb.getFullTableName(None, offsetTableBaseName) + + try { + var allItems = Seq.empty[java.util.Map[String, AttributeValue]] + var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null + + // Query all offsets for the table with pagination + do { + val queryRequestBuilder = QueryRequest.builder() + .tableName(offsetTableFullName) + .keyConditionExpression(s"$ATTR_PRAMEN_TABLE_NAME = :table_name") + .expressionAttributeValues(Map( + ":table_name" -> AttributeValue.builder().s(tableName).build() + ).asJava) + + if (lastEvaluatedKey != null) { + queryRequestBuilder.exclusiveStartKey(lastEvaluatedKey) + } + + val result = dynamoDbClient.query(queryRequestBuilder.build()) + allItems = allItems ++ result.items().asScala + lastEvaluatedKey = result.lastEvaluatedKey() + } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty) + + // Delete each item + allItems.foreach { item => + val deleteRequest = DeleteItemRequest.builder() + .tableName(offsetTableFullName) + .key(Map( + ATTR_PRAMEN_TABLE_NAME -> item.get(ATTR_PRAMEN_TABLE_NAME), + ATTR_COMPOSITE_KEY -> item.get(ATTR_COMPOSITE_KEY) + ).asJava) + .build() + + dynamoDbClient.deleteItem(deleteRequest) + } + + val deletedCount = allItems.size + log.info(s"Deleted $deletedCount offset records for table '$tableName'") + deletedCount + } catch { + case NonFatal(ex) => + log.error(s"Error deleting offsets for table '$tableName' from '$offsetTableFullName'", ex) + throw new RuntimeException(s"Unable to delete offsets for table '$tableName' from '$offsetTableFullName'", ex) + } + } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala index 8162413e..1ca2564f 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala @@ -56,7 +56,7 @@ class TokenLockFactoryDynamoDb( init() override def getLock(token: String): TokenLock = { - new TokenLockDynamoDb(token, dynamoDbClient, locksTableBaseName) + new TokenLockDynamoDb(token, dynamoDbClient, locksTableName) } /** @@ -80,12 +80,12 @@ class TokenLockFactoryDynamoDb( try { log.info(s"Initializing DynamoDB lock factory with table: '$locksTableName'") - if (!tableExists(locksTableBaseName)) { - log.info(s"Creating DynamoDB locks table: $locksTableBaseName") - createLocksTable(locksTableBaseName) - log.info(s"Successfully created locks table: $locksTableBaseName") + if (!tableExists(locksTableName)) { + log.info(s"Creating DynamoDB locks table: $locksTableName") + createLocksTable(locksTableName) + log.info(s"Successfully created locks table: $locksTableName") } else { - log.info(s"DynamoDB locks table already exists: $locksTableBaseName") + log.info(s"DynamoDB locks table already exists: $locksTableName") } log.info(s"DynamoDB lock factory initialization complete") diff --git a/pramen/examples/dynamodb_bookkeeping/README.md b/pramen/examples/dynamodb_bookkeeping/README.md index c49195a6..4e0f567c 100644 --- a/pramen/examples/dynamodb_bookkeeping/README.md +++ b/pramen/examples/dynamodb_bookkeeping/README.md @@ -434,7 +434,6 @@ val testFactory = TokenLockFactoryDynamoDb.builder .build() ``` -See `core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDbExample.scala` for more examples. ### Lock Behavior diff --git a/pramen/examples/dynamodb_bookkeeping/dynamodb_with_locks.conf b/pramen/examples/dynamodb_bookkeeping/dynamodb_with_locks.conf index 4f5140b1..0ec56e07 100644 --- a/pramen/examples/dynamodb_bookkeeping/dynamodb_with_locks.conf +++ b/pramen/examples/dynamodb_bookkeeping/dynamodb_with_locks.conf @@ -125,7 +125,10 @@ pramen { # "Resource": [ # "arn:aws:dynamodb:*:*:table/pramen_production_bookkeeping", # "arn:aws:dynamodb:*:*:table/pramen_production_schemas", -# "arn:aws:dynamodb:*:*:table/pramen_production_locks" +# "arn:aws:dynamodb:*:*:table/pramen_production_locks", +# "arn:aws:dynamodb:*:*:table/pramen_production_journal", +# "arn:aws:dynamodb:*:*:table/pramen_production_metadata", +# "arn:aws:dynamodb:*:*:table/pramen_production_offsets" # ] # } # ] From c06a4420cb4a38e0153c49be774d11f3542b6611 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Tue, 7 Apr 2026 12:57:35 +0200 Subject: [PATCH 09/13] #181 Remove default implementation of AutoCloseable from MetadataManager and OffsetManager traits to ensure the logic is defined by the implementation. --- .../src/main/scala/za/co/absa/pramen/api/MetadataManager.scala | 2 -- .../za/co/absa/pramen/core/bookkeeper/OffsetManager.scala | 2 -- .../za/co/absa/pramen/core/metadata/MetadataManagerJdbc.scala | 3 +++ .../za/co/absa/pramen/core/metadata/MetadataManagerNull.scala | 2 ++ .../absa/pramen/core/mocks/metadata/MetadataManagerSpy.scala | 2 ++ 5 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pramen/api/src/main/scala/za/co/absa/pramen/api/MetadataManager.scala b/pramen/api/src/main/scala/za/co/absa/pramen/api/MetadataManager.scala index 279f7969..333a8c78 100644 --- a/pramen/api/src/main/scala/za/co/absa/pramen/api/MetadataManager.scala +++ b/pramen/api/src/main/scala/za/co/absa/pramen/api/MetadataManager.scala @@ -70,6 +70,4 @@ trait MetadataManager extends AutoCloseable { * Returns false if metadata is available only for the duration of the session. */ def isPersistent: Boolean - - override def close(): Unit = {} } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManager.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManager.scala index fe20eb4d..39addcd2 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManager.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManager.scala @@ -88,6 +88,4 @@ trait OffsetManager extends AutoCloseable { * Rolls back an offset request */ def rollbackOffsets(request: DataOffsetRequest): Unit - - override def close(): Unit } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerJdbc.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerJdbc.scala index a630784a..64c6263f 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerJdbc.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerJdbc.scala @@ -102,4 +102,7 @@ class MetadataManagerJdbc(db: Database, slickProfile: JdbcProfile) extends Metad case NonFatal(ex) => throw new RuntimeException(s"Unable to delete from the metadata table.", ex) } } + + /** The implementation does not own DB connections, so it is not responsible for closing them. */ + override def close(): Unit = {} } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerNull.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerNull.scala index 3b91bd9e..dda48941 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerNull.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerNull.scala @@ -42,4 +42,6 @@ class MetadataManagerNull(isPersistenceEnabled: Boolean) extends MetadataManager def deleteMetadataFromStorage(tableName: String, infoDate: LocalDate): Unit = { throw new UnsupportedOperationException(errorMessage) } + + override def close(): Unit = {} } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/metadata/MetadataManagerSpy.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/metadata/MetadataManagerSpy.scala index 997db6c6..812ba06c 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/metadata/MetadataManagerSpy.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/metadata/MetadataManagerSpy.scala @@ -65,4 +65,6 @@ class MetadataManagerSpy(isPersistent: Boolean) extends MetadataManagerBase(isPe metadataLocalStore.remove(MetadataTableKey(tableName, infoDate)) } + + override def close(): Unit = {} } From e21c22ed8ec1a88c64f3a0bc66793c674d110165 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Wed, 8 Apr 2026 14:17:59 +0200 Subject: [PATCH 10/13] #181 Remove code duplication when waiting for DynamoDB tables to be active. --- .../core/bookkeeper/BookkeeperDynamoDb.scala | 84 ++++++++++--------- .../bookkeeper/OffsetManagerDynamoDb.scala | 38 +-------- .../pramen/core/journal/JournalDynamoDB.scala | 38 +-------- .../core/lock/TokenLockFactoryDynamoDb.scala | 42 +--------- .../metadata/MetadataManagerDynamoDb.scala | 38 +-------- 5 files changed, 51 insertions(+), 189 deletions(-) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala index f26e7d40..f1bf3c9b 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala @@ -169,7 +169,7 @@ class BookkeeperDynamoDb private ( dynamoDbClient.createTable(createTableRequest) // Wait for table to become active - waitForTableActive(tableName) + waitForTableActive(tableName, dynamoDbClient) } /** @@ -206,46 +206,7 @@ class BookkeeperDynamoDb private ( dynamoDbClient.createTable(createTableRequest) // Wait for table to become active - waitForTableActive(tableName) - } - - /** - * Waits for a table to become active after creation. - * - * @param tableName The name of the table to wait for - * @param maxWaitSeconds Maximum time to wait in seconds (default: 60) - */ - private def waitForTableActive(tableName: String, maxWaitSeconds: Int = 60): Unit = { - val startTime = System.currentTimeMillis() - val maxWaitMs = maxWaitSeconds * 1000L - - var tableActive = false - while (!tableActive && (System.currentTimeMillis() - startTime) < maxWaitMs) { - try { - val describeRequest = DescribeTableRequest.builder() - .tableName(tableName) - .build() - - val response = dynamoDbClient.describeTable(describeRequest) - val status = response.table().tableStatus() - - if (status == TableStatus.ACTIVE) { - tableActive = true - log.debug(s"Table $tableName is now ACTIVE") - } else { - log.debug(s"Table $tableName status: $status, waiting...") - Thread.sleep(2000) // Wait 2 seconds before checking again - } - } catch { - case NonFatal(ex) => - log.warn(s"Error checking table status for $tableName", ex) - Thread.sleep(2000) - } - } - - if (!tableActive) { - throw new RuntimeException(s"Table $tableName did not become active within $maxWaitSeconds seconds") - } + waitForTableActive(tableName, dynamoDbClient) } override def getLatestProcessedDateFromStorage(table: String, until: Option[LocalDate]): Option[LocalDate] = { @@ -846,6 +807,8 @@ object BookkeeperDynamoDb { val MODEL_VERSION = 1 + private val log = LoggerFactory.getLogger(this.getClass) + /** * Builder for creating BookkeeperDynamoDb instances. * Provides a fluent API for configuring DynamoDB bookkeeper. @@ -1001,4 +964,43 @@ object BookkeeperDynamoDb { case _ => tableName } } + + /** + * Waits for a table to become active after creation. + * + * @param tableName The name of the table to wait for + * @param maxWaitSeconds Maximum time to wait in seconds (default: 60) + */ + def waitForTableActive(tableName: String, dynamoDbClient: DynamoDbClient, maxWaitSeconds: Int = 60): Unit = { + val startTime = System.currentTimeMillis() + val maxWaitMs = maxWaitSeconds * 1000L + + var tableActive = false + while (!tableActive && (System.currentTimeMillis() - startTime) < maxWaitMs) { + try { + val describeRequest = DescribeTableRequest.builder() + .tableName(tableName) + .build() + + val response = dynamoDbClient.describeTable(describeRequest) + val status = response.table().tableStatus() + + if (status == TableStatus.ACTIVE) { + tableActive = true + log.info(s"Table $tableName is now ACTIVE") + } else { + log.info(s"Table $tableName status: $status, waiting...") + Thread.sleep(2000) // Wait 2 seconds before checking again + } + } catch { + case NonFatal(ex) => + log.warn(s"Error checking table status for $tableName", ex) + Thread.sleep(2000) + } + } + + if (!tableActive) { + throw new RuntimeException(s"Table $tableName did not become active within $maxWaitSeconds seconds") + } + } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala index 7ab2b2dd..dd5e7281 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala @@ -23,6 +23,7 @@ import software.amazon.awssdk.services.dynamodb.DynamoDbClient import software.amazon.awssdk.services.dynamodb.model._ import za.co.absa.pramen.api.offset.DataOffset.UncommittedOffset import za.co.absa.pramen.api.offset.{DataOffset, OffsetType, OffsetValue} +import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb.waitForTableActive import za.co.absa.pramen.core.bookkeeper.model._ import java.net.URI @@ -469,45 +470,10 @@ class OffsetManagerDynamoDb( .build() dynamoDbClient.createTable(createRequest) - waitForTableActive(offsetTableFullName) + waitForTableActive(offsetTableFullName, dynamoDbClient) log.info(s"Offset table '$offsetTableFullName' created successfully") } - /** - * Waits for a table to become ACTIVE. - */ - private def waitForTableActive(tableName: String, maxAttempts: Int = 30): Unit = { - var attempts = 0 - var isActive = false - - while (attempts < maxAttempts && !isActive) { - try { - val describeRequest = DescribeTableRequest.builder() - .tableName(tableName) - .build() - - val response = dynamoDbClient.describeTable(describeRequest) - val status = response.table().tableStatus() - - if (status == TableStatus.ACTIVE) { - isActive = true - } else { - Thread.sleep(1000) - attempts += 1 - } - } catch { - case NonFatal(ex) => - log.warn(s"Error waiting for table '$tableName' to become active", ex) - Thread.sleep(1000) - attempts += 1 - } - } - - if (!isActive) { - throw new RuntimeException(s"Table '$tableName' did not become ACTIVE after $maxAttempts attempts") - } - } - /** * Closes the DynamoDB client. */ diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala index b271ab81..94b70c6c 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala @@ -23,6 +23,7 @@ import software.amazon.awssdk.services.dynamodb.DynamoDbClient import software.amazon.awssdk.services.dynamodb.model._ import za.co.absa.pramen.core.app.config.InfoDateConfig import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb +import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb.waitForTableActive import za.co.absa.pramen.core.journal.model.TaskCompleted import java.net.URI @@ -227,45 +228,10 @@ class JournalDynamoDB private ( .build() dynamoDbClient.createTable(createRequest) - waitForTableActive(journalTableFullName) + waitForTableActive(journalTableFullName, dynamoDbClient) log.info(s"Journal table '$journalTableFullName' created successfully") } - /** - * Waits for a table to become ACTIVE. - */ - private def waitForTableActive(tableName: String, maxAttempts: Int = 30): Unit = { - var attempts = 0 - var isActive = false - - while (attempts < maxAttempts && !isActive) { - try { - val describeRequest = DescribeTableRequest.builder() - .tableName(tableName) - .build() - - val response = dynamoDbClient.describeTable(describeRequest) - val status = response.table().tableStatus() - - if (status == TableStatus.ACTIVE) { - isActive = true - } else { - Thread.sleep(1000) - attempts += 1 - } - } catch { - case NonFatal(ex) => - log.warn(s"Error waiting for table '$tableName' to become active", ex) - Thread.sleep(1000) - attempts += 1 - } - } - - if (!isActive) { - throw new RuntimeException(s"Table '$tableName' did not become ACTIVE after $maxAttempts attempts") - } - } - /** * Closes the DynamoDB client. */ diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala index 1ca2564f..513aa74f 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala @@ -23,6 +23,7 @@ import software.amazon.awssdk.services.dynamodb.DynamoDbClient import software.amazon.awssdk.services.dynamodb.model._ import za.co.absa.pramen.api.lock.{TokenLock, TokenLockFactory} import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb +import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb.waitForTableActive import java.net.URI import scala.util.control.NonFatal @@ -144,46 +145,7 @@ class TokenLockFactoryDynamoDb( dynamoDbClient.createTable(createTableRequest) // Wait for table to become active - waitForTableActive(tableName) - } - - /** - * Waits for a table to become active after creation. - * - * @param tableName The name of the table to wait for - * @param maxWaitSeconds Maximum time to wait in seconds (default: 60) - */ - private def waitForTableActive(tableName: String, maxWaitSeconds: Int = 60): Unit = { - val startTime = System.currentTimeMillis() - val maxWaitMs = maxWaitSeconds * 1000L - - var tableActive = false - while (!tableActive && (System.currentTimeMillis() - startTime) < maxWaitMs) { - try { - val describeRequest = DescribeTableRequest.builder() - .tableName(tableName) - .build() - - val response = dynamoDbClient.describeTable(describeRequest) - val status = response.table().tableStatus() - - if (status == TableStatus.ACTIVE) { - tableActive = true - log.debug(s"Table $tableName is now ACTIVE") - } else { - log.debug(s"Table $tableName status: $status, waiting...") - Thread.sleep(2000) // Wait 2 seconds before checking again - } - } catch { - case NonFatal(ex) => - log.warn(s"Error checking table status for $tableName", ex) - Thread.sleep(2000) - } - } - - if (!tableActive) { - throw new RuntimeException(s"Table $tableName did not become active within $maxWaitSeconds seconds") - } + waitForTableActive(tableName, dynamoDbClient) } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala index 16d8143d..1207cb3c 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala @@ -23,6 +23,7 @@ import software.amazon.awssdk.services.dynamodb.DynamoDbClient import software.amazon.awssdk.services.dynamodb.model._ import za.co.absa.pramen.api.MetadataValue import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb +import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb.waitForTableActive import java.net.URI import java.time.{Instant, LocalDate} @@ -225,45 +226,10 @@ class MetadataManagerDynamoDb private ( .build() dynamoDbClient.createTable(createRequest) - waitForTableActive(metadataTableFullName) + waitForTableActive(metadataTableFullName, dynamoDbClient) log.info(s"Metadata table '$metadataTableFullName' created successfully") } - /** - * Waits for a table to become ACTIVE. - */ - private def waitForTableActive(tableName: String, maxAttempts: Int = 30): Unit = { - var attempts = 0 - var isActive = false - - while (attempts < maxAttempts && !isActive) { - try { - val describeRequest = DescribeTableRequest.builder() - .tableName(tableName) - .build() - - val response = dynamoDbClient.describeTable(describeRequest) - val status = response.table().tableStatus() - - if (status == TableStatus.ACTIVE) { - isActive = true - } else { - Thread.sleep(1000) - attempts += 1 - } - } catch { - case NonFatal(ex) => - log.warn(s"Error waiting for table '$tableName' to become active", ex) - Thread.sleep(1000) - attempts += 1 - } - } - - if (!isActive) { - throw new RuntimeException(s"Table '$tableName' did not become ACTIVE after $maxAttempts attempts") - } - } - /** * Closes the DynamoDB client. */ From 5900303a491152ca17a2939e5ef40dfe6d47be08 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Fri, 10 Apr 2026 12:51:09 +0200 Subject: [PATCH 11/13] #181 Enhance error handling in DynamoDB clients by ensuring proper resource closure on exceptions and refactor offset deletion logic for improved pagination. --- .../core/bookkeeper/BookkeeperDynamoDb.scala | 20 ++- .../bookkeeper/OffsetManagerDynamoDb.scala | 152 ++++++++++-------- .../pramen/core/journal/JournalDynamoDB.scala | 16 +- .../core/lock/TokenLockFactoryDynamoDb.scala | 18 ++- .../metadata/MetadataManagerDynamoDb.scala | 16 +- 5 files changed, 129 insertions(+), 93 deletions(-) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala index f1bf3c9b..97089bfb 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala @@ -474,7 +474,7 @@ class BookkeeperDynamoDb private ( results += s"Deleted $schemaCount schema records for table '$tableName'" // Delete offsets - val offsetResults = OffsetManagerDynamoDb.deleteAllOffsets(tableName, dynamoDbClient) + val offsetResults = getOffsetManager.asInstanceOf[OffsetManagerDynamoDb].deleteAllOffsets(tableName, dynamoDbClient) results += s"Deleted $offsetResults offset records for table '$tableName'" log.info(s"Successfully deleted all records for table '$tableName'") @@ -931,12 +931,18 @@ object BookkeeperDynamoDb { val client = clientBuilder.build() - new BookkeeperDynamoDb( - dynamoDbClient = client, - batchId = actualBatchId, - tableArn = tableArn, - tablePrefix = tablePrefix - ) + try { + new BookkeeperDynamoDb( + dynamoDbClient = client, + batchId = actualBatchId, + tableArn = tableArn, + tablePrefix = tablePrefix + ) + } catch { + case NonFatal(ex) => + client.close() + throw ex + } } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala index dd5e7281..cbd31209 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala @@ -312,21 +312,29 @@ class OffsetManagerDynamoDb( */ private[core] def getOffsetRecords(table: String, infoDate: LocalDate): Array[OffsetRecord] = { try { - val queryRequest = QueryRequest.builder() - .tableName(offsetTableFullName) - .keyConditionExpression(s"${ATTR_PRAMEN_TABLE_NAME} = :table_name") - .filterExpression(s"${ATTR_INFO_DATE} = :info_date") - .expressionAttributeValues(Map( - ":table_name" -> AttributeValue.builder().s(table).build(), - ":info_date" -> AttributeValue.builder().s(infoDate.toString).build() - ).asJava) - .build() + var allItems = Seq.empty[java.util.Map[String, AttributeValue]] + var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null + val infoDatePrefix = s"${infoDate.toString}#" - val result = dynamoDbClient.query(queryRequest) + do { + val queryRequestBuilder = QueryRequest.builder() + .tableName(offsetTableFullName) + .keyConditionExpression(s"$ATTR_PRAMEN_TABLE_NAME = :table_name AND begins_with($ATTR_COMPOSITE_KEY, :prefix)") + .expressionAttributeValues(Map( + ":table_name" -> AttributeValue.builder().s(table).build(), + ":prefix" -> AttributeValue.builder().s(infoDatePrefix).build() + ).asJava) + + if (lastEvaluatedKey != null) { + queryRequestBuilder.exclusiveStartKey(lastEvaluatedKey) + } + + val result = dynamoDbClient.query(queryRequestBuilder.build()) + allItems = allItems ++ result.items().asScala + lastEvaluatedKey = result.lastEvaluatedKey() + } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty) - result.items().asScala - .map(itemToOffsetRecord) - .toArray + allItems.map(itemToOffsetRecord).toArray } catch { case NonFatal(ex) => throw new RuntimeException(s"Unable to read offset records from the offset table '$offsetTableFullName'.", ex) @@ -487,6 +495,54 @@ class OffsetManagerDynamoDb( log.warn("Error closing DynamoDB client", ex) } } + + /** Deletes all offsets for a given table. */ + private[core] def deleteAllOffsets(tableName: String, dynamoDbClient: DynamoDbClient): Int = { + val log = LoggerFactory.getLogger(this.getClass) + try { + var allItems = Seq.empty[java.util.Map[String, AttributeValue]] + var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null + + // Query all offsets for the table with pagination + do { + val queryRequestBuilder = QueryRequest.builder() + .tableName(offsetTableFullName) + .keyConditionExpression(s"$ATTR_PRAMEN_TABLE_NAME = :table_name") + .expressionAttributeValues(Map( + ":table_name" -> AttributeValue.builder().s(tableName).build() + ).asJava) + + if (lastEvaluatedKey != null) { + queryRequestBuilder.exclusiveStartKey(lastEvaluatedKey) + } + + val result = dynamoDbClient.query(queryRequestBuilder.build()) + allItems = allItems ++ result.items().asScala + lastEvaluatedKey = result.lastEvaluatedKey() + } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty) + + // Delete each item + allItems.foreach { item => + val deleteRequest = DeleteItemRequest.builder() + .tableName(offsetTableFullName) + .key(Map( + ATTR_PRAMEN_TABLE_NAME -> item.get(ATTR_PRAMEN_TABLE_NAME), + ATTR_COMPOSITE_KEY -> item.get(ATTR_COMPOSITE_KEY) + ).asJava) + .build() + + dynamoDbClient.deleteItem(deleteRequest) + } + + val deletedCount = allItems.size + log.info(s"Deleted $deletedCount offset records for table '$tableName'") + deletedCount + } catch { + case NonFatal(ex) => + log.error(s"Error deleting offsets for table '$tableName' from '$offsetTableFullName'", ex) + throw new RuntimeException(s"Unable to delete offsets for table '$tableName' from '$offsetTableFullName'", ex) + } + } } object OffsetManagerDynamoDb { @@ -560,66 +616,22 @@ object OffsetManagerDynamoDb { val client = clientBuilder.build() - new OffsetManagerDynamoDb( - dynamoDbClient = client, - batchId = batchId, - tableArn = tableArn, - tablePrefix = tablePrefix, - closesClient = true - ) + try { + new OffsetManagerDynamoDb( + dynamoDbClient = client, + batchId = batchId, + tableArn = tableArn, + tablePrefix = tablePrefix, + closesClient = true + ) + } catch { + case NonFatal(ex) => + client.close() + throw ex + } } } def builder: OffsetManagerDynamoDbBuilder = new OffsetManagerDynamoDbBuilder - /** Deletes all offsets for a given table. */ - def deleteAllOffsets(tableName: String, dynamoDbClient: DynamoDbClient): Int = { - val log = LoggerFactory.getLogger(this.getClass) - val offsetTableBaseName = s"${DEFAULT_TABLE_PREFIX}_${DEFAULT_OFFSET_TABLE}" - val offsetTableFullName = BookkeeperDynamoDb.getFullTableName(None, offsetTableBaseName) - - try { - var allItems = Seq.empty[java.util.Map[String, AttributeValue]] - var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null - - // Query all offsets for the table with pagination - do { - val queryRequestBuilder = QueryRequest.builder() - .tableName(offsetTableFullName) - .keyConditionExpression(s"$ATTR_PRAMEN_TABLE_NAME = :table_name") - .expressionAttributeValues(Map( - ":table_name" -> AttributeValue.builder().s(tableName).build() - ).asJava) - - if (lastEvaluatedKey != null) { - queryRequestBuilder.exclusiveStartKey(lastEvaluatedKey) - } - - val result = dynamoDbClient.query(queryRequestBuilder.build()) - allItems = allItems ++ result.items().asScala - lastEvaluatedKey = result.lastEvaluatedKey() - } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty) - - // Delete each item - allItems.foreach { item => - val deleteRequest = DeleteItemRequest.builder() - .tableName(offsetTableFullName) - .key(Map( - ATTR_PRAMEN_TABLE_NAME -> item.get(ATTR_PRAMEN_TABLE_NAME), - ATTR_COMPOSITE_KEY -> item.get(ATTR_COMPOSITE_KEY) - ).asJava) - .build() - - dynamoDbClient.deleteItem(deleteRequest) - } - - val deletedCount = allItems.size - log.info(s"Deleted $deletedCount offset records for table '$tableName'") - deletedCount - } catch { - case NonFatal(ex) => - log.error(s"Error deleting offsets for table '$tableName' from '$offsetTableFullName'", ex) - throw new RuntimeException(s"Unable to delete offsets for table '$tableName' from '$offsetTableFullName'", ex) - } - } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala index 94b70c6c..a9453d3e 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala @@ -331,11 +331,17 @@ object JournalDynamoDB { val client = clientBuilder.build() - new JournalDynamoDB( - dynamoDbClient = client, - tableArn = tableArn, - tablePrefix = tablePrefix - ) + try { + new JournalDynamoDB( + dynamoDbClient = client, + tableArn = tableArn, + tablePrefix = tablePrefix + ) + } catch { + case NonFatal(ex) => + client.close() + throw ex + } } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala index 513aa74f..75a669c6 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala @@ -38,7 +38,7 @@ import scala.util.control.NonFatal * @param tableArn Optional ARN prefix for the locks table * @param tablePrefix Prefix for the locks table name (default: "pramen") */ -class TokenLockFactoryDynamoDb( +class TokenLockFactoryDynamoDb private( dynamoDbClient: DynamoDbClient, tableArn: Option[String] = None, tablePrefix: String = "pramen" @@ -257,11 +257,17 @@ object TokenLockFactoryDynamoDb { val client = clientBuilder.build() - new TokenLockFactoryDynamoDb( - dynamoDbClient = client, - tableArn = tableArn, - tablePrefix = tablePrefix - ) + try { + new TokenLockFactoryDynamoDb( + dynamoDbClient = client, + tableArn = tableArn, + tablePrefix = tablePrefix + ) + } catch { + case NonFatal(ex) => + client.close() + throw ex + } } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala index 1207cb3c..31b8a893 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala @@ -310,11 +310,17 @@ object MetadataManagerDynamoDb { val client = clientBuilder.build() - new MetadataManagerDynamoDb( - dynamoDbClient = client, - tableArn = tableArn, - tablePrefix = tablePrefix - ) + try { + new MetadataManagerDynamoDb( + dynamoDbClient = client, + tableArn = tableArn, + tablePrefix = tablePrefix + ) + } catch { + case NonFatal(ex) => + client.close() + throw ex + } } } From 1cfbb4fe932d357865cb773c4754b3a1745db9e6 Mon Sep 17 00:00:00 2001 From: Ruslan Yushchenko Date: Fri, 10 Apr 2026 13:25:09 +0200 Subject: [PATCH 12/13] Update pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- .../bookkeeper/OffsetManagerDynamoDb.scala | 37 +++++++++++-------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala index cbd31209..0c5f872d 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala @@ -346,26 +346,31 @@ class OffsetManagerDynamoDb( */ private[core] def getMaximumInfoDate(table: String): Option[LocalDate] = { try { - val queryRequest = QueryRequest.builder() - .tableName(offsetTableFullName) - .keyConditionExpression(s"${ATTR_PRAMEN_TABLE_NAME} = :table_name") - .expressionAttributeValues(Map( - ":table_name" -> AttributeValue.builder().s(table).build() - ).asJava) - .projectionExpression(ATTR_INFO_DATE) - .build() + var allDates = Seq.empty[LocalDate] + var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null - val result = dynamoDbClient.query(queryRequest) + do { + val queryRequestBuilder = QueryRequest.builder() + .tableName(offsetTableFullName) + .keyConditionExpression(s"${ATTR_PRAMEN_TABLE_NAME} = :table_name") + .expressionAttributeValues(Map( + ":table_name" -> AttributeValue.builder().s(table).build() + ).asJava) + .projectionExpression(ATTR_INFO_DATE) - if (result.items().isEmpty) { + if (lastEvaluatedKey != null) { + queryRequestBuilder.exclusiveStartKey(lastEvaluatedKey) + } + + val result = dynamoDbClient.query(queryRequestBuilder.build()) + allDates = allDates ++ result.items().asScala.map(item => LocalDate.parse(item.get(ATTR_INFO_DATE).s())) + lastEvaluatedKey = result.lastEvaluatedKey() + } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty) + + if (allDates.isEmpty) { None } else { - // Use maxBy with compareTo to avoid needing implicit Ordering - val maxInfoDate = result.items().asScala - .map(item => LocalDate.parse(item.get(ATTR_INFO_DATE).s())) - .maxBy(_.toEpochDay) - - Some(maxInfoDate) + Some(allDates.maxBy(_.toEpochDay)) } } catch { case NonFatal(ex) => From dbf924b8687c38bb5e8a29decd39f54c93071ab6 Mon Sep 17 00:00:00 2001 From: Ruslan Iushchenko Date: Fri, 10 Apr 2026 14:01:15 +0200 Subject: [PATCH 13/13] #181 Fix the offset deletion causing ClassCastExcaption. --- .../co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala index 97089bfb..734a9ff2 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala @@ -73,6 +73,8 @@ class BookkeeperDynamoDb private ( new OffsetManagerDynamoDb(dynamoDbClient, batchId, tableArn, tablePrefix, closesClient = false) ) + private val offsetManagementDynamoDB = new OffsetManagerDynamoDb(dynamoDbClient, batchId, tableArn, tablePrefix, closesClient = false) + // Initialize tables on construction init() @@ -474,7 +476,7 @@ class BookkeeperDynamoDb private ( results += s"Deleted $schemaCount schema records for table '$tableName'" // Delete offsets - val offsetResults = getOffsetManager.asInstanceOf[OffsetManagerDynamoDb].deleteAllOffsets(tableName, dynamoDbClient) + val offsetResults = offsetManagementDynamoDB.deleteAllOffsets(tableName, dynamoDbClient) results += s"Deleted $offsetResults offset records for table '$tableName'" log.info(s"Successfully deleted all records for table '$tableName'")