diff --git a/README.md b/README.md index 9564c1af4..584993dfe 100644 --- a/README.md +++ b/README.md @@ -2569,6 +2569,34 @@ pramen { } ``` +### (experimental) DynamoDB database +Here is how you can use a DynamoDB database for storing bookkeeping information: + +```hocon +pramen { + bookkeeping.enabled = "true" + + bookkeeping.dynamodb { + region = "af-south-1" + table.prefix = "pramen_uat" + } +} +``` + +DynamoDB tables are automatically created if they don't exist with default options. Use the prefix to create multiple +Pramen bookeeping environments per AWS account. + +Note that the Pramen project that uses DynamoDB for bookeeping needs to add DynamoDB as a dependency if it is not provided +by the Spark cluster (e.g. EMR). + +```xml + + software.amazon.awssdk + dynamodb + ${aws.sdk.version} + +``` + ### Hadoop (CSV+JSON) This is less recommended way, and is quite slow. But the advantage is that you don't need a database. diff --git a/pramen/api/src/main/scala/za/co/absa/pramen/api/MetadataManager.scala b/pramen/api/src/main/scala/za/co/absa/pramen/api/MetadataManager.scala index 0bd91ed93..333a8c782 100644 --- a/pramen/api/src/main/scala/za/co/absa/pramen/api/MetadataManager.scala +++ b/pramen/api/src/main/scala/za/co/absa/pramen/api/MetadataManager.scala @@ -18,7 +18,7 @@ package za.co.absa.pramen.api import java.time.LocalDate -trait MetadataManager { +trait MetadataManager extends AutoCloseable { /** * Get metadata value for a given table, date and key. * diff --git a/pramen/api/src/main/scala/za/co/absa/pramen/api/lock/TokenLockFactory.scala b/pramen/api/src/main/scala/za/co/absa/pramen/api/lock/TokenLockFactory.scala index d0aaeaaf0..11ac6343c 100644 --- a/pramen/api/src/main/scala/za/co/absa/pramen/api/lock/TokenLockFactory.scala +++ b/pramen/api/src/main/scala/za/co/absa/pramen/api/lock/TokenLockFactory.scala @@ -41,6 +41,8 @@ */ package za.co.absa.pramen.api.lock -trait TokenLockFactory { +trait TokenLockFactory extends AutoCloseable { def getLock(token: String): TokenLock + + override def close(): Unit = {} } diff --git a/pramen/core/pom.xml b/pramen/core/pom.xml index fb500d652..54011be0e 100644 --- a/pramen/core/pom.xml +++ b/pramen/core/pom.xml @@ -144,6 +144,13 @@ channel_scala_${scala.compat.version} + + + software.amazon.awssdk + dynamodb + provided + + org.mockito diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/app/AppContext.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/app/AppContext.scala index 3fca0e018..47454157b 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/app/AppContext.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/app/AppContext.scala @@ -21,7 +21,7 @@ import za.co.absa.pramen.core.bookkeeper.Bookkeeper import za.co.absa.pramen.core.journal.Journal import za.co.absa.pramen.core.metastore.Metastore -trait AppContext { +trait AppContext extends AutoCloseable { val appConfig: AppConfig def bookkeeper: Bookkeeper @@ -31,6 +31,4 @@ trait AppContext { def journal: Journal def metastore: Metastore - - def close(): Unit } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/app/config/BookkeeperConfig.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/app/config/BookkeeperConfig.scala index cf0f71080..e5434316a 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/app/config/BookkeeperConfig.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/app/config/BookkeeperConfig.scala @@ -30,7 +30,10 @@ case class BookkeeperConfig( bookkeepingJdbcConfig: Option[JdbcConfig], deltaDatabase: Option[String], deltaTablePrefix: Option[String], - temporaryDirectory: Option[String] + temporaryDirectory: Option[String], + dynamoDbRegion: Option[String], + dynamoDbTableArn: Option[String], + dynamoDbTablePrefix: Option[String] ) object BookkeeperConfig { @@ -44,6 +47,9 @@ object BookkeeperConfig { val BOOKKEEPING_DB_NAME = "pramen.bookkeeping.mongodb.database" val BOOKKEEPING_DELTA_DB_NAME = "pramen.bookkeeping.delta.database" val BOOKKEEPING_DELTA_TABLE_PREFIX = "pramen.bookkeeping.delta.table.prefix" + val BOOKKEEPING_DYNAMODB_REGION = "pramen.bookkeeping.dynamodb.region" + val BOOKKEEPING_DYNAMODB_TABLE_ARN = "pramen.bookkeeping.dynamodb.table.arn" + val BOOKKEEPING_DYNAMODB_TABLE_PREFIX = "pramen.bookkeeping.dynamodb.table.prefix" val BOOKKEEPING_TEMPORARY_DIRECTORY_KEY = "pramen.temporary.directory" def fromConfig(conf: Config, allowLocalBookkepingStorage: Boolean = false): BookkeeperConfig = { @@ -56,6 +62,9 @@ object BookkeeperConfig { val temporaryDirectory = ConfigUtils.getOptionString(conf, BOOKKEEPING_TEMPORARY_DIRECTORY_KEY) val deltaDatabase = ConfigUtils.getOptionString(conf, BOOKKEEPING_DELTA_DB_NAME) val deltaTablePrefix = ConfigUtils.getOptionString(conf, BOOKKEEPING_DELTA_TABLE_PREFIX) + val dynamoDbRegion = ConfigUtils.getOptionString(conf, BOOKKEEPING_DYNAMODB_REGION) + val dynamoDbTableArn = ConfigUtils.getOptionString(conf, BOOKKEEPING_DYNAMODB_TABLE_ARN) + val dynamoDbTablePrefix = ConfigUtils.getOptionString(conf, BOOKKEEPING_DYNAMODB_TABLE_PREFIX) if (bookkeepingEnabled && bookkeepingJdbcConfig.isEmpty && bookkeepingHadoopFormat == HadoopFormat.Delta) { if (bookkeepingLocation.isEmpty && deltaTablePrefix.isEmpty) { @@ -63,7 +72,7 @@ object BookkeeperConfig { s"Preferably $BOOKKEEPING_DELTA_DB_NAME should be defined as well for managed Delta Lake tables.") } } else { - if (bookkeepingEnabled && bookkeepingConnectionString.isEmpty && bookkeepingLocation.isEmpty && bookkeepingJdbcConfig.isEmpty) { + if (bookkeepingEnabled && bookkeepingConnectionString.isEmpty && bookkeepingLocation.isEmpty && bookkeepingJdbcConfig.isEmpty && dynamoDbRegion.isEmpty) { if (allowLocalBookkepingStorage) { log.warn("Bookkeeping configuration is missing. Using the default SQLite database 'pramen.sqlite'") return BookkeeperConfig( @@ -78,10 +87,13 @@ object BookkeeperConfig { )), None, None, - temporaryDirectory + temporaryDirectory, + None, + None, + None ) } else { - throw new RuntimeException(s"One of the following should be defined: $BOOKKEEPING_PARENT.jdbc.url, $BOOKKEEPING_CONNECTION_STRING or $BOOKKEEPING_LOCATION" + + throw new RuntimeException(s"One of the following should be defined: $BOOKKEEPING_PARENT.jdbc.url, $BOOKKEEPING_CONNECTION_STRING, $BOOKKEEPING_DYNAMODB_REGION, or $BOOKKEEPING_LOCATION" + s" when bookkeeping is enabled. You can disable bookkeeping by setting $BOOKKEEPING_ENABLED = false.") } } @@ -89,6 +101,10 @@ object BookkeeperConfig { if (bookkeepingConnectionString.isDefined && bookkeepingDbName.isEmpty) { throw new RuntimeException(s"Database name is not defined. Please, define $BOOKKEEPING_DB_NAME.") } + + if (dynamoDbRegion.isDefined && dynamoDbTablePrefix.isEmpty) { + log.warn(s"DynamoDB table prefix is not defined. Using default prefix 'pramen'. You can define it with $BOOKKEEPING_DYNAMODB_TABLE_PREFIX.") + } } BookkeeperConfig( @@ -100,7 +116,10 @@ object BookkeeperConfig { bookkeepingJdbcConfig, deltaDatabase, deltaTablePrefix, - temporaryDirectory + temporaryDirectory, + dynamoDbRegion, + dynamoDbTableArn, + dynamoDbTablePrefix ) } } \ No newline at end of file diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala index 62eb8d886..8f0602754 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/Bookkeeper.scala @@ -26,7 +26,7 @@ import za.co.absa.pramen.core.app.config.{BookkeeperConfig, HadoopFormat, Runtim import za.co.absa.pramen.core.bookkeeper.model.DataAvailability import za.co.absa.pramen.core.journal._ import za.co.absa.pramen.core.lock._ -import za.co.absa.pramen.core.metadata.{MetadataManagerJdbc, MetadataManagerNull} +import za.co.absa.pramen.core.metadata.{MetadataManagerDynamoDb, MetadataManagerJdbc, MetadataManagerNull} import za.co.absa.pramen.core.model.DataChunk import za.co.absa.pramen.core.mongo.MongoDbConnection import za.co.absa.pramen.core.rdb.PramenDb @@ -90,6 +90,7 @@ object Bookkeeper { } val hasBookkeepingJdbc = bookkeepingConfig.bookkeepingJdbcConfig.exists(_.primaryUrl.isDefined) + val hasBookkeepingDynamoDb = bookkeepingConfig.dynamoDbRegion.isDefined val dbOpt = if (hasBookkeepingJdbc) { val jdbcConfig = bookkeepingConfig.bookkeepingJdbcConfig.get @@ -101,6 +102,14 @@ object Bookkeeper { if (hasBookkeepingJdbc) { log.info(s"Using RDB for lock management.") new TokenLockFactoryJdbc(dbOpt.get.slickDb, dbOpt.get.slickProfile) + } else if (hasBookkeepingDynamoDb) { + val tablePrefix = bookkeepingConfig.dynamoDbTablePrefix.getOrElse(BookkeeperDynamoDb.DEFAULT_TABLE_PREFIX) + log.info(s"Using DynamoDB for lock management in region '${bookkeepingConfig.dynamoDbRegion.get}' with table prefix '$tablePrefix'") + TokenLockFactoryDynamoDb.builder + .withRegion(bookkeepingConfig.dynamoDbRegion.get) + .withTablePrefix(tablePrefix) + .withTableArn(bookkeepingConfig.dynamoDbTableArn) + .build() } else { mongoDbConnection match { case Some(connection) => @@ -129,6 +138,15 @@ object Bookkeeper { new BookkeeperNull() } else if (hasBookkeepingJdbc) { BookkeeperJdbc.fromPramenDb(dbOpt.get, batchId) + } else if (hasBookkeepingDynamoDb) { + val tablePrefix = bookkeepingConfig.dynamoDbTablePrefix.getOrElse(BookkeeperDynamoDb.DEFAULT_TABLE_PREFIX) + log.info(s"Using DynamoDB for bookkeeping in region '${bookkeepingConfig.dynamoDbRegion.get}' with table prefix '$tablePrefix'") + BookkeeperDynamoDb.builder + .withRegion(bookkeepingConfig.dynamoDbRegion.get) + .withBatchId(batchId) + .withTablePrefix(tablePrefix) + .withTableArn(bookkeepingConfig.dynamoDbTableArn) + .build() } else { mongoDbConnection match { case Some(connection) => @@ -161,6 +179,14 @@ object Bookkeeper { } else if (hasBookkeepingJdbc) { log.info(s"Using RDB to keep journal of executed jobs.") new JournalJdbc(dbOpt.get.slickDb, dbOpt.get.slickProfile) + } else if (hasBookkeepingDynamoDb) { + val tablePrefix = bookkeepingConfig.dynamoDbTablePrefix.getOrElse(JournalDynamoDB.DEFAULT_TABLE_PREFIX) + log.info(s"Using DynamoDB for journal in region '${bookkeepingConfig.dynamoDbRegion.get}' with table prefix '$tablePrefix'") + JournalDynamoDB.builder + .withRegion(bookkeepingConfig.dynamoDbRegion.get) + .withTablePrefix(tablePrefix) + .withTableArn(bookkeepingConfig.dynamoDbTableArn) + .build() } else { mongoDbConnection match { case Some(connection) => @@ -194,6 +220,14 @@ object Bookkeeper { } else if (hasBookkeepingJdbc) { log.info(s"Using RDB to keep custom metadata.") new MetadataManagerJdbc(dbOpt.get.slickDb, dbOpt.get.slickProfile) + } else if (hasBookkeepingDynamoDb) { + val tablePrefix = bookkeepingConfig.dynamoDbTablePrefix.getOrElse(MetadataManagerDynamoDb.DEFAULT_TABLE_PREFIX) + log.info(s"Using DynamoDB for metadata in region '${bookkeepingConfig.dynamoDbRegion.get}' with table prefix '$tablePrefix'") + MetadataManagerDynamoDb.builder + .withRegion(bookkeepingConfig.dynamoDbRegion.get) + .withTablePrefix(tablePrefix) + .withTableArn(bookkeepingConfig.dynamoDbTableArn) + .build() } else { log.info(s"The custom metadata management is not supported.") new MetadataManagerNull(isPersistenceEnabled = true) @@ -203,6 +237,9 @@ object Bookkeeper { override def close(): Unit = { mongoDbConnection.foreach(_.close()) dbOpt.foreach(_.close()) + tokenFactory.close() + journal.close() + metadataManager.close() } } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala new file mode 100644 index 000000000..734a9ff23 --- /dev/null +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/BookkeeperDynamoDb.scala @@ -0,0 +1,1014 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.bookkeeper + +import org.apache.spark.sql.types.StructType +import org.slf4j.LoggerFactory +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider +import software.amazon.awssdk.regions.Region +import software.amazon.awssdk.services.dynamodb.DynamoDbClient +import software.amazon.awssdk.services.dynamodb.model._ +import za.co.absa.pramen.core.bookkeeper.model.DataAvailability +import za.co.absa.pramen.core.model.{DataChunk, TableSchema} +import za.co.absa.pramen.core.utils.{AlgorithmUtils, TimeUtils} + +import java.net.URI +import java.time.LocalDate +import scala.collection.JavaConverters._ +import scala.util.control.NonFatal + +/** + * DynamoDB-based implementation of the Bookkeeper. + * + * Table schema for bookkeeping: + * - Partition key: tableName (String) + * - Sort key: infoDateSortKey (String in "yyyy-MM-dd#jobFinishedMillis" format) + * The composite sort key allows multiple entries for the same table and date. + * + * Table schema for schemas: + * - Partition key: tableName (String) + * - Sort key: infoDate (String in yyyy-MM-dd format) + * + * @param dynamoDbClient The DynamoDB client to use for operations + * @param batchId The batch ID for this execution + * @param tableArn Optional ARN prefix for DynamoDB tables (e.g., "arn:aws:dynamodb:region:account-id:table/") + * @param tablePrefix Prefix for table names to allow multiple bookkeeping sets in the same account (default: "pramen") + */ +class BookkeeperDynamoDb private ( + dynamoDbClient: DynamoDbClient, + batchId: Long, + tableArn: Option[String] = None, + tablePrefix: String = BookkeeperDynamoDb.DEFAULT_TABLE_PREFIX +) extends BookkeeperBase(isBookkeepingEnabled = true, batchId) { + + import BookkeeperDynamoDb._ + + private val log = LoggerFactory.getLogger(this.getClass) + private val queryWarningTimeoutMs = 10000L + + // Construct table names with prefix + private val bookkeepingTableBaseName = s"${tablePrefix}_$DEFAULT_BOOKKEEPING_TABLE" + private val schemaTableBaseName = s"${tablePrefix}_$DEFAULT_SCHEMA_TABLE" + + // Full table names/ARNs + private val bookkeepingTableName = getFullTableName(tableArn, bookkeepingTableBaseName) + private val schemaTableName = getFullTableName(tableArn, schemaTableBaseName) + + // Offset management + private val offsetManagement = new OffsetManagerCached( + new OffsetManagerDynamoDb(dynamoDbClient, batchId, tableArn, tablePrefix, closesClient = false) + ) + + private val offsetManagementDynamoDB = new OffsetManagerDynamoDb(dynamoDbClient, batchId, tableArn, tablePrefix, closesClient = false) + + // Initialize tables on construction + init() + + override val bookkeepingEnabled: Boolean = true + + /** + * Initializes the DynamoDB tables for bookkeeping and schemas. + * Checks if tables exist and creates them if they don't. + */ + def init(): Unit = { + try { + log.info(s"Initializing DynamoDB bookkeeper with tables: bookkeeping='$bookkeepingTableName', schemas='$schemaTableName'") + + // Initialize bookkeeping table + if (!tableExists(bookkeepingTableName)) { + log.info(s"Creating DynamoDB bookkeeping table: $bookkeepingTableName") + createBookkeepingTable(bookkeepingTableName) + log.info(s"Successfully created bookkeeping table: $bookkeepingTableName") + } else { + log.info(s"DynamoDB bookkeeping table already exists: $bookkeepingTableName") + } + + // Initialize schema table + if (!tableExists(schemaTableName)) { + log.info(s"Creating DynamoDB schema table: $schemaTableName") + createSchemaTable(schemaTableName) + log.info(s"Successfully created schema table: $schemaTableName") + } else { + log.info(s"DynamoDB schema table already exists: $schemaTableName") + } + + log.info(s"DynamoDB bookkeeper initialization complete") + } catch { + case NonFatal(ex) => + log.error("Error initializing DynamoDB bookkeeper tables", ex) + throw new RuntimeException("Failed to initialize DynamoDB bookkeeper", ex) + } + } + + /** + * Checks if a DynamoDB table exists. + * + * @param tableName The name of the table to check + * @return true if the table exists, false otherwise + */ + private def tableExists(tableName: String): Boolean = { + try { + val describeRequest = DescribeTableRequest.builder() + .tableName(tableName) + .build() + + dynamoDbClient.describeTable(describeRequest) + true + } catch { + case _: ResourceNotFoundException => false + case NonFatal(ex) => + log.warn(s"Error checking if table exists: $tableName", ex) + throw ex + } + } + + /** + * Creates the bookkeeping table with the appropriate schema. + * Uses a composite sort key (infoDate#jobFinished) to allow multiple entries per table and date. + * + * @param tableName The name of the table to create + */ + private def createBookkeepingTable(tableName: String): Unit = { + val createTableRequest = CreateTableRequest.builder() + .tableName(tableName) + .keySchema( + KeySchemaElement.builder() + .attributeName(ATTR_TABLE_NAME) + .keyType(KeyType.HASH) + .build(), + KeySchemaElement.builder() + .attributeName(ATTR_INFO_DATE_SORT_KEY) + .keyType(KeyType.RANGE) + .build() + ) + .attributeDefinitions( + AttributeDefinition.builder() + .attributeName(ATTR_TABLE_NAME) + .attributeType(ScalarAttributeType.S) + .build(), + AttributeDefinition.builder() + .attributeName(ATTR_INFO_DATE_SORT_KEY) + .attributeType(ScalarAttributeType.S) + .build() + ) + .billingMode(BillingMode.PAY_PER_REQUEST) // On-demand billing + .build() + + dynamoDbClient.createTable(createTableRequest) + + // Wait for table to become active + waitForTableActive(tableName, dynamoDbClient) + } + + /** + * Creates the schema table with the appropriate schema. + * + * @param tableName The name of the table to create + */ + private def createSchemaTable(tableName: String): Unit = { + val createTableRequest = CreateTableRequest.builder() + .tableName(tableName) + .keySchema( + KeySchemaElement.builder() + .attributeName(ATTR_TABLE_NAME) + .keyType(KeyType.HASH) + .build(), + KeySchemaElement.builder() + .attributeName(ATTR_INFO_DATE) + .keyType(KeyType.RANGE) + .build() + ) + .attributeDefinitions( + AttributeDefinition.builder() + .attributeName(ATTR_TABLE_NAME) + .attributeType(ScalarAttributeType.S) + .build(), + AttributeDefinition.builder() + .attributeName(ATTR_INFO_DATE) + .attributeType(ScalarAttributeType.S) + .build() + ) + .billingMode(BillingMode.PAY_PER_REQUEST) // On-demand billing + .build() + + dynamoDbClient.createTable(createTableRequest) + + // Wait for table to become active + waitForTableActive(tableName, dynamoDbClient) + } + + override def getLatestProcessedDateFromStorage(table: String, until: Option[LocalDate]): Option[LocalDate] = { + try { + val queryBuilder = QueryRequest.builder() + .tableName(bookkeepingTableName) + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(table).build() + ).asJava) + .scanIndexForward(false) // descending order + + val query = until match { + case Some(endDate) => + val endDateStr = getDateStr(endDate) + // Query using prefix on the sort key since we need items with infoDate <= endDate + queryBuilder + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE_SORT_KEY <= :endDateMax") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(table).build(), + // Use max possible value after date to get all entries for that date and before + ":endDateMax" -> AttributeValue.builder().s(s"${endDateStr}#~").build() + ).asJava) + case None => + queryBuilder + } + + val response = dynamoDbClient.query(query.build()) + val items = response.items().asScala + + if (items.isEmpty) { + None + } else { + // Find the maximum infoDateEnd + val latestDate = items + .map(item => LocalDate.parse(item.get(ATTR_INFO_DATE_END).s())) + .maxBy(_.toEpochDay) + Some(latestDate) + } + } catch { + case NonFatal(ex) => + log.error(s"Error querying latest processed date for table '$table'", ex) + throw ex + } + } + + override def getLatestDataChunkFromStorage(table: String, infoDate: LocalDate): Option[DataChunk] = { + try { + val dateStr = getDateStr(infoDate) + + val queryRequest = QueryRequest.builder() + .tableName(bookkeepingTableName) + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND begins_with($ATTR_INFO_DATE_SORT_KEY, :infoDatePrefix)") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(table).build(), + ":infoDatePrefix" -> AttributeValue.builder().s(s"$dateStr#").build() + ).asJava) + .scanIndexForward(false) // descending order by sort key (latest jobFinished first) + .build() + + val response = dynamoDbClient.query(queryRequest) + val items = response.items().asScala + + if (items.isEmpty) { + None + } else { + // Take the first item (already sorted in descending order) + items + .map(itemToDataChunk) + .headOption + } + } catch { + case NonFatal(ex) => + log.error(s"Error getting latest data chunk for table '$table' at $infoDate", ex) + throw ex + } + } + + override def getDataChunksFromStorage(table: String, infoDate: LocalDate, batchIdFilter: Option[Long]): Seq[DataChunk] = { + try { + val dateStr = getDateStr(infoDate) + + val queryBuilder = QueryRequest.builder() + .tableName(bookkeepingTableName) + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND begins_with($ATTR_INFO_DATE_SORT_KEY, :infoDatePrefix)") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(table).build(), + ":infoDatePrefix" -> AttributeValue.builder().s(s"$dateStr#").build() + ).asJava) + + val query = batchIdFilter match { + case Some(bId) => + queryBuilder + .filterExpression(s"$ATTR_BATCH_ID = :batchId") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(table).build(), + ":infoDatePrefix" -> AttributeValue.builder().s(s"$dateStr#").build(), + ":batchId" -> AttributeValue.builder().n(bId.toString).build() + ).asJava) + case None => + queryBuilder + } + + val response = dynamoDbClient.query(query.build()) + val chunks = response.items().asScala + .map(itemToDataChunk) + .sortBy(_.jobFinished) + .toSeq + + log.debug(s"For $table ($infoDate) : ${chunks.mkString("[ ", ", ", " ]")}") + chunks + } catch { + case NonFatal(ex) => + log.error(s"Error getting data chunks for table '$table' at $infoDate", ex) + throw ex + } + } + + override def getDataChunksCountFromStorage(table: String, dateBeginOpt: Option[LocalDate], dateEndOpt: Option[LocalDate]): Long = { + try { + var count = 0L + var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null + + do { + val queryBuilder = buildQueryForDateRange(table, dateBeginOpt, dateEndOpt) + .select(Select.COUNT) + + if (lastEvaluatedKey != null) { + queryBuilder.exclusiveStartKey(lastEvaluatedKey) + } + + val response = dynamoDbClient.query(queryBuilder.build()) + count += response.count() + lastEvaluatedKey = response.lastEvaluatedKey() + } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty) + + count + } catch { + case NonFatal(ex) => + log.error(s"Error counting data chunks for table '$table'", ex) + throw ex + } + } + + override def getDataAvailabilityFromStorage(table: String, dateBegin: LocalDate, dateEnd: LocalDate): Seq[DataAvailability] = { + try { + val allChunks = getAllChunksInDateRange(table, dateBegin, dateEnd) + + // Group by infoDate and aggregate + val grouped = allChunks.groupBy(_.infoDate) + val availability = grouped.map { case (dateStr, chunks) => + val date = LocalDate.parse(dateStr) + val totalRecords = chunks.map(_.outputRecordCount).sum + DataAvailability(date, chunks.length, totalRecords) + }.toSeq.sortBy(_.infoDate.toEpochDay) + + availability + } catch { + case NonFatal(ex) => + log.error(s"Error getting data availability for table '$table'", ex) + throw ex + } + } + + override def saveRecordCountToStorage( + table: String, + infoDate: LocalDate, + inputRecordCount: Long, + outputRecordCount: Long, + recordsAppended: Option[Long], + jobStarted: Long, + jobFinished: Long + ): Unit = { + try { + val dateStr = getDateStr(infoDate) + val sortKey = buildSortKey(dateStr, jobFinished) + + val item = dataChunkToItem( + DataChunk(table, dateStr, dateStr, dateStr, inputRecordCount, outputRecordCount, jobStarted, jobFinished, Some(batchId), recordsAppended), + sortKey + ) + + val putRequest = PutItemRequest.builder() + .tableName(bookkeepingTableName) + .item(item) + .build() + + dynamoDbClient.putItem(putRequest) + log.debug(s"Saved bookkeeping record for table '$table', infoDate='$dateStr', sortKey='$sortKey', batchId=$batchId") + } catch { + case NonFatal(ex) => + log.error(s"Error saving record count for table '$table' at $infoDate", ex) + throw ex + } + } + + override def deleteNonCurrentBatchRecords(table: String, infoDate: LocalDate): Unit = { + try { + val dateStr = getDateStr(infoDate) + + AlgorithmUtils.runActionWithElapsedTimeEvent(queryWarningTimeoutMs) { + // Query all items for this table and date + val queryRequest = QueryRequest.builder() + .tableName(bookkeepingTableName) + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND begins_with($ATTR_INFO_DATE_SORT_KEY, :infoDatePrefix)") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(table).build(), + ":infoDatePrefix" -> AttributeValue.builder().s(s"$dateStr#").build() + ).asJava) + .build() + + val response = dynamoDbClient.query(queryRequest) + val items = response.items().asScala + + // Filter and delete items with different batchId + items.foreach { item => + val itemBatchId = Option(item.get(ATTR_BATCH_ID)).flatMap(av => + if (av.n() != null) Some(av.n().toLong) else None + ) + + if (itemBatchId.exists(_ != batchId)) { + val sortKey = item.get(ATTR_INFO_DATE_SORT_KEY).s() + val deleteRequest = DeleteItemRequest.builder() + .tableName(bookkeepingTableName) + .key(Map( + ATTR_TABLE_NAME -> AttributeValue.builder().s(table).build(), + ATTR_INFO_DATE_SORT_KEY -> AttributeValue.builder().s(sortKey).build() + ).asJava) + .conditionExpression(s"$ATTR_JOB_FINISHED = :jobFinished") + .expressionAttributeValues(Map( + ":jobFinished" -> item.get(ATTR_JOB_FINISHED) + ).asJava) + .build() + + try { + dynamoDbClient.deleteItem(deleteRequest) + } catch { + case _: ConditionalCheckFailedException => + // Item was already modified or deleted, ignore + log.info(s"Could not delete item for table '$table', sortKey '$sortKey' - already modified") + } + } + } + } { actualTimeMs => + val elapsedTime = TimeUtils.prettyPrintElapsedTimeShort(actualTimeMs) + log.warn(s"DynamoDB query took too long ($elapsedTime) while deleting from $bookkeepingTableName, tableName='$table', infoDate='$infoDate', batchId!=$batchId") + } + } catch { + case NonFatal(ex) => + log.error(s"Error deleting non-current batch records for table '$table' at $infoDate", ex) + throw ex + } + } + + override def deleteTable(tableName: String): Seq[String] = { + try { + val results = scala.collection.mutable.ListBuffer[String]() + + // Delete from bookkeeping table + val bookkeepingCount = deleteTableFromBookkeeping(tableName) + results += s"Deleted $bookkeepingCount bookkeeping records for table '$tableName'" + + // Delete from schema table + val schemaCount = deleteTableFromSchemas(tableName) + results += s"Deleted $schemaCount schema records for table '$tableName'" + + // Delete offsets + val offsetResults = offsetManagementDynamoDB.deleteAllOffsets(tableName, dynamoDbClient) + results += s"Deleted $offsetResults offset records for table '$tableName'" + + log.info(s"Successfully deleted all records for table '$tableName'") + results.toSeq + } catch { + case NonFatal(ex) => + log.error(s"Error deleting table '$tableName'", ex) + throw ex + } + } + + /** + * Deletes all bookkeeping records for the specified table. + * + * @param tableName The name of the table to delete + * @return The number of records deleted + */ + private def deleteTableFromBookkeeping(tableName: String): Int = { + var deletedCount = 0 + var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null + + do { + val queryBuilder = QueryRequest.builder() + .tableName(bookkeepingTableName) + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(tableName).build() + ).asJava) + + if (lastEvaluatedKey != null) { + queryBuilder.exclusiveStartKey(lastEvaluatedKey) + } + + val response = dynamoDbClient.query(queryBuilder.build()) + val items = response.items().asScala + + // Delete each item + items.foreach { item => + val sortKey = item.get(ATTR_INFO_DATE_SORT_KEY).s() + val deleteRequest = DeleteItemRequest.builder() + .tableName(bookkeepingTableName) + .key(Map( + ATTR_TABLE_NAME -> AttributeValue.builder().s(tableName).build(), + ATTR_INFO_DATE_SORT_KEY -> AttributeValue.builder().s(sortKey).build() + ).asJava) + .build() + + try { + dynamoDbClient.deleteItem(deleteRequest) + deletedCount += 1 + } catch { + case NonFatal(ex) => + log.warn(s"Failed to delete bookkeeping item for table '$tableName', sortKey '$sortKey'", ex) + } + } + + lastEvaluatedKey = response.lastEvaluatedKey() + } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty) + + deletedCount + } + + /** + * Deletes all schema records for the specified table. + * + * @param tableName The name of the table to delete + * @return The number of records deleted + */ + private def deleteTableFromSchemas(tableName: String): Int = { + var deletedCount = 0 + var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null + + do { + val queryBuilder = QueryRequest.builder() + .tableName(schemaTableName) + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(tableName).build() + ).asJava) + + if (lastEvaluatedKey != null) { + queryBuilder.exclusiveStartKey(lastEvaluatedKey) + } + + val response = dynamoDbClient.query(queryBuilder.build()) + val items = response.items().asScala + + // Delete each item + items.foreach { item => + val infoDate = item.get(ATTR_INFO_DATE).s() + val deleteRequest = DeleteItemRequest.builder() + .tableName(schemaTableName) + .key(Map( + ATTR_TABLE_NAME -> AttributeValue.builder().s(tableName).build(), + ATTR_INFO_DATE -> AttributeValue.builder().s(infoDate).build() + ).asJava) + .build() + + try { + dynamoDbClient.deleteItem(deleteRequest) + deletedCount += 1 + } catch { + case NonFatal(ex) => + log.warn(s"Failed to delete schema item for table '$tableName', infoDate '$infoDate'", ex) + } + } + + lastEvaluatedKey = response.lastEvaluatedKey() + } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty) + + deletedCount + } + + override def getLatestSchema(tableName: String, until: LocalDate): Option[(StructType, LocalDate)] = { + try { + val untilDateStr = until.toString + + val queryRequest = QueryRequest.builder() + .tableName(schemaTableName) + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE <= :untilDate") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(tableName).build(), + ":untilDate" -> AttributeValue.builder().s(untilDateStr).build() + ).asJava) + .scanIndexForward(false) // descending order + .limit(1) + .build() + + val response = dynamoDbClient.query(queryRequest) + val items = response.items().asScala + + items.headOption.flatMap { item => + val tableSchema = TableSchema( + tableName = item.get(ATTR_TABLE_NAME).s(), + infoDate = item.get(ATTR_INFO_DATE).s(), + schemaJson = item.get(ATTR_SCHEMA_JSON).s() + ) + TableSchema.toSchemaAndDate(tableSchema) + } + } catch { + case NonFatal(ex) => + log.error(s"Error getting latest schema for table '$tableName' until $until", ex) + throw ex + } + } + + private[pramen] override def saveSchema(tableName: String, infoDate: LocalDate, schema: StructType): Unit = { + try { + val item = Map( + ATTR_TABLE_NAME -> AttributeValue.builder().s(tableName).build(), + ATTR_INFO_DATE -> AttributeValue.builder().s(infoDate.toString).build(), + ATTR_SCHEMA_JSON -> AttributeValue.builder().s(schema.json).build() + ).asJava + + val putRequest = PutItemRequest.builder() + .tableName(schemaTableName) + .item(item) + .build() + + dynamoDbClient.putItem(putRequest) + log.debug(s"Saved schema for table '$tableName', infoDate='$infoDate'") + } catch { + case NonFatal(ex) => + log.error(s"Error saving schema for table '$tableName' at $infoDate", ex) + throw ex + } + } + + private[pramen] override def getOffsetManager: OffsetManager = { + offsetManagement + } + + override def close(): Unit = { + try { + // Note: offsetManagement wraps OffsetManagerDynamoDb which shares the same dynamoDbClient, + // so we don't need to close it separately + dynamoDbClient.close() + } catch { + case NonFatal(ex) => + log.warn("Error closing DynamoDB client", ex) + } + } + + private def itemToDataChunk(item: java.util.Map[String, AttributeValue]): DataChunk = { + DataChunk( + tableName = item.get(ATTR_TABLE_NAME).s(), + infoDate = item.get(ATTR_INFO_DATE).s(), + infoDateBegin = item.get(ATTR_INFO_DATE_BEGIN).s(), + infoDateEnd = item.get(ATTR_INFO_DATE_END).s(), + inputRecordCount = item.get(ATTR_INPUT_RECORD_COUNT).n().toLong, + outputRecordCount = item.get(ATTR_OUTPUT_RECORD_COUNT).n().toLong, + jobStarted = item.get(ATTR_JOB_STARTED).n().toLong, + jobFinished = item.get(ATTR_JOB_FINISHED).n().toLong, + batchId = Option(item.get(ATTR_BATCH_ID)).flatMap(av => if (av.n() != null) Some(av.n().toLong) else None), + appendedRecordCount = Option(item.get(ATTR_APPENDED_RECORD_COUNT)).flatMap(av => if (av.n() != null) Some(av.n().toLong) else None) + ) + } + + private def dataChunkToItem(chunk: DataChunk, sortKey: String): java.util.Map[String, AttributeValue] = { + val baseMap = Map( + ATTR_TABLE_NAME -> AttributeValue.builder().s(chunk.tableName).build(), + ATTR_INFO_DATE_SORT_KEY -> AttributeValue.builder().s(sortKey).build(), + ATTR_INFO_DATE -> AttributeValue.builder().s(chunk.infoDate).build(), + ATTR_INFO_DATE_BEGIN -> AttributeValue.builder().s(chunk.infoDateBegin).build(), + ATTR_INFO_DATE_END -> AttributeValue.builder().s(chunk.infoDateEnd).build(), + ATTR_INPUT_RECORD_COUNT -> AttributeValue.builder().n(chunk.inputRecordCount.toString).build(), + ATTR_OUTPUT_RECORD_COUNT -> AttributeValue.builder().n(chunk.outputRecordCount.toString).build(), + ATTR_JOB_STARTED -> AttributeValue.builder().n(chunk.jobStarted.toString).build(), + ATTR_JOB_FINISHED -> AttributeValue.builder().n(chunk.jobFinished.toString).build() + ) + + val withBatchId = chunk.batchId match { + case Some(bid) => baseMap + (ATTR_BATCH_ID -> AttributeValue.builder().n(bid.toString).build()) + case None => baseMap + } + + val withAppendedCount = chunk.appendedRecordCount match { + case Some(count) => withBatchId + (ATTR_APPENDED_RECORD_COUNT -> AttributeValue.builder().n(count.toString).build()) + case None => withBatchId + } + + withAppendedCount.asJava + } + + private def buildQueryForDateRange( + table: String, + dateBeginOpt: Option[LocalDate], + dateEndOpt: Option[LocalDate] + ): QueryRequest.Builder = { + val builder = QueryRequest.builder() + .tableName(bookkeepingTableName) + + (dateBeginOpt, dateEndOpt) match { + case (Some(begin), Some(end)) => + val beginStr = getDateStr(begin) + val endStr = getDateStr(end) + builder + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE_SORT_KEY BETWEEN :beginDate AND :endDateMax") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(table).build(), + ":beginDate" -> AttributeValue.builder().s(s"$beginStr#").build(), + ":endDateMax" -> AttributeValue.builder().s(s"$endStr#~").build() + ).asJava) + case (Some(begin), None) => + val beginStr = getDateStr(begin) + builder + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE_SORT_KEY >= :beginDate") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(table).build(), + ":beginDate" -> AttributeValue.builder().s(s"$beginStr#").build() + ).asJava) + case (None, Some(end)) => + val endStr = getDateStr(end) + builder + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName AND $ATTR_INFO_DATE_SORT_KEY <= :endDateMax") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(table).build(), + ":endDateMax" -> AttributeValue.builder().s(s"$endStr#~").build() + ).asJava) + case (None, None) => + builder + .keyConditionExpression(s"$ATTR_TABLE_NAME = :tableName") + .expressionAttributeValues(Map( + ":tableName" -> AttributeValue.builder().s(table).build() + ).asJava) + } + } + + private def getAllChunksInDateRange(table: String, dateBegin: LocalDate, dateEnd: LocalDate): Seq[DataChunk] = { + val chunks = scala.collection.mutable.ListBuffer[DataChunk]() + var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null + + do { + val queryBuilder = buildQueryForDateRange(table, Some(dateBegin), Some(dateEnd)) + + if (lastEvaluatedKey != null) { + queryBuilder.exclusiveStartKey(lastEvaluatedKey) + } + + val response = dynamoDbClient.query(queryBuilder.build()) + chunks ++= response.items().asScala.map(itemToDataChunk) + lastEvaluatedKey = response.lastEvaluatedKey() + } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty) + + chunks.toSeq + } + + /** + * Builds the composite sort key for bookkeeping table: "infoDate#jobFinished" + * + * @param infoDate The information date + * @param jobFinished The job finished timestamp in milliseconds + * @return Composite sort key string + */ + private def buildSortKey(infoDate: String, jobFinished: Long): String = { + s"$infoDate#$jobFinished" + } + + /** + * Extracts the infoDate from a composite sort key. + * + * @param sortKey Composite sort key in format "infoDate#jobFinished" + * @return The infoDate portion + */ + private def extractInfoDate(sortKey: String): String = { + sortKey.split("#").headOption.getOrElse(sortKey) + } +} + +object BookkeeperDynamoDb { + val DEFAULT_BOOKKEEPING_TABLE = "bookkeeping" + val DEFAULT_SCHEMA_TABLE = "schemas" + val DEFAULT_TABLE_PREFIX = "pramen" + + // Attribute names for bookkeeping table + val ATTR_TABLE_NAME = "tableName" + val ATTR_INFO_DATE = "infoDate" + val ATTR_INFO_DATE_SORT_KEY = "infoDateSortKey" // Composite: "infoDate#jobFinished" + val ATTR_INFO_DATE_BEGIN = "infoDateBegin" + val ATTR_INFO_DATE_END = "infoDateEnd" + val ATTR_INPUT_RECORD_COUNT = "inputRecordCount" + val ATTR_OUTPUT_RECORD_COUNT = "outputRecordCount" + val ATTR_JOB_STARTED = "jobStarted" + val ATTR_JOB_FINISHED = "jobFinished" + val ATTR_BATCH_ID = "batchId" + val ATTR_APPENDED_RECORD_COUNT = "appendedRecordCount" + + // Attribute names for schema table + val ATTR_SCHEMA_JSON = "schemaJson" + + val MODEL_VERSION = 1 + + private val log = LoggerFactory.getLogger(this.getClass) + + /** + * Builder for creating BookkeeperDynamoDb instances. + * Provides a fluent API for configuring DynamoDB bookkeeper. + * + * Example: + * {{{ + * val bookkeeper = BookkeeperDynamoDb.builder + * .withRegion("us-east-1") + * .withBatchId(System.currentTimeMillis()) + * .withTablePrefix("my_app") + * .build() + * }}} + */ + class BookkeeperDynamoDbBuilder { + private var region: Option[String] = None + private var batchId: Option[Long] = None + private var tableArn: Option[String] = None + private var tablePrefix: String = DEFAULT_TABLE_PREFIX + private var credentialsProvider: Option[AwsCredentialsProvider] = None + private var endpoint: Option[String] = None + + /** + * Sets the AWS region for the DynamoDB client. + * + * @param region AWS region (e.g., "us-east-1", "eu-west-1") + * @return this builder + */ + def withRegion(region: String): BookkeeperDynamoDbBuilder = { + this.region = Some(region) + this + } + + /** + * Sets the batch ID for this bookkeeper instance. + * + * @param batchId Batch ID (typically timestamp in milliseconds) + * @return this builder + */ + def withBatchId(batchId: Long): BookkeeperDynamoDbBuilder = { + this.batchId = Some(batchId) + this + } + + /** + * Sets the table ARN prefix for cross-account or cross-region access. + * + * @param arn ARN prefix (e.g., "arn:aws:dynamodb:us-east-1:123456789012:table/") + * @return this builder + */ + def withTableArn(arn: String): BookkeeperDynamoDbBuilder = { + this.tableArn = Some(arn) + this + } + + /** + * Sets the table ARN prefix for cross-account or cross-region access. + * + * @param arnOpt ARN prefix (e.g., "arn:aws:dynamodb:us-east-1:123456789012:table/") + * @return this builder + */ + def withTableArn(arnOpt: Option[String]): BookkeeperDynamoDbBuilder = { + this.tableArn = arnOpt + this + } + + /** + * Sets the table name prefix to allow multiple bookkeeping sets in the same account. + * + * @param prefix Table name prefix (default: "pramen") + * @return this builder + */ + def withTablePrefix(prefix: String): BookkeeperDynamoDbBuilder = { + this.tablePrefix = prefix + this + } + + /** + * Sets custom AWS credentials provider. + * + * @param provider AWS credentials provider + * @return this builder + */ + def withCredentialsProvider(provider: AwsCredentialsProvider): BookkeeperDynamoDbBuilder = { + this.credentialsProvider = Some(provider) + this + } + + /** + * Sets a custom DynamoDB endpoint (useful for testing with LocalStack or DynamoDB Local). + * + * @param endpoint Endpoint URI (e.g., "http://localhost:8000") + * @return this builder + */ + def withEndpoint(endpoint: String): BookkeeperDynamoDbBuilder = { + this.endpoint = Some(endpoint) + this + } + + /** + * Builds the BookkeeperDynamoDb instance. + * + * @return Configured BookkeeperDynamoDb instance + * @throws IllegalArgumentException if required parameters are missing + */ + def build(): BookkeeperDynamoDb = { + val actualBatchId = batchId.getOrElse(throw new IllegalArgumentException("BatchId is not supplied when building the instance of BookkeeperDynamoDb")) + + if (region.isEmpty) { + throw new IllegalArgumentException("Either region or dynamoDbClient must be provided") + } + + val clientBuilder = DynamoDbClient.builder() + .region(Region.of(region.get)) + + credentialsProvider.foreach(clientBuilder.credentialsProvider) + + endpoint.foreach { ep => + clientBuilder.endpointOverride(URI.create(ep)) + } + + val client = clientBuilder.build() + + try { + new BookkeeperDynamoDb( + dynamoDbClient = client, + batchId = actualBatchId, + tableArn = tableArn, + tablePrefix = tablePrefix + ) + } catch { + case NonFatal(ex) => + client.close() + throw ex + } + } + } + + def builder: BookkeeperDynamoDbBuilder = new BookkeeperDynamoDbBuilder + + /** + * Constructs the full table name using ARN prefix and table name. + * If tableArn is provided, uses it as a prefix, otherwise returns just the table name. + * + * @param tableArn Optional ARN prefix for the table + * @param tableName The table name + * @return Full table name or ARN + */ + def getFullTableName(tableArn: Option[String], tableName: String): String = { + tableArn match { + case Some(arn) if arn.nonEmpty => + // If ARN ends with table/, append the table name, otherwise append /table/tableName + if (arn.endsWith("table/")) { + s"$arn$tableName" + } else if (arn.endsWith("/")) { + s"${arn}table/$tableName" + } else { + s"$arn/table/$tableName" + } + case _ => tableName + } + } + + /** + * Waits for a table to become active after creation. + * + * @param tableName The name of the table to wait for + * @param maxWaitSeconds Maximum time to wait in seconds (default: 60) + */ + def waitForTableActive(tableName: String, dynamoDbClient: DynamoDbClient, maxWaitSeconds: Int = 60): Unit = { + val startTime = System.currentTimeMillis() + val maxWaitMs = maxWaitSeconds * 1000L + + var tableActive = false + while (!tableActive && (System.currentTimeMillis() - startTime) < maxWaitMs) { + try { + val describeRequest = DescribeTableRequest.builder() + .tableName(tableName) + .build() + + val response = dynamoDbClient.describeTable(describeRequest) + val status = response.table().tableStatus() + + if (status == TableStatus.ACTIVE) { + tableActive = true + log.info(s"Table $tableName is now ACTIVE") + } else { + log.info(s"Table $tableName status: $status, waiting...") + Thread.sleep(2000) // Wait 2 seconds before checking again + } + } catch { + case NonFatal(ex) => + log.warn(s"Error checking table status for $tableName", ex) + Thread.sleep(2000) + } + } + + if (!tableActive) { + throw new RuntimeException(s"Table $tableName did not become active within $maxWaitSeconds seconds") + } + } +} diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManager.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManager.scala index 684825429..39addcd27 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManager.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManager.scala @@ -33,7 +33,7 @@ import java.time.LocalDate * The startWriteOffsets() together with commitOffsets() and rollbackOffsets() provide mechanisms to ensure consistency * with data. */ -trait OffsetManager { +trait OffsetManager extends AutoCloseable { /** * Returns offsets for an information date. * diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerCached.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerCached.scala index 8da69ce30..52f411ab0 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerCached.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerCached.scala @@ -25,21 +25,21 @@ import java.time.LocalDate import scala.collection.mutable /** - * The offset manager decorator handles caching or repeated queries. + * The offset manager decorator handles caching of repeated queries. */ class OffsetManagerCached(offsetManager: OffsetManager) extends OffsetManager { private val log = LoggerFactory.getLogger(this.getClass) private val aggregatedOffsetsCache = new mutable.HashMap[(String, Option[LocalDate]), Option[DataOffsetAggregated]] - def getOffsets(table: String, infoDate: LocalDate): Array[DataOffset] = { + override def getOffsets(table: String, infoDate: LocalDate): Array[DataOffset] = { offsetManager.getOffsets(table, infoDate) } - def getUncommittedOffsets(table: String, onlyForInfoDate: Option[LocalDate]): Array[UncommittedOffset] = { + override def getUncommittedOffsets(table: String, onlyForInfoDate: Option[LocalDate]): Array[UncommittedOffset] = { offsetManager.getUncommittedOffsets(table, onlyForInfoDate) } - def getMaxInfoDateAndOffset(table: String, onlyForInfoDate: Option[LocalDate]): Option[DataOffsetAggregated] = synchronized { + override def getMaxInfoDateAndOffset(table: String, onlyForInfoDate: Option[LocalDate]): Option[DataOffsetAggregated] = synchronized { val tbl = onlyForInfoDate match { case Some(date) => s"'$table' for '$date'" case None => s"'$table'" @@ -57,11 +57,11 @@ class OffsetManagerCached(offsetManager: OffsetManager) extends OffsetManager { } } - def startWriteOffsets(table: String, infoDate: LocalDate, offsetType: OffsetType): DataOffsetRequest = { + override def startWriteOffsets(table: String, infoDate: LocalDate, offsetType: OffsetType): DataOffsetRequest = { offsetManager.startWriteOffsets(table, infoDate, offsetType) } - def commitOffsets(request: DataOffsetRequest, minOffset: OffsetValue, maxOffset: OffsetValue): Unit = { + override def commitOffsets(request: DataOffsetRequest, minOffset: OffsetValue, maxOffset: OffsetValue): Unit = { offsetManager.commitOffsets(request, minOffset, maxOffset) this.synchronized { @@ -69,7 +69,7 @@ class OffsetManagerCached(offsetManager: OffsetManager) extends OffsetManager { } } - def commitRerun(request: DataOffsetRequest, minOffset: OffsetValue, maxOffset: OffsetValue): Unit = { + override def commitRerun(request: DataOffsetRequest, minOffset: OffsetValue, maxOffset: OffsetValue): Unit = { this.synchronized { aggregatedOffsetsCache --= aggregatedOffsetsCache.keys.filter(_._1 == request.tableName) } @@ -77,7 +77,7 @@ class OffsetManagerCached(offsetManager: OffsetManager) extends OffsetManager { offsetManager.commitRerun(request, minOffset, maxOffset) } - def postCommittedRecords(commitRequests: Seq[OffsetCommitRequest]): Unit = { + override def postCommittedRecords(commitRequests: Seq[OffsetCommitRequest]): Unit = { offsetManager.postCommittedRecords(commitRequests) val updatedTables = commitRequests.map(_.table).toSet @@ -86,10 +86,14 @@ class OffsetManagerCached(offsetManager: OffsetManager) extends OffsetManager { } } - def rollbackOffsets(request: DataOffsetRequest): Unit = { + override def rollbackOffsets(request: DataOffsetRequest): Unit = { offsetManager.rollbackOffsets(request) } + override def close(): Unit = { + offsetManager.close() + } + private def renderAggregatedOptionalOffset(offsetsOpt: Option[DataOffsetAggregated]): String = { offsetsOpt match { case Some(offsets) => diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala new file mode 100644 index 000000000..0c5f872d8 --- /dev/null +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerDynamoDb.scala @@ -0,0 +1,642 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.bookkeeper + +import org.slf4j.LoggerFactory +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider +import software.amazon.awssdk.regions.Region +import software.amazon.awssdk.services.dynamodb.DynamoDbClient +import software.amazon.awssdk.services.dynamodb.model._ +import za.co.absa.pramen.api.offset.DataOffset.UncommittedOffset +import za.co.absa.pramen.api.offset.{DataOffset, OffsetType, OffsetValue} +import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb.waitForTableActive +import za.co.absa.pramen.core.bookkeeper.model._ + +import java.net.URI +import java.time.{Instant, LocalDate} +import scala.collection.JavaConverters._ +import scala.util.control.NonFatal + +/** + * DynamoDB-based offset manager for tracking incremental ingestion offsets. + * + * Table schema for offsets: + * - Partition key: pramenTableName (String) + * - Sort key: compositeKey (String) - format: "infoDate#createdAtMilli" for efficient querying + * + * The composite sort key allows: + * 1. Efficient queries for all offsets of a table+infoDate combination + * 2. Time-ordered offset records + * 3. Support for aggregation queries (fetch all offsets for a table+date) + * + * @param dynamoDbClient The DynamoDB client to use + * @param batchId The batch ID for this execution + * @param tableArn Optional ARN prefix for the offset table + * @param tablePrefix Prefix for the offset table name (default: "pramen") + */ +class OffsetManagerDynamoDb( + dynamoDbClient: DynamoDbClient, + batchId: Long, + tableArn: Option[String] = None, + tablePrefix: String = OffsetManagerDynamoDb.DEFAULT_TABLE_PREFIX, + closesClient: Boolean = true +) extends OffsetManager { + + import OffsetManagerDynamoDb._ + + private val log = LoggerFactory.getLogger(this.getClass) + + private val offsetTableBaseName = s"${tablePrefix}_${DEFAULT_OFFSET_TABLE}" + private val offsetTableFullName = BookkeeperDynamoDb.getFullTableName(tableArn, offsetTableBaseName) + + // Initialize table on creation + createOffsetTableIfNotExists() + + override def getOffsets(table: String, infoDate: LocalDate): Array[DataOffset] = { + val offsets = getOffsetRecords(table, infoDate) + + if (offsets.isEmpty) { + return Array.empty + } + + offsets.map(OffsetRecordConverter.toDataOffset) + } + + override def getUncommittedOffsets(table: String, onlyForInfoDate: Option[LocalDate]): Array[UncommittedOffset] = { + try { + onlyForInfoDate match { + case Some(infoDate) => + // Query for specific table and info date + val offsets = getOffsetRecords(table, infoDate) + offsets + .filter(_.committedAtMilli.isEmpty) + .map(record => OffsetRecordConverter.toDataOffset(record).asInstanceOf[UncommittedOffset]) + + case None => + // Query all offsets for this table with pagination + var allItems = Seq.empty[java.util.Map[String, AttributeValue]] + var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null + + do { + val queryRequestBuilder = QueryRequest.builder() + .tableName(offsetTableFullName) + .keyConditionExpression(s"$ATTR_PRAMEN_TABLE_NAME = :table_name") + .filterExpression(s"attribute_not_exists($ATTR_COMMITTED_AT)") + .expressionAttributeValues(Map( + ":table_name" -> AttributeValue.builder().s(table).build() + ).asJava) + + if (lastEvaluatedKey != null) { + queryRequestBuilder.exclusiveStartKey(lastEvaluatedKey) + } + + val result = dynamoDbClient.query(queryRequestBuilder.build()) + allItems = allItems ++ result.items().asScala + lastEvaluatedKey = result.lastEvaluatedKey() + } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty) + + allItems + .map(itemToOffsetRecord) + .map(record => OffsetRecordConverter.toDataOffset(record).asInstanceOf[UncommittedOffset]) + .toArray + } + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to read uncommitted offsets from the offset table '$offsetTableFullName'.", ex) + } + } + + override def getMaxInfoDateAndOffset(table: String, onlyForInfoDate: Option[LocalDate]): Option[DataOffsetAggregated] = { + val maxInfoDateOpt = onlyForInfoDate.orElse(getMaximumInfoDate(table)) + + try { + maxInfoDateOpt.flatMap { infoDate => + getMinMaxOffsets(table, infoDate) + } + } catch { + case NonFatal(ex) => throw new RuntimeException(s"Unable to read from the offset table '$offsetTableFullName'.", ex) + } + } + + override def startWriteOffsets(table: String, infoDate: LocalDate, offsetType: OffsetType): DataOffsetRequest = { + val createdAt = Instant.now() + val createdAtMilli = createdAt.toEpochMilli + val compositeKey = s"${infoDate.toString}#${createdAtMilli}" + + try { + val putRequest = PutItemRequest.builder() + .tableName(offsetTableFullName) + .item(Map( + ATTR_PRAMEN_TABLE_NAME -> AttributeValue.builder().s(table).build(), + ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build(), + ATTR_INFO_DATE -> AttributeValue.builder().s(infoDate.toString).build(), + ATTR_DATA_TYPE -> AttributeValue.builder().s(offsetType.dataTypeString).build(), + ATTR_MIN_OFFSET -> AttributeValue.builder().s("").build(), + ATTR_MAX_OFFSET -> AttributeValue.builder().s("").build(), + ATTR_BATCH_ID -> AttributeValue.builder().n(batchId.toString).build(), + ATTR_CREATED_AT -> AttributeValue.builder().n(createdAtMilli.toString).build() + ).asJava) + .build() + + dynamoDbClient.putItem(putRequest) + + DataOffsetRequest(table, infoDate, batchId, createdAt) + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to write to the offset table '$offsetTableFullName'.", ex) + } + } + + override def commitOffsets(request: DataOffsetRequest, minOffset: OffsetValue, maxOffset: OffsetValue): Unit = { + val committedAt = Instant.now().toEpochMilli + val compositeKey = s"${request.infoDate.toString}#${request.createdAt.toEpochMilli}" + + try { + val updateRequest = UpdateItemRequest.builder() + .tableName(offsetTableFullName) + .key(Map( + ATTR_PRAMEN_TABLE_NAME -> AttributeValue.builder().s(request.tableName).build(), + ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build() + ).asJava) + .updateExpression(s"SET ${ATTR_MIN_OFFSET} = :min_offset, ${ATTR_MAX_OFFSET} = :max_offset, ${ATTR_COMMITTED_AT} = :committed_at") + .expressionAttributeValues(Map( + ":min_offset" -> AttributeValue.builder().s(minOffset.valueString).build(), + ":max_offset" -> AttributeValue.builder().s(maxOffset.valueString).build(), + ":committed_at" -> AttributeValue.builder().n(committedAt.toString).build() + ).asJava) + .build() + + dynamoDbClient.updateItem(updateRequest) + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to commit offsets to the offset table '$offsetTableFullName'.", ex) + } + } + + override def commitRerun(request: DataOffsetRequest, minOffset: OffsetValue, maxOffset: OffsetValue): Unit = { + if (minOffset.compareTo(maxOffset) > 0) { + throw new IllegalArgumentException(s"minOffset is greater than maxOffset: ${minOffset.valueString} > ${maxOffset.valueString}") + } + + val committedAt = Instant.now().toEpochMilli + val compositeKey = s"${request.infoDate.toString}#${request.createdAt.toEpochMilli}" + + try { + // First, update the current offset + val updateRequest = UpdateItemRequest.builder() + .tableName(offsetTableFullName) + .key(Map( + ATTR_PRAMEN_TABLE_NAME -> AttributeValue.builder().s(request.tableName).build(), + ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build() + ).asJava) + .updateExpression(s"SET ${ATTR_MIN_OFFSET} = :min_offset, ${ATTR_MAX_OFFSET} = :max_offset, ${ATTR_COMMITTED_AT} = :committed_at") + .expressionAttributeValues(Map( + ":min_offset" -> AttributeValue.builder().s(minOffset.valueString).build(), + ":max_offset" -> AttributeValue.builder().s(maxOffset.valueString).build(), + ":committed_at" -> AttributeValue.builder().n(committedAt.toString).build() + ).asJava) + .build() + + dynamoDbClient.updateItem(updateRequest) + + // Then, delete all other offsets for this table and info date + val allOffsets = getOffsetRecords(request.tableName, request.infoDate) + allOffsets + .filter(r => r.createdAtMilli != request.createdAt.toEpochMilli) + .foreach { record => + val deleteCompositeKey = s"${record.infoDate}#${record.createdAtMilli}" + val deleteRequest = DeleteItemRequest.builder() + .tableName(offsetTableFullName) + .key(Map( + ATTR_PRAMEN_TABLE_NAME -> AttributeValue.builder().s(request.tableName).build(), + ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(deleteCompositeKey).build() + ).asJava) + .build() + + dynamoDbClient.deleteItem(deleteRequest) + } + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to commit rerun to the offset table '$offsetTableFullName'.", ex) + } + } + + override def postCommittedRecords(commitRequests: Seq[OffsetCommitRequest]): Unit = { + val committedAt = Instant.now() + val committedAtMilli = committedAt.toEpochMilli + + try { + // Insert all new committed records + commitRequests.foreach { req => + val compositeKey = s"${req.infoDate.toString}#${req.createdAt.toEpochMilli}" + + val putRequest = PutItemRequest.builder() + .tableName(offsetTableFullName) + .item(Map( + ATTR_PRAMEN_TABLE_NAME -> AttributeValue.builder().s(req.table).build(), + ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build(), + ATTR_INFO_DATE -> AttributeValue.builder().s(req.infoDate.toString).build(), + ATTR_DATA_TYPE -> AttributeValue.builder().s(req.minOffset.dataType.dataTypeString).build(), + ATTR_MIN_OFFSET -> AttributeValue.builder().s(req.minOffset.valueString).build(), + ATTR_MAX_OFFSET -> AttributeValue.builder().s(req.maxOffset.valueString).build(), + ATTR_BATCH_ID -> AttributeValue.builder().n(batchId.toString).build(), + ATTR_CREATED_AT -> AttributeValue.builder().n(req.createdAt.toEpochMilli.toString).build(), + ATTR_COMMITTED_AT -> AttributeValue.builder().n(committedAtMilli.toString).build() + ).asJava) + .build() + + dynamoDbClient.putItem(putRequest) + } + + // Delete old offsets for each (table, infoDate) pair + commitRequests.map(r => (r.table, r.infoDate)) + .distinct + .foreach { case (table, infoDate) => + val allOffsets = getOffsetRecords(table, infoDate) + allOffsets + .filter(_.committedAtMilli.exists(_ != committedAtMilli)) + .foreach { record => + val deleteCompositeKey = s"${record.infoDate}#${record.createdAtMilli}" + val deleteRequest = DeleteItemRequest.builder() + .tableName(offsetTableFullName) + .key(Map( + ATTR_PRAMEN_TABLE_NAME -> AttributeValue.builder().s(table).build(), + ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(deleteCompositeKey).build() + ).asJava) + .build() + + dynamoDbClient.deleteItem(deleteRequest) + } + } + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to post committed records to the offset table '$offsetTableFullName'.", ex) + } + } + + override def rollbackOffsets(request: DataOffsetRequest): Unit = { + val compositeKey = s"${request.infoDate.toString}#${request.createdAt.toEpochMilli}" + + try { + val deleteRequest = DeleteItemRequest.builder() + .tableName(offsetTableFullName) + .key(Map( + ATTR_PRAMEN_TABLE_NAME -> AttributeValue.builder().s(request.tableName).build(), + ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build() + ).asJava) + .build() + + dynamoDbClient.deleteItem(deleteRequest) + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to rollback offsets in the offset table '$offsetTableFullName'.", ex) + } + } + + /** + * Gets all offset records for a table and info date. + */ + private[core] def getOffsetRecords(table: String, infoDate: LocalDate): Array[OffsetRecord] = { + try { + var allItems = Seq.empty[java.util.Map[String, AttributeValue]] + var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null + val infoDatePrefix = s"${infoDate.toString}#" + + do { + val queryRequestBuilder = QueryRequest.builder() + .tableName(offsetTableFullName) + .keyConditionExpression(s"$ATTR_PRAMEN_TABLE_NAME = :table_name AND begins_with($ATTR_COMPOSITE_KEY, :prefix)") + .expressionAttributeValues(Map( + ":table_name" -> AttributeValue.builder().s(table).build(), + ":prefix" -> AttributeValue.builder().s(infoDatePrefix).build() + ).asJava) + + if (lastEvaluatedKey != null) { + queryRequestBuilder.exclusiveStartKey(lastEvaluatedKey) + } + + val result = dynamoDbClient.query(queryRequestBuilder.build()) + allItems = allItems ++ result.items().asScala + lastEvaluatedKey = result.lastEvaluatedKey() + } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty) + + allItems.map(itemToOffsetRecord).toArray + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to read offset records from the offset table '$offsetTableFullName'.", ex) + } + } + + /** + * Gets the maximum information date for a table. + */ + private[core] def getMaximumInfoDate(table: String): Option[LocalDate] = { + try { + var allDates = Seq.empty[LocalDate] + var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null + + do { + val queryRequestBuilder = QueryRequest.builder() + .tableName(offsetTableFullName) + .keyConditionExpression(s"${ATTR_PRAMEN_TABLE_NAME} = :table_name") + .expressionAttributeValues(Map( + ":table_name" -> AttributeValue.builder().s(table).build() + ).asJava) + .projectionExpression(ATTR_INFO_DATE) + + if (lastEvaluatedKey != null) { + queryRequestBuilder.exclusiveStartKey(lastEvaluatedKey) + } + + val result = dynamoDbClient.query(queryRequestBuilder.build()) + allDates = allDates ++ result.items().asScala.map(item => LocalDate.parse(item.get(ATTR_INFO_DATE).s())) + lastEvaluatedKey = result.lastEvaluatedKey() + } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty) + + if (allDates.isEmpty) { + None + } else { + Some(allDates.maxBy(_.toEpochDay)) + } + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to read maximum info date from the offset table '$offsetTableFullName'.", ex) + } + } + + /** + * Gets min/max offsets for a table and info date, with all offset records for that day. + */ + private[core] def getMinMaxOffsets(table: String, infoDate: LocalDate): Option[DataOffsetAggregated] = { + val offsets = getOffsetRecords(table, infoDate).filter(_.committedAtMilli.nonEmpty) + + if (offsets.isEmpty) { + return None + } + + validateOffsets(table, infoDate, offsets) + + val (minOffset, maxOffset) = getMinMaxOffsets(offsets) + + Some(DataOffsetAggregated(table, infoDate, minOffset, maxOffset, offsets.map(OffsetRecordConverter.toDataOffset))) + } + + /** + * Gets min/max offsets from an array of offset records. + */ + private[core] def getMinMaxOffsets(offsets: Array[OffsetRecord]): (OffsetValue, OffsetValue) = { + val offsetDataType = offsets.head.dataType + val minOffset = offsets.flatMap(or => OffsetValue.fromString(offsetDataType, or.minOffset)).min + val maxOffset = offsets.flatMap(or => OffsetValue.fromString(offsetDataType, or.maxOffset)).max + + (minOffset, maxOffset) + } + + /** + * Validates offsets for inconsistencies (e.g., inconsistent offset value types). + */ + private[core] def validateOffsets(table: String, infoDate: LocalDate, offsets: Array[OffsetRecord]): Unit = { + val inconsistentOffsets = offsets.groupBy(_.dataType).keys.toArray.sorted + if (inconsistentOffsets.length > 1) { + throw new RuntimeException(s"Inconsistent offset value types found for $table at $infoDate: ${inconsistentOffsets.mkString(", ")}") + } + } + + /** + * Converts a DynamoDB item to an OffsetRecord. + */ + private def itemToOffsetRecord(item: java.util.Map[String, AttributeValue]): OffsetRecord = { + val pramenTableName = item.get(ATTR_PRAMEN_TABLE_NAME).s() + val infoDate = item.get(ATTR_INFO_DATE).s() + val dataType = item.get(ATTR_DATA_TYPE).s() + val minOffset = item.get(ATTR_MIN_OFFSET).s() + val maxOffset = item.get(ATTR_MAX_OFFSET).s() + val batchId = item.get(ATTR_BATCH_ID).n().toLong + val createdAtMilli = item.get(ATTR_CREATED_AT).n().toLong + val committedAtMilli = Option(item.get(ATTR_COMMITTED_AT)).map(_.n().toLong) + + OffsetRecord(pramenTableName, infoDate, dataType, minOffset, maxOffset, batchId, createdAtMilli, committedAtMilli) + } + + /** + * Creates the offset table if it doesn't exist. + */ + private def createOffsetTableIfNotExists(): Unit = { + try { + val describeRequest = DescribeTableRequest.builder() + .tableName(offsetTableFullName) + .build() + + dynamoDbClient.describeTable(describeRequest) + log.info(s"Offset table '$offsetTableFullName' already exists") + } catch { + case _: ResourceNotFoundException => + log.info(s"Creating offset table '$offsetTableFullName'") + createOffsetTable() + case NonFatal(ex) => + log.error(s"Error checking if offset table exists", ex) + throw ex + } + } + + /** + * Creates the offset table in DynamoDB. + */ + private def createOffsetTable(): Unit = { + val createRequest = CreateTableRequest.builder() + .tableName(offsetTableFullName) + .attributeDefinitions( + AttributeDefinition.builder() + .attributeName(ATTR_PRAMEN_TABLE_NAME) + .attributeType(ScalarAttributeType.S) + .build(), + AttributeDefinition.builder() + .attributeName(ATTR_COMPOSITE_KEY) + .attributeType(ScalarAttributeType.S) + .build() + ) + .keySchema( + KeySchemaElement.builder() + .attributeName(ATTR_PRAMEN_TABLE_NAME) + .keyType(KeyType.HASH) + .build(), + KeySchemaElement.builder() + .attributeName(ATTR_COMPOSITE_KEY) + .keyType(KeyType.RANGE) + .build() + ) + .billingMode(BillingMode.PAY_PER_REQUEST) + .build() + + dynamoDbClient.createTable(createRequest) + waitForTableActive(offsetTableFullName, dynamoDbClient) + log.info(s"Offset table '$offsetTableFullName' created successfully") + } + + /** + * Closes the DynamoDB client. + */ + override def close(): Unit = { + try { + if (closesClient) { + dynamoDbClient.close() + } + } catch { + case NonFatal(ex) => + log.warn("Error closing DynamoDB client", ex) + } + } + + /** Deletes all offsets for a given table. */ + private[core] def deleteAllOffsets(tableName: String, dynamoDbClient: DynamoDbClient): Int = { + val log = LoggerFactory.getLogger(this.getClass) + try { + var allItems = Seq.empty[java.util.Map[String, AttributeValue]] + var lastEvaluatedKey: java.util.Map[String, AttributeValue] = null + + // Query all offsets for the table with pagination + do { + val queryRequestBuilder = QueryRequest.builder() + .tableName(offsetTableFullName) + .keyConditionExpression(s"$ATTR_PRAMEN_TABLE_NAME = :table_name") + .expressionAttributeValues(Map( + ":table_name" -> AttributeValue.builder().s(tableName).build() + ).asJava) + + if (lastEvaluatedKey != null) { + queryRequestBuilder.exclusiveStartKey(lastEvaluatedKey) + } + + val result = dynamoDbClient.query(queryRequestBuilder.build()) + allItems = allItems ++ result.items().asScala + lastEvaluatedKey = result.lastEvaluatedKey() + } while (lastEvaluatedKey != null && !lastEvaluatedKey.isEmpty) + + // Delete each item + allItems.foreach { item => + val deleteRequest = DeleteItemRequest.builder() + .tableName(offsetTableFullName) + .key(Map( + ATTR_PRAMEN_TABLE_NAME -> item.get(ATTR_PRAMEN_TABLE_NAME), + ATTR_COMPOSITE_KEY -> item.get(ATTR_COMPOSITE_KEY) + ).asJava) + .build() + + dynamoDbClient.deleteItem(deleteRequest) + } + + val deletedCount = allItems.size + log.info(s"Deleted $deletedCount offset records for table '$tableName'") + deletedCount + } catch { + case NonFatal(ex) => + log.error(s"Error deleting offsets for table '$tableName' from '$offsetTableFullName'", ex) + throw new RuntimeException(s"Unable to delete offsets for table '$tableName' from '$offsetTableFullName'", ex) + } + } +} + +object OffsetManagerDynamoDb { + val DEFAULT_OFFSET_TABLE = "offsets" + val DEFAULT_TABLE_PREFIX = "pramen" + + // Attribute names for offset table + val ATTR_PRAMEN_TABLE_NAME = "pramenTableName" + val ATTR_COMPOSITE_KEY = "compositeKey" // Format: "infoDate#createdAtMilli" + val ATTR_INFO_DATE = "infoDate" + val ATTR_DATA_TYPE = "dataType" + val ATTR_MIN_OFFSET = "minOffset" + val ATTR_MAX_OFFSET = "maxOffset" + val ATTR_BATCH_ID = "batchId" + val ATTR_CREATED_AT = "createdAt" + val ATTR_COMMITTED_AT = "committedAt" + + /** + * Builder for creating OffsetManagerDynamoDb instances. + */ + class OffsetManagerDynamoDbBuilder { + private var region: Option[String] = None + private var tableArn: Option[String] = None + private var tablePrefix: String = DEFAULT_TABLE_PREFIX + private var credentialsProvider: Option[AwsCredentialsProvider] = None + private var endpoint: Option[String] = None + private var batchId: Long = System.currentTimeMillis() + + def withRegion(region: String): OffsetManagerDynamoDbBuilder = { + this.region = Some(region) + this + } + + def withTableArn(arn: String): OffsetManagerDynamoDbBuilder = { + this.tableArn = Some(arn) + this + } + + def withTablePrefix(prefix: String): OffsetManagerDynamoDbBuilder = { + this.tablePrefix = prefix + this + } + + def withCredentialsProvider(provider: AwsCredentialsProvider): OffsetManagerDynamoDbBuilder = { + this.credentialsProvider = Some(provider) + this + } + + def withEndpoint(endpoint: String): OffsetManagerDynamoDbBuilder = { + this.endpoint = Some(endpoint) + this + } + + def withBatchId(batchId: Long): OffsetManagerDynamoDbBuilder = { + this.batchId = batchId + this + } + + def build(): OffsetManagerDynamoDb = { + if (region.isEmpty) { + throw new IllegalArgumentException("Region must be provided") + } + + val clientBuilder = DynamoDbClient.builder() + .region(Region.of(region.get)) + + credentialsProvider.foreach(clientBuilder.credentialsProvider) + endpoint.foreach { ep => + clientBuilder.endpointOverride(URI.create(ep)) + } + + val client = clientBuilder.build() + + try { + new OffsetManagerDynamoDb( + dynamoDbClient = client, + batchId = batchId, + tableArn = tableArn, + tablePrefix = tablePrefix, + closesClient = true + ) + } catch { + case NonFatal(ex) => + client.close() + throw ex + } + } + } + + def builder: OffsetManagerDynamoDbBuilder = new OffsetManagerDynamoDbBuilder + +} diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerJdbc.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerJdbc.scala index 368ab026f..b1e164860 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerJdbc.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/bookkeeper/OffsetManagerJdbc.scala @@ -150,6 +150,9 @@ class OffsetManagerJdbc(db: Database, slickProfile: JdbcProfile, offsetTable: Of ).execute() } + /** This class does not own the database connection. It is responsibility of the DB connection owner to close it. */ + override def close(): Unit = {} + private[core] def getMaximumInfoDate(table: String): Option[LocalDate] = { val query = offsetTable.records .filter(r => r.pramenTableName === table) diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/Journal.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/Journal.scala index 0a03c215d..b7d70ff81 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/Journal.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/Journal.scala @@ -23,10 +23,11 @@ import java.time.Instant /** * A journal is responsible of keeping track of all completed tasks. */ -trait Journal { +trait Journal extends AutoCloseable { def addEntry(entry: TaskCompleted): Unit def getEntries(from: Instant, to: Instant): Seq[TaskCompleted] + override def close(): Unit = {} } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala new file mode 100644 index 000000000..a9453d3ed --- /dev/null +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/journal/JournalDynamoDB.scala @@ -0,0 +1,349 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.journal + +import org.slf4j.LoggerFactory +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider +import software.amazon.awssdk.regions.Region +import software.amazon.awssdk.services.dynamodb.DynamoDbClient +import software.amazon.awssdk.services.dynamodb.model._ +import za.co.absa.pramen.core.app.config.InfoDateConfig +import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb +import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb.waitForTableActive +import za.co.absa.pramen.core.journal.model.TaskCompleted + +import java.net.URI +import java.time.{Instant, LocalDate} +import scala.collection.JavaConverters._ +import scala.util.control.NonFatal + +/** + * DynamoDB-based journal for tracking completed tasks. + * + * This journal stores task completion records in a DynamoDB table with automatic table creation. + * + * @param dynamoDbClient The DynamoDB client to use + * @param tableArn Optional ARN prefix for the journal table + * @param tablePrefix Prefix for the journal table name (default: "pramen") + */ +class JournalDynamoDB private ( + dynamoDbClient: DynamoDbClient, + tableArn: Option[String] = None, + tablePrefix: String = JournalDynamoDB.DEFAULT_TABLE_PREFIX +) extends Journal { + + private val log = LoggerFactory.getLogger(this.getClass) + private val dateFormatter = InfoDateConfig.defaultDateFormatter + + private val journalTableBaseName = s"${tablePrefix}_${JournalDynamoDB.DEFAULT_JOURNAL_TABLE}" + private val journalTableFullName = BookkeeperDynamoDb.getFullTableName(tableArn, journalTableBaseName) + + // Initialize table on creation + createJournalTableIfNotExists() + + /** + * Add a task completion entry to the journal. + * Failure reason is truncated to 4KB to fit DynamoDB item size limits. + */ + override def addEntry(entry: TaskCompleted): Unit = { + val periodBegin = entry.periodBegin.format(dateFormatter) + val periodEnd = entry.periodEnd.format(dateFormatter) + val infoDate = entry.informationDate.format(dateFormatter) + + // Truncate failure reason to 4KB maximum + val truncatedFailureReason = entry.failureReason.map { reason => + if (reason.length > JournalDynamoDB.MAX_FAILURE_REASON_LENGTH) { + val truncated = reason.substring(0, JournalDynamoDB.MAX_FAILURE_REASON_LENGTH - 20) + truncated + "\n[... truncated ...]" + } else { + reason + } + } + + val itemBuilder = Map.newBuilder[String, AttributeValue] + + // Primary key: composite of jobName and finishedAt (for sorting by time) + itemBuilder += (JournalDynamoDB.ATTR_JOB_NAME -> AttributeValue.builder().s(entry.jobName).build()) + itemBuilder += (JournalDynamoDB.ATTR_FINISHED_AT -> AttributeValue.builder().n(entry.finishedAt.toString).build()) + + // Attributes + itemBuilder += (JournalDynamoDB.ATTR_TABLE_NAME -> AttributeValue.builder().s(entry.tableName).build()) + itemBuilder += (JournalDynamoDB.ATTR_PERIOD_BEGIN -> AttributeValue.builder().s(periodBegin).build()) + itemBuilder += (JournalDynamoDB.ATTR_PERIOD_END -> AttributeValue.builder().s(periodEnd).build()) + itemBuilder += (JournalDynamoDB.ATTR_INFO_DATE -> AttributeValue.builder().s(infoDate).build()) + itemBuilder += (JournalDynamoDB.ATTR_INPUT_RECORD_COUNT -> AttributeValue.builder().n(entry.inputRecordCount.getOrElse(-1L).toString).build()) + itemBuilder += (JournalDynamoDB.ATTR_INPUT_RECORD_COUNT_OLD -> AttributeValue.builder().n(entry.inputRecordCountOld.getOrElse(-1L).toString).build()) + + entry.outputRecordCount.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_OUTPUT_RECORD_COUNT -> AttributeValue.builder().n(v.toString).build())) + entry.outputRecordCountOld.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_OUTPUT_RECORD_COUNT_OLD -> AttributeValue.builder().n(v.toString).build())) + entry.appendedRecordCount.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_APPENDED_RECORD_COUNT -> AttributeValue.builder().n(v.toString).build())) + entry.outputSize.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_OUTPUT_SIZE -> AttributeValue.builder().n(v.toString).build())) + + itemBuilder += (JournalDynamoDB.ATTR_STARTED_AT -> AttributeValue.builder().n(entry.startedAt.toString).build()) + itemBuilder += (JournalDynamoDB.ATTR_STATUS -> AttributeValue.builder().s(entry.status).build()) + + truncatedFailureReason.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_FAILURE_REASON -> AttributeValue.builder().s(v).build())) + entry.sparkApplicationId.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_SPARK_APP_ID -> AttributeValue.builder().s(v).build())) + entry.pipelineId.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_PIPELINE_ID -> AttributeValue.builder().s(v).build())) + entry.pipelineName.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_PIPELINE_NAME -> AttributeValue.builder().s(v).build())) + entry.environmentName.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_ENVIRONMENT_NAME -> AttributeValue.builder().s(v).build())) + entry.tenant.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_TENANT -> AttributeValue.builder().s(v).build())) + entry.country.foreach(v => itemBuilder += (JournalDynamoDB.ATTR_COUNTRY -> AttributeValue.builder().s(v).build())) + + itemBuilder += (JournalDynamoDB.ATTR_BATCH_ID -> AttributeValue.builder().n(entry.batchId.toString).build()) + + try { + val putRequest = PutItemRequest.builder() + .tableName(journalTableFullName) + .item(itemBuilder.result().asJava) + .build() + + dynamoDbClient.putItem(putRequest) + } catch { + case NonFatal(ex) => + log.error(s"Unable to write to the journal table '$journalTableFullName'.", ex) + } + } + + /** + * Get journal entries within a time range. + */ + override def getEntries(from: Instant, to: Instant): Seq[TaskCompleted] = { + val fromSec = from.getEpochSecond + val toSec = to.getEpochSecond + + try { + val scanRequest = ScanRequest.builder() + .tableName(journalTableFullName) + .filterExpression(s"${JournalDynamoDB.ATTR_FINISHED_AT} >= :from_time AND ${JournalDynamoDB.ATTR_FINISHED_AT} <= :to_time") + .expressionAttributeValues(Map( + ":from_time" -> AttributeValue.builder().n(fromSec.toString).build(), + ":to_time" -> AttributeValue.builder().n(toSec.toString).build() + ).asJava) + .build() + + val result = dynamoDbClient.scan(scanRequest) + + result.items().asScala.map { item => + val getS = (attr: String) => Option(item.get(attr)).map(_.s()) + val getN = (attr: String) => Option(item.get(attr)).map(_.n().toLong) + + val inputRecordCount = getN(JournalDynamoDB.ATTR_INPUT_RECORD_COUNT).flatMap(v => if (v < 0) None else Some(v)) + val inputRecordCountOld = getN(JournalDynamoDB.ATTR_INPUT_RECORD_COUNT_OLD).flatMap(v => if (v < 0) None else Some(v)) + + TaskCompleted( + jobName = item.get(JournalDynamoDB.ATTR_JOB_NAME).s(), + tableName = item.get(JournalDynamoDB.ATTR_TABLE_NAME).s(), + periodBegin = LocalDate.parse(item.get(JournalDynamoDB.ATTR_PERIOD_BEGIN).s(), dateFormatter), + periodEnd = LocalDate.parse(item.get(JournalDynamoDB.ATTR_PERIOD_END).s(), dateFormatter), + informationDate = LocalDate.parse(item.get(JournalDynamoDB.ATTR_INFO_DATE).s(), dateFormatter), + inputRecordCount = inputRecordCount, + inputRecordCountOld = inputRecordCountOld, + outputRecordCount = getN(JournalDynamoDB.ATTR_OUTPUT_RECORD_COUNT), + outputRecordCountOld = getN(JournalDynamoDB.ATTR_OUTPUT_RECORD_COUNT_OLD), + appendedRecordCount = getN(JournalDynamoDB.ATTR_APPENDED_RECORD_COUNT), + outputSize = getN(JournalDynamoDB.ATTR_OUTPUT_SIZE), + startedAt = item.get(JournalDynamoDB.ATTR_STARTED_AT).n().toLong, + finishedAt = item.get(JournalDynamoDB.ATTR_FINISHED_AT).n().toLong, + status = item.get(JournalDynamoDB.ATTR_STATUS).s(), + failureReason = getS(JournalDynamoDB.ATTR_FAILURE_REASON), + sparkApplicationId = getS(JournalDynamoDB.ATTR_SPARK_APP_ID), + pipelineId = getS(JournalDynamoDB.ATTR_PIPELINE_ID), + pipelineName = getS(JournalDynamoDB.ATTR_PIPELINE_NAME), + environmentName = getS(JournalDynamoDB.ATTR_ENVIRONMENT_NAME), + tenant = getS(JournalDynamoDB.ATTR_TENANT), + country = getS(JournalDynamoDB.ATTR_COUNTRY), + batchId = getN(JournalDynamoDB.ATTR_BATCH_ID).getOrElse(0L) + ) + }.toSeq + } catch { + case NonFatal(ex) => + log.error(s"Unable to read from the journal table '$journalTableFullName'.", ex) + Seq.empty + } + } + + /** + * Creates the journal table if it doesn't exist. + */ + private def createJournalTableIfNotExists(): Unit = { + try { + val describeRequest = DescribeTableRequest.builder() + .tableName(journalTableFullName) + .build() + + dynamoDbClient.describeTable(describeRequest) + log.info(s"Journal table '$journalTableFullName' already exists") + } catch { + case _: ResourceNotFoundException => + log.info(s"Creating journal table '$journalTableFullName'") + createJournalTable() + case NonFatal(ex) => + log.error(s"Error checking if journal table exists", ex) + throw ex + } + } + + /** + * Creates the journal table in DynamoDB. + */ + private def createJournalTable(): Unit = { + val createRequest = CreateTableRequest.builder() + .tableName(journalTableFullName) + .attributeDefinitions( + AttributeDefinition.builder() + .attributeName(JournalDynamoDB.ATTR_JOB_NAME) + .attributeType(ScalarAttributeType.S) + .build(), + AttributeDefinition.builder() + .attributeName(JournalDynamoDB.ATTR_FINISHED_AT) + .attributeType(ScalarAttributeType.N) + .build() + ) + .keySchema( + KeySchemaElement.builder() + .attributeName(JournalDynamoDB.ATTR_JOB_NAME) + .keyType(KeyType.HASH) + .build(), + KeySchemaElement.builder() + .attributeName(JournalDynamoDB.ATTR_FINISHED_AT) + .keyType(KeyType.RANGE) + .build() + ) + .billingMode(BillingMode.PAY_PER_REQUEST) + .build() + + dynamoDbClient.createTable(createRequest) + waitForTableActive(journalTableFullName, dynamoDbClient) + log.info(s"Journal table '$journalTableFullName' created successfully") + } + + /** + * Closes the DynamoDB client. + */ + override def close(): Unit = { + try { + dynamoDbClient.close() + } catch { + case NonFatal(ex) => + log.warn("Error closing DynamoDB client", ex) + } + } +} + +object JournalDynamoDB { + val DEFAULT_JOURNAL_TABLE = "journal" + val DEFAULT_TABLE_PREFIX = "pramen" + + // Maximum length for failure reason (4KB minus some overhead) + val MAX_FAILURE_REASON_LENGTH = 4000 + + // Attribute names for journal table + val ATTR_JOB_NAME = "jobName" + val ATTR_TABLE_NAME = "tableName" + val ATTR_PERIOD_BEGIN = "periodBegin" + val ATTR_PERIOD_END = "periodEnd" + val ATTR_INFO_DATE = "infoDate" + val ATTR_INPUT_RECORD_COUNT = "inputRecordCount" + val ATTR_INPUT_RECORD_COUNT_OLD = "inputRecordCountOld" + val ATTR_OUTPUT_RECORD_COUNT = "outputRecordCount" + val ATTR_OUTPUT_RECORD_COUNT_OLD = "outputRecordCountOld" + val ATTR_APPENDED_RECORD_COUNT = "appendedRecordCount" + val ATTR_OUTPUT_SIZE = "outputSize" + val ATTR_STARTED_AT = "startedAt" + val ATTR_FINISHED_AT = "finishedAt" + val ATTR_STATUS = "status" + val ATTR_FAILURE_REASON = "failureReason" + val ATTR_SPARK_APP_ID = "sparkApplicationId" + val ATTR_PIPELINE_ID = "pipelineId" + val ATTR_PIPELINE_NAME = "pipelineName" + val ATTR_ENVIRONMENT_NAME = "environmentName" + val ATTR_TENANT = "tenant" + val ATTR_COUNTRY = "country" + val ATTR_BATCH_ID = "batchId" + + /** + * Builder for creating JournalDynamoDB instances. + */ + class JournalDynamoDBBuilder { + private var region: Option[String] = None + private var tableArn: Option[String] = None + private var tablePrefix: String = DEFAULT_TABLE_PREFIX + private var credentialsProvider: Option[AwsCredentialsProvider] = None + private var endpoint: Option[String] = None + + def withRegion(region: String): JournalDynamoDBBuilder = { + this.region = Some(region) + this + } + + def withTableArn(arn: String): JournalDynamoDBBuilder = { + this.tableArn = Some(arn) + this + } + + def withTableArn(arnOpt: Option[String]): JournalDynamoDBBuilder = { + this.tableArn = arnOpt + this + } + + def withTablePrefix(prefix: String): JournalDynamoDBBuilder = { + this.tablePrefix = prefix + this + } + + def withCredentialsProvider(provider: AwsCredentialsProvider): JournalDynamoDBBuilder = { + this.credentialsProvider = Some(provider) + this + } + + def withEndpoint(endpoint: String): JournalDynamoDBBuilder = { + this.endpoint = Some(endpoint) + this + } + + def build(): JournalDynamoDB = { + if (region.isEmpty) { + throw new IllegalArgumentException("Region must be provided") + } + + val clientBuilder = DynamoDbClient.builder() + .region(Region.of(region.get)) + + credentialsProvider.foreach(clientBuilder.credentialsProvider) + endpoint.foreach { ep => + clientBuilder.endpointOverride(URI.create(ep)) + } + + val client = clientBuilder.build() + + try { + new JournalDynamoDB( + dynamoDbClient = client, + tableArn = tableArn, + tablePrefix = tablePrefix + ) + } catch { + case NonFatal(ex) => + client.close() + throw ex + } + } + } + + def builder: JournalDynamoDBBuilder = new JournalDynamoDBBuilder +} diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockDynamoDb.scala new file mode 100644 index 000000000..454e90900 --- /dev/null +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockDynamoDb.scala @@ -0,0 +1,214 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.lock + +import org.slf4j.LoggerFactory +import software.amazon.awssdk.services.dynamodb.DynamoDbClient +import software.amazon.awssdk.services.dynamodb.model._ +import za.co.absa.pramen.core.lock.model.LockTicket + +import java.time.Instant +import java.time.temporal.ChronoUnit +import scala.collection.JavaConverters._ +import scala.util.control.NonFatal +import scala.util.{Failure, Success, Try} + +object TokenLockDynamoDb { + val DEFAULT_TABLE_NAME = "pramen_locks" + + // Attribute names + val ATTR_TOKEN = "job_token" // 'token' is a reserved word in DynamoDb and can't be used as an attribute + val ATTR_OWNER = "job_owner" // 'owner' is a reserved word in DynamoDb and can't be used as an attribute + val ATTR_EXPIRES = "expiresAt" + val ATTR_CREATED_AT = "createdAt" + + val TICKETS_HARD_EXPIRE_DAYS = 1 +} + +/** + * DynamoDB-based distributed lock implementation. + * + * This lock uses DynamoDB's conditional writes to implement distributed locking. + * The lock is maintained by periodic updates to the expiration time. + * + * Table schema: + * - Partition key: token (String) + * - Attributes: owner (String), expires (Number), createdAt (Number) + * + * @param token The unique identifier for the lock + * @param dynamoDbClient The DynamoDB client to use + * @param tableName The name of the locks table + */ +class TokenLockDynamoDb( + token: String, + dynamoDbClient: DynamoDbClient, + tableName: String = TokenLockDynamoDb.DEFAULT_TABLE_NAME +) extends TokenLockBase(token) { + + import TokenLockDynamoDb._ + + private val log = LoggerFactory.getLogger(this.getClass) + + /** Invoked from a synchronized block. */ + override def tryAcquireGuardLock(retries: Int = 3, thisTry: Int = 0): Boolean = { + def tryAcquireExistingTicket(): Boolean = { + val ticketOpt = getTicket + + if (ticketOpt.isEmpty) { + log.warn(s"No ticket for $escapedToken") + tryAcquireGuardLock(retries - 1, thisTry + 1) + } else { + val ticket = ticketOpt.get + val expires = ticket.expires + val now = Instant.now().getEpochSecond + + if (expires < now) { + log.warn(s"Taking over expired ticket $escapedToken ($expires < $now)") + releaseGuardLock() + tryAcquireGuardLock(retries - 1, thisTry + 1) + } else { + false + } + } + } + + if (retries <= 0) { + log.error(s"Cannot try acquire a lock after $thisTry retries.") + false + } else { + val ok = Try(acquireGuardLock()) + + ok match { + case Success(_) => + true + case Failure(_: ConditionalCheckFailedException) => + // Lock already exists + tryAcquireExistingTicket() + case Failure(ex) => + throw new IllegalStateException(s"Unable to acquire a lock by querying DynamoDB", ex) + } + } + } + + /** Invoked from a synchronized block. */ + override def releaseGuardLock(): Unit = { + try { + val now = Instant.now() + val nowEpoch = now.getEpochSecond + val hardExpireTickets = now.minus(TICKETS_HARD_EXPIRE_DAYS, ChronoUnit.DAYS).getEpochSecond + + // Delete this ticket or any expired tickets + val deleteRequest = DeleteItemRequest.builder() + .tableName(tableName) + .key(Map( + ATTR_TOKEN -> AttributeValue.builder().s(escapedToken).build() + ).asJava) + .conditionExpression(s"$ATTR_OWNER = :jobOwner OR ($ATTR_EXPIRES < :now AND $ATTR_CREATED_AT < :hardExpire)") + .expressionAttributeValues(Map( + ":jobOwner" -> AttributeValue.builder().s(owner).build(), + ":now" -> AttributeValue.builder().n(nowEpoch.toString).build(), + ":hardExpire" -> AttributeValue.builder().n(hardExpireTickets.toString).build() + ).asJava) + .build() + + try { + dynamoDbClient.deleteItem(deleteRequest) + } catch { + case _: ConditionalCheckFailedException => + // Item doesn't match condition, ignore + log.debug(s"Could not delete ticket $escapedToken - condition not met") + } + } catch { + case NonFatal(ex) => + log.error(s"An error occurred when trying to release the lock: $escapedToken.", ex) + } + } + + /** Invoked from a synchronized block. */ + override def updateTicket(): Unit = { + val newTicket = getNewTicket + + try { + log.debug(s"Update $escapedToken to $newTicket") + + val updateRequest = UpdateItemRequest.builder() + .tableName(tableName) + .key(Map( + ATTR_TOKEN -> AttributeValue.builder().s(escapedToken).build() + ).asJava) + .updateExpression(s"SET $ATTR_EXPIRES = :expires") + .expressionAttributeValues(Map( + ":expires" -> AttributeValue.builder().n(newTicket.toString).build() + ).asJava) + .build() + + dynamoDbClient.updateItem(updateRequest) + } catch { + case NonFatal(ex) => + log.error(s"An error occurred when trying to update the lock: $escapedToken.", ex) + } + } + + /** Invoked from a synchronized block. */ + private def getTicket: Option[LockTicket] = { + try { + val getRequest = GetItemRequest.builder() + .tableName(tableName) + .key(Map( + ATTR_TOKEN -> AttributeValue.builder().s(escapedToken).build() + ).asJava) + .build() + + val response = dynamoDbClient.getItem(getRequest) + + if (response.hasItem && !response.item().isEmpty) { + val item = response.item() + Some(LockTicket( + token = item.get(ATTR_TOKEN).s(), + owner = item.get(ATTR_OWNER).s(), + expires = item.get(ATTR_EXPIRES).n().toLong, + createdAt = Option(item.get(ATTR_CREATED_AT)).map(_.n().toLong) + )) + } else { + None + } + } catch { + case NonFatal(ex) => + log.error(s"Error getting ticket for $escapedToken", ex) + None + } + } + + /** Invoked from a synchronized block. */ + private def acquireGuardLock(): Unit = { + val now = Instant.now().getEpochSecond + val item = Map( + ATTR_TOKEN -> AttributeValue.builder().s(escapedToken).build(), + ATTR_OWNER -> AttributeValue.builder().s(owner).build(), + ATTR_EXPIRES -> AttributeValue.builder().n(getNewTicket.toString).build(), + ATTR_CREATED_AT -> AttributeValue.builder().n(now.toString).build() + ).asJava + + val putRequest = PutItemRequest.builder() + .tableName(tableName) + .item(item) + .conditionExpression(s"attribute_not_exists($ATTR_TOKEN)") + .build() + + dynamoDbClient.putItem(putRequest) + } +} diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala new file mode 100644 index 000000000..75a669c65 --- /dev/null +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/lock/TokenLockFactoryDynamoDb.scala @@ -0,0 +1,280 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.lock + +import org.slf4j.LoggerFactory +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider +import software.amazon.awssdk.regions.Region +import software.amazon.awssdk.services.dynamodb.DynamoDbClient +import software.amazon.awssdk.services.dynamodb.model._ +import za.co.absa.pramen.api.lock.{TokenLock, TokenLockFactory} +import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb +import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb.waitForTableActive + +import java.net.URI +import scala.util.control.NonFatal + +/** + * Factory for creating DynamoDB-based distributed locks. + * + * This factory creates and manages a DynamoDB table for storing lock tickets. + * The table is created automatically if it doesn't exist. + * + * @param dynamoDbClient The DynamoDB client to use + * @param tableArn Optional ARN prefix for the locks table + * @param tablePrefix Prefix for the locks table name (default: "pramen") + */ +class TokenLockFactoryDynamoDb private( + dynamoDbClient: DynamoDbClient, + tableArn: Option[String] = None, + tablePrefix: String = "pramen" +) extends TokenLockFactory with AutoCloseable { + import TokenLockFactoryDynamoDb._ + + import TokenLockDynamoDb._ + + private val log = LoggerFactory.getLogger(this.getClass) + + // Construct table name with prefix + private val locksTableBaseName = s"${tablePrefix}_locks" + private val locksTableName = BookkeeperDynamoDb.getFullTableName(tableArn, locksTableBaseName) + + // Initialize table on construction + init() + + override def getLock(token: String): TokenLock = { + new TokenLockDynamoDb(token, dynamoDbClient, locksTableName) + } + + /** + * Closes the DynamoDB client. + * Should be called when the lock factory is no longer needed. + */ + override def close(): Unit = { + try { + dynamoDbClient.close() + } catch { + case NonFatal(ex) => + log.warn("Error closing DynamoDB client", ex) + } + } + + /** + * Initializes the DynamoDB locks table. + * Checks if the table exists and creates it if it doesn't. + */ + private def init(): Unit = { + try { + log.info(s"Initializing DynamoDB lock factory with table: '$locksTableName'") + + if (!tableExists(locksTableName)) { + log.info(s"Creating DynamoDB locks table: $locksTableName") + createLocksTable(locksTableName) + log.info(s"Successfully created locks table: $locksTableName") + } else { + log.info(s"DynamoDB locks table already exists: $locksTableName") + } + + log.info(s"DynamoDB lock factory initialization complete") + } catch { + case NonFatal(ex) => + log.error("Error initializing DynamoDB lock factory", ex) + throw new RuntimeException("Failed to initialize DynamoDB lock factory", ex) + } + } + + /** + * Checks if a DynamoDB table exists. + * + * @param tableName The name of the table to check + * @return true if the table exists, false otherwise + */ + private def tableExists(tableName: String): Boolean = { + try { + val describeRequest = DescribeTableRequest.builder() + .tableName(tableName) + .build() + + dynamoDbClient.describeTable(describeRequest) + true + } catch { + case _: ResourceNotFoundException => false + case NonFatal(ex) => + log.warn(s"Error checking if table exists: $tableName", ex) + throw ex + } + } + + /** + * Creates the locks table with the appropriate schema. + * + * @param tableName The name of the table to create + */ + private def createLocksTable(tableName: String): Unit = { + val createTableRequest = CreateTableRequest.builder() + .tableName(tableName) + .keySchema( + KeySchemaElement.builder() + .attributeName(ATTR_TOKEN) + .keyType(KeyType.HASH) + .build() + ) + .attributeDefinitions( + AttributeDefinition.builder() + .attributeName(ATTR_TOKEN) + .attributeType(ScalarAttributeType.S) + .build() + ) + .billingMode(BillingMode.PAY_PER_REQUEST) // On-demand billing + .build() + + dynamoDbClient.createTable(createTableRequest) + + // Wait for table to become active + waitForTableActive(tableName, dynamoDbClient) + } +} + +object TokenLockFactoryDynamoDb { + /** + * Builder for creating TokenLockFactoryDynamoDb instances. + * Provides a fluent API for configuring DynamoDB lock factory. + * + * Example: + * {{{ + * val lockFactory = TokenLockFactoryDynamoDb.builder + * .withRegion("us-east-1") + * .withTablePrefix("my_app") + * .build() + * }}} + */ + class TokenLockFactoryDynamoDbBuilder { + private var region: Option[String] = None + private var tableArn: Option[String] = None + private var tablePrefix: String = BookkeeperDynamoDb.DEFAULT_TABLE_PREFIX + private var credentialsProvider: Option[AwsCredentialsProvider] = None + private var endpoint: Option[String] = None + + /** + * Sets the AWS region for the DynamoDB client. + * + * @param region AWS region (e.g., "us-east-1", "eu-west-1") + * @return this builder + */ + def withRegion(region: String): TokenLockFactoryDynamoDbBuilder = { + this.region = Some(region) + this + } + + /** + * Sets the table ARN prefix for cross-account or cross-region access. + * + * @param arn ARN prefix (e.g., "arn:aws:dynamodb:us-east-1:123456789012:table/") + * @return this builder + */ + def withTableArn(arn: String): TokenLockFactoryDynamoDbBuilder = { + this.tableArn = Some(arn) + this + } + + /** + * Sets the table ARN prefix for cross-account or cross-region access. + * + * @param arnOpt ARN prefix (e.g., "arn:aws:dynamodb:us-east-1:123456789012:table/") + * @return this builder + */ + def withTableArn(arnOpt: Option[String]): TokenLockFactoryDynamoDbBuilder = { + this.tableArn = arnOpt + this + } + + /** + * Sets the table name prefix to allow multiple lock tables in the same account. + * + * @param prefix Table name prefix (default: "pramen") + * @return this builder + */ + def withTablePrefix(prefix: String): TokenLockFactoryDynamoDbBuilder = { + this.tablePrefix = prefix + this + } + + /** + * Sets custom AWS credentials provider. + * + * @param provider AWS credentials provider + * @return this builder + */ + def withCredentialsProvider(provider: AwsCredentialsProvider): TokenLockFactoryDynamoDbBuilder = { + this.credentialsProvider = Some(provider) + this + } + + /** + * Sets a custom DynamoDB endpoint (useful for testing with LocalStack or DynamoDB Local). + * + * @param endpoint Endpoint URI (e.g., "http://localhost:8000") + * @return this builder + */ + def withEndpoint(endpoint: String): TokenLockFactoryDynamoDbBuilder = { + this.endpoint = Some(endpoint) + this + } + + /** + * Builds the TokenLockFactoryDynamoDb instance. + * + * @return Configured TokenLockFactoryDynamoDb instance + * @throws IllegalArgumentException if required parameters are missing + */ + def build(): TokenLockFactoryDynamoDb = { + if (region.isEmpty) { + throw new IllegalArgumentException("Region must be provided") + } + + val clientBuilder = DynamoDbClient.builder() + .region(Region.of(region.get)) + + credentialsProvider.foreach(clientBuilder.credentialsProvider) + + endpoint.foreach { ep => + clientBuilder.endpointOverride(URI.create(ep)) + } + + val client = clientBuilder.build() + + try { + new TokenLockFactoryDynamoDb( + dynamoDbClient = client, + tableArn = tableArn, + tablePrefix = tablePrefix + ) + } catch { + case NonFatal(ex) => + client.close() + throw ex + } + } + } + + /** + * Creates a new builder for TokenLockFactoryDynamoDb. + * + * @return A new builder instance + */ + def builder: TokenLockFactoryDynamoDbBuilder = new TokenLockFactoryDynamoDbBuilder +} diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala new file mode 100644 index 000000000..31b8a8932 --- /dev/null +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerDynamoDb.scala @@ -0,0 +1,328 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.metadata + +import org.slf4j.LoggerFactory +import software.amazon.awssdk.auth.credentials.AwsCredentialsProvider +import software.amazon.awssdk.regions.Region +import software.amazon.awssdk.services.dynamodb.DynamoDbClient +import software.amazon.awssdk.services.dynamodb.model._ +import za.co.absa.pramen.api.MetadataValue +import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb +import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb.waitForTableActive + +import java.net.URI +import java.time.{Instant, LocalDate} +import scala.collection.JavaConverters._ +import scala.util.control.NonFatal + +/** + * DynamoDB-based metadata manager for storing custom metadata. + * + * This manager stores metadata key-value pairs in a DynamoDB table with automatic table creation. + * + * @param dynamoDbClient The DynamoDB client to use + * @param tableArn Optional ARN prefix for the metadata table + * @param tablePrefix Prefix for the metadata table name (default: "pramen") + */ +class MetadataManagerDynamoDb private ( + dynamoDbClient: DynamoDbClient, + tableArn: Option[String] = None, + tablePrefix: String = MetadataManagerDynamoDb.DEFAULT_TABLE_PREFIX +) extends MetadataManagerBase(true) with AutoCloseable { + + private val log = LoggerFactory.getLogger(this.getClass) + + private val metadataTableBaseName = s"${tablePrefix}_${MetadataManagerDynamoDb.DEFAULT_METADATA_TABLE}" + private val metadataTableFullName = BookkeeperDynamoDb.getFullTableName(tableArn, metadataTableBaseName) + + // Initialize table on creation + createMetadataTableIfNotExists() + + override def getMetadataFromStorage(tableName: String, infoDate: LocalDate, key: String): Option[MetadataValue] = { + try { + val compositeKey = s"$tableName#$infoDate" + + val getRequest = GetItemRequest.builder() + .tableName(metadataTableFullName) + .key(Map( + MetadataManagerDynamoDb.ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build(), + MetadataManagerDynamoDb.ATTR_METADATA_KEY -> AttributeValue.builder().s(key).build() + ).asJava) + .build() + + val result = dynamoDbClient.getItem(getRequest) + + if (result.hasItem) { + val item = result.item() + val value = item.get(MetadataManagerDynamoDb.ATTR_METADATA_VALUE).s() + val lastUpdated = Instant.ofEpochSecond(item.get(MetadataManagerDynamoDb.ATTR_LAST_UPDATED).n().toLong) + Some(MetadataValue(value, lastUpdated)) + } else { + None + } + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to read from the metadata table '$metadataTableFullName'.", ex) + } + } + + override def getMetadataFromStorage(tableName: String, infoDate: LocalDate): Map[String, MetadataValue] = { + try { + val compositeKey = s"$tableName#$infoDate" + + val queryRequest = QueryRequest.builder() + .tableName(metadataTableFullName) + .keyConditionExpression(s"${MetadataManagerDynamoDb.ATTR_COMPOSITE_KEY} = :composite_key") + .expressionAttributeValues(Map( + ":composite_key" -> AttributeValue.builder().s(compositeKey).build() + ).asJava) + .build() + + val result = dynamoDbClient.query(queryRequest) + + result.items().asScala.map { item => + val key = item.get(MetadataManagerDynamoDb.ATTR_METADATA_KEY).s() + val value = item.get(MetadataManagerDynamoDb.ATTR_METADATA_VALUE).s() + val lastUpdated = Instant.ofEpochSecond(item.get(MetadataManagerDynamoDb.ATTR_LAST_UPDATED).n().toLong) + key -> MetadataValue(value, lastUpdated) + }.toMap + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to read from the metadata table '$metadataTableFullName'.", ex) + } + } + + override def setMetadataToStorage(tableName: String, infoDate: LocalDate, key: String, metadata: MetadataValue): Unit = { + try { + val compositeKey = s"$tableName#$infoDate" + + val putRequest = PutItemRequest.builder() + .tableName(metadataTableFullName) + .item(Map( + MetadataManagerDynamoDb.ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build(), + MetadataManagerDynamoDb.ATTR_METADATA_KEY -> AttributeValue.builder().s(key).build(), + MetadataManagerDynamoDb.ATTR_METADATA_VALUE -> AttributeValue.builder().s(metadata.value).build(), + MetadataManagerDynamoDb.ATTR_LAST_UPDATED -> AttributeValue.builder().n(metadata.lastUpdated.getEpochSecond.toString).build(), + MetadataManagerDynamoDb.ATTR_TABLE_NAME -> AttributeValue.builder().s(tableName).build(), + MetadataManagerDynamoDb.ATTR_INFO_DATE -> AttributeValue.builder().s(infoDate.toString).build() + ).asJava) + .build() + + dynamoDbClient.putItem(putRequest) + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to write to the metadata table '$metadataTableFullName'.", ex) + } + } + + override def deleteMetadataFromStorage(tableName: String, infoDate: LocalDate, key: String): Unit = { + try { + val compositeKey = s"$tableName#$infoDate" + + val deleteRequest = DeleteItemRequest.builder() + .tableName(metadataTableFullName) + .key(Map( + MetadataManagerDynamoDb.ATTR_COMPOSITE_KEY -> AttributeValue.builder().s(compositeKey).build(), + MetadataManagerDynamoDb.ATTR_METADATA_KEY -> AttributeValue.builder().s(key).build() + ).asJava) + .build() + + dynamoDbClient.deleteItem(deleteRequest) + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to delete from the metadata table '$metadataTableFullName'.", ex) + } + } + + override def deleteMetadataFromStorage(tableName: String, infoDate: LocalDate): Unit = { + try { + val compositeKey = s"$tableName#$infoDate" + + // First, query all items with this composite key + val queryRequest = QueryRequest.builder() + .tableName(metadataTableFullName) + .keyConditionExpression(s"${MetadataManagerDynamoDb.ATTR_COMPOSITE_KEY} = :composite_key") + .expressionAttributeValues(Map( + ":composite_key" -> AttributeValue.builder().s(compositeKey).build() + ).asJava) + .build() + + val result = dynamoDbClient.query(queryRequest) + + // Delete each item + result.items().asScala.foreach { item => + val key = item.get(MetadataManagerDynamoDb.ATTR_METADATA_KEY).s() + deleteMetadataFromStorage(tableName, infoDate, key) + } + } catch { + case NonFatal(ex) => + throw new RuntimeException(s"Unable to delete from the metadata table '$metadataTableFullName'.", ex) + } + } + + /** + * Creates the metadata table if it doesn't exist. + */ + private def createMetadataTableIfNotExists(): Unit = { + try { + val describeRequest = DescribeTableRequest.builder() + .tableName(metadataTableFullName) + .build() + + dynamoDbClient.describeTable(describeRequest) + log.info(s"Metadata table '$metadataTableFullName' already exists") + } catch { + case _: ResourceNotFoundException => + log.info(s"Creating metadata table '$metadataTableFullName'") + createMetadataTable() + case NonFatal(ex) => + log.error(s"Error checking if metadata table exists", ex) + throw ex + } + } + + /** + * Creates the metadata table in DynamoDB. + */ + private def createMetadataTable(): Unit = { + val createRequest = CreateTableRequest.builder() + .tableName(metadataTableFullName) + .attributeDefinitions( + AttributeDefinition.builder() + .attributeName(MetadataManagerDynamoDb.ATTR_COMPOSITE_KEY) + .attributeType(ScalarAttributeType.S) + .build(), + AttributeDefinition.builder() + .attributeName(MetadataManagerDynamoDb.ATTR_METADATA_KEY) + .attributeType(ScalarAttributeType.S) + .build() + ) + .keySchema( + KeySchemaElement.builder() + .attributeName(MetadataManagerDynamoDb.ATTR_COMPOSITE_KEY) + .keyType(KeyType.HASH) + .build(), + KeySchemaElement.builder() + .attributeName(MetadataManagerDynamoDb.ATTR_METADATA_KEY) + .keyType(KeyType.RANGE) + .build() + ) + .billingMode(BillingMode.PAY_PER_REQUEST) + .build() + + dynamoDbClient.createTable(createRequest) + waitForTableActive(metadataTableFullName, dynamoDbClient) + log.info(s"Metadata table '$metadataTableFullName' created successfully") + } + + /** + * Closes the DynamoDB client. + */ + override def close(): Unit = { + try { + dynamoDbClient.close() + } catch { + case NonFatal(ex) => + log.warn("Error closing DynamoDB client", ex) + } + } +} + +object MetadataManagerDynamoDb { + val DEFAULT_METADATA_TABLE = "metadata" + val DEFAULT_TABLE_PREFIX = "pramen" + + // Attribute names for metadata table + val ATTR_COMPOSITE_KEY = "compositeKey" // tableName#infoDate + val ATTR_METADATA_KEY = "metadataKey" + val ATTR_METADATA_VALUE = "metadataValue" + val ATTR_LAST_UPDATED = "lastUpdated" + val ATTR_TABLE_NAME = "tableName" // For filtering/queries + val ATTR_INFO_DATE = "infoDate" // For filtering/queries + + /** + * Builder for creating MetadataManagerDynamoDb instances. + */ + class MetadataManagerDynamoDbBuilder { + private var region: Option[String] = None + private var tableArn: Option[String] = None + private var tablePrefix: String = DEFAULT_TABLE_PREFIX + private var credentialsProvider: Option[AwsCredentialsProvider] = None + private var endpoint: Option[String] = None + + def withRegion(region: String): MetadataManagerDynamoDbBuilder = { + this.region = Some(region) + this + } + + def withTableArn(arn: String): MetadataManagerDynamoDbBuilder = { + this.tableArn = Some(arn) + this + } + + def withTableArn(arnOpt: Option[String]): MetadataManagerDynamoDbBuilder = { + this.tableArn = arnOpt + this + } + + def withTablePrefix(prefix: String): MetadataManagerDynamoDbBuilder = { + this.tablePrefix = prefix + this + } + + def withCredentialsProvider(provider: AwsCredentialsProvider): MetadataManagerDynamoDbBuilder = { + this.credentialsProvider = Some(provider) + this + } + + def withEndpoint(endpoint: String): MetadataManagerDynamoDbBuilder = { + this.endpoint = Some(endpoint) + this + } + + def build(): MetadataManagerDynamoDb = { + if (region.isEmpty) { + throw new IllegalArgumentException("Region must be provided") + } + + val clientBuilder = DynamoDbClient.builder() + .region(Region.of(region.get)) + + credentialsProvider.foreach(clientBuilder.credentialsProvider) + endpoint.foreach { ep => + clientBuilder.endpointOverride(URI.create(ep)) + } + + val client = clientBuilder.build() + + try { + new MetadataManagerDynamoDb( + dynamoDbClient = client, + tableArn = tableArn, + tablePrefix = tablePrefix + ) + } catch { + case NonFatal(ex) => + client.close() + throw ex + } + } + } + + def builder: MetadataManagerDynamoDbBuilder = new MetadataManagerDynamoDbBuilder +} diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerJdbc.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerJdbc.scala index a630784ac..64c6263f6 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerJdbc.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerJdbc.scala @@ -102,4 +102,7 @@ class MetadataManagerJdbc(db: Database, slickProfile: JdbcProfile) extends Metad case NonFatal(ex) => throw new RuntimeException(s"Unable to delete from the metadata table.", ex) } } + + /** The implementation does not own DB connections, so it is not responsible for closing them. */ + override def close(): Unit = {} } diff --git a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerNull.scala b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerNull.scala index 3b91bd9e0..dda48941f 100644 --- a/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerNull.scala +++ b/pramen/core/src/main/scala/za/co/absa/pramen/core/metadata/MetadataManagerNull.scala @@ -42,4 +42,6 @@ class MetadataManagerNull(isPersistenceEnabled: Boolean) extends MetadataManager def deleteMetadataFromStorage(tableName: String, infoDate: LocalDate): Unit = { throw new UnsupportedOperationException(errorMessage) } + + override def close(): Unit = {} } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/BookkeepingConfigFactory.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/BookkeepingConfigFactory.scala index 6fc1f2623..17ca5bc9f 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/BookkeepingConfigFactory.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/BookkeepingConfigFactory.scala @@ -28,7 +28,10 @@ object BookkeepingConfigFactory { bookkeepingJdbcConfig: Option[JdbcConfig] = None, deltaDatabase: Option[String] = None, deltaTablePrefix: Option[String] = None, - temporaryDirectory: Option[String] = None): BookkeeperConfig = { + temporaryDirectory: Option[String] = None, + dynamoDbRegion: Option[String] = None, + dynamoDbTableArn: Option[String] = None, + dynamoDbTablePrefix: Option[String] = None): BookkeeperConfig = { BookkeeperConfig( bookkeepingEnabled, bookkeepingLocation, @@ -38,7 +41,10 @@ object BookkeepingConfigFactory { bookkeepingJdbcConfig, deltaDatabase, deltaTablePrefix, - temporaryDirectory + temporaryDirectory, + dynamoDbRegion, + dynamoDbTableArn, + dynamoDbTablePrefix ) } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/metadata/MetadataManagerSpy.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/metadata/MetadataManagerSpy.scala index 997db6c6b..812ba06ca 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/metadata/MetadataManagerSpy.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/mocks/metadata/MetadataManagerSpy.scala @@ -65,4 +65,6 @@ class MetadataManagerSpy(isPersistent: Boolean) extends MetadataManagerBase(isPe metadataLocalStore.remove(MetadataTableKey(tableName, infoDate)) } + + override def close(): Unit = {} } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/model/QueryBuilderSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/model/QueryBuilderSuite.scala index c5f4fa71c..fc8de3c0b 100644 --- a/pramen/core/src/test/scala/za/co/absa/pramen/core/model/QueryBuilderSuite.scala +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/model/QueryBuilderSuite.scala @@ -51,31 +51,31 @@ class QueryBuilderSuite extends AnyWordSpec { "throw an exception when no query configuration is specified" in { val conf = ConfigFactory.parseString("") - val exception = intercept[IllegalArgumentException] { + val ex = intercept[IllegalArgumentException] { QueryBuilder.fromConfig(conf, "", "") } - assert(exception.getMessage == "No options are specified for the query. Usually, it is one of: 'sql', 'path', 'table', 'db.table', 'topic'.") + assert(ex.getMessage == "No options are specified for the query. Usually, it is one of: 'sql', 'path', 'table', 'db.table', 'topic'.") } "throw an exception when the prefix is empty" in { val conf = ConfigFactory.parseString("data = /tmp") - val exception = intercept[IllegalArgumentException] { + val ex = intercept[IllegalArgumentException] { QueryBuilder.fromConfig(conf, "input", "") } - assert(exception.getMessage == "No options are specified for the 'input' query. Usually, it is one of: 'input.sql', 'input.path', 'input.table', 'input.db.table', 'input.topic'.") + assert(ex.getMessage == "No options are specified for the 'input' query. Usually, it is one of: 'input.sql', 'input.path', 'input.table', 'input.db.table', 'input.topic'.") } "throw an exception when the prefix is empty and parent is specified" in { val conf = ConfigFactory.parseString("data = /tmp") - val exception = intercept[IllegalArgumentException] { + val ex = intercept[IllegalArgumentException] { QueryBuilder.fromConfig(conf, "input", "my.parent") } - assert(exception.getMessage == "No options are specified for the 'input' query. Usually, it is one of: 'input.sql', 'input.path', 'input.table', 'input.db.table', 'input.topic' at my.parent.") + assert(ex.getMessage == "No options are specified for the 'input' query. Usually, it is one of: 'input.sql', 'input.path', 'input.table', 'input.db.table', 'input.topic' at my.parent.") } } } diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDynamoDbBuilderSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDynamoDbBuilderSuite.scala new file mode 100644 index 000000000..d0dbdf0e8 --- /dev/null +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/BookkeeperDynamoDbBuilderSuite.scala @@ -0,0 +1,154 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.tests.bookkeeper + +import org.scalatest.wordspec.AnyWordSpec +import software.amazon.awssdk.auth.credentials.{AwsBasicCredentials, StaticCredentialsProvider} +import za.co.absa.pramen.core.bookkeeper.BookkeeperDynamoDb + +class BookkeeperDynamoDbBuilderSuite extends AnyWordSpec { + + "BookkeeperDynamoDbBuilder" should { + "use default table prefix" in { + val builder = BookkeeperDynamoDb.builder + .withRegion("us-east-1") + .withBatchId(123456789L) + + // We can't instantiate without valid DynamoDB connection, + // but we can verify the builder returns itself (fluent API) + assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder]) + } + + "allow setting region" in { + val builder = BookkeeperDynamoDb.builder + .withRegion("eu-west-1") + .withBatchId(123456789L) + + assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder]) + } + + "allow setting table ARN" in { + val builder = BookkeeperDynamoDb.builder + .withRegion("us-east-1") + .withTableArn("arn:aws:dynamodb:us-east-1:123456789012:table/") + .withBatchId(123456789L) + + assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder]) + } + + "allow setting table prefix" in { + val builder = BookkeeperDynamoDb.builder + .withRegion("us-east-1") + .withTablePrefix("test_pramen") + .withBatchId(123456789L) + + assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder]) + } + + "allow setting credentials provider" in { + val credentials = AwsBasicCredentials.create("accessKey", "secretKey") + val credentialsProvider = StaticCredentialsProvider.create(credentials) + + val builder = BookkeeperDynamoDb.builder + .withRegion("us-east-1") + .withCredentialsProvider(credentialsProvider) + .withBatchId(123456789L) + + assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder]) + } + + "allow setting endpoint" in { + val builder = BookkeeperDynamoDb.builder + .withRegion("us-east-1") + .withEndpoint("http://localhost:8000") + .withBatchId(123456789L) + + assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder]) + } + + "allow setting batch ID" in { + val batchId = 987654321L + val builder = BookkeeperDynamoDb.builder + .withRegion("us-east-1") + .withBatchId(batchId) + + assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder]) + } + + "support fluent API chaining" in { + val credentials = AwsBasicCredentials.create("accessKey", "secretKey") + val credentialsProvider = StaticCredentialsProvider.create(credentials) + + val builder = BookkeeperDynamoDb.builder + .withRegion("ap-southeast-2") + .withTableArn("arn:aws:dynamodb:ap-southeast-2:123456789012:table/") + .withTablePrefix("prod_pramen") + .withCredentialsProvider(credentialsProvider) + .withEndpoint("http://localhost:8000") + .withBatchId(111222333L) + + assert(builder.isInstanceOf[BookkeeperDynamoDb.BookkeeperDynamoDbBuilder]) + } + + "throw IllegalArgumentException when region is not set" in { + val builder = BookkeeperDynamoDb.builder + .withBatchId(123456789L) + + val ex = intercept[IllegalArgumentException] { + builder.build() + } + + assert(ex.getMessage.contains("region")) + } + + "throw IllegalArgumentException when batch ID is not set" in { + val builder = BookkeeperDynamoDb.builder + .withRegion("us-east-1") + + val ex = intercept[IllegalArgumentException] { + builder.build() + } + + assert(ex.getMessage.contains("BatchId is not supplied")) + } + } + + "BookkeeperDynamoDb.getFullTableName" should { + "return table name when no ARN is provided" in { + val result = BookkeeperDynamoDb.getFullTableName(None, "test_table") + assert(result == "test_table") + } + + "return ARN with some prefix ends with slash" in { + val arn = "arn:aws:dynamodb:us-east-1:123456789012:path/" + val result = BookkeeperDynamoDb.getFullTableName(Some(arn), "test_table") + assert(result == s"${arn}table/test_table") + } + + "return ARN with /table/ prefix when ARN ends with slash" in { + val arn = "arn:aws:dynamodb:us-east-1:123456789012:table/" + val result = BookkeeperDynamoDb.getFullTableName(Some(arn), "test_table") + assert(result == s"${arn}test_table") + } + + "handle ARN without trailing slash by adding /table/" in { + val arn = "arn:aws:dynamodb:eu-west-1:987654321098" + val result = BookkeeperDynamoDb.getFullTableName(Some(arn), "my_table") + assert(result == s"$arn/table/my_table") + } + } +} diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/OffsetManagerDynamoDbBuilderSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/OffsetManagerDynamoDbBuilderSuite.scala new file mode 100644 index 000000000..b8714d858 --- /dev/null +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/bookkeeper/OffsetManagerDynamoDbBuilderSuite.scala @@ -0,0 +1,117 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.tests.bookkeeper + +import org.scalatest.wordspec.AnyWordSpec +import software.amazon.awssdk.auth.credentials.{AwsBasicCredentials, StaticCredentialsProvider} +import za.co.absa.pramen.core.bookkeeper.OffsetManagerDynamoDb + +class OffsetManagerDynamoDbBuilderSuite extends AnyWordSpec { + + "OffsetManagerDynamoDbBuilder" should { + "use default table prefix when not specified" in { + val builder = OffsetManagerDynamoDb.builder + .withRegion("us-east-1") + .withBatchId(123456789L) + + assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder]) + } + + "allow setting region" in { + val builder = OffsetManagerDynamoDb.builder + .withRegion("eu-central-1") + .withBatchId(123456789L) + + assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder]) + } + + "allow setting table ARN" in { + val builder = OffsetManagerDynamoDb.builder + .withRegion("us-west-2") + .withTableArn("arn:aws:dynamodb:us-west-2:123456789012:table/") + .withBatchId(123456789L) + + assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder]) + } + + "allow setting table prefix" in { + val builder = OffsetManagerDynamoDb.builder + .withRegion("ap-northeast-1") + .withTablePrefix("staging_pramen") + .withBatchId(123456789L) + + assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder]) + } + + "allow setting credentials provider" in { + val credentials = AwsBasicCredentials.create("testAccessKey", "testSecretKey") + val credentialsProvider = StaticCredentialsProvider.create(credentials) + + val builder = OffsetManagerDynamoDb.builder + .withRegion("us-east-1") + .withCredentialsProvider(credentialsProvider) + .withBatchId(123456789L) + + assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder]) + } + + "allow setting endpoint for local testing" in { + val builder = OffsetManagerDynamoDb.builder + .withRegion("us-east-1") + .withEndpoint("http://localhost:4566") + .withBatchId(123456789L) + + assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder]) + } + + "allow setting batch ID" in { + val batchId = 1234567890123L + val builder = OffsetManagerDynamoDb.builder + .withRegion("us-east-1") + .withBatchId(batchId) + + assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder]) + } + + "support fluent API with all parameters" in { + val credentials = AwsBasicCredentials.create("key", "secret") + val credentialsProvider = StaticCredentialsProvider.create(credentials) + val batchId = System.currentTimeMillis() + + val builder = OffsetManagerDynamoDb.builder + .withRegion("sa-east-1") + .withTableArn("arn:aws:dynamodb:sa-east-1:999888777666:table/") + .withTablePrefix("dev_pramen") + .withCredentialsProvider(credentialsProvider) + .withEndpoint("http://dynamodb.local:8000") + .withBatchId(batchId) + + assert(builder.isInstanceOf[OffsetManagerDynamoDb.OffsetManagerDynamoDbBuilder]) + } + + "throw IllegalArgumentException when region is missing" in { + val builder = OffsetManagerDynamoDb.builder + .withBatchId(123456789L) + + val ex = intercept[IllegalArgumentException] { + builder.build() + } + + assert(ex.getMessage.contains("Region")) + } + } +} diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/journal/JournalDynamoDBBuilderSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/journal/JournalDynamoDBBuilderSuite.scala new file mode 100644 index 000000000..2574096bc --- /dev/null +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/journal/JournalDynamoDBBuilderSuite.scala @@ -0,0 +1,99 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.tests.journal + +import org.scalatest.wordspec.AnyWordSpec +import software.amazon.awssdk.auth.credentials.{AwsBasicCredentials, StaticCredentialsProvider} +import za.co.absa.pramen.core.journal.JournalDynamoDB + +class JournalDynamoDBBuilderSuite extends AnyWordSpec { + + "JournalDynamoDBBuilder" should { + "use default table prefix when not specified" in { + val builder = JournalDynamoDB.builder + .withRegion("us-east-1") + + assert(builder.isInstanceOf[JournalDynamoDB.JournalDynamoDBBuilder]) + } + + "allow setting region" in { + val builder = JournalDynamoDB.builder + .withRegion("eu-west-2") + + assert(builder.isInstanceOf[JournalDynamoDB.JournalDynamoDBBuilder]) + } + + "allow setting table ARN" in { + val builder = JournalDynamoDB.builder + .withRegion("us-west-1") + .withTableArn("arn:aws:dynamodb:us-west-1:111222333444:table/") + + assert(builder.isInstanceOf[JournalDynamoDB.JournalDynamoDBBuilder]) + } + + "allow setting table prefix" in { + val builder = JournalDynamoDB.builder + .withRegion("ap-south-1") + .withTablePrefix("qa_pramen") + + assert(builder.isInstanceOf[JournalDynamoDB.JournalDynamoDBBuilder]) + } + + "allow setting credentials provider" in { + val credentials = AwsBasicCredentials.create("myAccessKey", "mySecretKey") + val credentialsProvider = StaticCredentialsProvider.create(credentials) + + val builder = JournalDynamoDB.builder + .withRegion("us-east-2") + .withCredentialsProvider(credentialsProvider) + + assert(builder.isInstanceOf[JournalDynamoDB.JournalDynamoDBBuilder]) + } + + "allow setting endpoint for local development" in { + val builder = JournalDynamoDB.builder + .withRegion("local") + .withEndpoint("http://localhost:8000") + + assert(builder.isInstanceOf[JournalDynamoDB.JournalDynamoDBBuilder]) + } + + "support fluent API chaining" in { + val credentials = AwsBasicCredentials.create("testKey", "testSecret") + val credentialsProvider = StaticCredentialsProvider.create(credentials) + + val builder = JournalDynamoDB.builder + .withRegion("ca-central-1") + .withTableArn("arn:aws:dynamodb:ca-central-1:555666777888:table/") + .withTablePrefix("prod_journal") + .withCredentialsProvider(credentialsProvider) + .withEndpoint("http://dynamodb-local:8000") + + assert(builder.isInstanceOf[JournalDynamoDB.JournalDynamoDBBuilder]) + } + + "throw IllegalArgumentException when region is not set" in { + val builder = JournalDynamoDB.builder + + val ex = intercept[IllegalArgumentException] { + builder.build() + } + + assert(ex.getMessage.contains("Region")) + } + } +} diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/lock/TokenLockFactoryDynamoDbBuilderSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/lock/TokenLockFactoryDynamoDbBuilderSuite.scala new file mode 100644 index 000000000..a653303b0 --- /dev/null +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/lock/TokenLockFactoryDynamoDbBuilderSuite.scala @@ -0,0 +1,99 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.tests.lock + +import org.scalatest.wordspec.AnyWordSpec +import software.amazon.awssdk.auth.credentials.{AwsBasicCredentials, StaticCredentialsProvider} +import za.co.absa.pramen.core.lock.TokenLockFactoryDynamoDb + +class TokenLockFactoryDynamoDbBuilderSuite extends AnyWordSpec { + + "TokenLockFactoryDynamoDbBuilder" should { + "use default table prefix when not set" in { + val builder = TokenLockFactoryDynamoDb.builder + .withRegion("us-east-1") + + assert(builder.isInstanceOf[TokenLockFactoryDynamoDb.TokenLockFactoryDynamoDbBuilder]) + } + + "allow setting region" in { + val builder = TokenLockFactoryDynamoDb.builder + .withRegion("eu-west-3") + + assert(builder.isInstanceOf[TokenLockFactoryDynamoDb.TokenLockFactoryDynamoDbBuilder]) + } + + "allow setting table ARN" in { + val builder = TokenLockFactoryDynamoDb.builder + .withRegion("ap-east-1") + .withTableArn("arn:aws:dynamodb:ap-east-1:999888777666:table/") + + assert(builder.isInstanceOf[TokenLockFactoryDynamoDb.TokenLockFactoryDynamoDbBuilder]) + } + + "allow setting table prefix" in { + val builder = TokenLockFactoryDynamoDb.builder + .withRegion("sa-east-1") + .withTablePrefix("lock_pramen") + + assert(builder.isInstanceOf[TokenLockFactoryDynamoDb.TokenLockFactoryDynamoDbBuilder]) + } + + "allow setting credentials provider" in { + val credentials = AwsBasicCredentials.create("lockKey", "lockSecret") + val credentialsProvider = StaticCredentialsProvider.create(credentials) + + val builder = TokenLockFactoryDynamoDb.builder + .withRegion("us-west-1") + .withCredentialsProvider(credentialsProvider) + + assert(builder.isInstanceOf[TokenLockFactoryDynamoDb.TokenLockFactoryDynamoDbBuilder]) + } + + "allow setting endpoint for testing" in { + val builder = TokenLockFactoryDynamoDb.builder + .withRegion("local") + .withEndpoint("http://dynamodb.local:8888") + + assert(builder.isInstanceOf[TokenLockFactoryDynamoDb.TokenLockFactoryDynamoDbBuilder]) + } + + "support full fluent API" in { + val credentials = AwsBasicCredentials.create("fluentKey", "fluentSecret") + val credentialsProvider = StaticCredentialsProvider.create(credentials) + + val builder = TokenLockFactoryDynamoDb.builder + .withRegion("cn-north-1") + .withTableArn("arn:aws-cn:dynamodb:cn-north-1:123456789012:table/") + .withTablePrefix("distributed_locks") + .withCredentialsProvider(credentialsProvider) + .withEndpoint("http://private-dynamodb:8000") + + assert(builder.isInstanceOf[TokenLockFactoryDynamoDb.TokenLockFactoryDynamoDbBuilder]) + } + + "throw IllegalArgumentException when region is not provided" in { + val builder = TokenLockFactoryDynamoDb.builder + + val ex = intercept[IllegalArgumentException] { + builder.build() + } + + assert(ex.getMessage.contains("Region")) + } + } +} diff --git a/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/metadata/MetadataManagerDynamoDbBuilderSuite.scala b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/metadata/MetadataManagerDynamoDbBuilderSuite.scala new file mode 100644 index 000000000..84e294732 --- /dev/null +++ b/pramen/core/src/test/scala/za/co/absa/pramen/core/tests/metadata/MetadataManagerDynamoDbBuilderSuite.scala @@ -0,0 +1,99 @@ +/* + * Copyright 2022 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package za.co.absa.pramen.core.tests.metadata + +import org.scalatest.wordspec.AnyWordSpec +import software.amazon.awssdk.auth.credentials.{AwsBasicCredentials, StaticCredentialsProvider} +import za.co.absa.pramen.core.metadata.MetadataManagerDynamoDb + +class MetadataManagerDynamoDbBuilderSuite extends AnyWordSpec { + + "MetadataManagerDynamoDbBuilder" should { + "use default table prefix" in { + val builder = MetadataManagerDynamoDb.builder + .withRegion("us-east-1") + + assert(builder.isInstanceOf[MetadataManagerDynamoDb.MetadataManagerDynamoDbBuilder]) + } + + "allow setting region" in { + val builder = MetadataManagerDynamoDb.builder + .withRegion("eu-north-1") + + assert(builder.isInstanceOf[MetadataManagerDynamoDb.MetadataManagerDynamoDbBuilder]) + } + + "allow setting table ARN" in { + val builder = MetadataManagerDynamoDb.builder + .withRegion("ap-southeast-1") + .withTableArn("arn:aws:dynamodb:ap-southeast-1:123123123123:table/") + + assert(builder.isInstanceOf[MetadataManagerDynamoDb.MetadataManagerDynamoDbBuilder]) + } + + "allow setting table prefix" in { + val builder = MetadataManagerDynamoDb.builder + .withRegion("me-south-1") + .withTablePrefix("test_metadata") + + assert(builder.isInstanceOf[MetadataManagerDynamoDb.MetadataManagerDynamoDbBuilder]) + } + + "allow setting credentials provider" in { + val credentials = AwsBasicCredentials.create("access", "secret") + val credentialsProvider = StaticCredentialsProvider.create(credentials) + + val builder = MetadataManagerDynamoDb.builder + .withRegion("af-south-1") + .withCredentialsProvider(credentialsProvider) + + assert(builder.isInstanceOf[MetadataManagerDynamoDb.MetadataManagerDynamoDbBuilder]) + } + + "allow setting endpoint" in { + val builder = MetadataManagerDynamoDb.builder + .withRegion("us-west-2") + .withEndpoint("http://localstack:4566") + + assert(builder.isInstanceOf[MetadataManagerDynamoDb.MetadataManagerDynamoDbBuilder]) + } + + "support complete fluent API chain" in { + val credentials = AwsBasicCredentials.create("myKey", "mySecret") + val credentialsProvider = StaticCredentialsProvider.create(credentials) + + val builder = MetadataManagerDynamoDb.builder + .withRegion("ap-northeast-2") + .withTableArn("arn:aws:dynamodb:ap-northeast-2:444333222111:table/") + .withTablePrefix("metadata_manager") + .withCredentialsProvider(credentialsProvider) + .withEndpoint("http://custom-endpoint:9000") + + assert(builder.isInstanceOf[MetadataManagerDynamoDb.MetadataManagerDynamoDbBuilder]) + } + + "throw IllegalArgumentException when region is missing" in { + val builder = MetadataManagerDynamoDb.builder + + val ex = intercept[IllegalArgumentException] { + builder.build() + } + + assert(ex.getMessage.contains("Region")) + } + } +} diff --git a/pramen/examples/dynamodb_bookkeeping/README.md b/pramen/examples/dynamodb_bookkeeping/README.md new file mode 100644 index 000000000..4e0f567c6 --- /dev/null +++ b/pramen/examples/dynamodb_bookkeeping/README.md @@ -0,0 +1,531 @@ +# DynamoDB Bookkeeping Example + +This example demonstrates how to configure Pramen to use AWS DynamoDB for bookkeeping instead of MongoDB, JDBC databases, or Hadoop-based storage. + +## Overview + +DynamoDB bookkeeping provides a serverless, fully managed solution for tracking pipeline state, record counts, and data availability in Pramen pipelines. + +### Benefits + +- **Serverless**: No database servers to manage or maintain +- **Auto-scaling**: Automatically scales to handle workload +- **Pay-per-request**: No fixed costs, pay only for what you use +- **High Availability**: Built-in replication across AWS availability zones +- **Multi-environment**: Easy separation via table prefixes +- **Automatic Table Creation**: Tables are created automatically on first run + +## Configuration + +### Minimal Configuration + +```hocon +pramen.bookkeeping { + enabled = true + dynamodb.region = "us-east-1" +} +``` + +This creates tables: +- `pramen_bookkeeping` - Data availability and record counts +- `pramen_schemas` - Table schema evolution +- `pramen_locks` - Distributed locking (if locks enabled) +- `pramen_journal` - Task completion history +- `pramen_metadata` - Custom metadata key-value pairs +- `pramen_offsets` - Incremental ingestion offset tracking + +### Production Configuration + +```hocon +pramen.bookkeeping { + enabled = true + dynamodb.region = "us-east-1" + dynamodb.table.prefix = "pramen_production" +} +``` + +This creates tables: +- `pramen_production_bookkeeping` - Data availability and record counts +- `pramen_production_schemas` - Table schema evolution +- `pramen_production_locks` - Distributed locking (if locks enabled) +- `pramen_production_journal` - Task completion history +- `pramen_production_metadata` - Custom metadata key-value pairs +- `pramen_production_offsets` - Incremental ingestion offset tracking + +### Multi-Environment Configuration + +**Development:** +```hocon +pramen.bookkeeping { + enabled = true + dynamodb.region = "us-east-1" + dynamodb.table.prefix = "pramen_dev" +} +``` + +**Staging:** +```hocon +pramen.bookkeeping { + enabled = true + dynamodb.region = "us-east-1" + dynamodb.table.prefix = "pramen_staging" +} +``` + +**Production:** +```hocon +pramen.bookkeeping { + enabled = true + dynamodb.region = "us-east-1" + dynamodb.table.prefix = "pramen_production" +} +``` + +### Cross-Account Configuration + +If DynamoDB tables are in a different AWS account: + +```hocon +pramen.bookkeeping { + enabled = true + dynamodb.region = "us-west-2" + dynamodb.table.arn = "arn:aws:dynamodb:us-west-2:987654321098:table/" + dynamodb.table.prefix = "shared_pramen" +} +``` + +## AWS Setup + +### 1. AWS Credentials + +Pramen uses the AWS SDK's `DefaultCredentialsProvider`, which loads credentials from: + +1. **Environment Variables**: + ```bash + export AWS_ACCESS_KEY_ID=your_access_key + export AWS_SECRET_ACCESS_KEY=your_secret_key + export AWS_REGION=us-east-1 + ``` + +2. **AWS Credentials File** (`~/.aws/credentials`): + ```ini + [default] + aws_access_key_id = your_access_key + aws_secret_access_key = your_secret_key + region = us-east-1 + ``` + +3. **IAM Role** (recommended for EC2, ECS, EMR, etc.): + - No credentials needed in configuration + - Automatically uses the instance/task role + +### 2. Required IAM Permissions + +Create an IAM policy with these permissions: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "dynamodb:CreateTable", + "dynamodb:DescribeTable", + "dynamodb:Query", + "dynamodb:PutItem", + "dynamodb:DeleteItem", + "dynamodb:GetItem", + "dynamodb:Scan" + ], + "Resource": [ + "arn:aws:dynamodb:us-east-1:*:table/pramen_*" + ] + } + ] +} +``` + +**Note**: Adjust the region and table name pattern based on your configuration. + +### 3. Table Structure + +Tables are automatically created with the following schema: + +#### Bookkeeping Table (`{prefix}_bookkeeping`) +- **Partition Key**: `tableName` (String) +- **Sort Key**: `infoDate` (String, format: yyyy-MM-dd) +- **Billing Mode**: PAY_PER_REQUEST (on-demand) +- **Attributes**: + - `tableName`: Name of the metastore table + - `infoDate`: Information date + - `infoDateBegin`: Start of date range + - `infoDateEnd`: End of date range + - `inputRecordCount`: Number of input records + - `outputRecordCount`: Number of output records + - `jobStarted`: Job start timestamp (milliseconds) + - `jobFinished`: Job finish timestamp (milliseconds) + - `batchId`: Batch execution ID + - `appendedRecordCount`: Records appended (optional) + +#### Schema Table (`{prefix}_schemas`) +- **Partition Key**: `tableName` (String) +- **Sort Key**: `infoDate` (String) +- **Billing Mode**: PAY_PER_REQUEST (on-demand) +- **Attributes**: + - `tableName`: Name of the metastore table + - `infoDate`: Date when schema was recorded + - `schemaJson`: Spark schema in JSON format + +#### Offset Table (`{prefix}_offsets`) +- **Partition Key**: `pramenTableName` (String) +- **Sort Key**: `compositeKey` (String, format: "infoDate#createdAtMilli") +- **Billing Mode**: PAY_PER_REQUEST (on-demand) +- **Attributes**: + - `pramenTableName`: Name of the metastore table + - `compositeKey`: Composite key for efficient querying (infoDate#createdAtMilli) + - `infoDate`: Information date + - `dataType`: Offset data type (e.g., "IntegralType", "StringType") + - `minOffset`: Minimum offset value for this batch + - `maxOffset`: Maximum offset value for this batch + - `batchId`: Batch execution ID + - `createdAt`: Timestamp when offset was created (milliseconds) + - `committedAt`: Timestamp when offset was committed (milliseconds, optional) + +## Running the Example + +1. **Configure AWS credentials** (see above) + +2. **Update the configuration file**: + ```bash + vi examples/dynamodb_bookkeeping/dynamodb_bookkeeping.conf + ``` + +3. **Run Pramen**: + ```bash + spark-submit \ + --class za.co.absa.pramen.runner.PipelineRunner \ + --master local[*] \ + pramen-runner_2.12-1.13.10.jar \ + --config examples/dynamodb_bookkeeping/dynamodb_bookkeeping.conf \ + --date 2024-01-15 + ``` + +4. **Verify tables were created**: + ```bash + aws dynamodb list-tables --region us-east-1 + ``` + + You should see: + - `pramen_production_bookkeeping` + - `pramen_production_schemas` + - `pramen_production_offsets` (if using incremental ingestion) + - `pramen_production_locks` (if locks enabled) + - `pramen_production_journal` (if journal enabled) + - `pramen_production_metadata` (if metadata enabled) + +5. **Query bookkeeping data**: + ```bash + aws dynamodb query \ + --table-name pramen_production_bookkeeping \ + --key-condition-expression "tableName = :table" \ + --expression-attribute-values '{":table":{"S":"example_table"}}' \ + --region us-east-1 + ``` + +## Cost Considerations + +DynamoDB uses pay-per-request billing: + +- **On-demand mode** (default): + - Write: $1.25 per million write requests + - Read: $0.25 per million read requests + - Storage: $0.25 per GB-month + +- **Typical Pramen workload**: + - Small pipelines: < $1/month + - Medium pipelines: $5-20/month + - Large pipelines: $50-100/month + +**Cost optimization tips**: +1. Use table prefixes to separate environments (avoid duplicating production data) +2. Archive old bookkeeping data periodically +3. Monitor usage via AWS Cost Explorer + +## Troubleshooting + +### Issue: "Access Denied" error + +**Cause**: Missing IAM permissions + +**Solution**: Verify IAM policy includes all required DynamoDB permissions + +### Issue: "Region not found" error + +**Cause**: Invalid AWS region specified + +**Solution**: Check region name in configuration matches AWS region codes +(e.g., `us-east-1`, `eu-west-1`, `ap-southeast-1`) + +### Issue: Tables not created automatically + +**Cause**: Missing `dynamodb:CreateTable` permission + +**Solution**: Add CreateTable permission to IAM policy, or manually create tables: + +```bash +# Create bookkeeping table +aws dynamodb create-table \ + --table-name pramen_production_bookkeeping \ + --attribute-definitions \ + AttributeName=tableName,AttributeType=S \ + AttributeName=infoDate,AttributeType=S \ + --key-schema \ + AttributeName=tableName,KeyType=HASH \ + AttributeName=infoDate,KeyType=RANGE \ + --billing-mode PAY_PER_REQUEST \ + --region us-east-1 + +# Create schema table +aws dynamodb create-table \ + --table-name pramen_production_schemas \ + --attribute-definitions \ + AttributeName=tableName,AttributeType=S \ + AttributeName=infoDate,AttributeType=S \ + --key-schema \ + AttributeName=tableName,KeyType=HASH \ + AttributeName=infoDate,KeyType=RANGE \ + --billing-mode PAY_PER_REQUEST \ + --region us-east-1 +``` + +### Issue: Slow queries + +**Cause**: Large number of bookkeeping records + +**Solution**: +1. Use date range filters in queries +2. Consider implementing table retention policy +3. Archive old data to S3 + +## Comparison with Other Bookkeeping Options + +| Feature | DynamoDB | JDBC | MongoDB | Hadoop/Delta | +|---------|----------|------|---------|--------------| +| Setup Complexity | Low | Medium | Medium | Low | +| Maintenance | None | High | Medium | Low | +| Cost (small) | Very Low | Medium | Medium | Very Low | +| Cost (large) | Medium | High | Medium | Low | +| Scaling | Automatic | Manual | Manual | Automatic | +| Multi-region | Yes | No | Yes | Yes | +| Query Performance | Fast | Fast | Fast | Slower | +| Incremental Support | Yes | Yes | No | No | + +## Distributed Locking with DynamoDB + +When DynamoDB is configured for bookkeeping, Pramen automatically uses it for distributed locking to prevent concurrent pipeline runs. This ensures data consistency in multi-instance deployments. + +### How It Works + +1. **Automatic Lock Table Creation**: A locks table is created automatically using a builder pattern: + - Table name: `{prefix}_locks` (e.g., `pramen_production_locks`) + - Schema: `token` (partition key), `owner`, `expires`, `createdAt` + - Created via `TokenLockFactoryDynamoDb.builder` + +2. **Lock Acquisition**: Uses DynamoDB conditional writes (`attribute_not_exists`) for atomic lock operations + +3. **Lock Renewal**: Active pipelines automatically renew their locks every 2 minutes + +4. **Lock Expiration**: Locks expire after 10 minutes of inactivity and can be taken over + +5. **Hard Expiration**: Stale locks are cleaned up after 1 day + +6. **Builder Pattern**: Lock factory is created using a fluent builder API for flexible configuration + +### Configuration + +Enable locking along with DynamoDB bookkeeping: + +```hocon +pramen { + # Enable distributed locking + runtime.use.locks = true + + bookkeeping { + enabled = true + dynamodb.region = "us-east-1" + dynamodb.table.prefix = "pramen_production" + } +} +``` + +This creates six tables: +- `pramen_production_bookkeeping` - Bookkeeping data +- `pramen_production_schemas` - Table schemas +- `pramen_production_locks` - Distributed locks +- `pramen_production_journal` - Task completion history +- `pramen_production_metadata` - Custom metadata +- `pramen_production_offsets` - Incremental ingestion offsets + +See `dynamodb_with_locks.conf` for a complete example. + +### IAM Permissions for Locks + +Add the locks table to your IAM policy: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "dynamodb:CreateTable", + "dynamodb:DescribeTable", + "dynamodb:PutItem", + "dynamodb:GetItem", + "dynamodb:DeleteItem", + "dynamodb:UpdateItem" + ], + "Resource": [ + "arn:aws:dynamodb:*:*:table/pramen_production_bookkeeping", + "arn:aws:dynamodb:*:*:table/pramen_production_schemas", + "arn:aws:dynamodb:*:*:table/pramen_production_locks", + "arn:aws:dynamodb:*:*:table/pramen_production_journal", + "arn:aws:dynamodb:*:*:table/pramen_production_metadata", + "arn:aws:dynamodb:*:*:table/pramen_production_offsets" + ] + } + ] +} +``` + +### Programmatic Usage + +You can also create lock factories programmatically using the builder pattern: + +```scala +import za.co.absa.pramen.core.lock.TokenLockFactoryDynamoDb + +// Basic usage +val lockFactory = TokenLockFactoryDynamoDb.builder + .withRegion("us-east-1") + .withTablePrefix("my_app") + .build() + +try { + val lock = lockFactory.getLock("my_pipeline") + + if (lock.tryAcquire()) { + try { + // Run your pipeline + } finally { + lock.release() + } + } +} finally { + lockFactory.close() +} + +// Testing with DynamoDB Local +val testFactory = TokenLockFactoryDynamoDb.builder + .withRegion("us-east-1") + .withEndpoint("http://localhost:8000") + .build() +``` + + +### Lock Behavior + +**Scenario 1: Single Pipeline Run** +- Pipeline acquires lock → processes data → releases lock + +**Scenario 2: Concurrent Pipeline Runs** +- Instance A acquires lock → starts processing +- Instance B tries to acquire same lock → blocked (lock already held) +- Instance A completes → releases lock +- Instance B can now acquire lock (if still attempting) + +**Scenario 3: Pipeline Crash** +- Pipeline acquires lock → crashes +- Lock expires after 10 minutes (no renewal) +- New pipeline run can take over expired lock + +### Monitoring Locks + +Query active locks: + +```bash +aws dynamodb scan \ + --table-name pramen_production_locks \ + --region us-east-1 +``` + +Check specific lock: + +```bash +aws dynamodb get-item \ + --table-name pramen_production_locks \ + --key '{"token":{"S":"my_pipeline_lock"}}' \ + --region us-east-1 +``` + +Manually release stuck lock (use with caution): + +```bash +aws dynamodb delete-item \ + --table-name pramen_production_locks \ + --key '{"token":{"S":"my_pipeline_lock"}}' \ + --region us-east-1 +``` + +### Lock Cost + +Lock operations add minimal cost: +- Lock acquisition: 1 write request (~$0.00000125) +- Lock renewal (every 2 min): 1 write request per renewal +- Lock release: 1 delete request (~$0.00000125) +- Total per pipeline run: ~$0.00001 (for 10-minute pipeline) + +## Advanced Topics + +### Using DynamoDB Local for Development + +For local development/testing, use DynamoDB Local: + +1. **Start DynamoDB Local**: + ```bash + docker run -p 8000:8000 amazon/dynamodb-local + ``` + +2. **Configure endpoint** (requires code modification): + ```scala + val client = DynamoDbClient.builder() + .endpointOverride(new URI("http://localhost:8000")) + .region(Region.US_EAST_1) + .build() + ``` + +### Table Backup and Restore + +Use AWS Backup or DynamoDB point-in-time recovery: + +```bash +# Enable point-in-time recovery +aws dynamodb update-continuous-backups \ + --table-name pramen_production_bookkeeping \ + --point-in-time-recovery-specification PointInTimeRecoveryEnabled=true +``` + +### Monitoring + +Monitor DynamoDB metrics in CloudWatch: +- `UserErrors` - Check for configuration issues +- `ConsumedReadCapacityUnits` / `ConsumedWriteCapacityUnits` - Monitor costs +- `SystemErrors` - Check for service issues + +## References + +- [AWS DynamoDB Documentation](https://docs.aws.amazon.com/dynamodb/) +- [AWS SDK for Java Documentation](https://docs.aws.amazon.com/sdk-for-java/) +- [DynamoDB Best Practices](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/best-practices.html) diff --git a/pramen/examples/dynamodb_bookkeeping/dynamodb_bookkeeping.conf b/pramen/examples/dynamodb_bookkeeping/dynamodb_bookkeeping.conf new file mode 100644 index 000000000..bd36630b4 --- /dev/null +++ b/pramen/examples/dynamodb_bookkeeping/dynamodb_bookkeeping.conf @@ -0,0 +1,123 @@ +# Copyright 2022 ABSA Group Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ============================================================================= +# Example Configuration: DynamoDB Bookkeeping +# ============================================================================= +# +# This example shows how to configure Pramen to use AWS DynamoDB for +# bookkeeping instead of MongoDB, JDBC, or Hadoop-based storage. +# +# DynamoDB bookkeeping provides: +# - Serverless, fully managed storage +# - Pay-per-request billing (no fixed costs) +# - Automatic scaling +# - High availability across AWS regions +# - Multi-environment support via table prefixes +# + +# General options +pramen { + environment.name = "Production" + pipeline.name = "DynamoDB Bookkeeping Example" + + # Enable bookkeeping with DynamoDB + bookkeeping { + enabled = true + + # ======================================================================= + # DynamoDB Configuration + # ======================================================================= + + # AWS Region where DynamoDB tables will be created/accessed + # REQUIRED when using DynamoDB bookkeeping + dynamodb.region = "af-south-1" + + # Table prefix for multi-environment/multi-tenant deployments + # OPTIONAL - defaults to "pramen" if not specified + # Creates tables: {prefix}_bookkeeping and {prefix}_schemas + dynamodb.table.prefix = "pramen_production" + + # Table ARN prefix for cross-account or resource-based policy access + # OPTIONAL - only needed for advanced scenarios + # Format: arn:aws:dynamodb:region:account-id:table/ + # dynamodb.table.arn = "arn:aws:dynamodb:us-east-1:123456789012:table/" + + # ======================================================================= + # Notes on DynamoDB Configuration + # ======================================================================= + # + # 1. AWS Credentials: + # - Pramen uses the AWS SDK's DefaultCredentialsProvider + # - Credentials are loaded from: + # a) Environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) + # b) AWS credentials file (~/.aws/credentials) + # c) IAM role (when running on EC2, ECS, Lambda, etc.) + # + # 2. Required IAM Permissions: + # Your AWS credentials/role must have permissions for: + # - dynamodb:CreateTable (for automatic table creation) + # - dynamodb:DescribeTable + # - dynamodb:Query + # - dynamodb:PutItem + # - dynamodb:DeleteItem + # + # 3. Table Structure: + # Tables are automatically created with: + # - Bookkeeping: Partition key=tableName, Sort key=infoDate + # - Schemas: Partition key=tableName, Sort key=infoDate + # - Billing mode: PAY_PER_REQUEST (on-demand) + # + # 4. Multi-Environment Setup: + # Use different table prefixes for different environments: + # - Dev: dynamodb.table.prefix = "pramen_dev" + # - Staging: dynamodb.table.prefix = "pramen_staging" + # - Production: dynamodb.table.prefix = "pramen_production" + # + # 5. Cross-Account Access: + # If tables are in a different AWS account, use the table ARN: + # dynamodb.table.arn = "arn:aws:dynamodb:us-west-2:987654321098:table/" + # + + # Hadoop format is required even when using DynamoDB + # (legacy requirement, set to any valid value) + hadoop.format = "delta" + } + + # Temporary directory (optional) + temporary.directory = "/tmp" +} + +# Metastore configuration +pramen.metastore { + tables = [ + { + name = "example_table" + format = "delta" + path = "/data/lake/example_table" + } + ] +} + +# Operations +pramen.operations = [ + { + name = "Example Operation" + type = "transformation" + schedule.type = "daily" + + transformer.class = "za.co.absa.pramen.core.transformers.IdentityTransformer" + input.table = "example_table" + output.table = "example_table" + } +] diff --git a/pramen/examples/dynamodb_bookkeeping/dynamodb_with_locks.conf b/pramen/examples/dynamodb_bookkeeping/dynamodb_with_locks.conf new file mode 100644 index 000000000..0ec56e079 --- /dev/null +++ b/pramen/examples/dynamodb_bookkeeping/dynamodb_with_locks.conf @@ -0,0 +1,153 @@ +# DynamoDB Bookkeeping with Distributed Locking Example +# +# This configuration demonstrates how to use DynamoDB for both bookkeeping AND distributed locking. +# When DynamoDB is configured for bookkeeping, Pramen automatically uses it for locks as well. + +pramen { + # Enable locking to prevent concurrent pipeline runs + runtime.use.locks = true + + # Bookkeeping configuration + bookkeeping { + enabled = true + + # DynamoDB Configuration + # ===================== + + # AWS region where DynamoDB tables will be created (REQUIRED) + dynamodb.region = "us-east-1" + + # Table prefix for all Pramen tables (OPTIONAL, default: "pramen") + # This creates: {prefix}_bookkeeping, {prefix}_schemas, {prefix}_locks + dynamodb.table.prefix = "pramen_production" + + # Table ARN prefix for cross-account or cross-region access (OPTIONAL) + # dynamodb.table.arn = "arn:aws:dynamodb:us-east-1:123456789012:table/" + + # Legacy required field (not used for DynamoDB but must be set) + hadoop.format = "delta" + } + + # Pipeline configuration + environment.name = "production" + pipeline.name = "example_pipeline" + + # Metastore configuration (example) + metastore { + tables = [ + { + name = "customer_data" + format = "parquet" + path = "/data/customers" + } + ] + } + + # Example operations + operations = [ + { + name = "ingest_customers" + type = "ingestion" + schedule.type = "daily" + + source { + factory.class = "za.co.absa.pramen.core.source.JdbcSource" + jdbc.url = "jdbc:postgresql://localhost:5432/source_db" + jdbc.user = "reader" + jdbc.password = "secret" + query = "SELECT * FROM customers WHERE date = :infoDate" + } + + metastore.table = "customer_data" + } + ] +} + +# ============================================================================ +# How This Configuration Works +# ============================================================================ +# +# 1. BOOKKEEPING TABLES: +# - pramen_production_bookkeeping: Stores data chunk metadata +# - pramen_production_schemas: Stores table schemas +# +# 2. LOCK TABLE: +# - pramen_production_locks: Stores distributed locks +# +# 3. JOURNAL TABLE: +# - pramen_production_journal: Stores task completion records +# +# 4. METADATA TABLE: +# - pramen_production_metadata: Stores custom metadata key-value pairs +# +# 5. AUTOMATIC TABLE CREATION: +# All tables are created automatically on first run with proper schema: +# - Partition keys and sort keys configured +# - PAY_PER_REQUEST billing mode (on-demand) +# - Five tables total: bookkeeping, schemas, locks, journal, metadata +# +# 6. LOCK BEHAVIOR: +# When a pipeline runs: +# - Acquires a lock by writing to pramen_production_locks table +# - Uses DynamoDB conditional writes (attribute_not_exists) for atomicity +# - Lock ticket expires after 10 minutes (automatically renewed) +# - If another instance tries to run, it will be blocked +# - Lock is released when pipeline completes or fails +# +# 7. MULTI-ENVIRONMENT SETUP: +# Use different table prefixes for different environments: +# - Dev: pramen_dev_* +# - Staging: pramen_staging_* +# - Production: pramen_production_* +# +# 8. AWS CREDENTIALS: +# Pramen uses AWS DefaultCredentialsProvider which loads from: +# - Environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) +# - AWS credentials file (~/.aws/credentials) +# - IAM role (EC2, ECS, EMR, Lambda, Glue) +# +# 9. IAM PERMISSIONS REQUIRED: +# { +# "Version": "2012-10-17", +# "Statement": [ +# { +# "Effect": "Allow", +# "Action": [ +# "dynamodb:CreateTable", +# "dynamodb:DescribeTable", +# "dynamodb:PutItem", +# "dynamodb:GetItem", +# "dynamodb:DeleteItem", +# "dynamodb:UpdateItem", +# "dynamodb:Query", +# "dynamodb:Scan" +# ], +# "Resource": [ +# "arn:aws:dynamodb:*:*:table/pramen_production_bookkeeping", +# "arn:aws:dynamodb:*:*:table/pramen_production_schemas", +# "arn:aws:dynamodb:*:*:table/pramen_production_locks", +# "arn:aws:dynamodb:*:*:table/pramen_production_journal", +# "arn:aws:dynamodb:*:*:table/pramen_production_metadata", +# "arn:aws:dynamodb:*:*:table/pramen_production_offsets" +# ] +# } +# ] +# } +# +# 10. TESTING: +# spark-submit --class za.co.absa.pramen.runner.PipelineRunner \ +# pramen-runner_2.12-1.13.10.jar \ +# --config dynamodb_with_locks.conf \ +# --date 2024-01-15 +# +# 11. COST OPTIMIZATION: +# - PAY_PER_REQUEST billing: $1.25 per million writes, $0.25 per million reads +# - Typical pipeline: ~10-20 requests per run +# - Cost per run: < $0.001 +# - Monthly cost for 100 daily runs: ~$3 +# +# 12. LOCK EXPIRATION: +# - Lock tickets expire after 10 minutes of inactivity +# - Active pipelines renew their lock every 2 minutes +# - Expired locks can be taken over by new pipeline runs +# - Hard expiration after 1 day (cleanup of stale locks) diff --git a/pramen/extras/src/main/scala/za/co/absa/pramen/extras/notification/mq/SingleMessageProducer.scala b/pramen/extras/src/main/scala/za/co/absa/pramen/extras/notification/mq/SingleMessageProducer.scala index 95c92d0af..3c3c4d458 100644 --- a/pramen/extras/src/main/scala/za/co/absa/pramen/extras/notification/mq/SingleMessageProducer.scala +++ b/pramen/extras/src/main/scala/za/co/absa/pramen/extras/notification/mq/SingleMessageProducer.scala @@ -16,10 +16,8 @@ package za.co.absa.pramen.extras.notification.mq -trait SingleMessageProducer { +trait SingleMessageProducer extends AutoCloseable { def send(topic: String, message: String, numberOrRetries: Int = 3): Unit def connect(): Unit - - def close(): Unit } diff --git a/pramen/pom.xml b/pramen/pom.xml index 3bdeff7e5..f3c429dbf 100644 --- a/pramen/pom.xml +++ b/pramen/pom.xml @@ -143,6 +143,7 @@ 1.1.4 1.10.3 0-10 + 2.42.23 true @@ -372,6 +373,13 @@ 0.8.0 + + + software.amazon.awssdk + dynamodb + ${aws.sdk.version} + + org.scalatest diff --git a/pramen/project/Dependencies.scala b/pramen/project/Dependencies.scala index b382b0da7..572b2da9a 100644 --- a/pramen/project/Dependencies.scala +++ b/pramen/project/Dependencies.scala @@ -29,6 +29,7 @@ object Dependencies { def CoreDependencies(scalaVersion: String, isDeltaCompile: Boolean): Seq[ModuleID] = Seq( "org.apache.spark" %% "spark-sql" % sparkVersion(scalaVersion) % Provided, + "software.amazon.awssdk" % "dynamodb" % awsSdkVersion % Provided, "org.mongodb.scala" %% "mongo-scala-driver" % mongoDbScalaDriverVersion, "com.typesafe.slick" %% "slick" % slickVersion, "com.typesafe.slick" %% "slick-hikaricp" % slickVersion, diff --git a/pramen/project/Versions.scala b/pramen/project/Versions.scala index 86725e468..64e7c57c1 100644 --- a/pramen/project/Versions.scala +++ b/pramen/project/Versions.scala @@ -37,6 +37,7 @@ object Versions { val scalatestVersion = "3.2.14" val mockitoVersion = "2.28.2" val httpClientVersion = "4.5.14" + val awsSdkVersion = "2.42.23" def sparkFallbackVersion(scalaVersion: String): String = { if (scalaVersion.startsWith("2.11.")) {