Skip to content

Commit

Permalink
registerTable needs to retry more times before giving up (#3642)
Browse files Browse the repository at this point in the history
Right after upgrade if a large number of tables are
opened in parallel, the chances of collisions increase
many fold since many tables may share the same protobuf schemas
under the hood.
  • Loading branch information
hisundar committed Jun 6, 2023
1 parent f2ec6e1 commit 2e87474
Showing 1 changed file with 17 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.corfudb.runtime.collections.TableOptions;
import org.corfudb.runtime.collections.TableParameters;
import org.corfudb.runtime.collections.streaming.StreamingManager;
import org.corfudb.runtime.exceptions.AbortCause;
import org.corfudb.runtime.exceptions.SerializerException;
import org.corfudb.runtime.exceptions.TransactionAbortedException;
import org.corfudb.runtime.object.transactions.TransactionType;
Expand Down Expand Up @@ -247,7 +248,12 @@ void registerTable(@Nonnull String namespace,
}
TableMetadata tableMetadata = metadataBuilder.build();

int numRetries = 9; // Since this is an internal transaction, retry a few times before giving up.
// Since this is an internal transaction, retry a few times before giving up.
final int minRetryCount = 16;
// Some clients open a large number of tables in parallel using ForkJoin thread pools
// greatly increasing the chances of collisions and transaction aborts.
// So set the number of retries as a factor of the number of cores in the system.
int numRetries = Math.max(minRetryCount, Runtime.getRuntime().availableProcessors());
while (numRetries-- > 0) {
// Schema validation to ensure that there is either proper modification of the schema across open calls.
// Or no modification to the protobuf files.
Expand Down Expand Up @@ -293,7 +299,17 @@ void registerTable(@Nonnull String namespace,
this.runtime.getObjectsView().TXEnd();
break;
} catch (TransactionAbortedException txAbort) {
if (txAbort.getAbortCause() == AbortCause.CONFLICT &&
txAbort.getConflictStream().equals(protobufDescriptorTable.getCorfuStreamID())) {
// Updates to protobuf descriptor tables are internal so conflicts hit here
// should not count towards the normal retry count.
log.info("registerTable {}${} failed due to conflict in protobuf descriptors. Retrying",
namespace, tableName);
numRetries++;
continue;
}
if (numRetries <= 0) {
log.error("registerTable failed. Retries exhausted. Cause {}", numRetries, txAbort);
throw txAbort;
}
log.info("registerTable: commit failed. Will retry {} times. Cause {}", numRetries, txAbort);
Expand Down

0 comments on commit 2e87474

Please sign in to comment.