Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EVA-3543 check if rs with same hash already exist before merging #441

Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import uk.ac.ebi.eva.accession.clustering.metric.ClusteringMetric;
import uk.ac.ebi.eva.accession.core.model.IClusteredVariant;
import uk.ac.ebi.eva.accession.core.model.ISubmittedVariant;
import uk.ac.ebi.eva.accession.core.model.dbsnp.DbsnpClusteredVariantEntity;
import uk.ac.ebi.eva.accession.core.model.dbsnp.DbsnpSubmittedVariantEntity;
import uk.ac.ebi.eva.accession.core.model.dbsnp.DbsnpSubmittedVariantOperationEntity;
import uk.ac.ebi.eva.accession.core.model.eva.ClusteredVariantEntity;
Expand Down Expand Up @@ -192,6 +193,18 @@ public void writeRSMerge(SubmittedVariantOperationEntity currentOperation)
getMergeDestinationAndMergees(mergeCandidates);
ClusteredVariantEntity mergeDestination = mergeDestinationAndMergees.getLeft();

// check if any variant with same hash as mergeDestination already exist in DB
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This section is only relevant if the list of mergees has more than 2 elements. Can we test this before going through this ?

Copy link
Contributor

@apriltuesday apriltuesday Apr 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it also work (and maybe make more sense) to actually just extract this removal into a method that's called before the for-loop, and modify it so it checks for all mergee accessions instead of just one?

I'm sure this implementation works, but I'm wary of essentially redo-ing merge detection and prioritisation logic rather than just trusting the candidates we have.

ClusteredVariantEntity existingClusteredVariantEntity = getClusteredVariantEntityWithHash(mergeDestination);
if (existingClusteredVariantEntity != null && existingClusteredVariantEntity.getAccession()!=mergeDestination.getAccession()) {
if (mergeDestination.getAccession() == ClusteredVariantMergingPolicy.prioritise(mergeDestination.getAccession(),
existingClusteredVariantEntity.getAccession()).accessionToKeep) {
merge(mergeDestination, existingClusteredVariantEntity, currentOperation);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The existingClusteredVariantEntity is also part of the mergees so we will likely have to remove it from the list

} else {
merge(existingClusteredVariantEntity, mergeDestination, currentOperation);
mergeDestination = existingClusteredVariantEntity;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this part is necessary. If the existing cluster variant entity document is the merge destination then the norm code bellow would work.

}
}

List<ClusteredVariantEntity> mergees = mergeDestinationAndMergees.getRight();
for (ClusteredVariantEntity mergee: mergees) {
logger.info("RS merge operation: Merging rs{} to rs{} due to hash collision...",
Expand All @@ -200,6 +213,19 @@ public void writeRSMerge(SubmittedVariantOperationEntity currentOperation)
}
}

private ClusteredVariantEntity getClusteredVariantEntityWithHash(ClusteredVariantEntity cve) {
ClusteredVariantEntity existingClusteredVariantEntity = null;
existingClusteredVariantEntity = mongoTemplate.findOne(query(where(ID_ATTRIBUTE).is(cve.getHashedMessage())),
ClusteredVariantEntity.class);
if (existingClusteredVariantEntity != null) {
return existingClusteredVariantEntity;
} else {
existingClusteredVariantEntity = mongoTemplate.findOne(query(where(ID_ATTRIBUTE).is(cve.getHashedMessage())),
DbsnpClusteredVariantEntity.class);
}
return existingClusteredVariantEntity;
}

private ImmutablePair<String, Long> getHashedMessageAndAccessionForSVIE(SubmittedVariantInactiveEntity svie) {
return new ImmutablePair<>(svie.getHashedMessage(), svie.getAccession());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -432,6 +432,82 @@ public void testMultiLevelRSMerges() throws Exception {
assertRSAssociatedWithSS(1L, ss4);
}

@Test
@DirtiesContext
public void testRSWithSameHashAsMergeDestinationAlreadyExistsInDBWithHigherAccession() throws Exception {
ss1 = createSS(1L, 1L, 100L, "C", "T");
ss2 = createSS(2L, 2L, 100L, "C", "A");
ss3 = createSS(3L, 3L, 100L, "A", "G");
this.mongoTemplate.insert(Arrays.asList(ss1, ss2, ss3), DBSNP_SUBMITTED_VARIANT_COLLECTION);

DbsnpClusteredVariantEntity rs2 = this.createRS(ss2);
this.mongoTemplate.insert(rs2);

SubmittedVariantOperationEntity mergeOperation1 = new SubmittedVariantOperationEntity();
mergeOperation1.fill(RSMergeAndSplitCandidatesReaderConfiguration.MERGE_CANDIDATES_EVENT_TYPE,
ss1.getAccession(), null, "Different RS with matching loci",
Stream.of(ss1, ss2, ss3).map(SubmittedVariantInactiveEntity::new).collect(Collectors.toList()));
mergeOperation1.setId(ClusteringWriter.getMergeCandidateId(mergeOperation1));
this.mongoTemplate.insert(Arrays.asList(mergeOperation1), SUBMITTED_VARIANT_OPERATION_COLLECTION);

List<SubmittedVariantOperationEntity> submittedVariantOperationEntities = new ArrayList<>();
SubmittedVariantOperationEntity temp;
rsMergeCandidatesReader.open(new ExecutionContext());
while ((temp = rsMergeCandidatesReader.read()) != null) {
submittedVariantOperationEntities.add(temp);
}

//Perform merge
rsMergeWriter.write(submittedVariantOperationEntities);

// Check rs2,rs3 merged into to rs1
assertMergeOp(ss2.getClusteredVariantAccession(), ss1.getClusteredVariantAccession(), ASSEMBLY);
assertMergeOp(ss3.getClusteredVariantAccession(), ss1.getClusteredVariantAccession(), ASSEMBLY);

// Check rs1 to rs3 all have same rs
assertRSAssociatedWithSS(1L, ss1);
assertRSAssociatedWithSS(1L, ss2);
assertRSAssociatedWithSS(1L, ss3);
}

@Test
@DirtiesContext
public void testRSWithSameHashAsMergeDestinationAlreadyExistsInDBWithLowerAccession() throws Exception {
ss1 = createSS(1L, 1L, 100L, "C", "T");
ss2 = createSS(2L, 2L, 100L, "C", "A");
ss3 = createSS(3L, 3L, 100L, "A", "G");
this.mongoTemplate.insert(Arrays.asList(ss1, ss2, ss3), DBSNP_SUBMITTED_VARIANT_COLLECTION);

DbsnpClusteredVariantEntity rs1 = this.createRS(ss1);
this.mongoTemplate.insert(rs1);

SubmittedVariantOperationEntity mergeOperation1 = new SubmittedVariantOperationEntity();
mergeOperation1.fill(RSMergeAndSplitCandidatesReaderConfiguration.MERGE_CANDIDATES_EVENT_TYPE,
ss2.getAccession(), null, "Different RS with matching loci",
Stream.of(ss2, ss3).map(SubmittedVariantInactiveEntity::new).collect(Collectors.toList()));
mergeOperation1.setId(ClusteringWriter.getMergeCandidateId(mergeOperation1));
this.mongoTemplate.insert(Arrays.asList(mergeOperation1), SUBMITTED_VARIANT_OPERATION_COLLECTION);

List<SubmittedVariantOperationEntity> submittedVariantOperationEntities = new ArrayList<>();
SubmittedVariantOperationEntity temp;
rsMergeCandidatesReader.open(new ExecutionContext());
while ((temp = rsMergeCandidatesReader.read()) != null) {
submittedVariantOperationEntities.add(temp);
}

//Perform merge
rsMergeWriter.write(submittedVariantOperationEntities);

// Check rs2,rs3 merged into to rs1
assertMergeOp(ss2.getClusteredVariantAccession(), ss1.getClusteredVariantAccession(), ASSEMBLY);
assertMergeOp(ss3.getClusteredVariantAccession(), ss1.getClusteredVariantAccession(), ASSEMBLY);

// Check rs1 to rs3 all have same rs
assertRSAssociatedWithSS(1L, ss1);
assertRSAssociatedWithSS(1L, ss2);
assertRSAssociatedWithSS(1L, ss3);
}

private void assertMergeOp(Long mergee, Long merger, String assemblyToUse) {
assertEquals(0, this.clusteredVariantAccessioningService
.getAllActiveByAssemblyAndAccessionIn(assemblyToUse, Arrays.asList(mergee)).size());
Expand Down