diff --git a/qannotate/src/au/edu/qimr/qannotate/nanno/Annotate.java b/qannotate/src/au/edu/qimr/qannotate/nanno/Annotate.java index 5b2a96645..da4d1a054 100644 --- a/qannotate/src/au/edu/qimr/qannotate/nanno/Annotate.java +++ b/qannotate/src/au/edu/qimr/qannotate/nanno/Annotate.java @@ -23,6 +23,13 @@ public class Annotate { static final List SEARCH_TERM_VARIETIES = Arrays.asList(">", "->", "-->", "/"); + // Define the set of standard GRCh38 contigs (no "chr" prefix version) + private static final Set STANDARD_GRCH38_CONTIGS = Set.of( + "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", + "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", + "21", "22", "X", "Y", "MT", "M" + ); + static Comparator CUSTOM_COMPARATOR; static QLogger logger; @@ -71,70 +78,77 @@ public int engage() throws Exception { ChrPosition lastCP = null; try ( - VcfFileReader reader = new VcfFileReader(inputFile)) { + VcfFileReader reader = new VcfFileReader(inputFile)) { logger.info("VcfFileReader has been setup"); int vcfCount = 0; + int nonStandardContigCount = 0; for (VcfRecord vcf : reader) { vcfCount++; ChrPosition thisVcfsCP = vcf.getChrPositionRefAlt(); logger.debug("thisVcfsCP: " + thisVcfsCP.toIGVString()); + boolean isStandardContig = isStandardContig(thisVcfsCP); + if (isStandardContig) { - /* - * check that this CP is "after" the last CP - */ - int compare = null != lastCP ? ((ChrPositionRefAlt) thisVcfsCP).compareTo((ChrPositionRefAlt) lastCP) : 0; - if (compare < 0) { - throw new IllegalArgumentException("Incorrect order of vcf records in input vcf file! this vcf: " + thisVcfsCP.toIGVString() + ", last vcf: " + lastCP.toIGVString()); - } - - String alt = ((ChrPositionRefAlt) thisVcfsCP).getAlt(); - String gatkAD = VcfUtils.getFormatField(vcf.getFormatFields(), "AD", 0); - String gatkGT = VcfUtils.getFormatField(vcf.getFormatFields(), "GT", 0); - - if (alt.contains(",")) { - logger.info("alt has comma: " + thisVcfsCP); /* - * split record, create new ChrPositions for each + * check that this CP is "after" the last CP */ - String[] altArray = alt.split(","); - List splitVcfs = new ArrayList<>(); - for (String thisAlt : altArray) { - if (thisAlt.equals("*")) { - /* - * ignore - */ - } else { - VcfRecord newVcf = VcfUtils.cloneWithNewAlt(vcf, thisAlt); - splitVcfs.add(newVcf); - } + int compare = null != lastCP ? ((ChrPositionRefAlt) thisVcfsCP).compareTo((ChrPositionRefAlt) lastCP) : 0; + if (compare < 0) { + throw new IllegalArgumentException("Incorrect order of vcf records in input vcf file! this vcf: " + thisVcfsCP.toIGVString() + ", last vcf: " + lastCP.toIGVString()); } - if (splitVcfs.size() > 1) { + + + String alt = ((ChrPositionRefAlt) thisVcfsCP).getAlt(); + String gatkAD = VcfUtils.getFormatField(vcf.getFormatFields(), "AD", 0); + String gatkGT = VcfUtils.getFormatField(vcf.getFormatFields(), "GT", 0); + + if (alt.contains(",")) { + logger.info("alt has comma: " + thisVcfsCP); /* - * sort + * split record, create new ChrPositions for each */ - splitVcfs.sort(null); - } - for (VcfRecord splitVcf : splitVcfs) { - List annotations = new ArrayList<>(getAnnotationsForPosition(splitVcf.getChrPositionRefAlt(), annotationSources, executor)); - queue.add(new ChrPositionAnnotations(splitVcf.getChrPositionRefAlt(), annotations, gatkAD, gatkGT, alt)); - } + String[] altArray = alt.split(","); + List splitVcfs = new ArrayList<>(); + for (String thisAlt : altArray) { + if (thisAlt.equals("*")) { + /* + * ignore + */ + } else { + VcfRecord newVcf = VcfUtils.cloneWithNewAlt(vcf, thisAlt); + splitVcfs.add(newVcf); + } + } + if (splitVcfs.size() > 1) { + /* + * sort + */ + splitVcfs.sort(null); + } + for (VcfRecord splitVcf : splitVcfs) { + List annotations = new ArrayList<>(getAnnotationsForPosition(splitVcf.getChrPositionRefAlt(), annotationSources, executor)); + queue.add(new ChrPositionAnnotations(splitVcf.getChrPositionRefAlt(), annotations, gatkAD, gatkGT, alt)); + } - } else { + } else { - logger.debug("about to get annotations for: " + thisVcfsCP.toIGVString()); - List annotations = getAnnotationsForPosition(thisVcfsCP, annotationSources, executor); - logger.debug("got annotations for: " + thisVcfsCP.toIGVString() + " - adding to queue"); - queue.add(new ChrPositionAnnotations(thisVcfsCP, annotations, gatkAD, gatkGT, alt)); + logger.debug("about to get annotations for: " + thisVcfsCP.toIGVString()); + List annotations = getAnnotationsForPosition(thisVcfsCP, annotationSources, executor); + logger.debug("got annotations for: " + thisVcfsCP.toIGVString() + " - adding to queue"); + queue.add(new ChrPositionAnnotations(thisVcfsCP, annotations, gatkAD, gatkGT, alt)); - } + } - lastCP = thisVcfsCP; + lastCP = thisVcfsCP; + } else { + nonStandardContigCount++; + } } - logger.info("# of vcf records: " + vcfCount); + logger.info("# of vcf records: " + vcfCount + ", # of non-standard contigs: " + nonStandardContigCount); } finally { /* * count down the count down latch @@ -147,6 +161,10 @@ public int engage() throws Exception { return exitStatus; } + private boolean isStandardContig(ChrPosition thisVcfsCP) { + return thisVcfsCP.getChromosome().startsWith("chr") ? STANDARD_GRCH38_CONTIGS.contains(thisVcfsCP.getChromosome().substring(3)) : STANDARD_GRCH38_CONTIGS.contains(thisVcfsCP.getChromosome()); + } + private static List getAnnotationsForPosition(ChrPosition cp, List annotationSources, Executor executor) { long contigAndPosition = ((ChrPositionUtils.convertContigAndPositionToLong(cp.getChromosome().startsWith("chr") ? cp.getChromosome().substring(3) : cp.getChromosome(), cp.getStartPosition()))); diff --git a/qcommon/src/org/qcmg/common/util/ChrPositionUtils.java b/qcommon/src/org/qcmg/common/util/ChrPositionUtils.java index 5edc5642e..b60cb17a6 100644 --- a/qcommon/src/org/qcmg/common/util/ChrPositionUtils.java +++ b/qcommon/src/org/qcmg/common/util/ChrPositionUtils.java @@ -64,11 +64,14 @@ public static int convertContigNameToInt(String contigName) { if (null == contigName || contigName.isEmpty()) { throw new IllegalArgumentException("null or empty contig name supplied to convertContigNameToInt"); } - int i = Character.isDigit(contigName.charAt(0)) ? Integer.parseInt(contigName) : -1; - if (i > -1) { - return i; + // check if the contig name is a number + // if so, return it as an int + // otherwise, convert it to a hash code + if (isDigits(contigName)) { + return Integer.parseInt(contigName); } + if (contigName.length() > 3 && contigName.startsWith("chr")) { return convertContigNameToInt(contigName.substring(3)); } @@ -81,6 +84,10 @@ public static int convertContigNameToInt(String contigName) { }; } + public static boolean isDigits(String str) { + return str != null && !str.isEmpty() && str.chars().allMatch(Character::isDigit); + } + /** * Checks if two ChrPosition objects overlap with a buffer. * diff --git a/qcommon/test/org/qcmg/common/util/ChrPositionUtilsTest.java b/qcommon/test/org/qcmg/common/util/ChrPositionUtilsTest.java index 9c5f47b95..5f2f39cdd 100644 --- a/qcommon/test/org/qcmg/common/util/ChrPositionUtilsTest.java +++ b/qcommon/test/org/qcmg/common/util/ChrPositionUtilsTest.java @@ -35,6 +35,61 @@ public void testDelta() { assertTrue(ChrPositionUtils.arePositionsWithinDelta(cp1, cp2, 4)); } + @Test + public void testConvertContigNameToInt_NumericContig() { + assertEquals(1, ChrPositionUtils.convertContigNameToInt("1")); + assertEquals(22, ChrPositionUtils.convertContigNameToInt("22")); + } + + @Test + public void testConvertContigNameToInt_ChromosomeWithChrPrefix() { + assertEquals(1, ChrPositionUtils.convertContigNameToInt("chr1")); + assertEquals(22, ChrPositionUtils.convertContigNameToInt("chr22")); + } + + @Test + public void testConvertContigNameToInt_SexChromosomes() { + assertEquals(23, ChrPositionUtils.convertContigNameToInt("X")); + assertEquals(24, ChrPositionUtils.convertContigNameToInt("Y")); + } + + @Test + public void testConvertContigNameToInt_Mitochondrial() { + assertEquals(25, ChrPositionUtils.convertContigNameToInt("M")); + assertEquals(25, ChrPositionUtils.convertContigNameToInt("MT")); + } + + @Test + public void testConvertContigNameToInt_ChromosomeWithChrPrefixSpecialCases() { + assertEquals(23, ChrPositionUtils.convertContigNameToInt("chrX")); + assertEquals(24, ChrPositionUtils.convertContigNameToInt("chrY")); + assertEquals(25, ChrPositionUtils.convertContigNameToInt("chrM")); + } + + @Test + public void testConvertContigNameToInt_AltChromosome() { + assertEquals("22_KI270739v1_random".hashCode(), ChrPositionUtils.convertContigNameToInt("chr22_KI270739v1_random")); + assertEquals("Y_KI270740v1_random".hashCode(), ChrPositionUtils.convertContigNameToInt("chrY_KI270740v1_random")); + assertEquals("Un_KI270302v1".hashCode(), ChrPositionUtils.convertContigNameToInt("chrUn_KI270302v1")); + } + + @Test + public void testConvertContigNameToInt_OtherValues() { + // For other values, it should return hashCode + String contig = "other"; + assertEquals(contig.hashCode(), ChrPositionUtils.convertContigNameToInt(contig)); + } + + @Test(expected = IllegalArgumentException.class) + public void testConvertContigNameToInt_NullInput() { + ChrPositionUtils.convertContigNameToInt(null); + } + + @Test(expected = IllegalArgumentException.class) + public void testConvertContigNameToInt_EmptyInput() { + ChrPositionUtils.convertContigNameToInt(""); + } + @Test public void testConvertChrPositionToLong() { long expected = ((long) 4 << 32) + 9;