diff --git a/qannotate/src/au/edu/qimr/qannotate/modes/IndelConfidenceMode.java b/qannotate/src/au/edu/qimr/qannotate/modes/IndelConfidenceMode.java index 8403fdbfc..7d2048fc7 100644 --- a/qannotate/src/au/edu/qimr/qannotate/modes/IndelConfidenceMode.java +++ b/qannotate/src/au/edu/qimr/qannotate/modes/IndelConfidenceMode.java @@ -34,7 +34,7 @@ /** * @author christix - * annotate whether indel is high, low or zero confidence + * annotate whether indel is high, low, or zero confidence * */ public class IndelConfidenceMode extends AbstractMode{ @@ -47,10 +47,12 @@ public class IndelConfidenceMode extends AbstractMode{ private static final float DEFAULT_SSOI = 0.2f; public static final int DEFAULT_HOMN = 6; private final Map mask = new HashMap<>(); + + private int homopolymerCutoff = IndelConfidenceMode.DEFAULT_HOMN; //filters private static final String FILTER_REPEAT = "REPEAT"; - private static final String DESCRITPION_INFO_CONFIDENCE = "set to HIGH if the variants passed all filter, " + private static final String DESCRIPTION_INFO_CONFIDENCE = "set to HIGH if the variants passed all filter, " + "nearby homopolymer sequence base less than six and less than 10% reads contains nearby indel; set to Zero if " + "coverage more than 1000, or fallen in repeat region; set to LOW for reminding variants"; @@ -68,36 +70,42 @@ public IndelConfidenceMode(Options options) throws Exception{ input = options.getInputFileName(); output = options.getOutputFileName(); commandLine = options.getCommandLine(); + options.getHomoplymersCutoff().ifPresent(i -> homopolymerCutoff = i); logger.tool("input: " + options.getInputFileName()); logger.tool("mask File: " + options.getDatabaseFileName() ); logger.tool("output annotated records: " + options.getOutputFileName()); logger.tool("logger file " + options.getLogFileName()); logger.tool("logger level " + (options.getLogLevel() == null ? QLoggerFactory.DEFAULT_LEVEL.getName() : options.getLogLevel())); + logger.tool("Homopolymer cutoff (will add to filter if value is greater than or equal to cutoff): " + homopolymerCutoff); addAnnotation(options.getDatabaseFileName() ); } /** - * load repeat region into RAM - * @param dbfile - * @throws Exception - */ + * Loads repeat region data from the specified file and populates a BitSet for masking. + * Each line in the file is processed to extract chromosome, start, and end positions, + * which are then used to update the mask data structure. + * + * @param dbfile the path to the file containing repeat regions. The file should consist of lines + * where each line contains chromosome, start, and end positions of the repeat regions. + * @throws IOException if an I/O error occurs while reading the file. + */ private void loadMask(String dbfile) throws IOException { //load repeat region to bitset try(BufferedReader reader = new BufferedReader(new FileReader(dbfile))){ String line; while (( line = reader.readLine()) != null) { - if ( ! line.startsWith("geno")) { - String[] array = line.split(" "); - //int no = Integer.parseInt(array[0]) - 1; - String chr = IndelUtils.getFullChromosome(array[0]); - - int start = Integer.parseInt(array[1]); - int end = Integer.parseInt(array[2]); - - mask.computeIfAbsent(chr, (v) -> new BitSet()).set(start,end); - } + if ( ! line.startsWith("geno")) { + String[] array = line.split(" "); + //int no = Integer.parseInt(array[0]) - 1; + String chr = IndelUtils.getFullChromosome(array[0]); + + int start = Integer.parseInt(array[1]); + int end = Integer.parseInt(array[2]); + + mask.computeIfAbsent(chr, (v) -> new BitSet()).set(start,end); + } } } } @@ -117,11 +125,11 @@ MafConfidence getConfidence(VcfRecord vcf){ float ssoi = (info.getField(VcfHeaderUtils.INFO_SOMATIC) != null) ? 1 : StringUtils.string2Number(vcf.getInfoRecord().getField(IndelUtils.INFO_SSOI), Float.class); - //check homoplymers + //check homopolymers int lhomo = (info.getField(VcfHeaderUtils.INFO_HOM) == null)? 1 : StringUtils.string2Number(info.getField(VcfHeaderUtils.INFO_HOM).split(",")[0], Integer.class); - if(nioc <= DEFAULT_NIOC && lhomo <= DEFAULT_HOMN && ssoi >= DEFAULT_SSOI) return MafConfidence.HIGH; + if(nioc <= DEFAULT_NIOC && lhomo <= homopolymerCutoff && ssoi >= DEFAULT_SSOI) return MafConfidence.HIGH; } else if (filter.equals(IndelUtils.FILTER_HCOVN) || filter.equals(IndelUtils.FILTER_HCOVT) || filter.equals(FILTER_REPEAT) || filter.contains(Constants.SEMI_COLON + FILTER_REPEAT + Constants.SEMI_COLON) || @@ -134,7 +142,16 @@ MafConfidence getConfidence(VcfRecord vcf){ } - private boolean isRepeat(VcfRecord vcf){ + /** + * Determines if the given VCF record falls within a repeat region. + * It checks the specified chromosome and position range in the associated + * BitSet mask for repeat regions. + * + * @param vcf a {@code VcfRecord} representing the variant call to check for repeat regions. + * This includes details such as chromosome, position, and other metadata. + * @return {@code true} if the variant falls within a repeat region; {@code false} otherwise. + */ + private boolean isRepeat(VcfRecord vcf){ String chr = IndelUtils.getFullChromosome(vcf.getChromosome()); BitSet chrMask = mask.get(chr); @@ -158,14 +175,14 @@ void addAnnotation(String dbfile) throws IOException { long count = 0; long repeatCount = 0; - HashSet posCheck = new HashSet(); + HashSet posCheck = new HashSet<>(); try (VcfFileReader reader = new VcfFileReader(input) ; RecordWriter writer = new RecordWriter<>(new File(output )) ) { //reheader VcfHeader hd = reader.getVcfHeader(); hd.addFilter(FILTER_REPEAT, DESCRIPTION_FILTER_REPEAT ); - hd.addInfo(VcfHeaderUtils.INFO_CONFIDENCE, "1", "String", DESCRITPION_INFO_CONFIDENCE); + hd.addInfo(VcfHeaderUtils.INFO_CONFIDENCE, "1", "String", DESCRIPTION_INFO_CONFIDENCE); hd = reheader(hd, commandLine ,input); for(final VcfHeaderRecord record: hd) @@ -183,8 +200,8 @@ void addAnnotation(String dbfile) throws IOException { writer.add(vcf); } } - logger.info(String.format("outputed %d VCF record, happend on %d variants location.", count , posCheck.size())); - logger.info("number of variants fallen into repeat region is " + repeatCount); + logger.info(String.format("outputted %d VCF records in %d locations.", count , posCheck.size())); + logger.info("number of variants in repeat regions: " + repeatCount); } diff --git a/qannotate/test/au/edu/qimr/qannotate/modes/IndelConfidenceModeTest.java b/qannotate/test/au/edu/qimr/qannotate/modes/IndelConfidenceModeTest.java index 8d7c21ce4..030c54e19 100644 --- a/qannotate/test/au/edu/qimr/qannotate/modes/IndelConfidenceModeTest.java +++ b/qannotate/test/au/edu/qimr/qannotate/modes/IndelConfidenceModeTest.java @@ -1,9 +1,5 @@ package au.edu.qimr.qannotate.modes; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; - import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; @@ -21,6 +17,8 @@ import au.edu.qimr.qannotate.Main; +import static org.junit.Assert.*; + public class IndelConfidenceModeTest { public static String dbMask = "repeat.mask"; public static String input = "input.vcf"; @@ -37,7 +35,7 @@ public void clear(){ } @Test - public void InfoTest(){ + public void infoTest(){ IndelConfidenceMode mode = new IndelConfidenceMode(); @@ -45,63 +43,71 @@ public void InfoTest(){ // String str = "chr1 11303744 . C CA 37.73 HOM5 SOMATIC;HOMTXT=AGCCTGTCTCaAAAAAAAAAA;NIOC=0.087;SVTYPE=INS;END=11303745 GT:AD:DP:GQ:PL:ACINDEL .:.:.:.:.:0,39,36,0[0,0],0,4,4 0/1:30,10:40:75:75,0,541:7,80,66,8[4,4],1,7,8"; VcfRecord vcf = new VcfRecord(str.split("\\t")); - assertTrue(mode.getConfidence(vcf) == MafConfidence.HIGH); + assertSame(MafConfidence.HIGH, mode.getConfidence(vcf)); //HOMCNTXT is no longer checked vcf.setInfo("SOMATIC;HOMCNTXT=9,AGCCTGTCTCaAAAAAAAAAA;SVTYPE=INS;END=11303745"); - assertTrue(mode.getConfidence(vcf) == MafConfidence.HIGH); - vcf.setInfo("SOMATIC;HOM=9,AGCCTGTCTCaAAAAAAAAAA;SVTYPE=INS;END=11303745"); - assertTrue(mode.getConfidence(vcf) == MafConfidence.LOW); + assertSame(MafConfidence.HIGH, mode.getConfidence(vcf)); + vcf.setInfo("SOMATIC;HOM=9,AGCCTGTCTCaAAAAAAAAAA;SVTYPE=INS;END=11303745"); + assertSame(MafConfidence.LOW, mode.getConfidence(vcf)); //no homopolymers (repeat) vcf = new VcfRecord(str.split("\\t")); vcf.setInfo("SOMATIC;NIOC=0.087;SVTYPE=INS;END=11303745"); - assertTrue(mode.getConfidence(vcf) == MafConfidence.HIGH); + assertSame(MafConfidence.HIGH, mode.getConfidence(vcf)); //somatic SSOI vcf.setInfo("SOMATIC;NIOC=0.087;SSOI=0.1;SVTYPE=INS;END=11303745"); - assertTrue(mode.getConfidence(vcf) == MafConfidence.HIGH); + assertSame(MafConfidence.HIGH, mode.getConfidence(vcf)); //germline SSOI high vcf.setInfo("NIOC=0.087;SSOI=0.2;SVTYPE=INS;END=11303745"); - assertTrue(mode.getConfidence(vcf) == MafConfidence.HIGH); + assertSame(MafConfidence.HIGH, mode.getConfidence(vcf)); //germline SSOI low vcf.setInfo("NIOC=0.087;SSOI=0.1;SVTYPE=INS;END=11303745"); - assertTrue(mode.getConfidence(vcf) == MafConfidence.LOW); + assertSame(MafConfidence.LOW, mode.getConfidence(vcf)); //9 base repeat //vcf.setFilter(IndelUtils.FILTER_HOM + "9"); vcf.setInfo("SOMATIC;HOM=9,AGCCTGTCTCaAAAAAAAAAA;NIOC=0.087;SVTYPE=INS;END=11303745"); - assertTrue(mode.getConfidence(vcf) == MafConfidence.LOW); + assertSame(MafConfidence.LOW, mode.getConfidence(vcf)); //no repeat but too many nearby indel vcf.setInfo("SOMATIC;HOM=5,AGCCTGTCTCaAAAAAAAAAA;NIOC=0.187;SVTYPE=INS;END=11303745"); - assertTrue(mode.getConfidence(vcf) == MafConfidence.LOW); + assertSame(MafConfidence.LOW, mode.getConfidence(vcf)); //vcf.setFilter(IndelUtils.FILTER_HOM + "3" ); vcf.setInfo("SOMATIC;HOM=3,AGCCTGTCTCaAAAAAAAAAA;NIOC=0.087;SVTYPE=INS;END=11303745"); - assertTrue(mode.getConfidence(vcf) == MafConfidence.HIGH); + assertSame(MafConfidence.HIGH, mode.getConfidence(vcf)); vcf.setFilter(IndelUtils.FILTER_MIN ); - assertTrue(mode.getConfidence(vcf) == MafConfidence.LOW); + assertSame(MafConfidence.LOW, mode.getConfidence(vcf)); } @Test - public void confidenceRealLifeIndel() throws Exception { + public void confidenceRealLifeIndel() { //chr2 29432822 . C CGCGAATT . HCOVT AC=2;AF=1.00;AN=2;DP=1174;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=59.98;QD=7.19;SOR=11.114;HOM=0,GGATTTTATCgcgaattTCCAAGCTGA;CONF=ZERO GT:GD:AD:DP:GQ:PL .:.:.:.:.:. 1/1:CGCGAATT/CGCGAATT:0,258:258:99:8481,606,0 VcfRecord vcf1 = new VcfRecord(new String[]{"chr2","29432822",".","C","CGCGAATT",".","HCOVT","AC=2;AF=1.00;AN=2;DP=1174;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=59.98;QD=7.19;SOR=11.114;HOM=0,GGATTTTATCgcgaattTCCAAGCTGA","GT:GD:AD:DP:GQ:PL"," .:.:.:.:.:.","1/1:CGCGAATT/CGCGAATT:0,258:258:99:8481,606,0"}); - IndelConfidenceMode cm =new IndelConfidenceMode(); - assertTrue(cm.getConfidence(vcf1) == MafConfidence.ZERO); + IndelConfidenceMode cm = new IndelConfidenceMode(); + assertSame(MafConfidence.ZERO, cm.getConfidence(vcf1)); } + @Test + public void confidenceRealLifeIndel2() { + //chr2 29432822 . C CGCGAATT . HCOVT AC=2;AF=1.00;AN=2;DP=1174;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=59.98;QD=7.19;SOR=11.114;HOM=0,GGATTTTATCgcgaattTCCAAGCTGA;CONF=ZERO GT:GD:AD:DP:GQ:PL .:.:.:.:.:. 1/1:CGCGAATT/CGCGAATT:0,258:258:99:8481,606,0 + VcfRecord vcf1 = new VcfRecord(new String[]{"chr2","29432822",".","C","CGCGAATT",".","HCOVT","AC=2;AF=1.00;AN=2;DP=1174;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=59.98;QD=7.19;SOR=11.114;HOM=0,GGATTTTATCgcgaattTCCAAGCTGA","GT:GD:AD:DP:GQ:PL"," .:.:.:.:.:.","1/1:CGCGAATT/CGCGAATT:0,258:258:99:8481,606,0"}); + + IndelConfidenceMode cm = new IndelConfidenceMode(); + assertSame(MafConfidence.ZERO, cm.getConfidence(vcf1)); + } @Test - public void MaskTest() throws Exception{ + public void maskTest() { try{ createMask(); @@ -118,44 +124,87 @@ public void MaskTest() throws Exception{ i ++; if(i == 1) //chr1 53741 . CTT C - assertTrue( re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE).equals( MafConfidence.HIGH.name())); + assertEquals(re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE), MafConfidence.HIGH.name()); else if(i == 2) //chr1 53742 . C CA - assertTrue( re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE).equals( MafConfidence.HIGH.name())); + assertEquals(re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE), MafConfidence.HIGH.name()); else if(i == 3) //chr1 53742 . CTT C - assertTrue( re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE).equals( MafConfidence.ZERO.name())); + assertEquals(re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE), MafConfidence.ZERO.name()); else if(i == 4) //chr1 53743 . C CA //the insertion happened bwt 53743 and 53744, so it is before repeat region - assertTrue( re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE).equals( MafConfidence.HIGH.name())); + assertEquals(re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE), MafConfidence.HIGH.name()); else if(i == 5) //chr1 53744 . CTT C - assertTrue( re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE).equals( MafConfidence.ZERO.name())); + assertEquals(re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE), MafConfidence.ZERO.name()); } assertEquals(5, i); } - }catch(Exception e){ + } catch(Exception e){ System.out.println(e.getMessage()); - - fail( "My method throw expection" ); + fail( "My method threw an exception" ); } } + + @Test + public void homopolymerOptionTest() { + try{ + createMask(); + createHomVcf(); + + String[] args ={"--mode","IndelConfidence", "-i", new File(input).getAbsolutePath(), "-o", new File(output).getAbsolutePath(), + "-d", new File(dbMask).getAbsolutePath(), "--log", new File(log).getAbsolutePath()}; + Main.main(args); + + try (VcfFileReader reader = new VcfFileReader(new File(output))) { + int i = 0; + for ( VcfRecord re : reader) { + if (++i == 1) { + assertEquals(MafConfidence.LOW.name(), re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE)); + } else if (i == 2) { + assertEquals(MafConfidence.HIGH.name(), re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE)); + } + } + assertEquals(2, i); + } + + args = new String[]{"--mode", "IndelConfidence", "-i", new File(input).getAbsolutePath(), "-o", new File(output).getAbsolutePath(), + "-d", new File(dbMask).getAbsolutePath(), "--log", new File(log).getAbsolutePath(), "--homCutoff", "8"}; + Main.main(args); + + try (VcfFileReader reader = new VcfFileReader(new File(output))) { + int i = 0; + for ( VcfRecord re : reader) { + if (++i == 1) { + assertEquals(MafConfidence.HIGH.name(), re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE)); + } else if (i == 2) { + assertEquals(MafConfidence.HIGH.name(), re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE)); + } + } + assertEquals(2, i); + } + + } catch(Exception e){ + System.out.println(e.getMessage()); + fail( "My method threw an exception" ); + } + } public static void createMask() throws IOException{ - final List data = new ArrayList(); + final List data = new ArrayList<>(); data.add("genoName genoStart genoEnd repClass repFamily"); data.add("1 53744 53780 Low_complexity Low_complexity"); data.add("chr1 54712 54820 Simple_repeat Simple_repeat"); - try(BufferedWriter out = new BufferedWriter(new FileWriter(dbMask));) { + try(BufferedWriter out = new BufferedWriter(new FileWriter(dbMask))) { for (final String line : data) out.write(line +"\n"); } } public static void createVcf() throws IOException{ - final List data = new ArrayList(); + final List data = new ArrayList<>(); data.add("##fileformat=VCFv4.0"); data.add(VcfHeaderUtils.STANDARD_UUID_LINE + "=abcd_12345678_xzy_999666333"); data.add("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"); @@ -166,9 +215,21 @@ public static void createVcf() throws IOException{ data.add("chr1 53743 . C CA 37.73 PASS SOMATIC;HOMCNTXT=5,AGCCTGTCTCaAAAAAAAAAA;NIOC=0.087"); data.add("chr1 53744 . CTT C 37.73 PASS SOMATIC;HOMCNTXT=5,AGCCTGTCTCaAAAAAAAAAA;NIOC=0.087"); - try(BufferedWriter out = new BufferedWriter(new FileWriter(input));) { + try(BufferedWriter out = new BufferedWriter(new FileWriter(input))) { for (final String line : data) out.write(line +"\n"); } } + public static void createHomVcf() throws IOException{ + final List data = new ArrayList<>(); + data.add("##fileformat=VCFv4.0"); + data.add(VcfHeaderUtils.STANDARD_UUID_LINE + "=abcd_12345678_xzy_999666333"); + data.add("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t29ab944c-f37f-4531-95ac-57c52c53be6f\t78ab1438-d237-4cff-a76a-835320a5fb0f"); + data.add("chr1\t897580\trs397863599\tAT\tA\t404.73\tPASS\tBaseQRankSum=1.304;ClippingRankSum=0.000;DP=50;ExcessHet=3.0103;FS=2.463;MQ=60.00;MQRankSum=0.000;QD=8.26;ReadPosRankSum=-0.193;SOR=0.751;NIOC=0;SSOI=0.286;SVTYPE=DEL;END=897581;IN=1;VLD;DB;VAF=0.696;HOM=8,GACTCACTGA_TTTTTTTCTA\tGT:AD:DP:G;D:GQ:PL:ACINDEL\t0/1:29,20:49:AT/A:99:442,0,732:14,50,49,14[7,7],18[18],0,0,1\t0/1:10,51:61:AT/A:99:1367,0,121:43,61,61,48[21,27],48[43],0,0,1"); + data.add("chr1\t1303675\trs60404130\tC\tCCCT\t1037.73\tPASS\tBaseQRankSum=1.750;ClippingRankSum=0.000;DP=44;ExcessHet=3.0103;FS=13.869;MQ=59.81;MQRankSum=0.000;QD=30.90;ReadPosRankSum=2.125;SOR=1.634;NIOC=0.030;SSOI=0.758;SVTYPE=INS;END=1303676;IN=1;VLD;DB;VAF=0.86;HOM=5,CCTGTTGCCCcctCCTCTTTCTC\tGT:AD:DP:GD:GQ:PL:ACINDEL\t1/1:3,24:27:CCCT/CCCT:72:1075,72,0:23,33,33,25[6,19],28[25],0,1,2\t1/1:5,17:22:CCCT/CCCT:23:537,23,0:18,34,33,18[10,8],21[21],1,1,4"); + try(BufferedWriter out = new BufferedWriter(new FileWriter(input))) { + for (final String line : data) out.write(line +"\n"); + } + + } }