Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 40 additions & 23 deletions qannotate/src/au/edu/qimr/qannotate/modes/IndelConfidenceMode.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@

/**
* @author christix
* annotate whether indel is high, low or zero confidence
* annotate whether indel is high, low, or zero confidence
*
*/
public class IndelConfidenceMode extends AbstractMode{
Expand All @@ -47,10 +47,12 @@ public class IndelConfidenceMode extends AbstractMode{
private static final float DEFAULT_SSOI = 0.2f;
public static final int DEFAULT_HOMN = 6;
private final Map<String,BitSet> mask = new HashMap<>();

private int homopolymerCutoff = IndelConfidenceMode.DEFAULT_HOMN;

//filters
private static final String FILTER_REPEAT = "REPEAT";
private static final String DESCRITPION_INFO_CONFIDENCE = "set to HIGH if the variants passed all filter, "
private static final String DESCRIPTION_INFO_CONFIDENCE = "set to HIGH if the variants passed all filter, "
+ "nearby homopolymer sequence base less than six and less than 10% reads contains nearby indel; set to Zero if "
+ "coverage more than 1000, or fallen in repeat region; set to LOW for reminding variants";

Expand All @@ -68,36 +70,42 @@ public IndelConfidenceMode(Options options) throws Exception{
input = options.getInputFileName();
output = options.getOutputFileName();
commandLine = options.getCommandLine();
options.getHomoplymersCutoff().ifPresent(i -> homopolymerCutoff = i);

logger.tool("input: " + options.getInputFileName());
logger.tool("mask File: " + options.getDatabaseFileName() );
logger.tool("output annotated records: " + options.getOutputFileName());
logger.tool("logger file " + options.getLogFileName());
logger.tool("logger level " + (options.getLogLevel() == null ? QLoggerFactory.DEFAULT_LEVEL.getName() : options.getLogLevel()));
logger.tool("Homopolymer cutoff (will add to filter if value is greater than or equal to cutoff): " + homopolymerCutoff);

addAnnotation(options.getDatabaseFileName() );
}

/**
* load repeat region into RAM
* @param dbfile
* @throws Exception
*/
* Loads repeat region data from the specified file and populates a BitSet for masking.
* Each line in the file is processed to extract chromosome, start, and end positions,
* which are then used to update the mask data structure.
*
* @param dbfile the path to the file containing repeat regions. The file should consist of lines
* where each line contains chromosome, start, and end positions of the repeat regions.
* @throws IOException if an I/O error occurs while reading the file.
*/
private void loadMask(String dbfile) throws IOException {
//load repeat region to bitset
try(BufferedReader reader = new BufferedReader(new FileReader(dbfile))){
String line;
while (( line = reader.readLine()) != null) {
if ( ! line.startsWith("geno")) {
String[] array = line.split(" ");
//int no = Integer.parseInt(array[0]) - 1;
String chr = IndelUtils.getFullChromosome(array[0]);
int start = Integer.parseInt(array[1]);
int end = Integer.parseInt(array[2]);
mask.computeIfAbsent(chr, (v) -> new BitSet()).set(start,end);
}
if ( ! line.startsWith("geno")) {
String[] array = line.split(" ");
//int no = Integer.parseInt(array[0]) - 1;
String chr = IndelUtils.getFullChromosome(array[0]);

int start = Integer.parseInt(array[1]);
int end = Integer.parseInt(array[2]);

mask.computeIfAbsent(chr, (v) -> new BitSet()).set(start,end);
}
}
}
}
Expand All @@ -117,11 +125,11 @@ MafConfidence getConfidence(VcfRecord vcf){
float ssoi = (info.getField(VcfHeaderUtils.INFO_SOMATIC) != null) ?
1 : StringUtils.string2Number(vcf.getInfoRecord().getField(IndelUtils.INFO_SSOI), Float.class);

//check homoplymers
//check homopolymers
int lhomo = (info.getField(VcfHeaderUtils.INFO_HOM) == null)? 1 :
StringUtils.string2Number(info.getField(VcfHeaderUtils.INFO_HOM).split(",")[0], Integer.class);

if(nioc <= DEFAULT_NIOC && lhomo <= DEFAULT_HOMN && ssoi >= DEFAULT_SSOI) return MafConfidence.HIGH;
if(nioc <= DEFAULT_NIOC && lhomo <= homopolymerCutoff && ssoi >= DEFAULT_SSOI) return MafConfidence.HIGH;

} else if (filter.equals(IndelUtils.FILTER_HCOVN) || filter.equals(IndelUtils.FILTER_HCOVT) ||
filter.equals(FILTER_REPEAT) || filter.contains(Constants.SEMI_COLON + FILTER_REPEAT + Constants.SEMI_COLON) ||
Expand All @@ -134,7 +142,16 @@ MafConfidence getConfidence(VcfRecord vcf){
}


private boolean isRepeat(VcfRecord vcf){
/**
* Determines if the given VCF record falls within a repeat region.
* It checks the specified chromosome and position range in the associated
* BitSet mask for repeat regions.
*
* @param vcf a {@code VcfRecord} representing the variant call to check for repeat regions.
* This includes details such as chromosome, position, and other metadata.
* @return {@code true} if the variant falls within a repeat region; {@code false} otherwise.
*/
private boolean isRepeat(VcfRecord vcf){

String chr = IndelUtils.getFullChromosome(vcf.getChromosome());
BitSet chrMask = mask.get(chr);
Expand All @@ -158,14 +175,14 @@ void addAnnotation(String dbfile) throws IOException {

long count = 0;
long repeatCount = 0;
HashSet<ChrPosition> posCheck = new HashSet<ChrPosition>();
HashSet<ChrPosition> posCheck = new HashSet<>();
try (VcfFileReader reader = new VcfFileReader(input) ;
RecordWriter<VcfRecord> writer = new RecordWriter<>(new File(output )) ) {

//reheader
VcfHeader hd = reader.getVcfHeader();
hd.addFilter(FILTER_REPEAT, DESCRIPTION_FILTER_REPEAT );
hd.addInfo(VcfHeaderUtils.INFO_CONFIDENCE, "1", "String", DESCRITPION_INFO_CONFIDENCE);
hd.addInfo(VcfHeaderUtils.INFO_CONFIDENCE, "1", "String", DESCRIPTION_INFO_CONFIDENCE);
hd = reheader(hd, commandLine ,input);

for(final VcfHeaderRecord record: hd)
Expand All @@ -183,8 +200,8 @@ void addAnnotation(String dbfile) throws IOException {
writer.add(vcf);
}
}
logger.info(String.format("outputed %d VCF record, happend on %d variants location.", count , posCheck.size()));
logger.info("number of variants fallen into repeat region is " + repeatCount);
logger.info(String.format("outputted %d VCF records in %d locations.", count , posCheck.size()));
logger.info("number of variants in repeat regions: " + repeatCount);

}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
package au.edu.qimr.qannotate.modes;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
Expand All @@ -21,6 +17,8 @@

import au.edu.qimr.qannotate.Main;

import static org.junit.Assert.*;

public class IndelConfidenceModeTest {
public static String dbMask = "repeat.mask";
public static String input = "input.vcf";
Expand All @@ -37,71 +35,79 @@ public void clear(){
}

@Test
public void InfoTest(){
public void infoTest(){

IndelConfidenceMode mode = new IndelConfidenceMode();

String str = "chr1 11303744 . C CA 37.73 PASS SOMATIC;HOM=5,AGCCTGTCTCaAAAAAAAAAA;NIOC=0.087;SVTYPE=INS;END=11303745 GT:AD:DP:GQ:PL:ACINDEL .:.:.:.:.:0,39,36,0[0,0],0,4,4 0/1:30,10:40:75:75,0,541:7,80,66,8[4,4],1,7,8";
// String str = "chr1 11303744 . C CA 37.73 HOM5 SOMATIC;HOMTXT=AGCCTGTCTCaAAAAAAAAAA;NIOC=0.087;SVTYPE=INS;END=11303745 GT:AD:DP:GQ:PL:ACINDEL .:.:.:.:.:0,39,36,0[0,0],0,4,4 0/1:30,10:40:75:75,0,541:7,80,66,8[4,4],1,7,8";

VcfRecord vcf = new VcfRecord(str.split("\\t"));
assertTrue(mode.getConfidence(vcf) == MafConfidence.HIGH);
assertSame(MafConfidence.HIGH, mode.getConfidence(vcf));

//HOMCNTXT is no longer checked
vcf.setInfo("SOMATIC;HOMCNTXT=9,AGCCTGTCTCaAAAAAAAAAA;SVTYPE=INS;END=11303745");
assertTrue(mode.getConfidence(vcf) == MafConfidence.HIGH);
vcf.setInfo("SOMATIC;HOM=9,AGCCTGTCTCaAAAAAAAAAA;SVTYPE=INS;END=11303745");
assertTrue(mode.getConfidence(vcf) == MafConfidence.LOW);
assertSame(MafConfidence.HIGH, mode.getConfidence(vcf));
vcf.setInfo("SOMATIC;HOM=9,AGCCTGTCTCaAAAAAAAAAA;SVTYPE=INS;END=11303745");
assertSame(MafConfidence.LOW, mode.getConfidence(vcf));


//no homopolymers (repeat)
vcf = new VcfRecord(str.split("\\t"));
vcf.setInfo("SOMATIC;NIOC=0.087;SVTYPE=INS;END=11303745");
assertTrue(mode.getConfidence(vcf) == MafConfidence.HIGH);
assertSame(MafConfidence.HIGH, mode.getConfidence(vcf));

//somatic SSOI
vcf.setInfo("SOMATIC;NIOC=0.087;SSOI=0.1;SVTYPE=INS;END=11303745");
assertTrue(mode.getConfidence(vcf) == MafConfidence.HIGH);
assertSame(MafConfidence.HIGH, mode.getConfidence(vcf));

//germline SSOI high
vcf.setInfo("NIOC=0.087;SSOI=0.2;SVTYPE=INS;END=11303745");
assertTrue(mode.getConfidence(vcf) == MafConfidence.HIGH);
assertSame(MafConfidence.HIGH, mode.getConfidence(vcf));

//germline SSOI low
vcf.setInfo("NIOC=0.087;SSOI=0.1;SVTYPE=INS;END=11303745");
assertTrue(mode.getConfidence(vcf) == MafConfidence.LOW);
assertSame(MafConfidence.LOW, mode.getConfidence(vcf));

//9 base repeat
//vcf.setFilter(IndelUtils.FILTER_HOM + "9");
vcf.setInfo("SOMATIC;HOM=9,AGCCTGTCTCaAAAAAAAAAA;NIOC=0.087;SVTYPE=INS;END=11303745");
assertTrue(mode.getConfidence(vcf) == MafConfidence.LOW);
assertSame(MafConfidence.LOW, mode.getConfidence(vcf));

//no repeat but too many nearby indel
vcf.setInfo("SOMATIC;HOM=5,AGCCTGTCTCaAAAAAAAAAA;NIOC=0.187;SVTYPE=INS;END=11303745");
assertTrue(mode.getConfidence(vcf) == MafConfidence.LOW);
assertSame(MafConfidence.LOW, mode.getConfidence(vcf));


//vcf.setFilter(IndelUtils.FILTER_HOM + "3" );
vcf.setInfo("SOMATIC;HOM=3,AGCCTGTCTCaAAAAAAAAAA;NIOC=0.087;SVTYPE=INS;END=11303745");
assertTrue(mode.getConfidence(vcf) == MafConfidence.HIGH);
assertSame(MafConfidence.HIGH, mode.getConfidence(vcf));

vcf.setFilter(IndelUtils.FILTER_MIN );
assertTrue(mode.getConfidence(vcf) == MafConfidence.LOW);
assertSame(MafConfidence.LOW, mode.getConfidence(vcf));

}


@Test
public void confidenceRealLifeIndel() throws Exception {
public void confidenceRealLifeIndel() {
//chr2 29432822 . C CGCGAATT . HCOVT AC=2;AF=1.00;AN=2;DP=1174;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=59.98;QD=7.19;SOR=11.114;HOM=0,GGATTTTATCgcgaattTCCAAGCTGA;CONF=ZERO GT:GD:AD:DP:GQ:PL .:.:.:.:.:. 1/1:CGCGAATT/CGCGAATT:0,258:258:99:8481,606,0
VcfRecord vcf1 = new VcfRecord(new String[]{"chr2","29432822",".","C","CGCGAATT",".","HCOVT","AC=2;AF=1.00;AN=2;DP=1174;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=59.98;QD=7.19;SOR=11.114;HOM=0,GGATTTTATCgcgaattTCCAAGCTGA","GT:GD:AD:DP:GQ:PL"," .:.:.:.:.:.","1/1:CGCGAATT/CGCGAATT:0,258:258:99:8481,606,0"});

IndelConfidenceMode cm =new IndelConfidenceMode();
assertTrue(cm.getConfidence(vcf1) == MafConfidence.ZERO);
IndelConfidenceMode cm = new IndelConfidenceMode();
assertSame(MafConfidence.ZERO, cm.getConfidence(vcf1));
}
@Test
public void confidenceRealLifeIndel2() {
//chr2 29432822 . C CGCGAATT . HCOVT AC=2;AF=1.00;AN=2;DP=1174;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=59.98;QD=7.19;SOR=11.114;HOM=0,GGATTTTATCgcgaattTCCAAGCTGA;CONF=ZERO GT:GD:AD:DP:GQ:PL .:.:.:.:.:. 1/1:CGCGAATT/CGCGAATT:0,258:258:99:8481,606,0
VcfRecord vcf1 = new VcfRecord(new String[]{"chr2","29432822",".","C","CGCGAATT",".","HCOVT","AC=2;AF=1.00;AN=2;DP=1174;FS=0.000;MLEAC=2;MLEAF=1.00;MQ=59.98;QD=7.19;SOR=11.114;HOM=0,GGATTTTATCgcgaattTCCAAGCTGA","GT:GD:AD:DP:GQ:PL"," .:.:.:.:.:.","1/1:CGCGAATT/CGCGAATT:0,258:258:99:8481,606,0"});

IndelConfidenceMode cm = new IndelConfidenceMode();
assertSame(MafConfidence.ZERO, cm.getConfidence(vcf1));
}

@Test
public void MaskTest() throws Exception{
public void maskTest() {

try{
createMask();
Expand All @@ -118,44 +124,87 @@ public void MaskTest() throws Exception{
i ++;
if(i == 1)
//chr1 53741 . CTT C
assertTrue( re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE).equals( MafConfidence.HIGH.name()));
assertEquals(re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE), MafConfidence.HIGH.name());
else if(i == 2)
//chr1 53742 . C CA
assertTrue( re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE).equals( MafConfidence.HIGH.name()));
assertEquals(re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE), MafConfidence.HIGH.name());
else if(i == 3)
//chr1 53742 . CTT C
assertTrue( re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE).equals( MafConfidence.ZERO.name()));
assertEquals(re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE), MafConfidence.ZERO.name());
else if(i == 4)
//chr1 53743 . C CA
//the insertion happened bwt 53743 and 53744, so it is before repeat region
assertTrue( re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE).equals( MafConfidence.HIGH.name()));
assertEquals(re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE), MafConfidence.HIGH.name());
else if(i == 5)
//chr1 53744 . CTT C
assertTrue( re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE).equals( MafConfidence.ZERO.name()));
assertEquals(re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE), MafConfidence.ZERO.name());
}
assertEquals(5, i);
}

}catch(Exception e){
} catch(Exception e){
System.out.println(e.getMessage());

fail( "My method throw expection" );
fail( "My method threw an exception" );
}

}

@Test
public void homopolymerOptionTest() {
try{
createMask();
createHomVcf();

String[] args ={"--mode","IndelConfidence", "-i", new File(input).getAbsolutePath(), "-o", new File(output).getAbsolutePath(),
"-d", new File(dbMask).getAbsolutePath(), "--log", new File(log).getAbsolutePath()};
Main.main(args);

try (VcfFileReader reader = new VcfFileReader(new File(output))) {
int i = 0;
for ( VcfRecord re : reader) {
if (++i == 1) {
assertEquals(MafConfidence.LOW.name(), re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE));
} else if (i == 2) {
assertEquals(MafConfidence.HIGH.name(), re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE));
}
}
assertEquals(2, i);
}

args = new String[]{"--mode", "IndelConfidence", "-i", new File(input).getAbsolutePath(), "-o", new File(output).getAbsolutePath(),
"-d", new File(dbMask).getAbsolutePath(), "--log", new File(log).getAbsolutePath(), "--homCutoff", "8"};
Main.main(args);

try (VcfFileReader reader = new VcfFileReader(new File(output))) {
int i = 0;
for ( VcfRecord re : reader) {
if (++i == 1) {
assertEquals(MafConfidence.HIGH.name(), re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE));
} else if (i == 2) {
assertEquals(MafConfidence.HIGH.name(), re.getInfoRecord().getField(VcfHeaderUtils.INFO_CONFIDENCE));
}
}
assertEquals(2, i);
}

} catch(Exception e){
System.out.println(e.getMessage());
fail( "My method threw an exception" );
}
}

public static void createMask() throws IOException{
final List<String> data = new ArrayList<String>();
final List<String> data = new ArrayList<>();
data.add("genoName genoStart genoEnd repClass repFamily");
data.add("1 53744 53780 Low_complexity Low_complexity");
data.add("chr1 54712 54820 Simple_repeat Simple_repeat");
try(BufferedWriter out = new BufferedWriter(new FileWriter(dbMask));) {
try(BufferedWriter out = new BufferedWriter(new FileWriter(dbMask))) {
for (final String line : data) out.write(line +"\n");
}
}

public static void createVcf() throws IOException{
final List<String> data = new ArrayList<String>();
final List<String> data = new ArrayList<>();
data.add("##fileformat=VCFv4.0");
data.add(VcfHeaderUtils.STANDARD_UUID_LINE + "=abcd_12345678_xzy_999666333");
data.add("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO");
Expand All @@ -166,9 +215,21 @@ public static void createVcf() throws IOException{
data.add("chr1 53743 . C CA 37.73 PASS SOMATIC;HOMCNTXT=5,AGCCTGTCTCaAAAAAAAAAA;NIOC=0.087");
data.add("chr1 53744 . CTT C 37.73 PASS SOMATIC;HOMCNTXT=5,AGCCTGTCTCaAAAAAAAAAA;NIOC=0.087");

try(BufferedWriter out = new BufferedWriter(new FileWriter(input));) {
try(BufferedWriter out = new BufferedWriter(new FileWriter(input))) {
for (final String line : data) out.write(line +"\n");
}

}
public static void createHomVcf() throws IOException{
final List<String> data = new ArrayList<>();
data.add("##fileformat=VCFv4.0");
data.add(VcfHeaderUtils.STANDARD_UUID_LINE + "=abcd_12345678_xzy_999666333");
data.add("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t29ab944c-f37f-4531-95ac-57c52c53be6f\t78ab1438-d237-4cff-a76a-835320a5fb0f");
data.add("chr1\t897580\trs397863599\tAT\tA\t404.73\tPASS\tBaseQRankSum=1.304;ClippingRankSum=0.000;DP=50;ExcessHet=3.0103;FS=2.463;MQ=60.00;MQRankSum=0.000;QD=8.26;ReadPosRankSum=-0.193;SOR=0.751;NIOC=0;SSOI=0.286;SVTYPE=DEL;END=897581;IN=1;VLD;DB;VAF=0.696;HOM=8,GACTCACTGA_TTTTTTTCTA\tGT:AD:DP:G;D:GQ:PL:ACINDEL\t0/1:29,20:49:AT/A:99:442,0,732:14,50,49,14[7,7],18[18],0,0,1\t0/1:10,51:61:AT/A:99:1367,0,121:43,61,61,48[21,27],48[43],0,0,1");
data.add("chr1\t1303675\trs60404130\tC\tCCCT\t1037.73\tPASS\tBaseQRankSum=1.750;ClippingRankSum=0.000;DP=44;ExcessHet=3.0103;FS=13.869;MQ=59.81;MQRankSum=0.000;QD=30.90;ReadPosRankSum=2.125;SOR=1.634;NIOC=0.030;SSOI=0.758;SVTYPE=INS;END=1303676;IN=1;VLD;DB;VAF=0.86;HOM=5,CCTGTTGCCCcctCCTCTTTCTC\tGT:AD:DP:GD:GQ:PL:ACINDEL\t1/1:3,24:27:CCCT/CCCT:72:1075,72,0:23,33,33,25[6,19],28[25],0,1,2\t1/1:5,17:22:CCCT/CCCT:23:537,23,0:18,34,33,18[10,8],21[21],1,1,4");
try(BufferedWriter out = new BufferedWriter(new FileWriter(input))) {
for (final String line : data) out.write(line +"\n");
}

}
}