From 417a8f1c2628b41be364d7c924d589d8530a8ea6 Mon Sep 17 00:00:00 2001 From: earonesty Date: Tue, 16 Sep 2014 15:54:28 +0000 Subject: [PATCH] important: cap lookbehind since we're using a vector... maybe use a list instead? typically it's rather sparse --- clipper/varcall.cpp | 119 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 80 insertions(+), 39 deletions(-) diff --git a/clipper/varcall.cpp b/clipper/varcall.cpp index ebb403d..63173c0 100644 --- a/clipper/varcall.cpp +++ b/clipper/varcall.cpp @@ -87,6 +87,7 @@ FILE *stat_fout=NULL; #define T_CNT 8 #define b2i(c) ((c)=='A'?0:(c)=='a'?0:(c)=='C'?1:(c)=='c'?1:(c)=='G'?2:(c)=='g'?2:(c)=='T'?3:(c)=='t'?3:(c)=='*'?4:(c)=='-'?5:(c)=='+'?6:7) #define i2b(i) (i==0?'A':i==1?'C':i==2?'G':i==3?'T':i==4?'*':i==5?'-':i==6?'+':'?') +#define toupper(c) ((c)>='a' && (c)<='z' ? ((c)-32) : (c)) // die unless it's a number int ok_atoi(const char *s) { @@ -104,6 +105,7 @@ int rand_round(double x); // basic utils std::vector split(char* str, char delim); +int split(char **buf, char* str, char delim); std::string string_format(const std::string &fmt, ...); void to_upper(const std::string str); void rename_tmp(std::string f); @@ -288,7 +290,7 @@ friend class PileupSubscriber; class VarStatVisitor : public PileupSubscriber { public: - VarStatVisitor() : PileupSubscriber() {tot_locii=0; tot_depth=0; num_reads=0;}; + VarStatVisitor() : PileupSubscriber() {tot_locii=0; tot_depth=0; num_reads=0; stats.reserve(1000000); ins_stats.reserve(1000000); del_stats.reserve(1000000);}; VarStatVisitor(PileupManager &man) : PileupSubscriber(man) {tot_locii=0; tot_depth=0; num_reads=0;}; void Visit(PileupSummary &dat); @@ -345,8 +347,8 @@ int debug_xpos=0; int debug_level=0; int min_depth=1; int min_mapq=0; -double min_diversity=0.25; // only skip huge piles in one spot... even a little diversity is OK -double min_agreement=0.25; // only skip when like-depth variation wildly disagrees +double min_diversity=0.15; // only skip huge piles in one spot... even a little diversity is OK +double min_agreement=0.15; // only really relevent when depths are high ... this is a bare minimum score int min_qual=3; int repeat_filter=7; double artifact_filter=1; @@ -942,23 +944,22 @@ typedef struct { } ChrRange; -void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype) { +char *_dat[256]; +inline void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype) { - vector d=split(line, '\t'); + int dsize=split(_dat, line, '\t'); - if (d.size() < 6) { - warn("Can't read pileup : %d fields, need 6 columns, line %d\n", (int) d.size(), g_lineno); + if (dsize < 6) { + warn("Can't read pileup : %d fields, need 6 columns, line %d\n", (int) dsize, g_lineno); exit(1); } - const char * p_qual=d[5]; + const char * p_qual=_dat[5]; - Calls.resize(0); - - Chr=d[0]; - Pos=atoi(d[1]); - Base=*(d[2]); - Depth = atoi(d[3]); + Chr=_dat[0]; + Pos=atoi(_dat[1]); + Base=*(_dat[2]); + Depth = atoi(_dat[3]); SkipDupReads = 0; SkipN = 0; SkipAmp = 0; @@ -972,7 +973,7 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype) int i; - const char *cur_p = d[4]; + const char *cur_p = _dat[4]; list::iterator read_i = rds.ReadList.begin(); @@ -1017,9 +1018,13 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype) } } - int maxdepthbypos = rds.MeanReadLen() <= 0 ? 10 : max(10, round(10 * artifact_filter * (Depth/rds.MeanReadLen()))); + int meanreadlen = rds.MeanReadLen(); + int maxdepthbypos = meanreadlen <= 0 ? 10 : max(10, round(10 * artifact_filter * (Depth/meanreadlen))); + + Calls.clear(); int j; + int max_pia=-1; for (i=0;i max_pia) + max_pia=pia; + + depthbypos[pia]++; if (sor) ++NumReads; @@ -1076,9 +1087,9 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype) if (!ampok) { for (j=0;jPos + rds.MeanReadLen() + 1; - int bpos = read_i->Pos + rds.MeanReadLen(); - int cpos = read_i->Pos + rds.MeanReadLen() - 1; + int apos = read_i->Pos + meanreadlen + 1; + int bpos = read_i->Pos + meanreadlen; + int cpos = read_i->Pos + meanreadlen - 1; if (apos == amps[j].End || bpos == amps[j].End || cpos == amps[j].End) { ampok=1; } @@ -1099,6 +1110,11 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype) } */ +// before dup filtering + + int call_index = b2i(c); + depthbyposbycall[pia].call[call_index]++; + if (!ampok) { // warn("SKIP: %d-%d, %c\n", read_i->Pos, (int)(read_i->Pos+rds.MeanReadLen()), o); ++SkipAmp; @@ -1107,22 +1123,19 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype) ++SkipN; skip=1; // ok, instead of rolling a random number, even things out - } else if (artifact_filter > 0 && (((10*(depthbypos[pia]+1))+(i%10)) > maxdepthbypos )) { - ++SkipDupReads; - skip=1; } else if (mq < min_mapq) { ++SkipMinMapq; skip=1; } else if (q < min_qual) { ++SkipMinQual; skip=1; + } else if (artifact_filter > 0 && (((10*(depthbypos[pia]+1))+(i%10)) > maxdepthbypos )) { + ++SkipDupReads; + skip=1; } else { - int j = b2i(c); - - depthbypos[pia]++; - depthbyposbycall[pia].call[j]++; + int j = call_index; - if (j >= Calls.size()) { + if (call_index >= Calls.size()) { int was = Calls.size(); Calls.resize(j+1); int t; for (t=was;t<=j;++t) { @@ -1213,6 +1226,7 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype) } // printf("%d\t%s\n", read_i->MapQ, read_i->Seq.c_str()); read_i=rds.ReadList.erase(read_i); + meanreadlen = rds.MeanReadLen(); --read_i; ++cur_p; ++eor; @@ -1232,7 +1246,7 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype) } if (*cur_p) { - warn("Failed to parse pileup %s\n", d[4]); + warn("Failed to parse pileup %s\n", _dat[4]); exit(1); } @@ -1246,7 +1260,7 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype) fprintf(stderr,"xpos-max-per-pos\t%f\n", maxdepthbypos/10.0); fprintf(stderr,"xpos-mean-readlen\t%f\n", rds.MeanReadLen()); fprintf(stderr,"xpos-depth-list\t"); - for (i=0;i meanreadlen*2) + max_pia=meanreadlen*2; + // bool quit=0; for (i=0;i < Calls.size();++i) { // total depth (exclude inserts for tot depth, otherwise they are double-counted) if (Calls[i].depth()>0) { - double expected=Calls[i].depth()/rds.MeanReadLen(); + double expected=Calls[i].depth()/meanreadlen; double all_pct=Calls[i].depth()/(double)Depth; - double p2=(Calls[i].depth()+2*all_pct*depthbypos.size())/(double)(Depth+2*depthbypos.size()); + double p2=(Calls[i].depth()+2*all_pct*max_pia)/(double)(Depth+2*max_pia); double total_v=0; double diff; double moment_den=0; @@ -1294,7 +1311,7 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype) */ int poscnt=0; - for (j=0;j 2) warn("base:%c, cube_v:%g, deno:%g\n", Calls[i].base, cube_v, ((double)(Depth+2*depthbypos.size()))); + if (debug_level > 2) warn("base:%c, cube_v:%g, deno:%g\n", Calls[i].base, cube_v, ((double)(Depth+2*max_pia))); fprintf(stderr,"xpos-agree-%c\t%g\n",Calls[i].base, Calls[i].agreement); fprintf(stderr,"xpos-diver-%c\t%g\n",Calls[i].base, Calls[i].diversity); } @@ -1328,7 +1345,7 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype) /* if(Depth>100 && Calls[i].depth() < 100) { - printf("num %g, den %g\n", (p2*(1-p2)*(Depth+2*depthbypos.size())), moment_den); + printf("num %g, den %g\n", (p2*(1-p2)*(Depth+2*max_pia)), moment_den); printf(", Agree: %g", Calls[i].agreement); printf(", Divers: %g\n", Calls[i].diversity); quit=1; @@ -1346,7 +1363,7 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype) PileupSummary JunkSummary; -void PileupManager::Parse(char *dat) { +inline void PileupManager::Parse(char *dat) { Pileup.Parse(dat, Reads, UseAnnot ? &AnnotDex : NULL, AnnotType); Visit(Pileup); } @@ -2211,10 +2228,13 @@ std::string string_format(const std::string &fmt, ...) { } void to_upper(const std::string str) { - std::string::iterator it; - int i; + static int i; + static char c; for ( i=0;i= 'a' && c <= 'z') { + ((char *)(void *)str.data())[i]=c-32; + } } } @@ -2285,6 +2305,27 @@ std::vector split(char* str, char delim) return result; } +int split(char **buf, char* str, char delim) +{ + char **bb=buf; + char *p=strchr(str,delim); + std::vector result; + while(p != NULL) + { + *p='\0'; + *bb=str; + ++bb; + str=p+1; + p=strchr(str,delim); + } + if (*str) { + *bb=str; + bb++; + } + *bb=NULL; + return bb-buf; +} + int rand_round(double x) { return floor(x)+((rand()>(x-int(x))) ? 1 : 0); //warn("rr:%f=%d\n",x);