Permalink
Browse files

important: cap lookbehind since we're using a vector...

maybe use a list instead? typically it's rather sparse
  • Loading branch information...
1 parent 76701b3 commit 417a8f1c2628b41be364d7c924d589d8530a8ea6 earonesty committed Sep 16, 2014
Showing with 80 additions and 39 deletions.
  1. +80 −39 clipper/varcall.cpp
View
@@ -87,6 +87,7 @@ FILE *stat_fout=NULL;
#define T_CNT 8
#define b2i(c) ((c)=='A'?0:(c)=='a'?0:(c)=='C'?1:(c)=='c'?1:(c)=='G'?2:(c)=='g'?2:(c)=='T'?3:(c)=='t'?3:(c)=='*'?4:(c)=='-'?5:(c)=='+'?6:7)
#define i2b(i) (i==0?'A':i==1?'C':i==2?'G':i==3?'T':i==4?'*':i==5?'-':i==6?'+':'?')
+#define toupper(c) ((c)>='a' && (c)<='z' ? ((c)-32) : (c))
// die unless it's a number
int ok_atoi(const char *s) {
@@ -104,6 +105,7 @@ int rand_round(double x);
// basic utils
std::vector<char *> split(char* str, char delim);
+int split(char **buf, char* str, char delim);
std::string string_format(const std::string &fmt, ...);
void to_upper(const std::string str);
void rename_tmp(std::string f);
@@ -288,7 +290,7 @@ friend class PileupSubscriber;
class VarStatVisitor : public PileupSubscriber {
public:
- VarStatVisitor() : PileupSubscriber() {tot_locii=0; tot_depth=0; num_reads=0;};
+ VarStatVisitor() : PileupSubscriber() {tot_locii=0; tot_depth=0; num_reads=0; stats.reserve(1000000); ins_stats.reserve(1000000); del_stats.reserve(1000000);};
VarStatVisitor(PileupManager &man) : PileupSubscriber(man) {tot_locii=0; tot_depth=0; num_reads=0;};
void Visit(PileupSummary &dat);
@@ -345,8 +347,8 @@ int debug_xpos=0;
int debug_level=0;
int min_depth=1;
int min_mapq=0;
-double min_diversity=0.25; // only skip huge piles in one spot... even a little diversity is OK
-double min_agreement=0.25; // only skip when like-depth variation wildly disagrees
+double min_diversity=0.15; // only skip huge piles in one spot... even a little diversity is OK
+double min_agreement=0.15; // only really relevent when depths are high ... this is a bare minimum score
int min_qual=3;
int repeat_filter=7;
double artifact_filter=1;
@@ -942,23 +944,22 @@ typedef struct {
} ChrRange;
-void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype) {
+char *_dat[256];
+inline void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype) {
- vector<char *> d=split(line, '\t');
+ int dsize=split(_dat, line, '\t');
- if (d.size() < 6) {
- warn("Can't read pileup : %d fields, need 6 columns, line %d\n", (int) d.size(), g_lineno);
+ if (dsize < 6) {
+ warn("Can't read pileup : %d fields, need 6 columns, line %d\n", (int) dsize, g_lineno);
exit(1);
}
- const char * p_qual=d[5];
+ const char * p_qual=_dat[5];
- Calls.resize(0);
-
- Chr=d[0];
- Pos=atoi(d[1]);
- Base=*(d[2]);
- Depth = atoi(d[3]);
+ Chr=_dat[0];
+ Pos=atoi(_dat[1]);
+ Base=*(_dat[2]);
+ Depth = atoi(_dat[3]);
SkipDupReads = 0;
SkipN = 0;
SkipAmp = 0;
@@ -972,7 +973,7 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype)
int i;
- const char *cur_p = d[4];
+ const char *cur_p = _dat[4];
list<Read>::iterator read_i = rds.ReadList.begin();
@@ -1017,9 +1018,13 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype)
}
}
- int maxdepthbypos = rds.MeanReadLen() <= 0 ? 10 : max(10, round(10 * artifact_filter * (Depth/rds.MeanReadLen())));
+ int meanreadlen = rds.MeanReadLen();
+ int maxdepthbypos = meanreadlen <= 0 ? 10 : max(10, round(10 * artifact_filter * (Depth/meanreadlen)));
+
+ Calls.clear();
int j;
+ int max_pia=-1;
for (i=0;i<Depth;++i,++read_i) {
bool sor=0;
@@ -1034,6 +1039,7 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype)
++read_i;
}
read_i=rds.ReadList.insert(read_i,x);
+ meanreadlen = rds.MeanReadLen();
}
if (read_i == rds.ReadList.end()) {
@@ -1042,6 +1048,7 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype)
x.MapQ = 0;
x.Pos = -1;
read_i=rds.ReadList.insert(read_i,x);
+ meanreadlen = rds.MeanReadLen();
}
@@ -1051,6 +1058,10 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype)
depthbypos.resize(pia+1);
depthbyposbycall.resize(pia+1);
}
+ if (pia > max_pia)
+ max_pia=pia;
+
+ depthbypos[pia]++;
if (sor)
++NumReads;
@@ -1076,9 +1087,9 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype)
if (!ampok) {
for (j=0;j<amps.size();++j) {
- int apos = read_i->Pos + rds.MeanReadLen() + 1;
- int bpos = read_i->Pos + rds.MeanReadLen();
- int cpos = read_i->Pos + rds.MeanReadLen() - 1;
+ int apos = read_i->Pos + meanreadlen + 1;
+ int bpos = read_i->Pos + meanreadlen;
+ int cpos = read_i->Pos + meanreadlen - 1;
if (apos == amps[j].End || bpos == amps[j].End || cpos == amps[j].End) {
ampok=1;
}
@@ -1099,6 +1110,11 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype)
}
*/
+// before dup filtering
+
+ int call_index = b2i(c);
+ depthbyposbycall[pia].call[call_index]++;
+
if (!ampok) {
// warn("SKIP: %d-%d, %c\n", read_i->Pos, (int)(read_i->Pos+rds.MeanReadLen()), o);
++SkipAmp;
@@ -1107,22 +1123,19 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype)
++SkipN;
skip=1;
// ok, instead of rolling a random number, even things out
- } else if (artifact_filter > 0 && (((10*(depthbypos[pia]+1))+(i%10)) > maxdepthbypos )) {
- ++SkipDupReads;
- skip=1;
} else if (mq < min_mapq) {
++SkipMinMapq;
skip=1;
} else if (q < min_qual) {
++SkipMinQual;
skip=1;
+ } else if (artifact_filter > 0 && (((10*(depthbypos[pia]+1))+(i%10)) > maxdepthbypos )) {
+ ++SkipDupReads;
+ skip=1;
} else {
- int j = b2i(c);
-
- depthbypos[pia]++;
- depthbyposbycall[pia].call[j]++;
+ int j = call_index;
- if (j >= Calls.size()) {
+ if (call_index >= Calls.size()) {
int was = Calls.size();
Calls.resize(j+1);
int t; for (t=was;t<=j;++t) {
@@ -1213,6 +1226,7 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype)
}
// printf("%d\t%s\n", read_i->MapQ, read_i->Seq.c_str());
read_i=rds.ReadList.erase(read_i);
+ meanreadlen = rds.MeanReadLen();
--read_i;
++cur_p;
++eor;
@@ -1232,7 +1246,7 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype)
}
if (*cur_p) {
- warn("Failed to parse pileup %s\n", d[4]);
+ warn("Failed to parse pileup %s\n", _dat[4]);
exit(1);
}
@@ -1246,7 +1260,7 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype)
fprintf(stderr,"xpos-max-per-pos\t%f\n", maxdepthbypos/10.0);
fprintf(stderr,"xpos-mean-readlen\t%f\n", rds.MeanReadLen());
fprintf(stderr,"xpos-depth-list\t");
- for (i=0;i<depthbypos.size();++i) {
+ for (i=0;i<max_pia;++i) {
fprintf(stderr,"%d,", depthbypos[i]);
}
fprintf(stderr,"\n");
@@ -1265,12 +1279,15 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype)
}
}
+ if (max_pia > meanreadlen*2)
+ max_pia=meanreadlen*2;
+
// bool quit=0;
for (i=0;i < Calls.size();++i) { // total depth (exclude inserts for tot depth, otherwise they are double-counted)
if (Calls[i].depth()>0) {
- double expected=Calls[i].depth()/rds.MeanReadLen();
+ double expected=Calls[i].depth()/meanreadlen;
double all_pct=Calls[i].depth()/(double)Depth;
- double p2=(Calls[i].depth()+2*all_pct*depthbypos.size())/(double)(Depth+2*depthbypos.size());
+ double p2=(Calls[i].depth()+2*all_pct*max_pia)/(double)(Depth+2*max_pia);
double total_v=0;
double diff;
double moment_den=0;
@@ -1294,7 +1311,7 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype)
*/
int poscnt=0;
- for (j=0;j<depthbyposbycall.size();++j) {
+ for (j=0;j<max_pia;++j) {
// diversity calc
diff=floor(fabs(depthbyposbycall[j].call[i]-expected));
total_v+=diff*diff;
@@ -1315,20 +1332,20 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype)
Calls[i].diversity = max(0,1-shift_v/max(1,(pow(Calls[i].depth()-expected,2)-2*Calls[i].depth())));
if (poscnt==1) Calls[i].diversity = 0;
- double wt4_od=pow(cube_v/((double)(Depth+2*depthbypos.size())),1.0/3.0)/all_pct;
+ double wt4_od=pow(cube_v/((double)(Depth+2*max_pia)),1.0/3.0)/all_pct;
Calls[i].agreement = max(0,1-wt4_od);
if (debug_xpos) {
if (Pos == debug_xpos && !strcmp(debug_xchr,Chr.data())) {
- if (debug_level > 2) warn("base:%c, cube_v:%g, deno:%g\n", Calls[i].base, cube_v, ((double)(Depth+2*depthbypos.size())));
+ if (debug_level > 2) warn("base:%c, cube_v:%g, deno:%g\n", Calls[i].base, cube_v, ((double)(Depth+2*max_pia)));
fprintf(stderr,"xpos-agree-%c\t%g\n",Calls[i].base, Calls[i].agreement);
fprintf(stderr,"xpos-diver-%c\t%g\n",Calls[i].base, Calls[i].diversity);
}
}
/*
if(Depth>100 && Calls[i].depth() < 100) {
- printf("num %g, den %g\n", (p2*(1-p2)*(Depth+2*depthbypos.size())), moment_den);
+ printf("num %g, den %g\n", (p2*(1-p2)*(Depth+2*max_pia)), moment_den);
printf(", Agree: %g", Calls[i].agreement);
printf(", Divers: %g\n", Calls[i].diversity);
quit=1;
@@ -1346,7 +1363,7 @@ void PileupSummary::Parse(char *line, PileupReads &rds, tidx *adex, char atype)
PileupSummary JunkSummary;
-void PileupManager::Parse(char *dat) {
+inline void PileupManager::Parse(char *dat) {
Pileup.Parse(dat, Reads, UseAnnot ? &AnnotDex : NULL, AnnotType);
Visit(Pileup);
}
@@ -2211,10 +2228,13 @@ std::string string_format(const std::string &fmt, ...) {
}
void to_upper(const std::string str) {
- std::string::iterator it;
- int i;
+ static int i;
+ static char c;
for ( i=0;i<str.size();++i ) {
- ((char *)(void *)str.data())[i]=toupper(((char *)str.data())[i]);
+ c=((char *)(void *)str.data())[i];
+ if (c >= 'a' && c <= 'z') {
+ ((char *)(void *)str.data())[i]=c-32;
+ }
}
}
@@ -2285,6 +2305,27 @@ std::vector<char *> split(char* str, char delim)
return result;
}
+int split(char **buf, char* str, char delim)
+{
+ char **bb=buf;
+ char *p=strchr(str,delim);
+ std::vector<char *> result;
+ while(p != NULL)
+ {
+ *p='\0';
+ *bb=str;
+ ++bb;
+ str=p+1;
+ p=strchr(str,delim);
+ }
+ if (*str) {
+ *bb=str;
+ bb++;
+ }
+ *bb=NULL;
+ return bb-buf;
+}
+
int rand_round(double x) {
return floor(x)+((rand()>(x-int(x))) ? 1 : 0);
//warn("rr:%f=%d\n",x);

0 comments on commit 417a8f1

Please sign in to comment.