Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

Already on GitHub? Sign in to your account

Develop #54

Merged
merged 3 commits into from Apr 7, 2017
View
@@ -31,7 +31,7 @@ See "void usage" below for usage.
#include "fastq-lib.h"
-#define VERSION "1.04.807"
+#define VERSION "1.05"
#define MAX_ADAPTER_NUM 1000
#define SCANLEN 15
@@ -174,7 +174,7 @@ class inbuffer {
fq->qual.s[--fq->qual.n] = '\0';
}
- return fq->qual.n > 0;
+ return fq->qual.n >= 0; // github issue 46, 53
} else {
return ::read_fq(fin, rno, fq, name);
}
@@ -220,6 +220,8 @@ int main (int argc, char **argv) {
int ilv3 = -1;
int duplen = 0;
int dupskip = 0;
+ int min_start_trim = 0;
+ int min_end_trim = 0;
bool noexec = 0;
bool hompol_filter = 0;
bool lowcom_filter = 0;
@@ -258,6 +260,8 @@ int main (int argc, char **argv) {
{"mate-min-len", 1, 0, 0},
{"homopolymer-pct", 1, 0, 0},
{"lowcomplex-pct", 1, 0, 0},
+ {"min-start-trim", 1, 0, 0},
+ {"min-end-trim", 1, 0, 0},
{0, 0, 0, 0}
};
@@ -275,6 +279,10 @@ int main (int argc, char **argv) {
keeponlyclip=1;
} else if(!strcmp(oname, "mate-qual-mean")) {
qf2_mean=atoi(optarg);
+ } else if (!strcmp(oname, "min-start-trim")) {
+ min_start_trim = atoi(optarg);
+ } else if (!strcmp(oname, "min-end-trim")) {
+ min_end_trim = atoi(optarg);
} else if(!strcmp(oname, "homopolymer-pct")) {
hompol_pct=atof(optarg)/100.0;
hompol_filter=1;
@@ -631,6 +639,10 @@ int main (int argc, char **argv) {
--nq; --ns; // don't count newline for read len
// skip poor quals/lots of N's when doing sampling (otherwise you'll miss some)
+ if (ns == 0) { // github issue 46, 53
+ ++skipped;
+ continue;
+ }
if ((st.st_size > (sampcnt * 500)) && (skipped < sampcnt) && poorqual(i, ns, s, q)) {
++skipped;
continue;
@@ -970,6 +982,15 @@ int main (int argc, char **argv) {
}
}
++nrec;
+ if (fq[0].qual.n == 0) { // github issue 46, 53
+ ++nfiltered;
+ continue;
+ } else if (i_n > 1) {
+ if (fq[1].qual.n == 0) {
+ ++nfiltered;
+ continue;
+ }
+ }
if (read_ok < 0) {
++nerr;
continue;
@@ -1003,6 +1024,9 @@ int main (int argc, char **argv) {
for (f=0;f<i_n;++f) {
dotrim[f][0] = sktrim[f][0]; // default, trim to detected skew levels
dotrim[f][1] = sktrim[f][1];
+ // trim to minimum, if specified
+ dotrim[f][0] = max(dotrim[f][0], min_start_trim);
+ dotrim[f][1] = max(dotrim[f][1], min_end_trim);
if (avgns[f] < 11)
// reads of avg length < 11 ? barcode lane, skip it
continue;
@@ -1493,6 +1517,10 @@ void usage(FILE *f, const char *msg) {
" -C N Number of reads to use for subsampling (300k)\n"
" -d Output lots of random debugging stuff\n"
"\n"
+"Minimum trimming options:\n"
+" --min-start-trim NUM Always trim at least NUM bases from start\n"
+" --min-end-trim NUM Always trim at least NUM bases from end\n"
+"\n"
"Quality adjustment options:\n"
" --cycle-adjust CYC,AMT Adjust cycle CYC (negative = offset from end) by amount AMT\n"
" --phred-adjust SCORE,AMT Adjust score SCORE by amount AMT\n"
View
@@ -34,7 +34,7 @@ See "void usage" below for usage.
#define THFIXFACTOR 20
#define endstr(e) (e=='e'?"end":e=='b'?"start":"n/a")
-const char * VERSION = "1.02.772";
+const char * VERSION = "1.03";
// barcode
struct bc {
@@ -116,6 +116,7 @@ int main (int argc, char **argv) {
const char* list=NULL; // use a barcode master list
char verify='\0';
bool noexec = false;
+ bool seqnames = false;
const char *group = NULL;
bool usefile1 = false;
int phred = 33;
@@ -125,7 +126,7 @@ int main (int argc, char **argv) {
int i;
bool omode = false;
char *bfil = NULL;
- while ( (c = getopt (argc, argv, "-DzxnHhbeov:m:B:g:L:l:G:q:d:t:")) != -1) {
+ while ( (c = getopt (argc, argv, "-DzxnHhbeosv:m:B:g:L:l:G:q:d:t:")) != -1) {
switch (c) t:{
case '\1':
if (omode) {
@@ -143,6 +144,7 @@ int main (int argc, char **argv) {
}
break;
case 'o': omode=true; break;
+ case 's': seqnames=true; break;
case 'v':
if (strlen(optarg)>1) {
fprintf(stderr, "Option -v requires a single character argument");
@@ -801,18 +803,34 @@ int main (int argc, char **argv) {
// TODO: output barcode read ...but only for unmatched?
int b;
for (b=0;b<=bcnt;++b) {
+ size_t nameseq_len = strlen(bc[b].id.s);
+ if ((b < bcnt) && seqnames) {
+ nameseq_len = strlen(bc[b].seq.s);
+ if (bc[b].dual)
+ nameseq_len += bc[b].dual_n + 1;
+ }
+
for (i=0;i<f_n;++i) {
if (!strcasecmp(out[i],"n/a") || !strcasecmp(out[i],"/dev/null")) {
bc[b].out[i] = NULL;
bc[b].fout[i] = NULL;
continue;
}
const char *p=strchr(out[i],'%');
- if (!p) fail("Each output file name must contain a '%%' sign, which is replaced by the barcode id\n");
- bc[b].out[i]=(char *) malloc(strlen(out[i])+strlen(bc[b].id.s)+100);
+ if (!p) fail("Each output file name must contain a '%%' sign, which is replaced by the barcode id or sequence\n");
+ bc[b].out[i]=(char *) malloc(strlen(out[i])+nameseq_len+100);
strncpy(bc[b].out[i], out[i], p-out[i]);
bc[b].out[i][p-out[i]]='\0';
- strcat(bc[b].out[i], bc[b].id.s);
+ if (seqnames && (b < bcnt)) {
+ strcat(bc[b].out[i], bc[b].seq.s);
+ if (bc[b].dual) {
+ strcat(bc[b].out[i], "-");
+ strcat(bc[b].out[i], bc[b].dual);
+ }
+ }
+ else {
+ strcat(bc[b].out[i], bc[b].id.s);
+ }
strcat(bc[b].out[i], p+1);
if (!(bc[b].fout[i]=gzopen(bc[b].out[i], "w", &bc[b].gzout[i]))) {
fprintf(stderr, "Error opening output file '%s': %s\n",bc[b].out[i], strerror(errno));
@@ -1011,7 +1029,7 @@ int main (int argc, char **argv) {
if (!f) continue;
if (!trimmed) {
// todo: capture always, not just when trim is off
- *strrchr(fq[i].id.s, '\n') = '\0';
+ if (!debug) *strrchr(fq[i].id.s, '\n') = '\0';
fputs(fq[i].id.s,f);
fputc(' ', f);
fputs(fq[0].seq.s,f);
@@ -1145,6 +1163,7 @@ void usage(FILE *f) {
"-L BCFIL Determine barcodes from <read1.fq>, using BCFIL as a master list\n"
"-B BCFIL Use barcodes from BCFIL, no determination step, codes in <read1.fq>\n"
"-H Use barcodes from illumina's header, instead of a read\n"
+"-s Substitute barcode sequence instead of barcode label into output file names\n"
"-b Force beginning of line (5') for barcode matching\n"
"-e Force end of line (3') for batcode matching\n"
"-t NUM Divide threshold for auto-determine by factor NUM (1), > 1 = more sensitive\n"