Permalink
Switch branches/tags
Nothing to show
Find file
Fetching contributors…
Cannot retrieve contributors at this time
588 lines (481 sloc) 15 KB
#include <iostream>
#include <sparsehash/sparse_hash_map>
#include "fastq-lib.h"
#include <time.h>
#include <string>
#include <vector>
#include <stdio.h>
#include <getopt.h>
#include <stdarg.h>
#define meminit(l) (memset(&l,0,sizeof(l)))
#define VERSION "0.5"
FILE *zopenordie(const char *path, const char *mode);
FILE *openordie(const char *path, const char *mode);
FILE *popenordie(const char *path, const char *mode);
std::string string_format(const std::string &fmt, ...);
using namespace std;
void seq2char(uint32_t s, int k, char *p);
// true if different
void compare_files(const char *f1, const char *f2);
template <class vtype> double quantile(const vtype &vec, double p);
#define AS_UINT(x) ((int *)(void *)(x))
// main
unsigned long long basemap[256];
unsigned long long basemaprc[256];
bool lowcomplex(char *s, int n, int max);
void usage(FILE *f);
// coefficients used to produce the blended similarity score (range 0 to 1, where 1 is identical)
double scoef=.75;
double qcoef=350;
int main(int argc, char **argv) {
int i;
clock_t clock1;
clock1=clock();
meminit(basemap);
for (i=0;i<4;++i) {
basemap[("acgt")[i]]=i;
basemap[("ACGT")[i]]=i;
basemaprc[("tgca")[i]]=i;
basemaprc[("TGCA")[i]]=i;
}
// max 16... lower values result in better sensitivity, lower specificity expecially with blended/mixed samples
int k=12;
int x=200;
int sampsize=500000;
float lowcom_pct=0.40;
double ftop=.99;
double fdel=.015;
double fmin=.50;
// run comparison on args?
bool docompare=0;
char c;
char *endp;
char **inf;
int inc=0;
char *contampath=NULL;
while ( (c = getopt_long(argc, argv, "chk:x:n:l:t:d:m:r:S:Q:",NULL,NULL)) != -1) {
switch (c) {
case 'h': usage(stdout); return 0;
case 'c': docompare=1; break;
case 'x': x=atoi(optarg); break;
case 'k': k=atoi(optarg); if(k>16) fail("Max k is 16\n"); break;
case 'n': sampsize=atoi(optarg); break;
case 'l': lowcom_pct=strtod(optarg,&endp); break;
case 't': ftop=strtod(optarg,&endp); break;
case 'S': scoef=strtod(optarg,&endp); break;
case 'Q': qcoef=strtod(optarg,&endp); break;
case 'r': contampath=optarg; break;
case 'd': fdel=strtod(optarg,&endp); break;
case 'm': fmin=strtod(optarg,&endp); break;
case '\1': inf[++inc]=optarg; break;
case '?':
if (!optopt) {
usage(stdout); return 0;
} else if (optopt && strchr("kxnltdmrSQ", optopt))
fprintf (stderr, "Option -%c requires an argument.\n", optopt);
else if (isprint(optopt))
fprintf (stderr, "Unknown option `-%c'.\n", optopt);
else
fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt);
usage(stderr);
return 1;
}
}
inf=&(argv[optind]);
inc=argc-optind;
if (!inc) {
usage(stderr);
return 1;
}
if (!x || !k || !sampsize || ftop<.8 || ftop > 1 || fdel > ftop || fmin > ftop || inc < 1) {
fail("Bad params, quitting\n");
}
if (docompare) {
compare_files(inf[0],inf[1]);
exit(0);
}
char*in=inf[0];
FILE *fin;
if (contampath) {
if (!strcmp(fext(in),".dz")||!strcmp(fext(in),".dsrc")) {
fin=popenordie(string_format("dsrc d -s %s | bowtie -p 3 -S %s -q - | samtools view -S -f 4 - 2> /dev/null", in, contampath).c_str(), "r");
} else {
fin=popenordie(string_format("bowtie -p 3 -S %s -q %s | samtools view -S -f 4 - 2> /dev/null", contampath, in).c_str(), "r");
}
} else {
fin=zopenordie(in,"r");
}
struct fq fq;
meminit(fq);
int n=0;
google::sparse_hash_map<uint32_t, int> kh;
std::string mer, scnt;
mer.resize(k);
int *pdat;
int kcnt=0;
unsigned long long lm; //, lmrc;
unsigned long long kmask=~(((unsigned long long) ~0) << (k*2));
// printf("kmask:%llx\n",kmask);
// minimum # of low-complexity positions in 32 bases to be called "low complexity"
int lowcom_count = 32 * lowcom_pct;
int skiplow=0;
while (contampath?read_fq_sam(fin,n,&fq):read_fq(fin, n, &fq)) {
if (fq.seq.n > 32) // only first 32 bases considered
fq.seq.n = 32;
// sequence as 64-bit value
lm=0;
// lmrc=0;
if (lowcomplex(fq.seq.s, fq.seq.n, lowcom_count)) {
++skiplow;
continue;
}
// printf("s:%s\n", fq.seq.s);
// skip too short
if (fq.seq.n < k) {
++skiplow;
continue;
}
++n;
for(i=0;i<fq.seq.n;++i) { // map to 64 bit sequence
lm=lm|(basemap[fq.seq.s[i]]<<(i*2));
// lmrc=(lmrc<<2)|basemaprc[fq.seq.s[i]];
}
// printf("orig: %s\n", string(fq.seq.s,32).c_str());
// printf("s:%s,lm:%llx\n",fq.seq.s,lm);
for (i=0;i<fq.seq.n-k+1;++i) { // create unint_16_t kmers
uint32_t v=lm&kmask;
if (!kh[v]) ++kcnt;
kh[v]++;
// printf("i:%d,v:%hx\n",i,v);
lm=lm>>2; // remove first base
// reverse
// char t[33];
// seq2char(v, k, t);
// printf("orig: %s, lm: %llx, asv: %hx, mapped: %s\n", string(fq.seq.s+i,k).c_str(), lm, v, t);
}
if (n>=sampsize) break;
}
if (n==0) {
fail("Insufficient sequence content\n");
}
// printf("entries: %d\n", kcnt);
struct {uint32_t s; int v;} topx[x]; meminit(topx);
google::sparse_hash_map<uint32_t,int>::iterator it = kh.begin();
vector<int> dist;
while(it!=kh.end()) {
// *not* making the distinction between 2 singleton mappings and 1 paired here
int val = it->second;
dist.push_back(val);
if (val > topx[x-1].v) {
for (i=0;i<x;++i) {
if (val > topx[i].v) {
break;
}
}
int found=i;
// shift remaining down (max 8 shifts)
for (i=x-1;i>found;--i) {
topx[i]=topx[i-1];
}
topx[found].v=val;
topx[found].s=it->first;
}
++it;
}
sort(dist.begin(),dist.end());
double qtop=quantile(dist,ftop);
printf("k\t%d\n", k);
printf("top-x\t%d\n", x);
printf("q-start\t%.3f\n", ftop);
printf("q-delta\t%.3f\n", fdel);
printf("q-min\t%.3f\n", fmin);
printf("reads-used\t%d\n", n);
printf("lowcom-pct\t%.3f\n", lowcom_pct*100);
printf("quantile-top\t%f\n", qtop);
printf("skip-low\t%d\n", skiplow);
printf("run-time\t%f\n", (clock()-clock1)/(double)CLOCKS_PER_SEC);
printf("--------\n");
int j;
char buf[k+1];
for (i=0;i<x;++i) {
seq2char(topx[i].s, k, buf);
printf("%s %.3f\n", buf, topx[i].v/qtop);
}
printf("--------\n");
double f;
vector<double>dsig;
for (f=ftop-fdel;f>=fmin;f-=fdel) {
double r=quantile(dist, f)/qtop;
dsig.push_back(r);
printf("%.6f\n",r);
}
}
FILE *zopenordie(const char *path, const char *mode) {
bool isgz;
FILE *f=gzopen(path, mode, &isgz);
if (!f) {
warn("Can't open-%s '%s': %s\n", mode, path, strerror(errno));
exit(1);
}
return f;
}
FILE *openordie(const char *path, const char *mode) {
bool isgz;
FILE *f=fopen(path, mode);
if (!f) {
warn("Can't open-%s '%s': %s\n", mode, path, strerror(errno));
exit(1);
}
return f;
}
FILE *popenordie(const char *cmd, const char *mode) {
FILE *f=popen(cmd, mode);
if (!f) {
warn("Can't open-%s '%s': %s\n", mode, cmd, strerror(errno));
exit(1);
}
return f;
}
void seq2char(uint32_t s, int k, char *p) {
int i;
for(i=0;i<k;++i) {
*p++=("ACGT")[(s>>(i<<1))&0x3];
}
*p='\0';
}
bool lowcomplex(char *s, int n, int lowcom_thresh) {
int lc=0;
int i;
for (i=1; i<n; ++i) {
// N's always match everything
if (s[i] == 'N' || (s[i] == s[i-1])) {
lc+=1;
} else if (i >= 3 && (s[i] == s[i-2] && s[i-1] == s[i-3])) {
lc+=1;
} else if (i >= 5 && (s[i] == s[i-3] && s[i-1] == s[i-4] && s[i-2] == s[i-5])) {
lc+=1;
} else if (i >= 7 && (s[i] == s[i-4] && s[i-1] == s[i-5] && s[i-2] == s[i-6] && s[i-3] == s[i-7])) {
lc+=1;
}
}
// enough lc to pass?
return lc > lowcom_thresh;
}
uint32_t char2seq(char *p, int k) {
int i;
uint32_t r=0;
for(i=0;i<k;++i) { // map to 64 bit sequence
r=r|(basemap[p[i]]<<(i*2));
}
return r;
}
uint32_t char2seqrc(char *p, int k) {
int i;
uint32_t r=0;
for(i=0;i<k;++i) { // map to 64 bit sequence
r=r<<2|(basemaprc[p[i]]);
}
return r;
}
class sig {
public:
int k; // kmer size
int svx; // number of entries in sequence vector
int n; // sample size
double ftop; // top quantile
double fdel; // delta
double fmin; // min quality
double lowcom_pct; // complexity filter used
string fn; // file read from (if any)
int qvx; // number of entries in quantile vector
void read(const char *file);
typedef struct {
uint32_t seq; // sequence
uint32_t rcseq; // reverse complement of same
int lev;
} ent;
typedef struct {
float sdist; // s distance
float qdist; // q distance
float score; // blended score
} comp;
ent *svec;
double *qvec;
sig() {init();};
~sig() {clear();};
void init() {svec=NULL;svx=qvx=n=k=0;qvec=NULL;};
void clear() {if(svec) free(svec); if (qvec) free(qvec); init();}
sig::comp compare(sig &other);
};
void sig::read(const char *file) {
clear();
FILE *f;
bool isop=0;
if (!strcmp(file,"-")) {
f=stdin;
} else {
f=openordie(file,"r");
isop=1;
}
fn=file;
char *s=NULL; size_t a; int l;
char *p, *e;
double v;
while((l=getline(&s, &a, f))>1) {
if(p=strchr(s,'\t')) {
*p++='\0';
v=strtod(p, &e);
} else {
break;
}
if(p=strchr(s,':')) // colon ignored
*p='\0';
if (!strcmp(s, "k")) {
k=v;
} else if (!strcmp(s, "top-x")) {
svx=v;
} else if (!strcmp(s, "q-start")) {
ftop=v;
} else if (!strcmp(s, "q-delta")) {
fdel=v;
} else if (!strcmp(s, "q-min")) {
fmin=v;
} else if (!strcmp(s, "lowcom-pct")) {
lowcom_pct=v/100;
} else if (!strcmp(s, "reads-used")) {
n=v;
}
}
if (!k || !svx || ftop==0 || fdel==0 || fmin == 0 || n ==0) {
fail("File '%s': Invalid header.\n", file);
}
svec = (sig::ent *)malloc(sizeof(*svec)*svx);
int i = 0;
while(i<svx && ( (l=getline(&s, &a, f)) > 1 )) {
s[k]='\0';
svec[i].seq=char2seq(s,k);
svec[i].rcseq=char2seqrc(s,k);
svec[i].lev=atoi(s+k+1);
++i;
}
// number of quantile vector entries
qvx = (ftop-fmin-fdel)/fdel;
// discard separator
l=getline(&s, &a, f);
qvec = (double *)malloc(sizeof(*qvec)*qvx);
i = 0;
while(i<qvx && ( (l=getline(&s, &a, f)) > 1 )) {
qvec[i]=strtod(s,&e);
++i;
}
if (isop) fclose(f);
}
sig::comp sig::compare(sig &o) {
if (svx!=o.svx || k!=o.k || fmin != o.fmin || fdel != o.fdel || ftop != o.ftop || lowcom_pct != o.lowcom_pct) {
fail("Can't compare sigs with different parameters\n");
}
// influences certainty
double diff=(double)abs(n-o.n)/(double(n+o.n)/2);
// top x sequence vector comparison
int i, j;
int matches=0;
double sdistance=0;
for (i=0;i<svx;++i) {
bool found=0;
for (j=0;j<svx;++j) {
// cap distance at 1000
if (svec[i].seq==o.svec[j].seq || svec[i].seq==o.svec[j].rcseq) {
found=1;
matches+=1;
// the max range on these is about 1000
sdistance+=min(1000,fabs(svec[i].lev-o.svec[j].lev));
break;
}
}
if (!found) {
// 10x worse than finding it at all...
sdistance+=10000;
}
}
sdistance=(sdistance/(double)svx)/10000;
// top x quantile vector comparison
double qdistance=0;
for (i=1;i<qvx;++i) {
// the biggest this can be is 1, but realistically, it will be very, very small
qdistance+=fabs(fabs(qvec[i]-qvec[i-1])-fabs(o.qvec[i]-o.qvec[i-1]));
}
qdistance = qdistance/qvx;
sig::comp c;
c.sdist=sdistance;
c.qdist=qdistance;
// blended score - based on a fit to a model of expected outputs which included some mixtures
c.score=max(0,min(1,1-scoef*sdistance-qcoef*qdistance));
return c;
}
void compare_files(const char *i1, const char *i2) {
sig s1;
sig s2;
s1.read(i1);
s2.read(i2);
sig::comp c = s1.compare(s2);
printf("sequence distance\t%f\n", c.sdist);
printf("quantile distance\t%f\n", c.qdist);
printf("similarity\t%f\n", c.score);
}
template <class vtype>
double quantile(const vtype &vec, double p) {
int l = vec.size();
if (!l) return 0;
double t = ((double)l-1)*p;
int it = (int) t;
int v=vec[it];
if (t > (double)it) {
return (v + (t-it) * (vec[it+1] - v));
} else {
return v;
}
}
void usage(FILE *f) {
fprintf(f,
"Usage: seqsig [options] file1\n"
" or: seqsig -c file1 file2\n"
"Version: %s\n"
"\n"
"Generate qsig files for fastqs, or compare them\n"
"\n"
"Sig generation options:\n"
" -k INT kmer size (12)\n"
" -x INT number of top kmers to output (200)\n"
" -n INT number of reads to use (500000)\n"
" -l REAL low complexity filter (0.40), where 1 is no filter\n"
" -t REAL top quantile to use (0.90)\n"
" -d REAL quantile delta to iterate (0.15)\n"
" -m REAL minimum quantile to output/test (0.50)\n"
" -r FASTA bowtie indexed file of contamination/spike-ins to ignore\n"
"\n"
"Comparison options:\n"
" -Q REAL quantile coefficient for score output (350)\n"
" -S REAL sequence coefficient for score output (0.75)\n"
"\n"
"Misc options:\n"
" -h this help\n"
"\n"
,VERSION);
}
std::string string_format(const std::string &fmt, ...) {
int n, size=100;
std::string str;
va_list ap;
while (1) {
str.resize(size);
va_start(ap, fmt);
int n = vsnprintf((char *)str.c_str(), size, fmt.c_str(), ap);
va_end(ap);
if (n > -1 && n < size)
return str;
if (n > -1)
size=n+1;
else
size*=2;
}
}