Updated to excise the biogo dependency.

BurntSushi · Jul 1, 2013 · 5a5a669 · 5a5a669
1 parent b905aea
commit 5a5a669
Show file tree

Hide file tree

Showing 7 changed files with 63 additions and 56 deletions.
diff --git a/cmd/cablastp-compress/main.go b/cmd/cablastp-compress/main.go
@@ -27,7 +27,7 @@ var (
 	ignoredResidues = []byte{'J', 'O', 'U'}
 
 	// A default configuration.
-	dbConf = cablastp.DefaultDBConf
+	dbConf = &cablastp.DefaultDBConf
 
 	// Flags that affect the higher level operation of compression.
 	// Flags that control algorithmic parameters are stored in `dbConf`.
@@ -161,7 +161,7 @@ func main() {
 
 	// Create a new database for writing. If we're appending, we load
 	// the coarse database into memory, and setup the database for writing.
-	db, err := cablastp.NewWriteDB(flagAppend, dbConf, flag.Arg(0))
+	db, err := cablastp.NewWriteDB(flagAppend, *dbConf, flag.Arg(0))
 	if err != nil {
 		fatalf("%s\n", err)
 	}
@@ -170,6 +170,7 @@ func main() {
 	pool := startCompressWorkers(db)
 	orgSeqId := db.ComDB.NumSequences()
 	mainQuit := make(chan struct{}, 0)
+	totalResidues := dbConf.BlastDBSize
 
 	// If the process is killed, try to clean up elegantly.
 	// The idea is to preserve the integrity of the database.
@@ -203,6 +204,7 @@ func main() {
 			if readSeq.Err != nil {
 				log.Fatal(err)
 			}
+			totalResidues += readSeq.Seq.Len()
 			orgSeqId = pool.compress(orgSeqId, readSeq.Seq)
 			verboseOutput(db, orgSeqId)
 			if flagMaxSeedsGB > 0 && orgSeqId%10000 == 0 {
@@ -213,6 +215,8 @@ func main() {
 	cablastp.Vprintln("\n")
 	cablastp.Vprintf("Wrote %s.\n", cablastp.FileCompressed)
 	cablastp.Vprintf("Wrote %s.\n", cablastp.FileIndex)
+
+	db.BlastDBSize = totalResidues
 	cleanup(db, &pool)
 }
 

diff --git a/cmd/cablastp-compress/nw.go b/cmd/cablastp-compress/nw.go
@@ -2,30 +2,19 @@ package main
 
 import (
 	"github.com/BurntSushi/cablastp/blosum"
-
-	"code.google.com/p/biogo/align/nw"
-	"code.google.com/p/biogo/util"
 )
 
 var (
-	nwLookUpP util.CTL
-	aligner   = &nw.Aligner{
-		Matrix:  blosum.Matrix62,
-		LookUp:  nwLookUpP,
-		GapChar: '-',
-	}
+	nwLookUpP map[byte]int
 )
 
 // Initialize the alignment lookup table. (i.e., translate ASCII residue
 // characters to BLOSUM62 matrix indices.)
 func init() {
-	m := make(map[int]int)
-	for i, v := range blosum.Alphabet62 {
-		m[int(v)] = i
+	nwLookUpP := make(map[byte]int)
+	for i := 0; i < len(blosum.Alphabet62); i++ {
+		nwLookUpP[blosum.Alphabet62[i]] = i
 	}
-	nwLookUpP = *util.NewCTL(m)
-
-	aligner.LookUp = nwLookUpP
 }
 
 // appendOne appends a single byte to a byte slice and only allocates if it
@@ -53,7 +42,7 @@ func appendOne(slice []byte, b byte) []byte {
 // programming to only allow a limited number of gaps proportion to the
 // length of the large of rseq and oseq.
 func nwAlign(rseq, oseq []byte, mem *memory) [2][]byte {
-	gap := len(aligner.Matrix) - 1
+	gap := len(blosum.Matrix62) - 1
 	r, c := len(rseq)+1, len(oseq)+1
 	off := 0
 
@@ -75,9 +64,9 @@ func nwAlign(rseq, oseq []byte, mem *memory) [2][]byte {
 	}
 
 	var sdiag, sup, sleft, rVal, oVal int
-	valToCode := aligner.LookUp.ValueToCode
-	gapChar := aligner.GapChar
-	matrix := aligner.Matrix
+	valToCode := nwLookUpP
+	gapChar := byte('-')
+	matrix := blosum.Matrix62
 
 	var i2, i3 int
 	for i := 1; i < r; i++ {

diff --git a/cmd/cablastp-decompress/main.go b/cmd/cablastp-decompress/main.go
@@ -9,8 +9,7 @@ import (
 	"runtime"
 	"runtime/pprof"
 
-	"code.google.com/p/biogo/io/seqio/fasta"
-	"code.google.com/p/biogo/seq"
+	"github.com/TuftsBCB/io/fasta"
 
 	"github.com/BurntSushi/cablastp"
 )
@@ -59,7 +58,8 @@ func main() {
 	if err != nil {
 		fatalf("Could not write to '%s': %s\n", flag.Arg(1), err)
 	}
-	fastaWriter := fasta.NewWriter(outFasta, 60)
+	fastaWriter := fasta.NewWriter(outFasta)
+	fastaWriter.Asterisk = true
 
 	// Create a new database for writing. If we're appending, we load
 	// the coarse database into memory, and setup the database for writing.
@@ -80,16 +80,20 @@ func main() {
 
 	numSeqs := db.ComDB.NumSequences()
 	for orgSeqId := 0; orgSeqId < numSeqs; orgSeqId++ {
-		oseq, err := db.ComDB.ReadNextSeq(db.CoarseDB, orgSeqId)
+		oseq, err := db.ComDB.ReadSeq(db.CoarseDB, orgSeqId)
 		if err != nil {
-			fatalf("%s\n", err)
+			fatalf("Error reading seq id '%d': %s\n", orgSeqId, err)
+		}
+		if err := fastaWriter.Write(oseq.FastaSeq()); err != nil {
+			cablastp.Vprintf("Error writing seq '%s': %s\n", oseq.Name, err)
 		}
-		fastaWriter.Write(
-			seq.New(oseq.Name, append(oseq.Residues, '*'), nil))
 	}
 
 	cleanup(db)
-	if err = fastaWriter.Close(); err != nil {
+	if err = fastaWriter.Flush(); err != nil {
+		fatalf("%s\n", err)
+	}
+	if err = outFasta.Close(); err != nil {
 		fatalf("%s\n", err)
 	}
 }

diff --git a/dbconf.go b/dbconf.go
@@ -42,7 +42,7 @@ var DefaultDBConf = DBConf{
 	SavePlain:           false,
 	ReadOnly:            true,
 	BlastMakeBlastDB:    "makeblastdb",
-	BlastDBSize:         20000000,
+	BlastDBSize:         0,
 }
 
 func LoadDBConf(r io.Reader) (conf DBConf, err error) {

diff --git a/fasta.go b/fasta.go
@@ -2,8 +2,9 @@ package cablastp
 
 import (
 	"io"
+	"os"
 
-	"code.google.com/p/biogo/io/seqio/fasta"
+	"github.com/TuftsBCB/io/fasta"
 )
 
 // ReadOriginalSeq is the value sent over `chan ReadOriginalSeq` when a new
@@ -15,17 +16,20 @@ type ReadOriginalSeq struct {
 
 // ReadOriginalSeqs reads a FASTA formatted file and returns a channel that
 // each new sequence is sent to.
-func ReadOriginalSeqs(fileName string,
-	ignore []byte) (chan ReadOriginalSeq, error) {
-
-	reader, err := fasta.NewReaderName(fileName)
+func ReadOriginalSeqs(
+	fileName string,
+	ignore []byte,
+) (chan ReadOriginalSeq, error) {
+	f, err := os.Open(fileName)
 	if err != nil {
 		return nil, err
 	}
+
+	reader := fasta.NewReader(f)
 	seqChan := make(chan ReadOriginalSeq, 200)
 	go func() {
 		for i := 0; true; i++ {
-			seq, err := reader.Read()
+			sequence, err := reader.Read()
 			if err == io.EOF {
 				close(seqChan)
 				break
@@ -38,16 +42,16 @@ func ReadOriginalSeqs(fileName string,
 				close(seqChan)
 				break
 			}
-			for i, residue := range seq.Seq {
+			for i, residue := range sequence.Residues {
 				for _, toignore := range ignore {
-					if toignore == residue {
-						seq.Seq[i] = 'X'
+					if toignore == byte(residue) {
+						sequence.Residues[i] = 'X'
 						break
 					}
 				}
 			}
 			seqChan <- ReadOriginalSeq{
-				Seq: NewBiogoOriginalSeq(i, seq),
+				Seq: NewFastaOriginalSeq(i, sequence),
 				Err: nil,
 			}
 		}

diff --git a/io.go b/io.go
@@ -11,7 +11,7 @@ import (
 	"strconv"
 	"time"
 
-	"code.google.com/p/biogo/io/seqio/fasta"
+	"github.com/TuftsBCB/io/fasta"
 )
 
 func (coarsedb *CoarseDB) readFasta() error {
@@ -27,7 +27,7 @@ func (coarsedb *CoarseDB) readFasta() error {
 		if err != nil {
 			return err
 		}
-		coarsedb.Seqs = append(coarsedb.Seqs, NewBiogoCoarseSeq(i, seq))
+		coarsedb.Seqs = append(coarsedb.Seqs, NewFastaCoarseSeq(i, seq))
 	}
 	coarsedb.seqsRead = len(coarsedb.Seqs)
 
@@ -332,19 +332,21 @@ func (comdb *CompressedDB) ReadSeq(
 			fmt.Errorf("Tried to seek to offset %d in the compressed "+
 				"database, but seeked to %d instead.", off, newOff)
 	}
-
 	return comdb.ReadNextSeq(coarsedb, orgSeqId)
 }
 
 func (comdb *CompressedDB) ReadNextSeq(
 	coarsedb *CoarseDB, orgSeqId int) (OriginalSeq, error) {
 
 	csvReader := csv.NewReader(comdb.File)
+	csvReader.LazyQuotes = true
 	csvReader.Comma = ','
 	csvReader.FieldsPerRecord = -1
 
 	record, err := csvReader.Read()
-	if err != nil {
+	if err == io.EOF && len(record) == 0 {
+		return OriginalSeq{}, fmt.Errorf("[csv reader]: id out of range")
+	} else if err != nil && err != io.EOF {
 		return OriginalSeq{}, fmt.Errorf("[csv reader]: %s", err)
 	}
 

diff --git a/seq.go b/seq.go
@@ -6,7 +6,7 @@ import (
 	"strings"
 	"sync"
 
-	"code.google.com/p/biogo/seq"
+	"github.com/TuftsBCB/seq"
 )
 
 // SeqIdentity computes the sequence identity of two byte slices.
@@ -91,10 +91,10 @@ func newSeq(id int, name string, residues []byte) *sequence {
 	}
 }
 
-// newBiogoSeq creates a new *sequence value from biogo's Seq type, and ensures
-// that all residues in the sequence are upper cased.
-func newBiogoSeq(id int, s *seq.Seq) *sequence {
-	return newSeq(id, s.ID, s.Seq)
+// newFastaSeq creates a new *sequence value from seq's Sequence type, and
+// ensures that all residues in the sequence are upper cased.
+func newFastaSeq(id int, s seq.Sequence) *sequence {
+	return newSeq(id, s.Name, s.Bytes())
 }
 
 // newSubSequence returns a new *sequence value that corresponds to a
@@ -110,9 +110,13 @@ func (seq *sequence) newSubSequence(start, end uint) *sequence {
 	return s
 }
 
-// BiogoSeq returns a new *seq.Seq from biogo.
-func (s *sequence) BiogoSeq() *seq.Seq {
-	return seq.New(s.Name, s.Residues, nil)
+// FastaSeq returns a new seq.Sequence from TuftsBCB/seq.
+func (s *sequence) FastaSeq() seq.Sequence {
+	rs := make([]seq.Residue, len(s.Residues))
+	for i := range s.Residues {
+		rs[i] = seq.Residue(s.Residues[i])
+	}
+	return seq.Sequence{s.Name, rs}
 }
 
 // Len retuns the number of residues in this sequence.
@@ -149,8 +153,8 @@ func NewCoarseSeq(id int, name string, residues []byte) *CoarseSeq {
 	}
 }
 
-func NewBiogoCoarseSeq(id int, seq *seq.Seq) *CoarseSeq {
-	return NewCoarseSeq(id, seq.ID, seq.Seq)
+func NewFastaCoarseSeq(id int, s seq.Sequence) *CoarseSeq {
+	return NewCoarseSeq(id, s.Name, s.Bytes())
 }
 
 func (rseq *CoarseSeq) NewSubSequence(start, end uint) *CoarseSeq {
@@ -188,8 +192,8 @@ func NewOriginalSeq(id int, name string, residues []byte) *OriginalSeq {
 	return &OriginalSeq{sequence: newSeq(id, name, residues)}
 }
 
-func NewBiogoOriginalSeq(id int, seq *seq.Seq) *OriginalSeq {
-	return &OriginalSeq{sequence: newBiogoSeq(id, seq)}
+func NewFastaOriginalSeq(id int, s seq.Sequence) *OriginalSeq {
+	return &OriginalSeq{sequence: newFastaSeq(id, s)}
 }
 
 func (oseq *OriginalSeq) NewSubSequence(start, end uint) *OriginalSeq {