Skip to content

Commit

Permalink
Updated to excise the biogo dependency.
Browse files Browse the repository at this point in the history
  • Loading branch information
BurntSushi committed Jul 1, 2013
1 parent b905aea commit 5a5a669
Show file tree
Hide file tree
Showing 7 changed files with 63 additions and 56 deletions.
8 changes: 6 additions & 2 deletions cmd/cablastp-compress/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ var (
ignoredResidues = []byte{'J', 'O', 'U'}

// A default configuration.
dbConf = cablastp.DefaultDBConf
dbConf = &cablastp.DefaultDBConf

// Flags that affect the higher level operation of compression.
// Flags that control algorithmic parameters are stored in `dbConf`.
Expand Down Expand Up @@ -161,7 +161,7 @@ func main() {

// Create a new database for writing. If we're appending, we load
// the coarse database into memory, and setup the database for writing.
db, err := cablastp.NewWriteDB(flagAppend, dbConf, flag.Arg(0))
db, err := cablastp.NewWriteDB(flagAppend, *dbConf, flag.Arg(0))
if err != nil {
fatalf("%s\n", err)
}
Expand All @@ -170,6 +170,7 @@ func main() {
pool := startCompressWorkers(db)
orgSeqId := db.ComDB.NumSequences()
mainQuit := make(chan struct{}, 0)
totalResidues := dbConf.BlastDBSize

// If the process is killed, try to clean up elegantly.
// The idea is to preserve the integrity of the database.
Expand Down Expand Up @@ -203,6 +204,7 @@ func main() {
if readSeq.Err != nil {
log.Fatal(err)
}
totalResidues += readSeq.Seq.Len()
orgSeqId = pool.compress(orgSeqId, readSeq.Seq)
verboseOutput(db, orgSeqId)
if flagMaxSeedsGB > 0 && orgSeqId%10000 == 0 {
Expand All @@ -213,6 +215,8 @@ func main() {
cablastp.Vprintln("\n")
cablastp.Vprintf("Wrote %s.\n", cablastp.FileCompressed)
cablastp.Vprintf("Wrote %s.\n", cablastp.FileIndex)

db.BlastDBSize = totalResidues
cleanup(db, &pool)
}

Expand Down
27 changes: 8 additions & 19 deletions cmd/cablastp-compress/nw.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,19 @@ package main

import (
"github.com/BurntSushi/cablastp/blosum"

"code.google.com/p/biogo/align/nw"
"code.google.com/p/biogo/util"
)

var (
nwLookUpP util.CTL
aligner = &nw.Aligner{
Matrix: blosum.Matrix62,
LookUp: nwLookUpP,
GapChar: '-',
}
nwLookUpP map[byte]int
)

// Initialize the alignment lookup table. (i.e., translate ASCII residue
// characters to BLOSUM62 matrix indices.)
func init() {
m := make(map[int]int)
for i, v := range blosum.Alphabet62 {
m[int(v)] = i
nwLookUpP := make(map[byte]int)
for i := 0; i < len(blosum.Alphabet62); i++ {
nwLookUpP[blosum.Alphabet62[i]] = i
}
nwLookUpP = *util.NewCTL(m)

aligner.LookUp = nwLookUpP
}

// appendOne appends a single byte to a byte slice and only allocates if it
Expand Down Expand Up @@ -53,7 +42,7 @@ func appendOne(slice []byte, b byte) []byte {
// programming to only allow a limited number of gaps proportion to the
// length of the large of rseq and oseq.
func nwAlign(rseq, oseq []byte, mem *memory) [2][]byte {
gap := len(aligner.Matrix) - 1
gap := len(blosum.Matrix62) - 1
r, c := len(rseq)+1, len(oseq)+1
off := 0

Expand All @@ -75,9 +64,9 @@ func nwAlign(rseq, oseq []byte, mem *memory) [2][]byte {
}

var sdiag, sup, sleft, rVal, oVal int
valToCode := aligner.LookUp.ValueToCode
gapChar := aligner.GapChar
matrix := aligner.Matrix
valToCode := nwLookUpP
gapChar := byte('-')
matrix := blosum.Matrix62

var i2, i3 int
for i := 1; i < r; i++ {
Expand Down
20 changes: 12 additions & 8 deletions cmd/cablastp-decompress/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@ import (
"runtime"
"runtime/pprof"

"code.google.com/p/biogo/io/seqio/fasta"
"code.google.com/p/biogo/seq"
"github.com/TuftsBCB/io/fasta"

"github.com/BurntSushi/cablastp"
)
Expand Down Expand Up @@ -59,7 +58,8 @@ func main() {
if err != nil {
fatalf("Could not write to '%s': %s\n", flag.Arg(1), err)
}
fastaWriter := fasta.NewWriter(outFasta, 60)
fastaWriter := fasta.NewWriter(outFasta)
fastaWriter.Asterisk = true

// Create a new database for writing. If we're appending, we load
// the coarse database into memory, and setup the database for writing.
Expand All @@ -80,16 +80,20 @@ func main() {

numSeqs := db.ComDB.NumSequences()
for orgSeqId := 0; orgSeqId < numSeqs; orgSeqId++ {
oseq, err := db.ComDB.ReadNextSeq(db.CoarseDB, orgSeqId)
oseq, err := db.ComDB.ReadSeq(db.CoarseDB, orgSeqId)
if err != nil {
fatalf("%s\n", err)
fatalf("Error reading seq id '%d': %s\n", orgSeqId, err)
}
if err := fastaWriter.Write(oseq.FastaSeq()); err != nil {
cablastp.Vprintf("Error writing seq '%s': %s\n", oseq.Name, err)
}
fastaWriter.Write(
seq.New(oseq.Name, append(oseq.Residues, '*'), nil))
}

cleanup(db)
if err = fastaWriter.Close(); err != nil {
if err = fastaWriter.Flush(); err != nil {
fatalf("%s\n", err)
}
if err = outFasta.Close(); err != nil {
fatalf("%s\n", err)
}
}
Expand Down
2 changes: 1 addition & 1 deletion dbconf.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ var DefaultDBConf = DBConf{
SavePlain: false,
ReadOnly: true,
BlastMakeBlastDB: "makeblastdb",
BlastDBSize: 20000000,
BlastDBSize: 0,
}

func LoadDBConf(r io.Reader) (conf DBConf, err error) {
Expand Down
24 changes: 14 additions & 10 deletions fasta.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@ package cablastp

import (
"io"
"os"

"code.google.com/p/biogo/io/seqio/fasta"
"github.com/TuftsBCB/io/fasta"
)

// ReadOriginalSeq is the value sent over `chan ReadOriginalSeq` when a new
Expand All @@ -15,17 +16,20 @@ type ReadOriginalSeq struct {

// ReadOriginalSeqs reads a FASTA formatted file and returns a channel that
// each new sequence is sent to.
func ReadOriginalSeqs(fileName string,
ignore []byte) (chan ReadOriginalSeq, error) {

reader, err := fasta.NewReaderName(fileName)
func ReadOriginalSeqs(
fileName string,
ignore []byte,
) (chan ReadOriginalSeq, error) {
f, err := os.Open(fileName)
if err != nil {
return nil, err
}

reader := fasta.NewReader(f)
seqChan := make(chan ReadOriginalSeq, 200)
go func() {
for i := 0; true; i++ {
seq, err := reader.Read()
sequence, err := reader.Read()
if err == io.EOF {
close(seqChan)
break
Expand All @@ -38,16 +42,16 @@ func ReadOriginalSeqs(fileName string,
close(seqChan)
break
}
for i, residue := range seq.Seq {
for i, residue := range sequence.Residues {
for _, toignore := range ignore {
if toignore == residue {
seq.Seq[i] = 'X'
if toignore == byte(residue) {
sequence.Residues[i] = 'X'
break
}
}
}
seqChan <- ReadOriginalSeq{
Seq: NewBiogoOriginalSeq(i, seq),
Seq: NewFastaOriginalSeq(i, sequence),
Err: nil,
}
}
Expand Down
10 changes: 6 additions & 4 deletions io.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import (
"strconv"
"time"

"code.google.com/p/biogo/io/seqio/fasta"
"github.com/TuftsBCB/io/fasta"
)

func (coarsedb *CoarseDB) readFasta() error {
Expand All @@ -27,7 +27,7 @@ func (coarsedb *CoarseDB) readFasta() error {
if err != nil {
return err
}
coarsedb.Seqs = append(coarsedb.Seqs, NewBiogoCoarseSeq(i, seq))
coarsedb.Seqs = append(coarsedb.Seqs, NewFastaCoarseSeq(i, seq))
}
coarsedb.seqsRead = len(coarsedb.Seqs)

Expand Down Expand Up @@ -332,19 +332,21 @@ func (comdb *CompressedDB) ReadSeq(
fmt.Errorf("Tried to seek to offset %d in the compressed "+
"database, but seeked to %d instead.", off, newOff)
}

return comdb.ReadNextSeq(coarsedb, orgSeqId)
}

func (comdb *CompressedDB) ReadNextSeq(
coarsedb *CoarseDB, orgSeqId int) (OriginalSeq, error) {

csvReader := csv.NewReader(comdb.File)
csvReader.LazyQuotes = true
csvReader.Comma = ','
csvReader.FieldsPerRecord = -1

record, err := csvReader.Read()
if err != nil {
if err == io.EOF && len(record) == 0 {
return OriginalSeq{}, fmt.Errorf("[csv reader]: id out of range")
} else if err != nil && err != io.EOF {
return OriginalSeq{}, fmt.Errorf("[csv reader]: %s", err)
}

Expand Down
28 changes: 16 additions & 12 deletions seq.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
"strings"
"sync"

"code.google.com/p/biogo/seq"
"github.com/TuftsBCB/seq"
)

// SeqIdentity computes the sequence identity of two byte slices.
Expand Down Expand Up @@ -91,10 +91,10 @@ func newSeq(id int, name string, residues []byte) *sequence {
}
}

// newBiogoSeq creates a new *sequence value from biogo's Seq type, and ensures
// that all residues in the sequence are upper cased.
func newBiogoSeq(id int, s *seq.Seq) *sequence {
return newSeq(id, s.ID, s.Seq)
// newFastaSeq creates a new *sequence value from seq's Sequence type, and
// ensures that all residues in the sequence are upper cased.
func newFastaSeq(id int, s seq.Sequence) *sequence {
return newSeq(id, s.Name, s.Bytes())
}

// newSubSequence returns a new *sequence value that corresponds to a
Expand All @@ -110,9 +110,13 @@ func (seq *sequence) newSubSequence(start, end uint) *sequence {
return s
}

// BiogoSeq returns a new *seq.Seq from biogo.
func (s *sequence) BiogoSeq() *seq.Seq {
return seq.New(s.Name, s.Residues, nil)
// FastaSeq returns a new seq.Sequence from TuftsBCB/seq.
func (s *sequence) FastaSeq() seq.Sequence {
rs := make([]seq.Residue, len(s.Residues))
for i := range s.Residues {
rs[i] = seq.Residue(s.Residues[i])
}
return seq.Sequence{s.Name, rs}
}

// Len retuns the number of residues in this sequence.
Expand Down Expand Up @@ -149,8 +153,8 @@ func NewCoarseSeq(id int, name string, residues []byte) *CoarseSeq {
}
}

func NewBiogoCoarseSeq(id int, seq *seq.Seq) *CoarseSeq {
return NewCoarseSeq(id, seq.ID, seq.Seq)
func NewFastaCoarseSeq(id int, s seq.Sequence) *CoarseSeq {
return NewCoarseSeq(id, s.Name, s.Bytes())
}

func (rseq *CoarseSeq) NewSubSequence(start, end uint) *CoarseSeq {
Expand Down Expand Up @@ -188,8 +192,8 @@ func NewOriginalSeq(id int, name string, residues []byte) *OriginalSeq {
return &OriginalSeq{sequence: newSeq(id, name, residues)}
}

func NewBiogoOriginalSeq(id int, seq *seq.Seq) *OriginalSeq {
return &OriginalSeq{sequence: newBiogoSeq(id, seq)}
func NewFastaOriginalSeq(id int, s seq.Sequence) *OriginalSeq {
return &OriginalSeq{sequence: newFastaSeq(id, s)}
}

func (oseq *OriginalSeq) NewSubSequence(start, end uint) *OriginalSeq {
Expand Down

0 comments on commit 5a5a669

Please sign in to comment.