Skip to content

Commit

Permalink
Rework Tagged Sequence and Phylogenetic Tree types/modules
Browse files Browse the repository at this point in the history
  • Loading branch information
kMutagene committed May 10, 2021
1 parent beb4158 commit aec8232
Show file tree
Hide file tree
Showing 11 changed files with 288 additions and 129 deletions.
104 changes: 104 additions & 0 deletions BioFSharp.sln
Expand Up @@ -26,6 +26,44 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = ".build", ".build", "{352487
EndProjectSection
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "docs", "docs", "{236C409B-8B38-4393-8E23-4E6844EA97FD}"
ProjectSection(SolutionItems) = preProject
docs\_template.html = docs\_template.html
docs\_template.ipynb = docs\_template.ipynb
docs\Alignment.fsx = docs\Alignment.fsx
docs\AminoProperties.fsx = docs\AminoProperties.fsx
docs\BioCollections.fsx = docs\BioCollections.fsx
docs\BioContainers.fsx = docs\BioContainers.fsx
docs\BioContainers_TargetP.fsx = docs\BioContainers_TargetP.fsx
docs\BioContainersDesignGuide.fsx = docs\BioContainersDesignGuide.fsx
docs\BioDB.fsx = docs\BioDB.fsx
docs\BioID.fsx = docs\BioID.fsx
docs\BioItem.fsx = docs\BioItem.fsx
docs\BioTools-tmhmm.fsx = docs\BioTools-tmhmm.fsx
docs\BlastWrapper.fsx = docs\BlastWrapper.fsx
docs\Clustal.fsx = docs\Clustal.fsx
docs\ClustalOWrapper.fsx = docs\ClustalOWrapper.fsx
docs\CSV.fsx = docs\CSV.fsx
docs\FastA.fsx = docs\FastA.fsx
docs\FastQ.fsx = docs\FastQ.fsx
docs\Formula.fsx = docs\Formula.fsx
docs\FSIPrinters.fsx = docs\FSIPrinters.fsx
docs\GenBank.fsx = docs\GenBank.fsx
docs\GFF3.fsx = docs\GFF3.fsx
docs\GSEA.fsx = docs\GSEA.fsx
docs\index.fsx = docs\index.fsx
docs\Introduction.fsx = docs\Introduction.fsx
docs\MAF.fsx = docs\MAF.fsx
docs\MoleculeFinding.fsx = docs\MoleculeFinding.fsx
docs\MotiveSearch.fsx = docs\MotiveSearch.fsx
docs\Newick.fsx = docs\Newick.fsx
docs\Obo.fsx = docs\Obo.fsx
docs\PetideClassification.fsx = docs\PetideClassification.fsx
docs\Readers.fsx = docs\Readers.fsx
docs\SOFT.fsx = docs\SOFT.fsx
docs\StringMatching.fsx = docs\StringMatching.fsx
docs\tutorial.fsx = docs\tutorial.fsx
docs\WebLogo.fsx = docs\WebLogo.fsx
EndProjectSection
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = ".ci", ".ci", "{A5C98CA2-8C64-4684-BF9A-6D76033BF822}"
EndProject
Expand All @@ -34,6 +72,68 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "actions", "actions", "{C28E
.github\workflows\build-test.yml = .github\workflows\build-test.yml
EndProjectSection
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "img", "img", "{13D91212-A78F-4643-8F5C-F168D309D9E0}"
ProjectSection(SolutionItems) = preProject
docs\img\BioContainers_Overview.png = docs\img\BioContainers_Overview.png
docs\img\Core.png = docs\img\Core.png
docs\img\favicon.ico = docs\img\favicon.ico
docs\img\GFF3.png = docs\img\GFF3.png
docs\img\logo-template.pdn = docs\img\logo-template.pdn
docs\img\logo.png = docs\img\logo.png
docs\img\Logo.svg = docs\img\Logo.svg
docs\img\Logo_large.png = docs\img\Logo_large.png
docs\img\Logo_large.svg = docs\img\Logo_large.svg
docs\img\MakeBlastDBParams.png = docs\img\MakeBlastDBParams.png
docs\img\Nucleotides.png = docs\img\Nucleotides.png
docs\img\Nucleotides.svg = docs\img\Nucleotides.svg
docs\img\release-notes.md = docs\img\release-notes.md
docs\img\SourceCode.png = docs\img\SourceCode.png
docs\img\SourceCode1.png = docs\img\SourceCode1.png
docs\img\TG1.jpg = docs\img\TG1.jpg
docs\img\TG1.png = docs\img\TG1.png
docs\img\Tree.png = docs\img\Tree.png
EndProjectSection
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "reference", "reference", "{86538483-B249-41CA-A1C7-21F00EA314A2}"
ProjectSection(SolutionItems) = preProject
docs\reference\_template.html = docs\reference\_template.html
EndProjectSection
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "content", "content", "{4D7E91D8-E01D-40AE-9DCB-B82754E4ECA2}"
ProjectSection(SolutionItems) = preProject
docs\content\fsdocs-custom.css = docs\content\fsdocs-custom.css
EndProjectSection
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "data", "data", "{5E7A59A6-CB05-4496-99D2-E5C6810819B9}"
ProjectSection(SolutionItems) = preProject
docs\data\alignment.maf = docs\data\alignment.maf
docs\data\blastTestOutput.csv = docs\data\blastTestOutput.csv
docs\data\Chlamy_Cp.fastA = docs\data\Chlamy_Cp.fastA
docs\data\Chlamy_Cp.fastA.gz = docs\data\Chlamy_Cp.fastA.gz
docs\data\Chlamy_Cp.fastA.phr = docs\data\Chlamy_Cp.fastA.phr
docs\data\Chlamy_Cp.fastA.pin = docs\data\Chlamy_Cp.fastA.pin
docs\data\Chlamy_Cp.fastA.psq = docs\data\Chlamy_Cp.fastA.psq
docs\data\clustalExample.asn = docs\data\clustalExample.asn
docs\data\clustalOutputExample.asn = docs\data\clustalOutputExample.asn
docs\data\example.mgf = docs\data\example.mgf
docs\data\FastQtest.fastq = docs\data\FastQtest.fastq
docs\data\gff3Example.gff = docs\data\gff3Example.gff
docs\data\GPL15922_family.soft = docs\data\GPL15922_family.soft
docs\data\GSE71469_family.soft = docs\data\GSE71469_family.soft
docs\data\irisData.csv = docs\data\irisData.csv
docs\data\ms.obo = docs\data\ms.obo
docs\data\ms1Example.mgf = docs\data\ms1Example.mgf
docs\data\ms1ExampleN15.mgf = docs\data\ms1ExampleN15.mgf
docs\data\ms2Example.mgf = docs\data\ms2Example.mgf
docs\data\ms2ExampleN15.mgf = docs\data\ms2ExampleN15.mgf
docs\data\outputTree.txt = docs\data\outputTree.txt
docs\data\Psi-MS.obo = docs\data\Psi-MS.obo
docs\data\sequence.gb = docs\data\sequence.gb
docs\data\Sequence_Ontology_Terms_2_5_3.txt = docs\data\Sequence_Ontology_Terms_2_5_3.txt
docs\data\testTerm.obo = docs\data\testTerm.obo
docs\data\treeExample.txt = docs\data\treeExample.txt
EndProjectSection
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -123,6 +223,10 @@ Global
GlobalSection(NestedProjects) = preSolution
{F9E53B05-C5B7-4F90-A446-045079B9013D} = {653DC881-9E1C-490D-A7F2-1CFE5D78D3FE}
{C28E101C-CFED-47EE-923F-63299DC60E95} = {A5C98CA2-8C64-4684-BF9A-6D76033BF822}
{13D91212-A78F-4643-8F5C-F168D309D9E0} = {236C409B-8B38-4393-8E23-4E6844EA97FD}
{86538483-B249-41CA-A1C7-21F00EA314A2} = {236C409B-8B38-4393-8E23-4E6844EA97FD}
{4D7E91D8-E01D-40AE-9DCB-B82754E4ECA2} = {236C409B-8B38-4393-8E23-4E6844EA97FD}
{5E7A59A6-CB05-4496-99D2-E5C6810819B9} = {236C409B-8B38-4393-8E23-4E6844EA97FD}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {0A2ED902-6B1D-4BA1-966D-A325EE1BB479}
Expand Down
5 changes: 2 additions & 3 deletions src/BioFSharp.IO/Clustal.fs
Expand Up @@ -4,7 +4,6 @@ open FSharpAux
open System.Text
open System.IO
open BioFSharp
open BioFSharp.TaggedSequence
open BioFSharp.BioID

///Contains functions for reading clustal alignment files
Expand Down Expand Up @@ -92,7 +91,7 @@ module Clustal =
Sequences =
[
for kv in sequences do
yield createTaggedSequence kv.Key kv.Value
yield TaggedSequence.create kv.Key kv.Value
]
}

Expand Down Expand Up @@ -133,7 +132,7 @@ module Clustal =
let s = sb.ToString()
sb.Clear() |> ignore
s
createTaggedSequence "" alignment.MetaData.ConservationInfo
TaggedSequence.create "" alignment.MetaData.ConservationInfo
|> Seq.appendSingleton alignment.Sequences
|> Seq.map (fun x ->
addEmpty x.Tag,
Expand Down
2 changes: 1 addition & 1 deletion src/BioFSharp.IO/ClustalOWrapper.fs
Expand Up @@ -3,7 +3,6 @@
///Wrapper and its helpers for Clustal Omega multiple alignment tools
module ClustalOWrapper =

open BioFSharp.TaggedSequence
open FSharpAux

///Contains modifier parameter type for Clustal Omega wrapper
Expand Down Expand Up @@ -248,6 +247,7 @@ module ClustalOWrapper =
open System
open System.Diagnostics
open System.IO
open BioFSharp
open BioFSharp.BioID

let private tsToFasta (ts:TaggedSequence<string,char>) =
Expand Down
3 changes: 1 addition & 2 deletions src/BioFSharp.IO/FSIPrinters.fs
Expand Up @@ -6,7 +6,6 @@ module FSIPrinters =
open BioFSharp
open BioFSharp.Alignment
open BioFSharp.BioID
open BioFSharp.TaggedSequence
open BioFSharp.IO
open BioFSharp.IO.Clustal
open BioFSharp.IO.GFF3
Expand Down Expand Up @@ -174,7 +173,7 @@ module FSIPrinters =
let s = sb.ToString()
sb.Clear() |> ignore
s
createTaggedSequence "" alignment.MetaData.ConservationInfo
TaggedSequence.create "" alignment.MetaData.ConservationInfo
|> Seq.appendSingleton alignment.Sequences
|> Seq.map (fun x ->
addEmpty x.Tag,
Expand Down
22 changes: 15 additions & 7 deletions src/BioFSharp.IO/FastA.fs
Expand Up @@ -3,7 +3,7 @@
open System
open FSharpAux
open FSharpAux.IO

module FastA =
open System.IO

Expand All @@ -13,6 +13,8 @@ module FastA =
Sequence : 'a;
}

let toTaggedSequence (fsa:FastaItem<'S>) =
BioFSharp.TaggedSequence.create fsa.Header fsa.Sequence

/// Creates with header line and sequence.
let createFastaItem header sequence =
Expand Down Expand Up @@ -58,6 +60,9 @@ module FastA =


/// Writes FastaItem to stream. Converter determines type of sequence by converting type -> char
/// The passed stream stays open and is not disposed after writing to it.
/// If you want to reuse the stream (e.g. you are not writing to a file stream but a memory stream that gets used afterwards)
/// you have to reset the position with `stream.Seek(0L, SeekOrigin.Begin)`
let writeToStream (toString:'T -> char) (stream:Stream) (data:seq<FastaItem<#seq<'T>>>) =
let toChunks (w:System.IO.StreamWriter) (length:int) (source: seq<'T>) =
use ie = source.GetEnumerator()
Expand All @@ -83,22 +88,25 @@ module FastA =
w.Flush()

loop ()
use sWriter = new System.IO.StreamWriter(stream,Text.UTF8Encoding(false,true),1024,true)
use sWriter = new System.IO.StreamWriter(stream,Text.UTF8Encoding(false,true),4096,true)
data
|> Seq.iter (fun (i:FastaItem<_>) ->
sWriter.WriteLine(">" + i.Header)
toChunks sWriter 80 i.Sequence)
sWriter.WriteLine(">" + i.Header)
toChunks sWriter 80 i.Sequence
)


/// Writes FastaItem to file. Converter determines type of sequence by converting type -> char. If file already exists the data is overwritten.
let write (toString:'T -> char) (filePath:string) (data:seq<FastaItem<#seq<'T>>>) =
use file = new FileStream(filePath,FileMode.Create)
writeToStream toString file data
let file = new FileStream(filePath,FileMode.Create)
writeToStream toString file data
file.Dispose()

/// Writes FastaItem to file. Converter determines type of sequence by converting type -> char. If file already exists the data is appended.
let writeAndAppend (toString:'T -> char) (filePath:string) (data:seq<FastaItem<#seq<'T>>>) =
use file = new FileStream(filePath,FileMode.Append)
let file = new FileStream(filePath,FileMode.Append)
writeToStream toString file data
file.Dispose()

/// Converts FastaItem to string. Converter determines type of sequence by converting type -> char
let toString (toString:'T -> char) (data:seq<FastaItem<#seq<'T>>>) =
Expand Down
22 changes: 11 additions & 11 deletions src/BioFSharp.IO/Newick.fs
Expand Up @@ -56,12 +56,12 @@ module Newick =

(* Parser *)
///Parses a seq of tokens to a PhylTree
let private parser (converter : string -> 'Distance) (input:seq<Token>) : PhylTree.Node<string*'Distance> =
let private parser (converter : string -> 'Distance) (input:seq<Token>) : PhylogeneticTree<string*'Distance> =
let en = input.GetEnumerator()
let sbID,sbDist = StringBuilder(),StringBuilder()
///Reduces tree to a tuple of its info
let cutDown (tree:PhylTree.Node<string*'Distance>) =
match tree with | PhylTree.Branch (x,y) -> x
let cutDown (tree:PhylogeneticTree<string*'Distance>) =
match tree with | PhylogeneticTree.Branch (x,y) -> x
let mutable iOpen,iClosed = 0,0
let rec loop() =
///This function is called when a new branch is opened, it recursively creates a list of nodes until the corresponding ')' is reached
Expand All @@ -83,7 +83,7 @@ module Newick =
iOpen <- iOpen + 1
let children = createBranch []
let treeInfo,isFinished = loop()
PhylTree.Branch(cutDown treeInfo, children),isFinished
PhylogeneticTree.Branch(cutDown treeInfo, children),isFinished
//distancevalue is added to stringbuilder, iteration is continued
| Distance c ->
sbDist.Append(c) |> ignore
Expand All @@ -97,17 +97,17 @@ module Newick =
iClosed <- iClosed + 1
let dist,id = sbDist.ToString(),sbID.ToString()
(sbDist.Clear(),sbID.Clear()) |> ignore
PhylTree.Branch((id,converter dist),[]),true
PhylogeneticTree.Branch((id,converter dist),[]),true
//name is obtained from stringbuilder, distance is obtained from stringbuilder and converted; tree is built from these info and branchclosed boolean false is returned
| NextNode ->
let dist,id = sbDist.ToString(),sbID.ToString()
(sbDist.Clear(),sbID.Clear()) |> ignore
PhylTree.Branch((id,converter dist),[]),false
PhylogeneticTree.Branch((id,converter dist),[]),false
//name is obtained from stringbuilder, distance is obtained from stringbuilder and converted; tree is built from these infos and branchclosed boolean true is returned
| EndTree ->
let dist,id = sbDist.ToString(),sbID.ToString()
(sbDist.Clear(),sbID.Clear()) |> ignore
PhylTree.Branch((id,converter dist),[]),true
PhylogeneticTree.Branch((id,converter dist),[]),true
//ignored
| Separator ->
loop()
Expand All @@ -117,7 +117,7 @@ module Newick =
fst (loop())

///Returns a PhylTree of file. Converter is used to create a distancevalue of a string
let ofFile (converter : string -> 'Distance) (path: string) : PhylTree.Node<string*'Distance> =
let ofFile (converter : string -> 'Distance) (path: string) : PhylogeneticTree<string*'Distance> =
path
|> readFile
|> tokenizer
Expand All @@ -126,14 +126,14 @@ module Newick =
//---Writer---//

///Creates a NewickTree file of PhylTree. nodeConverter is used to split the distanceInfo and the name of a node, because they are parsed separately. First result of the tuple is name, second is distance.
let toFile (nodeConverter: 'T -> string * string) (path:string) (tree: PhylTree.Node<'T>) =
let toFile (nodeConverter: 'T -> string * string) (path:string) (tree: PhylogeneticTree<'T>) =
let rec loop tree =
seq {
match tree with
| PhylTree.Branch ((nodeInfo),[]) ->
| PhylogeneticTree.Branch ((nodeInfo),[]) ->
let name,distance = nodeConverter nodeInfo
yield name + ":" + (distance)
| PhylTree.Branch ((nodeInfo), nl) ->
| PhylogeneticTree.Branch ((nodeInfo), nl) ->
let nodeInfo =
match nodeConverter nodeInfo with
| (name, "") -> name
Expand Down
1 change: 1 addition & 0 deletions src/BioFSharp/BioFSharp.fsproj
Expand Up @@ -107,6 +107,7 @@
<None Include="Playground\WorkflowLanguage.fsx" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="FSharp.Stats" Version="0.4.1" />
<PackageReference Include="FSharpAux" Version="1.0.0" />
<PackageReference Include="FSharpAux.IO" Version="1.0.0" />
<PackageReference Include="Microsoft.SourceLink.GitHub" Version="1.0.0" PrivateAssets="All" />
Expand Down

0 comments on commit aec8232

Please sign in to comment.