From aec8232dd7725cdbb99a038ebe0c99f26922f31a Mon Sep 17 00:00:00 2001 From: Kevin Schneider Date: Mon, 10 May 2021 11:45:53 +0200 Subject: [PATCH] Rework Tagged Sequence and Phylogenetic Tree types/modules --- BioFSharp.sln | 104 +++++++++++ src/BioFSharp.IO/Clustal.fs | 5 +- src/BioFSharp.IO/ClustalOWrapper.fs | 2 +- src/BioFSharp.IO/FSIPrinters.fs | 3 +- src/BioFSharp.IO/FastA.fs | 22 ++- src/BioFSharp.IO/Newick.fs | 22 +-- src/BioFSharp/BioFSharp.fsproj | 1 + src/BioFSharp/PhylTree.fs | 163 +++++++++++------- src/BioFSharp/TaggedSequence.fs | 34 ++-- .../BioFSharp.Tests/BioFSharp/BioItemTests.fs | 8 +- .../BioFSharp/PhylTreeTests.fs | 53 +++--- 11 files changed, 288 insertions(+), 129 deletions(-) diff --git a/BioFSharp.sln b/BioFSharp.sln index 7b1ac072..6a09616e 100644 --- a/BioFSharp.sln +++ b/BioFSharp.sln @@ -26,6 +26,44 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = ".build", ".build", "{352487 EndProjectSection EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "docs", "docs", "{236C409B-8B38-4393-8E23-4E6844EA97FD}" + ProjectSection(SolutionItems) = preProject + docs\_template.html = docs\_template.html + docs\_template.ipynb = docs\_template.ipynb + docs\Alignment.fsx = docs\Alignment.fsx + docs\AminoProperties.fsx = docs\AminoProperties.fsx + docs\BioCollections.fsx = docs\BioCollections.fsx + docs\BioContainers.fsx = docs\BioContainers.fsx + docs\BioContainers_TargetP.fsx = docs\BioContainers_TargetP.fsx + docs\BioContainersDesignGuide.fsx = docs\BioContainersDesignGuide.fsx + docs\BioDB.fsx = docs\BioDB.fsx + docs\BioID.fsx = docs\BioID.fsx + docs\BioItem.fsx = docs\BioItem.fsx + docs\BioTools-tmhmm.fsx = docs\BioTools-tmhmm.fsx + docs\BlastWrapper.fsx = docs\BlastWrapper.fsx + docs\Clustal.fsx = docs\Clustal.fsx + docs\ClustalOWrapper.fsx = docs\ClustalOWrapper.fsx + docs\CSV.fsx = docs\CSV.fsx + docs\FastA.fsx = docs\FastA.fsx + docs\FastQ.fsx = docs\FastQ.fsx + docs\Formula.fsx = docs\Formula.fsx + docs\FSIPrinters.fsx = docs\FSIPrinters.fsx + docs\GenBank.fsx = docs\GenBank.fsx + docs\GFF3.fsx = docs\GFF3.fsx + docs\GSEA.fsx = docs\GSEA.fsx + docs\index.fsx = docs\index.fsx + docs\Introduction.fsx = docs\Introduction.fsx + docs\MAF.fsx = docs\MAF.fsx + docs\MoleculeFinding.fsx = docs\MoleculeFinding.fsx + docs\MotiveSearch.fsx = docs\MotiveSearch.fsx + docs\Newick.fsx = docs\Newick.fsx + docs\Obo.fsx = docs\Obo.fsx + docs\PetideClassification.fsx = docs\PetideClassification.fsx + docs\Readers.fsx = docs\Readers.fsx + docs\SOFT.fsx = docs\SOFT.fsx + docs\StringMatching.fsx = docs\StringMatching.fsx + docs\tutorial.fsx = docs\tutorial.fsx + docs\WebLogo.fsx = docs\WebLogo.fsx + EndProjectSection EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = ".ci", ".ci", "{A5C98CA2-8C64-4684-BF9A-6D76033BF822}" EndProject @@ -34,6 +72,68 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "actions", "actions", "{C28E .github\workflows\build-test.yml = .github\workflows\build-test.yml EndProjectSection EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "img", "img", "{13D91212-A78F-4643-8F5C-F168D309D9E0}" + ProjectSection(SolutionItems) = preProject + docs\img\BioContainers_Overview.png = docs\img\BioContainers_Overview.png + docs\img\Core.png = docs\img\Core.png + docs\img\favicon.ico = docs\img\favicon.ico + docs\img\GFF3.png = docs\img\GFF3.png + docs\img\logo-template.pdn = docs\img\logo-template.pdn + docs\img\logo.png = docs\img\logo.png + docs\img\Logo.svg = docs\img\Logo.svg + docs\img\Logo_large.png = docs\img\Logo_large.png + docs\img\Logo_large.svg = docs\img\Logo_large.svg + docs\img\MakeBlastDBParams.png = docs\img\MakeBlastDBParams.png + docs\img\Nucleotides.png = docs\img\Nucleotides.png + docs\img\Nucleotides.svg = docs\img\Nucleotides.svg + docs\img\release-notes.md = docs\img\release-notes.md + docs\img\SourceCode.png = docs\img\SourceCode.png + docs\img\SourceCode1.png = docs\img\SourceCode1.png + docs\img\TG1.jpg = docs\img\TG1.jpg + docs\img\TG1.png = docs\img\TG1.png + docs\img\Tree.png = docs\img\Tree.png + EndProjectSection +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "reference", "reference", "{86538483-B249-41CA-A1C7-21F00EA314A2}" + ProjectSection(SolutionItems) = preProject + docs\reference\_template.html = docs\reference\_template.html + EndProjectSection +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "content", "content", "{4D7E91D8-E01D-40AE-9DCB-B82754E4ECA2}" + ProjectSection(SolutionItems) = preProject + docs\content\fsdocs-custom.css = docs\content\fsdocs-custom.css + EndProjectSection +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "data", "data", "{5E7A59A6-CB05-4496-99D2-E5C6810819B9}" + ProjectSection(SolutionItems) = preProject + docs\data\alignment.maf = docs\data\alignment.maf + docs\data\blastTestOutput.csv = docs\data\blastTestOutput.csv + docs\data\Chlamy_Cp.fastA = docs\data\Chlamy_Cp.fastA + docs\data\Chlamy_Cp.fastA.gz = docs\data\Chlamy_Cp.fastA.gz + docs\data\Chlamy_Cp.fastA.phr = docs\data\Chlamy_Cp.fastA.phr + docs\data\Chlamy_Cp.fastA.pin = docs\data\Chlamy_Cp.fastA.pin + docs\data\Chlamy_Cp.fastA.psq = docs\data\Chlamy_Cp.fastA.psq + docs\data\clustalExample.asn = docs\data\clustalExample.asn + docs\data\clustalOutputExample.asn = docs\data\clustalOutputExample.asn + docs\data\example.mgf = docs\data\example.mgf + docs\data\FastQtest.fastq = docs\data\FastQtest.fastq + docs\data\gff3Example.gff = docs\data\gff3Example.gff + docs\data\GPL15922_family.soft = docs\data\GPL15922_family.soft + docs\data\GSE71469_family.soft = docs\data\GSE71469_family.soft + docs\data\irisData.csv = docs\data\irisData.csv + docs\data\ms.obo = docs\data\ms.obo + docs\data\ms1Example.mgf = docs\data\ms1Example.mgf + docs\data\ms1ExampleN15.mgf = docs\data\ms1ExampleN15.mgf + docs\data\ms2Example.mgf = docs\data\ms2Example.mgf + docs\data\ms2ExampleN15.mgf = docs\data\ms2ExampleN15.mgf + docs\data\outputTree.txt = docs\data\outputTree.txt + docs\data\Psi-MS.obo = docs\data\Psi-MS.obo + docs\data\sequence.gb = docs\data\sequence.gb + docs\data\Sequence_Ontology_Terms_2_5_3.txt = docs\data\Sequence_Ontology_Terms_2_5_3.txt + docs\data\testTerm.obo = docs\data\testTerm.obo + docs\data\treeExample.txt = docs\data\treeExample.txt + EndProjectSection +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -123,6 +223,10 @@ Global GlobalSection(NestedProjects) = preSolution {F9E53B05-C5B7-4F90-A446-045079B9013D} = {653DC881-9E1C-490D-A7F2-1CFE5D78D3FE} {C28E101C-CFED-47EE-923F-63299DC60E95} = {A5C98CA2-8C64-4684-BF9A-6D76033BF822} + {13D91212-A78F-4643-8F5C-F168D309D9E0} = {236C409B-8B38-4393-8E23-4E6844EA97FD} + {86538483-B249-41CA-A1C7-21F00EA314A2} = {236C409B-8B38-4393-8E23-4E6844EA97FD} + {4D7E91D8-E01D-40AE-9DCB-B82754E4ECA2} = {236C409B-8B38-4393-8E23-4E6844EA97FD} + {5E7A59A6-CB05-4496-99D2-E5C6810819B9} = {236C409B-8B38-4393-8E23-4E6844EA97FD} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {0A2ED902-6B1D-4BA1-966D-A325EE1BB479} diff --git a/src/BioFSharp.IO/Clustal.fs b/src/BioFSharp.IO/Clustal.fs index 47be5bf6..c5ba5de8 100644 --- a/src/BioFSharp.IO/Clustal.fs +++ b/src/BioFSharp.IO/Clustal.fs @@ -4,7 +4,6 @@ open FSharpAux open System.Text open System.IO open BioFSharp -open BioFSharp.TaggedSequence open BioFSharp.BioID ///Contains functions for reading clustal alignment files @@ -92,7 +91,7 @@ module Clustal = Sequences = [ for kv in sequences do - yield createTaggedSequence kv.Key kv.Value + yield TaggedSequence.create kv.Key kv.Value ] } @@ -133,7 +132,7 @@ module Clustal = let s = sb.ToString() sb.Clear() |> ignore s - createTaggedSequence "" alignment.MetaData.ConservationInfo + TaggedSequence.create "" alignment.MetaData.ConservationInfo |> Seq.appendSingleton alignment.Sequences |> Seq.map (fun x -> addEmpty x.Tag, diff --git a/src/BioFSharp.IO/ClustalOWrapper.fs b/src/BioFSharp.IO/ClustalOWrapper.fs index 81f0a95f..5ab180de 100644 --- a/src/BioFSharp.IO/ClustalOWrapper.fs +++ b/src/BioFSharp.IO/ClustalOWrapper.fs @@ -3,7 +3,6 @@ ///Wrapper and its helpers for Clustal Omega multiple alignment tools module ClustalOWrapper = - open BioFSharp.TaggedSequence open FSharpAux ///Contains modifier parameter type for Clustal Omega wrapper @@ -248,6 +247,7 @@ module ClustalOWrapper = open System open System.Diagnostics open System.IO + open BioFSharp open BioFSharp.BioID let private tsToFasta (ts:TaggedSequence) = diff --git a/src/BioFSharp.IO/FSIPrinters.fs b/src/BioFSharp.IO/FSIPrinters.fs index bbff7604..8d8a062c 100644 --- a/src/BioFSharp.IO/FSIPrinters.fs +++ b/src/BioFSharp.IO/FSIPrinters.fs @@ -6,7 +6,6 @@ module FSIPrinters = open BioFSharp open BioFSharp.Alignment open BioFSharp.BioID - open BioFSharp.TaggedSequence open BioFSharp.IO open BioFSharp.IO.Clustal open BioFSharp.IO.GFF3 @@ -174,7 +173,7 @@ module FSIPrinters = let s = sb.ToString() sb.Clear() |> ignore s - createTaggedSequence "" alignment.MetaData.ConservationInfo + TaggedSequence.create "" alignment.MetaData.ConservationInfo |> Seq.appendSingleton alignment.Sequences |> Seq.map (fun x -> addEmpty x.Tag, diff --git a/src/BioFSharp.IO/FastA.fs b/src/BioFSharp.IO/FastA.fs index 467e3d7a..d3a6f294 100644 --- a/src/BioFSharp.IO/FastA.fs +++ b/src/BioFSharp.IO/FastA.fs @@ -3,7 +3,7 @@ open System open FSharpAux open FSharpAux.IO - + module FastA = open System.IO @@ -13,6 +13,8 @@ module FastA = Sequence : 'a; } + let toTaggedSequence (fsa:FastaItem<'S>) = + BioFSharp.TaggedSequence.create fsa.Header fsa.Sequence /// Creates with header line and sequence. let createFastaItem header sequence = @@ -58,6 +60,9 @@ module FastA = /// Writes FastaItem to stream. Converter determines type of sequence by converting type -> char + /// The passed stream stays open and is not disposed after writing to it. + /// If you want to reuse the stream (e.g. you are not writing to a file stream but a memory stream that gets used afterwards) + /// you have to reset the position with `stream.Seek(0L, SeekOrigin.Begin)` let writeToStream (toString:'T -> char) (stream:Stream) (data:seq>>) = let toChunks (w:System.IO.StreamWriter) (length:int) (source: seq<'T>) = use ie = source.GetEnumerator() @@ -83,22 +88,25 @@ module FastA = w.Flush() loop () - use sWriter = new System.IO.StreamWriter(stream,Text.UTF8Encoding(false,true),1024,true) + use sWriter = new System.IO.StreamWriter(stream,Text.UTF8Encoding(false,true),4096,true) data |> Seq.iter (fun (i:FastaItem<_>) -> - sWriter.WriteLine(">" + i.Header) - toChunks sWriter 80 i.Sequence) + sWriter.WriteLine(">" + i.Header) + toChunks sWriter 80 i.Sequence + ) /// Writes FastaItem to file. Converter determines type of sequence by converting type -> char. If file already exists the data is overwritten. let write (toString:'T -> char) (filePath:string) (data:seq>>) = - use file = new FileStream(filePath,FileMode.Create) - writeToStream toString file data + let file = new FileStream(filePath,FileMode.Create) + writeToStream toString file data + file.Dispose() /// Writes FastaItem to file. Converter determines type of sequence by converting type -> char. If file already exists the data is appended. let writeAndAppend (toString:'T -> char) (filePath:string) (data:seq>>) = - use file = new FileStream(filePath,FileMode.Append) + let file = new FileStream(filePath,FileMode.Append) writeToStream toString file data + file.Dispose() /// Converts FastaItem to string. Converter determines type of sequence by converting type -> char let toString (toString:'T -> char) (data:seq>>) = diff --git a/src/BioFSharp.IO/Newick.fs b/src/BioFSharp.IO/Newick.fs index 1c8f9827..e41407ed 100644 --- a/src/BioFSharp.IO/Newick.fs +++ b/src/BioFSharp.IO/Newick.fs @@ -56,12 +56,12 @@ module Newick = (* Parser *) ///Parses a seq of tokens to a PhylTree - let private parser (converter : string -> 'Distance) (input:seq) : PhylTree.Node = + let private parser (converter : string -> 'Distance) (input:seq) : PhylogeneticTree = let en = input.GetEnumerator() let sbID,sbDist = StringBuilder(),StringBuilder() ///Reduces tree to a tuple of its info - let cutDown (tree:PhylTree.Node) = - match tree with | PhylTree.Branch (x,y) -> x + let cutDown (tree:PhylogeneticTree) = + match tree with | PhylogeneticTree.Branch (x,y) -> x let mutable iOpen,iClosed = 0,0 let rec loop() = ///This function is called when a new branch is opened, it recursively creates a list of nodes until the corresponding ')' is reached @@ -83,7 +83,7 @@ module Newick = iOpen <- iOpen + 1 let children = createBranch [] let treeInfo,isFinished = loop() - PhylTree.Branch(cutDown treeInfo, children),isFinished + PhylogeneticTree.Branch(cutDown treeInfo, children),isFinished //distancevalue is added to stringbuilder, iteration is continued | Distance c -> sbDist.Append(c) |> ignore @@ -97,17 +97,17 @@ module Newick = iClosed <- iClosed + 1 let dist,id = sbDist.ToString(),sbID.ToString() (sbDist.Clear(),sbID.Clear()) |> ignore - PhylTree.Branch((id,converter dist),[]),true + PhylogeneticTree.Branch((id,converter dist),[]),true //name is obtained from stringbuilder, distance is obtained from stringbuilder and converted; tree is built from these info and branchclosed boolean false is returned | NextNode -> let dist,id = sbDist.ToString(),sbID.ToString() (sbDist.Clear(),sbID.Clear()) |> ignore - PhylTree.Branch((id,converter dist),[]),false + PhylogeneticTree.Branch((id,converter dist),[]),false //name is obtained from stringbuilder, distance is obtained from stringbuilder and converted; tree is built from these infos and branchclosed boolean true is returned | EndTree -> let dist,id = sbDist.ToString(),sbID.ToString() (sbDist.Clear(),sbID.Clear()) |> ignore - PhylTree.Branch((id,converter dist),[]),true + PhylogeneticTree.Branch((id,converter dist),[]),true //ignored | Separator -> loop() @@ -117,7 +117,7 @@ module Newick = fst (loop()) ///Returns a PhylTree of file. Converter is used to create a distancevalue of a string - let ofFile (converter : string -> 'Distance) (path: string) : PhylTree.Node = + let ofFile (converter : string -> 'Distance) (path: string) : PhylogeneticTree = path |> readFile |> tokenizer @@ -126,14 +126,14 @@ module Newick = //---Writer---// ///Creates a NewickTree file of PhylTree. nodeConverter is used to split the distanceInfo and the name of a node, because they are parsed separately. First result of the tuple is name, second is distance. - let toFile (nodeConverter: 'T -> string * string) (path:string) (tree: PhylTree.Node<'T>) = + let toFile (nodeConverter: 'T -> string * string) (path:string) (tree: PhylogeneticTree<'T>) = let rec loop tree = seq { match tree with - | PhylTree.Branch ((nodeInfo),[]) -> + | PhylogeneticTree.Branch ((nodeInfo),[]) -> let name,distance = nodeConverter nodeInfo yield name + ":" + (distance) - | PhylTree.Branch ((nodeInfo), nl) -> + | PhylogeneticTree.Branch ((nodeInfo), nl) -> let nodeInfo = match nodeConverter nodeInfo with | (name, "") -> name diff --git a/src/BioFSharp/BioFSharp.fsproj b/src/BioFSharp/BioFSharp.fsproj index e9f25d38..59a6458e 100644 --- a/src/BioFSharp/BioFSharp.fsproj +++ b/src/BioFSharp/BioFSharp.fsproj @@ -107,6 +107,7 @@ + diff --git a/src/BioFSharp/PhylTree.fs b/src/BioFSharp/PhylTree.fs index d35628d8..8a1e267f 100644 --- a/src/BioFSharp/PhylTree.fs +++ b/src/BioFSharp/PhylTree.fs @@ -1,76 +1,119 @@ namespace BioFSharp +open FSharp.Stats.ML.Unsupervised -///Phyologenetic Tree and functions -module PhylTree = - - ///Recursive type representing a phylogenetic tree - type Node<'n> = - ///Can be internal node or leaf node, depending on wether the list is empty or not. Match accordingly - | Branch of 'n * List> +/// Recursive representation of a phylogenetic tree +type PhylogeneticTree<'T> = + ///Can be internal node or leaf node, depending on wether the list is empty or not. Match accordingly + | Branch of 'T * List> + + /// converts the input hierarchical clustering to a phylogenetig tree and conserves the distance insformation. + /// In contrasr to the clustering result, the distance value of a Branch represents the distance to its Parent, + /// not the distance that all children have to this Branch. + static member ofHierarchicalCluster (branchTag:'T) (distanceConverter: float -> 'Distance) (hCluster:HierarchicalClustering.Cluster<'T>) : PhylogeneticTree<'T * 'Distance>= + let rec loop distance (c: HierarchicalClustering.Cluster<'T>) = + match hCluster with + | HierarchicalClustering.Cluster.Node (cIndex, distance, lCount, left, right) -> + PhylogeneticTree.Branch ((branchTag, distanceConverter distance), [loop distance left; loop distance right]) + | HierarchicalClustering.Cluster.Leaf (id, lCount, tag) -> PhylogeneticTree.Branch((tag, distanceConverter distance),[]) + loop 0. hCluster + + /// Performs hierarchical clustering of the input TaggedSequences using the provided distance function and linker. Returns the result as a Phylogenetic tree. + /// a tag to give the infered common ancestor branches (these are not tagged in contrast to the input sequence.) + /// a converter function for the distance between nodes of the tree. Usually, a conversion to a string makes sense for downstream conversion to Newick format + /// a function that determines the distance between two sequences e.g. evolutionary distance based on a substitution model + /// the linker function to join clusters with + /// the input TaggedSequences + static member ofTaggedSequencesWithLinker (branchTag:'T) (distanceConverter: float -> 'Distance) (distanceFunction: seq<'S> -> seq<'S> -> float) linker (sequences: seq>) = + sequences + |> HierarchicalClustering.generate + (fun a b -> distanceFunction a.Sequence b.Sequence) + linker + |> PhylogeneticTree.ofHierarchicalCluster (TaggedSequence.create branchTag Seq.empty) distanceConverter + + + /// Performs hierarchical clustering of the input TaggedSequences using the provided distance function. Returns the result as a Phylogenetic tree. + /// a function that determines the distance between two sequences e.g. evolutionary distance based on a substitution model + /// the input TaggedSequences + static member ofTaggedBioSequences (distanceFunction: seq<#IBioItem> -> seq<#IBioItem> -> float) (sequences: seq>) = + sequences + |> PhylogeneticTree.ofTaggedSequencesWithLinker + "Ancestor" + string + distanceFunction + HierarchicalClustering.Linker.upgmaLwLinker ///Iterates trough a tree and transforms all nodes by applying a mapping function on them - let rec map (mapping: Node<'n> -> 't) (tree:Node<'n>) = - let treeMapper tree = map mapping tree - match tree with - | Branch (_,nl) -> Branch (mapping tree, List.map treeMapper nl) + static member map (mapping: PhylogeneticTree<'T> -> 't) (tree:PhylogeneticTree<'T>) = + let rec loop (mapping: PhylogeneticTree<'T> -> 't) (tree:PhylogeneticTree<'T>) = + let treeMapper tree = loop mapping tree + match tree with + | Branch (_,nl) -> Branch (mapping tree, List.map treeMapper nl) + loop mapping tree ///Iterates trough a tree and performs a action on every node - let rec iter (action: Node<'n> -> unit) (tree:Node<'n>) = - let treeIterer tree = iter action tree - match tree with - | Branch (_,nl) -> - action tree - List.iter treeIterer nl + static member iter (action: PhylogeneticTree<'T> -> unit) (tree:PhylogeneticTree<'T>) = + let rec loop (action: PhylogeneticTree<'T> -> unit) (tree:PhylogeneticTree<'T>) = + let treeIterer tree = loop action tree + match tree with + | Branch (_,nl) -> + action tree + List.iter treeIterer nl + loop action tree ///Iterates through a tree and accumulates a value by applying the folder to it and every node of the tree - let rec fold (acc: 'State) (folder: 'State -> Node<'n> -> 'State) (tree:Node<'n>) = - match tree with - | Branch (_,nl) -> - folder - (List.fold - (fun acc n -> fold acc folder n) - acc - nl) - tree + static member fold (acc: 'State) (folder: 'State -> PhylogeneticTree<'T> -> 'State) (tree:PhylogeneticTree<'T>) = + let rec loop (acc: 'State) (folder: 'State -> PhylogeneticTree<'T> -> 'State) (tree:PhylogeneticTree<'T>) = + match tree with + | Branch (_,nl) -> + folder + (List.fold (fun acc n -> loop acc folder n) acc nl) + tree + loop acc folder tree ///Iterates through a tree and accumulates a value by applying the folder to it and every mapped node of the tree - let rec mapFold (acc: 'State) (mapping: Node<'n> -> 't) (folder: 'State -> 't -> 'State) (tree:Node<'n>) = - match tree with - | Branch (_,nl) -> - folder - (List.fold - (fun acc n -> mapFold acc mapping folder n) - acc - nl) - (mapping tree) + static member mapFold (acc: 'State) (mapping: PhylogeneticTree<'T> -> 't) (folder: 'State -> 't -> 'State) (tree:PhylogeneticTree<'T>) = + let rec loop (acc: 'State) (mapping: PhylogeneticTree<'T> -> 't) (folder: 'State -> 't -> 'State) (tree:PhylogeneticTree<'T>) = + match tree with + | Branch (_,nl) -> + folder + (List.fold + (fun acc n -> PhylogeneticTree.mapFold acc mapping folder n) + acc + nl) + (mapping tree) + loop acc mapping folder tree /// Returns the count of nodes containing no subtrees - let countLeafs (tree:Node<'n>) = - fold 0 (fun x y -> x + (match y with | Branch (n,[]) -> 1 | _ -> 0)) tree + static member countLeafs (tree:PhylogeneticTree<'T>) = + PhylogeneticTree.fold 0 (fun x y -> x + (match y with | Branch (n,[]) -> 1 | _ -> 0)) tree ///Returns the most top level element for which the condition returns true - let rec tryGetNodeBy (condition: Node<'n> -> bool) (tree:Node<'n>) = - let rec loopList nl = - match nl with - | n :: tail -> - match tryGetNodeBy condition n with - | Some x -> Some x - | None -> loopList tail - | [] -> None - match tree with - | Branch _ when condition tree -> - Some tree - | Branch (_,nl) -> loopList nl + static member tryGetNodeBy (condition: PhylogeneticTree<'T> -> bool) (tree:PhylogeneticTree<'T>) = + let rec loop (condition: PhylogeneticTree<'T> -> bool) (tree:PhylogeneticTree<'T>) = + let rec loopList nl = + match nl with + | n :: tail -> + match loop condition n with + | Some x -> Some x + | None -> loopList tail + | [] -> None + match tree with + | Branch _ when condition tree -> + Some tree + | Branch (_,nl) -> loopList nl + loop condition tree ///Adds a child Node to the nodes for which the condition returns true - let rec addChildToNodes (condition: Node<'n> -> bool) (child: Node<'n>) (tree:Node<'n>) : Node<'n>= - let mapper = addChildToNodes (condition: Node<'n> -> bool) (child: Node<'n>) - let rec loop tree= - match tree with - | Branch (n,nl) when condition tree -> - loop (Branch(n,(child :: (List.map mapper nl)))) - | Branch (_,[]) -> - tree - | Branch (n,nl) -> - loop (Branch(n,List.map mapper nl)) - loop tree \ No newline at end of file + static member addChildToNodes (condition: PhylogeneticTree<'T> -> bool) (child: PhylogeneticTree<'T>) (tree:PhylogeneticTree<'T>) : PhylogeneticTree<'T>= + let rec loop (condition: PhylogeneticTree<'T> -> bool) (child: PhylogeneticTree<'T>) (tree:PhylogeneticTree<'T>) : PhylogeneticTree<'T>= + let mapper = loop (condition: PhylogeneticTree<'T> -> bool) (child: PhylogeneticTree<'T>) + let rec loopInner tree= + match tree with + | Branch (n,nl) when condition tree -> + loopInner (Branch(n,(child :: (List.map mapper nl)))) + | Branch (_,[]) -> + tree + | Branch (n,nl) -> + loopInner (Branch(n,List.map mapper nl)) + loopInner tree + loop condition child tree \ No newline at end of file diff --git a/src/BioFSharp/TaggedSequence.fs b/src/BioFSharp/TaggedSequence.fs index 328a9fbf..b3d08099 100644 --- a/src/BioFSharp/TaggedSequence.fs +++ b/src/BioFSharp/TaggedSequence.fs @@ -2,21 +2,27 @@ open System -module TaggedSequence = +/// Record of a sequence and its tag +type TaggedSequence<'T,'S> = + { + Tag: 'T; + Sequence: seq<'S> + } + with - /// record of a sequence and its tag - type TaggedSequence<'a,'b> ={ - Tag: 'a; - Sequence: seq<'b>} + /// Creates a tagged sequence + static member create (tag:'T) (sequence:seq<'S>) = + {Tag = tag; Sequence = sequence} - /// Creates a tagged sequence - let createTaggedSequence tag sequence = - {Tag = tag; Sequence = sequence} + /// Maps tag of tagged sequence + static member mapTag (mapping:'T->'U) (ts:TaggedSequence<'T,'S>) : TaggedSequence<'U,'S> = + TaggedSequence.create + (mapping ts.Tag) + ts.Sequence - /// Maps tag of tagged sequence - let mapTag (mapping:'a->'c) (ts:TaggedSequence<'a,'b>) = - {Tag = mapping ts.Tag; Sequence = ts.Sequence} + /// Maps sequence of tagged sequence + static member mapSequence (mapping:seq<'S>->seq<'M>) (ts:TaggedSequence<'T,'S>) = + TaggedSequence.create + ts.Tag + (mapping ts.Sequence) - /// Maps sequence of tagged sequence - let mapSequence (mapping:seq<'b>->seq<'c>) (ts:TaggedSequence<'a,'b>) = - {Tag = ts.Tag; Sequence = mapping ts.Sequence} \ No newline at end of file diff --git a/tests/BioFSharp.Tests/BioFSharp/BioItemTests.fs b/tests/BioFSharp.Tests/BioFSharp/BioItemTests.fs index 9531a930..58daee9b 100644 --- a/tests/BioFSharp.Tests/BioFSharp/BioItemTests.fs +++ b/tests/BioFSharp.Tests/BioFSharp/BioItemTests.fs @@ -467,7 +467,7 @@ let testIBioItem = Expect.isFasterThan f1 f2 "" ] ] -open BioFSharp.TaggedSequence +open BioFSharp [] let testTaggedSequence = @@ -479,17 +479,17 @@ let testTaggedSequence = yield testCase "create" <| fun () -> - let ts' = createTaggedSequence t s + let ts' = TaggedSequence.create t s Expect.equal ts' ts "Record initialization via function differs from initialization via record expression. Check parameter order of 'create'" yield testCase "test_mapTag" <| fun () -> let t' = ts.Tag.ToLower() - let ts' = mapTag (fun (t:string) -> t.ToLower() ) ts + let ts' = TaggedSequence.mapTag (fun (t:string) -> t.ToLower() ) ts Expect.equal ts'.Tag t' "'mapTag' does not alter the value of the field 'Tag' as expected." yield testCase "test_mapSequence" <| fun () -> let t' = ts.Sequence |> Seq.map ((*) (-1)) - let ts' = mapSequence (Seq.map ((*) (-1))) ts + let ts' = TaggedSequence.mapSequence (Seq.map ((*) (-1))) ts Expect.sequenceEqual ts'.Sequence t' "'mapSequence' does not alter the value of the field 'Sequence' as expected." ] open IsotopicDistribution diff --git a/tests/BioFSharp.Tests/BioFSharp/PhylTreeTests.fs b/tests/BioFSharp.Tests/BioFSharp/PhylTreeTests.fs index 67b4d275..337fb1b9 100644 --- a/tests/BioFSharp.Tests/BioFSharp/PhylTreeTests.fs +++ b/tests/BioFSharp.Tests/BioFSharp/PhylTreeTests.fs @@ -1,46 +1,45 @@ module PhylTreeTests open BioFSharp -open PhylTree open BioList open Nucleotides open Expecto -let testPhylTree_oneGen = Node.Branch("1", []) +let testPhylTree_oneGen = Branch("1", []) let testPhylTree_threeGens_string = - Node.Branch("ACTG",[ - Node.Branch("ACTT", [ - Node.Branch("ACTC", []) + Branch("ACTG",[ + Branch("ACTT", [ + Branch("ACTC", []) ]) - Node.Branch("ACGG", [ - Node.Branch("ACCG", []) + Branch("ACGG", [ + Branch("ACCG", []) ]) - Node.Branch("GCTG", [ - Node.Branch("TCTG", []) + Branch("GCTG", [ + Branch("TCTG", []) ]) ]) let testPhylTree_threeGens_BioList = - Node.Branch(BioList.ofNucleotideString "ACTG",[ - Node.Branch(BioList.ofNucleotideString "ACTT", [ - Node.Branch(BioList.ofNucleotideString "ACTC", []) + Branch(BioList.ofNucleotideString "ACTG",[ + Branch(BioList.ofNucleotideString "ACTT", [ + Branch(BioList.ofNucleotideString "ACTC", []) ]) - Node.Branch(BioList.ofNucleotideString "ACGG", [ - Node.Branch(BioList.ofNucleotideString "ACCG", []) + Branch(BioList.ofNucleotideString "ACGG", [ + Branch(BioList.ofNucleotideString "ACCG", []) ]) - Node.Branch(BioList.ofNucleotideString "GCTG", [ - Node.Branch(BioList.ofNucleotideString "TCTG", []) + Branch(BioList.ofNucleotideString "GCTG", [ + Branch(BioList.ofNucleotideString "TCTG", []) ]) ]) -let testFoldFun (acc: string) (node: Node<'n>) = - match node with +let testFoldFun (acc: string) (tree: PhylogeneticTree<'n>) = + match tree with Branch(s, nl) -> (s + "; " + acc) -let testMappingFun (n: Node<'n>) = - match n with +let testMappingFun (tree: PhylogeneticTree<'n>) = + match tree with Branch(s, nl) -> s |> BioList.ofNucleotideString [] @@ -48,16 +47,16 @@ let phylTreeTests = testList "PhylTree" [ testCase "map" (fun() -> Expect.equal - (PhylTree.map testMappingFun testPhylTree_threeGens_string) + (PhylogeneticTree.map testMappingFun testPhylTree_threeGens_string) testPhylTree_threeGens_BioList "PhylTree.map did not return correct Node<'t>" ) testCase "iter" (fun () -> let mutable testList = [] - let testIterFun (node: Node<'n>) = + let testIterFun (node: PhylogeneticTree<'n>) = match node with Branch (s, nl) -> do (testList <- testList @ [s]) - PhylTree.iter testIterFun testPhylTree_threeGens_string + PhylogeneticTree.iter testIterFun testPhylTree_threeGens_string Expect.equal testList ["ACTG"; "ACTT"; "ACTC"; "ACGG"; "ACCG"; "GCTG"; "TCTG"] @@ -66,22 +65,22 @@ let phylTreeTests = testCase "fold" (fun () -> let testAcc = "" Expect.equal - (PhylTree.fold testAcc testFoldFun testPhylTree_threeGens_string) + (PhylogeneticTree.fold testAcc testFoldFun testPhylTree_threeGens_string) "ACTG; GCTG; TCTG; ACGG; ACCG; ACTT; ACTC; " "PhylTree.fold did not return correct accumulated value." ) testCase "countLeafs" (fun () -> Expect.equal - (testPhylTree_threeGens_string |> PhylTree.countLeafs) + (testPhylTree_threeGens_string |> PhylogeneticTree.countLeafs) 3 "PhylTree.countLeafs did not return the correct number of leaves" ) testCase "tryGetNodeBy" (fun () -> - let testConditionFun (node: Node<'n>) = + let testConditionFun (node: PhylogeneticTree<'n>) = match node with Branch(s, _) -> s = "ACTG" Expect.equal - (PhylTree.tryGetNodeBy testConditionFun testPhylTree_threeGens_string) + (PhylogeneticTree.tryGetNodeBy testConditionFun testPhylTree_threeGens_string) (Some testPhylTree_threeGens_string) "PhylTree.tryGetNodeBy did not return the correct Node<'n> for the given condition." )