In [321]:
# Allow src folder to be imported from this notebook
import sys
from pathlib import Path

module_path = str(Path.cwd().parents[0])
if module_path not in sys.path:
    sys.path.append(module_path)

In [322]:
from src.graph.bandage_labels_from_gfa import bandage_labels
from Bio import AlignIO
from collections import defaultdict

NAME_MSA = "toyexample"
path_msa = f"../msas-didelot/{NAME_MSA}.fa"
msa=AlignIO.read(path_msa,"fasta")


In [323]:
def load_gfa(path):
    nodes=dict()
    edges=list()
    paths=dict()
    in_nodes = defaultdict(list)
    out_nodes = defaultdict(list)

    with open(path,"r") as fp: 
        for line in fp.readlines():
            # print(line)
            line = line.replace("\n","")
            line = line.split("\t")
            # node
            if line[0]=="S":
                id_node = int(line[1])
                label_node = line[2].upper()
                nodes[id_node]=dict(label=label_node)
            # edge
            elif line[0]=="L":
                start_node = int(line[1])
                end_node = int(line[3])
                in_nodes[end_node].append(start_node)
                out_nodes[start_node].append(end_node)

                edges.append((start_node,end_node))

    return nodes, edges, paths, in_nodes, out_nodes

# path_gfa = f"../output-didelot/output-pandora/coli27-86.gfa" # make prg
path_gfa = f"../experiment-didelot/gfa/{NAME_MSA}.gfa" # pangeblock /home/avila/pangeblocks/experiment-didelot/gfa/toyexample.gfa
nodes, edges, paths, in_nodes, out_nodes = load_gfa(path_gfa)


bandage_labels(path_gfa, f"../output-didelot/output-pandora/{NAME_MSA}.csv")

___

In [326]:
# sequence from the msa
seq = msa[2].seq
seq = "CGATGA"

# booleans
_visited = defaultdict(bool)
_match = defaultdict(bool) # save if a node matches with the sequence or not
_current_pos_seq = defaultdict(int) # save path and position until the seq spelt by the path matches the input seq 

# initialization
# toy example
source_node = 90
sink_node   = 100

# make_prg coli27-86 
# source_node=0
# sink_node=975
_visited[source_node] = True
_current_pos_seq[source_node] = 0

nodes_to_visit = [source_node]

In [327]:
node_id = -1 # reset node_id
print(node_id, sink_node)
while nodes_to_visit and node_id != sink_node:
    print(nodes_to_visit)
    current_node = nodes_to_visit.pop()
    current_pos_seq = _current_pos_seq[current_node]
    
    for node_id in out_nodes[current_node]:    
        
        if node_id == sink_node:
            break # path exists

        label = nodes[node_id]["label"].upper() # string node
        subseq = str(seq[current_pos_seq:current_pos_seq+len(label)]).upper() # string sequence

        if label == "*": # case make_prg
            # _current_pos_seq[node_id] = current_pos_seq
            nodes_to_visit.append(node_id)
            _current_pos_seq[node_id] = current_pos_seq
           
        elif subseq == label: # there is a match
            print(subseq, label)
            _current_pos_seq[node_id] = current_pos_seq + len(label)
            nodes_to_visit.append(node_id)
         
        else: # no match
            _visited[node_id] = True

print(node_id == sink_node)

-1 100
[90]
C C
[5]
GAT GAT
[3]
G G
[0]
A A
[4]
True
