In [367]:
# Allow src folder to be imported from this notebook
import sys
from pathlib import Path

module_path = str(Path.cwd().parents[0])
if module_path not in sys.path:
    sys.path.append(module_path)

In [368]:
from src.graph.bandage_labels_from_gfa import bandage_labels
from Bio import AlignIO
from collections import defaultdict

NAME_MSA = "coli27-86"
path_msa = f"../msas-didelot/{NAME_MSA}.fa"
msa=AlignIO.read(path_msa,"fasta")


In [381]:
def load_gfa(path):
    nodes=dict()
    edges=list()
    paths=dict()
    in_nodes = defaultdict(list)
    out_nodes = defaultdict(list)

    with open(path,"r") as fp: 
        for line in fp.readlines():
            # print(line)
            line = line.replace("\n","")
            line = line.split("\t")
            # node
            if line[0]=="S":
                id_node = int(line[1])
                label_node = line[2].upper()
                nodes[id_node]=dict(label=label_node)
            # edge
            elif line[0]=="L":
                start_node = int(line[1])
                end_node = int(line[3])
                in_nodes[end_node].append(start_node)
                out_nodes[start_node].append(end_node)

                edges.append((start_node,end_node))

    return nodes, edges, paths, in_nodes, out_nodes

path_gfa = f"../output-didelot/output-pandora/coli27-86.gfa" # make prg
# path_gfa = f"../experiment-didelot/gfa/{NAME_MSA}.gfa" # pangeblock /home/avila/pangeblocks/experiment-didelot/gfa/toyexample.gfa
nodes, edges, paths, in_nodes, out_nodes = load_gfa(path_gfa)


bandage_labels(path_gfa, f"../output-didelot/output-pandora/{NAME_MSA}.csv")

___

In [385]:
# sequence from the msa
seq = msa[26].seq
# seq = "CGATGA"

# booleans
_visited = defaultdict(bool)
_current_pos_seq = defaultdict(int) # save path and position until the seq spelt by the path matches the input seq 

# initialization
# toy example
# source_node = 90
# sink_node   = 100

# make_prg coli27-86 
source_node=0
sink_node=975
_visited[source_node] = True
_current_pos_seq[source_node] = 0

nodes_to_visit = [source_node]

In [386]:
node_id = -1 # reset node_id
print(node_id, sink_node)
while nodes_to_visit and node_id != sink_node:
    print(nodes_to_visit)
    current_node = nodes_to_visit.pop()
    current_pos_seq = _current_pos_seq[current_node]
    
    for node_id in out_nodes[current_node]:    
        
        if node_id == sink_node:
            break # path exists

        label = nodes[node_id]["label"].upper() # string node
        subseq = str(seq[current_pos_seq:current_pos_seq+len(label)]).upper() # string sequence

        if label == "*": # case make_prg
            # _current_pos_seq[node_id] = current_pos_seq
            nodes_to_visit.append(node_id)
            _current_pos_seq[node_id] = current_pos_seq
           
        elif subseq == label: # there is a match
            print(subseq, label)
            _current_pos_seq[node_id] = current_pos_seq + len(label)
            nodes_to_visit.append(node_id)
         
        else: # no match
            _visited[node_id] = True

print(node_id == sink_node)

-1 975
[0]
GTGATTGGCGATCGCG GTGATTGGCGATCGCG
[1]
CGACCGA CGACCGA
[4]
[5]
C C
[6]
ATTCAACTGGC ATTCAACTGGC
[8]
G G
[9]
[11]
GAAAACATGGG GAAAACATGGG
[17]
[18, 48]
[18, 49]
CATTAA CATTAA
[18, 51]
[18]
CATTAA CATTAA
[19]
TGGTTTACGCTA TGGTTTACGCTA
[22]
CGACCGTGAAATCCTGAACTGGCCG CGACCGTGAAATCCTGAACTGGCCG
[23]
ATGATTGG ATGATTGG
[32]
TGAGCAACTCACT TGAGCAACTCACT
[33]
AAACGAGACCGTTACGCCCATGTAGTGCG AAACGAGACCGTTACGCCCATGTAGTGCG
[39]
C C
[40]
AACACCAAAGAGACGCA AACACCAAAGAGACGCA
[42]
AATTGACGTCCAGGTG AATTGACGTCCAGGTG
[43]
[47]
TGGCTGGA TGGCTGGA
[73]
[74]
TCGCGAGGGTGG TCGCGAGGGTGG
[75]
CAGCAAGATTAA CAGCAAGATTAA
[79]
CACCGGC CACCGGC
[80]
[84]
GTTGGCTTCTTTGA GTTGGCTTCTTTGA
[86]
TCAC TCAC
[87]
ATGCTGGATCA ATGCTGGATCA
[90]
GATCGCTACCCAC GATCGCTACCCAC
[91]
GGCGGTTT GGCGGTTT
[96]
[97]
CCGTATGGAAA CCGTATGGAAA
[98]
TCAACGTCAAAGGCGACCTCTATATC TCAACGTCAAAGGCGACCTCTATATC
[101]
GACGATCACCACAC GACGATCACCACAC
[103]
CGTC CGTC
[104]
GAAGATACCGG GAAGATACCGG
[107]
CCTGG CCTGG
[108]
CGCTGGG CGCTGGG
[111]
[112]
G G
[113

In [387]:
_current_pos_seq

defaultdict(int,
            {0: 0,
             1: 16,
             4: 23,
             5: 23,
             6: 24,
             8: 35,
             9: 36,
             11: 36,
             17: 47,
             18: 47,
             48: 47,
             49: 47,
             51: 53,
             19: 53,
             22: 65,
             23: 90,
             32: 98,
             33: 111,
             39: 140,
             40: 141,
             42: 158,
             43: 174,
             47: 174,
             73: 182,
             74: 182,
             75: 194,
             79: 206,
             80: 213,
             84: 213,
             86: 227,
             87: 231,
             90: 242,
             91: 255,
             96: 263,
             97: 263,
             98: 274,
             101: 300,
             103: 314,
             104: 318,
             107: 329,
             108: 334,
             111: 341,
             112: 341,
             113: 342,
             116: 356,
         

In [374]:
seq[10:16].upper()

Seq('ATCGCG')

In [376]:
label, subseq

('CT', 'GT')

In [377]:
nodes_to_visit

[]

In [380]:
out_nodes[current_node]

[2157]