## Check if sequences are spell in the GFAs

In [1]:
import sys
from pathlib import Path

# add parent folder to the path
module_path = str(Path.cwd().parents[0])
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
from Bio import SeqIO
from utils.load_gfa import load_gfa

## Didelot

In [2]:
PATH_SEQS = "/data/didelot"
paths_seqs = list(Path(PATH_SEQS).glob("*.fasta"))
names_seqs = [seq.stem for seq in paths_seqs]
names_seqs.sort()
names_seqs

['coli27-86', 'slpa-basis', 'slpa-real', 'slpa-simu']

### pggb 

In [3]:
PATH_GFA_PGGB="../didelot-pggb"

# paths to gfa generated by pggb
gfas = list(Path(PATH_GFA_PGGB).rglob("*smooth.fix.gfa"))
seqs_in_gfa_pggb = []

for name_seq in names_seqs:
    
    # load gfa info
    path_gfa= [p for p in gfas if name_seq in str(p)][0]
    nodes, edges, paths = load_gfa(path_gfa)

    # load sequences info
    path_seq = [seq for seq in paths_seqs if name_seq in str(seq)][0]
    input_seqs = {record.id: str(record.seq).upper() for record in  SeqIO.parse(path_seq, "fasta")}
    
    # check if sequences are spelt in any of the gfa paths
    for seqid, label in input_seqs.items():
        label_gfa = paths[seqid]
        # print(seqid, label in paths.values())
        in_gfa=label in paths.values()
        seqs_in_gfa_pggb.append(
            {
                "tool": "pggb",
                "seqid": seqid, 
                "in_gfa": in_gfa, 
                "gfa_id": name_seq,
                "path_seqs": str(path_seq), 
                "path_gfa": str(path_gfa)
            }
        )

## PanPA

In [4]:
PATH_GFA_PANPA = "../didelot-PanPA"

# paths to gfa generated by PanPA
gfas= list(Path(PATH_GFA_PANPA).rglob("*.gfa"))

seqs_in_gfa_panpa=[]
for path_gfa in gfas:
    
    # load gfa info
    name_seq = path_gfa.stem.split(".")[0]
    nodes, edges, paths = load_gfa(path_gfa)

    # load sequences info
    path_seq = [seq for seq in paths_seqs if name_seq in str(seq)][0]
    input_seqs = {record.id: str(record.seq).upper() for record in  SeqIO.parse(path_seq, "fasta")}
    
    # check if sequences are spelt in any of the gfa paths
    for seqid, label in input_seqs.items():
        # label_gfa = paths[seqid]
        # print(seqid, label in paths.values())
        in_gfa=label in paths.values()
        seqs_in_gfa_panpa.append(
            {
                "tool": "PanPA",
                "seqid": seqid, 
                "in_gfa": in_gfa, 
                "gfa_id": name_seq,
                "path_seqs": str(path_seq), 
                "path_gfa": str(path_gfa)
            }
        )

## pangeblocks

In [5]:
# PATH_SEQS = "/data/msas-pangeblocks/msa-coli"
# paths_seqs = list(Path(PATH_SEQS).glob("*.fasta"))
# names_seqs = [seq.stem for seq in paths_seqs]
# names_seqs.sort()
# names_seqs

In [6]:
PATH_GFA_PANGEBLOCKS = "../didelot-pangeblocks"

# paths to gfa generated by pangeblocks
gfas= [p for p in Path(PATH_GFA_PANGEBLOCKS).rglob("*.gfa") if "unchop" in str(p)]# and "nodes" in str(p) in str(p) and "coli" in str(p)] # gfa con indels
# gfas= [p for p in Path(PATH_GFA_PANGEBLOCKS).rglob("*.gfa") if "gfa-post" in str(p) ] # gfa sin indels
# gfas= [p for p in Path(PATH_GFA_PANGEBLOCKS).rglob("*.gfa") if "unchop" in str(p) ]   # unchop by vg

seqs_in_gfa_pangeblocks=[]
for path_gfa in gfas:
    
    # load gfa info
    name_seq = path_gfa.stem.split(".")[0]
    nodes, edges, paths = load_gfa(path_gfa)

    # load sequences info
    path_seq = [seq for seq in paths_seqs if name_seq in str(seq)][0]
    input_seqs = {record.id: str(record.seq).replace("-","").upper() for record in  SeqIO.parse(path_seq, "fasta")}
    
    # check if sequences are spelt in any of the gfa paths
    for seqid, label in input_seqs.items():
        in_gfa=label in paths.values()
        seqs_in_gfa_pangeblocks.append(
            {
                "tool": "pangeblocks",
                "seqid": seqid, 
                "in_gfa": in_gfa, 
                "gfa_id": name_seq,
                "path_seqs": str(path_seq), 
                "path_gfa": str(path_gfa)
            }
        )

In [7]:
list(filter(lambda d: d["in_gfa"]==False, seqs_in_gfa_pangeblocks))

[]

In [8]:
data = seqs_in_gfa_pggb + seqs_in_gfa_panpa + seqs_in_gfa_pangeblocks

In [9]:
pd.DataFrame(data).to_csv("didelot-seqs_in_gfa.csv")