## Check if sequences are spell in the GFAs

In [30]:
import sys
from pathlib import Path

# add parent folder to the path
module_path = str(Path.cwd().parents[0])
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
from Bio import SeqIO
from utils.load_gfa import load_gfa


# HLA-zoo

In [31]:
PATH_SEQS = "../sequences/HLA-zoo/"
paths_seqs = list(Path(PATH_SEQS).glob("*.fa"))
names_seqs = [seq.stem for seq in paths_seqs]
names_seqs.sort()
names_seqs

['A-3105',
 'B-3106',
 'C-3107',
 'DMA-3108',
 'DMB-3109',
 'DOA-3111',
 'DOB-3112',
 'DPA1-3113',
 'DPB1-3115',
 'DQA1-3117',
 'DQB1-3119',
 'DRA-3122',
 'DRB1-3123',
 'DRB3-3125',
 'DRB4-3126',
 'DRB5-3127',
 'E-3133',
 'F-3134',
 'G-3135',
 'H-3136',
 'J-3137',
 'K-3138',
 'L-3139',
 'MICA-100507436',
 'MICB-4277',
 'TAP1-6890',
 'TAP2-6891',
 'V-352962']

### pggb

In [32]:
PATH_GFA_PGGB = "../HLA-zoo-pggb/"

# paths to gfa generated by pggb
seqs_in_gfa_pggb = []
names_seqs=["DMA-3108"]
for name_seq in names_seqs:
    
    gfas = list(Path(PATH_GFA_PGGB).joinpath(name_seq).rglob("*smooth.final.gfa"))
    # load gfa info
    path_gfa= [p for p in gfas if name_seq in str(p)][0]
    nodes, edges, paths = load_gfa(path_gfa)

    # load sequences info
    path_seq = [seq for seq in paths_seqs if name_seq in str(seq)][0]
    input_seqs = {record.id: str(record.seq).upper() for record in  SeqIO.parse(path_seq, "fasta")}
    
    # check if sequences are spelt in any of the gfa paths
    for seqid, label in input_seqs.items():
        label_gfa = paths[seqid]
        # print(seqid, label in paths.values())
        in_gfa=label in paths.values()
        seqs_in_gfa_pggb.append(
            {
                "tool": "pggb",
                "seqid": seqid, 
                "in_gfa": in_gfa, 
                "gfa_id": name_seq,
                "path_seqs": str(path_seq), 
                "path_gfa": str(path_gfa)
            }
        )

In [33]:
list(Path(PATH_GFA_PGGB).joinpath(name_seq).glob("*.gfa"))

[PosixPath('../HLA-zoo-pggb/DMA-3108/DMA-3108.fa.gz.8ffb64a.417fcdf.1e4a7df.smooth.final.gfa')]

In [34]:
pd.DataFrame(seqs_in_gfa_pggb)

Unnamed: 0,tool,seqid,in_gfa,gfa_id,path_seqs,path_gfa
0,pggb,gi|568815592:32948613-32953121,True,DMA-3108,../sequences/HLA-zoo/DMA-3108.fa,../HLA-zoo-pggb/DMA-3108/DMA-3108.fa.gz.8ffb64...
1,pggb,gi|568815454:4253460-4257971,True,DMA-3108,../sequences/HLA-zoo/DMA-3108.fa,../HLA-zoo-pggb/DMA-3108/DMA-3108.fa.gz.8ffb64...
2,pggb,gi|568815529:4360810-4365321,True,DMA-3108,../sequences/HLA-zoo/DMA-3108.fa,../HLA-zoo-pggb/DMA-3108/DMA-3108.fa.gz.8ffb64...
3,pggb,gi|568815551:4192143-4196654,True,DMA-3108,../sequences/HLA-zoo/DMA-3108.fa,../HLA-zoo-pggb/DMA-3108/DMA-3108.fa.gz.8ffb64...
4,pggb,gi|568815561:4367981-4372490,True,DMA-3108,../sequences/HLA-zoo/DMA-3108.fa,../HLA-zoo-pggb/DMA-3108/DMA-3108.fa.gz.8ffb64...
5,pggb,gi|568815564:4247658-4252169,True,DMA-3108,../sequences/HLA-zoo/DMA-3108.fa,../HLA-zoo-pggb/DMA-3108/DMA-3108.fa.gz.8ffb64...
6,pggb,gi|568815567:4142920-4147431,True,DMA-3108,../sequences/HLA-zoo/DMA-3108.fa,../HLA-zoo-pggb/DMA-3108/DMA-3108.fa.gz.8ffb64...
7,pggb,gi|568815569:4347848-4352359,True,DMA-3108,../sequences/HLA-zoo/DMA-3108.fa,../HLA-zoo-pggb/DMA-3108/DMA-3108.fa.gz.8ffb64...
8,pggb,gi|528476637:32917185-32921696,True,DMA-3108,../sequences/HLA-zoo/DMA-3108.fa,../HLA-zoo-pggb/DMA-3108/DMA-3108.fa.gz.8ffb64...
9,pggb,gi|157734152:32658919-32663430,True,DMA-3108,../sequences/HLA-zoo/DMA-3108.fa,../HLA-zoo-pggb/DMA-3108/DMA-3108.fa.gz.8ffb64...


In [35]:
pd.DataFrame(seqs_in_gfa_pggb).groupby(["path_gfa","in_gfa"]).size()

path_gfa                                                                          in_gfa
../HLA-zoo-pggb/DMA-3108/DMA-3108.fa.gz.8ffb64a.417fcdf.1e4a7df.smooth.final.gfa  True      10
dtype: int64

## PanPA

In [36]:
PATH_GFA_PANPA = "../HLA-zoo-PanPA"

# paths to gfa generated by PanPA
gfas= list(Path(PATH_GFA_PANPA).rglob("*.gfa"))

seqs_in_gfa_panpa=[]
for path_gfa in gfas:
# for name_seq in names_seqs:
    
    # load gfa info
    # path_gfa= [p for p in gfas if name_seq in str(p)][0]
    name_seq = path_gfa.stem
    nodes, edges, paths = load_gfa(path_gfa)

    # load sequences info
    path_seq = [seq for seq in paths_seqs if name_seq in str(seq)][0]
    input_seqs = {record.id: str(record.seq).upper() for record in  SeqIO.parse(path_seq, "fasta")}
    
    # check if sequences are spelt in any of the gfa paths
    for seqid, label in input_seqs.items():
        # label_gfa = paths[seqid]
        # print(seqid, label in paths.values())
        in_gfa=label in paths.values()
        seqs_in_gfa_panpa.append(
            {
                "tool": "PanPA",
                "seqid": seqid, 
                "in_gfa": in_gfa, 
                "gfa_id": name_seq,
                "path_seqs": str(path_seq), 
                "path_gfa": str(path_gfa)
            }
        )

In [37]:
pd.DataFrame(seqs_in_gfa_panpa).groupby(["path_gfa","in_gfa"]).size()

path_gfa                                                          in_gfa
../HLA-zoo-PanPA/output-HLA-zoo-mafft.op1.53-ep0/A-3105.gfa       True      11
../HLA-zoo-PanPA/output-HLA-zoo-mafft.op1.53-ep0/B-3106.gfa       True       8
../HLA-zoo-PanPA/output-HLA-zoo-mafft.op1.53-ep0/C-3107.gfa       True       9
../HLA-zoo-PanPA/output-HLA-zoo-mafft.op1.53-ep0/DMA-3108.gfa     True      10
../HLA-zoo-PanPA/output-HLA-zoo-mafft.op1.53-ep0/DMB-3109.gfa     True      10
                                                                            ..
../HLA-zoo-PanPA/output-HLA-zoo-mafft.op5-ep0/MICA-100507436.gfa  True       8
../HLA-zoo-PanPA/output-HLA-zoo-mafft.op5-ep0/MICB-4277.gfa       True      11
../HLA-zoo-PanPA/output-HLA-zoo-mafft.op5-ep0/TAP1-6890.gfa       True      10
../HLA-zoo-PanPA/output-HLA-zoo-mafft.op5-ep0/TAP2-6891.gfa       True      10
../HLA-zoo-PanPA/output-HLA-zoo-mafft.op5-ep0/V-352962.gfa        True      10
Length: 84, dtype: int64

## pangeblocks

In [38]:
PATH_GFA_PANGEBLOCKS = "../HLA-zoo-pangeblocks"

# paths to gfa generated by PanPA
gfas= list(Path(PATH_GFA_PANGEBLOCKS).rglob("*.gfa"))
gfas = [gfa for gfa in gfas if "unchop" in str(gfa)]

seqs_in_gfa_pangeblocks=[]
for path_gfa in gfas:
    # load gfa info
    # path_gfa= [p for p in gfas if name_seq in str(p)][0]
    name_seq = path_gfa.stem
    nodes, edges, paths = load_gfa(path_gfa)

    # load sequences info
    path_seq = [seq for seq in paths_seqs if name_seq in str(seq)][0]
    input_seqs = {record.id: str(record.seq).upper() for record in  SeqIO.parse(path_seq, "fasta")}
    
    # check if sequences are spelt in any of the gfa paths
    for seqid, label in input_seqs.items():
        # label_gfa = paths[seqid]
        # print(seqid, label in paths.values())
        in_gfa=label in paths.values()
        seqs_in_gfa_pangeblocks.append(
            {
                "tool": "pangeblocks",
                "seqid": seqid, 
                "in_gfa": in_gfa, 
                "gfa_id": name_seq,
                "path_seqs": str(path_seq), 
                "path_gfa": str(path_gfa)
            }
        )

In [39]:
data = seqs_in_gfa_pggb + seqs_in_gfa_panpa + seqs_in_gfa_pangeblocks
pd.DataFrame(data).to_csv("HLA-zoo-seqs_in_gfa.csv")

In [40]:
list(filter(lambda d: d["in_gfa"] is False, seqs_in_gfa_pangeblocks))

[]

## vg

In [41]:
PATH_GFA_VG = "../HLA-zoo-vg"

# paths to gfa generated by PanPA
gfas= list(Path(PATH_GFA_VG).rglob("*.gfa"))

seqs_in_gfa_vg=[]
for path_gfa in gfas:
# for name_seq in names_seqs:
    
    # load gfa info
    # path_gfa= [p for p in gfas if name_seq in str(p)][0]
    name_seq = path_gfa.stem
    nodes, edges, paths = load_gfa(path_gfa)

    # load sequences info
    path_seq = [seq for seq in paths_seqs if name_seq in str(seq)][0]
    input_seqs = {record.id: str(record.seq).upper() for record in  SeqIO.parse(path_seq, "fasta")}
    
    # check if sequences are spelt in any of the gfa paths
    for seqid, label in input_seqs.items():
        # label_gfa = paths[seqid]
        # print(seqid, label in paths.values())
        in_gfa=label in paths.values()
        seqs_in_gfa_vg.append(
            {
                "tool": "vg",
                "seqid": seqid, 
                "in_gfa": in_gfa, 
                "gfa_id": name_seq,
                "path_seqs": str(path_seq), 
                "path_gfa": str(path_gfa)
            }
        )

In [42]:
data = seqs_in_gfa_pggb + seqs_in_gfa_panpa + seqs_in_gfa_pangeblocks + seqs_in_gfa_vg
pd.DataFrame(data).to_csv("HLA-zoo-seqs_in_gfa.csv")

In [44]:
pd.DataFrame(data).groupby("in_gfa").size()

in_gfa
True    9817
dtype: int64