## Check if sequences are spell in the GFAs

In [1]:
import sys
from pathlib import Path

# add parent folder to the path
module_path = str(Path.cwd().parents[0])
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
from Bio import SeqIO
from utils.load_gfa import load_gfa


# HLA-zoo

In [2]:
PATH_SEQS = "/home/avila/HLA-zoo/seqs"
paths_seqs = list(Path(PATH_SEQS).glob("*.fa"))
names_seqs = [seq.stem for seq in paths_seqs]
names_seqs.sort()

### pggb

In [3]:
PATH_GFA_PGGB = "../HLA-zoo-pggb/"

# paths to gfa generated by pggb
gfas = list(Path(PATH_GFA_PGGB).glob("*smooth.fix.gfa"))
seqs_in_gfa_pggb = []

for name_seq in names_seqs:
    
    # load gfa info
    path_gfa= [p for p in gfas if name_seq in str(p)][0]
    nodes, edges, paths = load_gfa(path_gfa)

    # load sequences info
    path_seq = [seq for seq in paths_seqs if name_seq in str(seq)][0]
    input_seqs = {record.id: str(record.seq).upper() for record in  SeqIO.parse(path_seq, "fasta")}
    
    # check if sequences are spelt in any of the gfa paths
    for seqid, label in input_seqs.items():
        label_gfa = paths[seqid]
        # print(seqid, label in paths.values())
        in_gfa=label in paths.values()
        seqs_in_gfa_pggb.append(
            {
                "tool": "pggb",
                "seqid": seqid, 
                "in_gfa": in_gfa, 
                "name_seq": name_seq,
                "path_seqs": str(path_seq), 
                "path_gfa": str(path_gfa)
            }
        )

In [4]:
paths

{'gi|568815592:29791752-29792749': 'TCTAGAAGAGTCCACGGGGACAGGTAAGGAGTAGGAGGCAGGGAGTCCAGTTCTGGGACGGGGATTCCGTGATGCAAAGTGAAGAGAGAGGGACGGGGCCCATTCCGAGGGTTTCTCCCTGGTTTCTCAGACAGCTCCTGGGCCAAGACTCAGGGAAACATTGAGACAGAGCGCTTGGCACAGAAGTAGCGGGGTCAGGGCGAAGTCCCAGGGCCTCAGGCGTGGCTCTCAGGATCTCAGGCCCCAAAGGCGGTGTATGGATTGGGGAGGCCCAGCGCTGGGCATTCCCCATCTTTGCAGGGTTTCTCTTCTCCCTCTCCCAACCTGTGTCGGGTCCTTCTTCCTGGGTACTCACCGGGCTGCCCCAGTTCTCACTCCCATTGAGTGTCGGGTTTCTAGAGAAGCCAATCAATGTAGCCGCGGTCCCGGTTCTAAAGTTCCCACGCACCCACCGGGACTCCGATTCTTCCCAGTCGCCGAGGATGGTGTCATGGCGCCCCGAACCCTGCTTCTGCTGCTCTCGGGGGCCCTGGTCCTGACCCAGACCTGGGCAGGTGAGTGCGGGGTCGGGAGGGAAACGGCGTCTGTGGGGAGTAGCTAGGGGCCTGCCCGGCGGGGGCGCAGGAACCCGGTTGCGGTGCCGGGAGGAGGGTCGGGAGGGTCTCAGCCCCCTCCTTGCTCCCAGGCTTCCACTCCTTGAGGTATTTCCACACCACCATGTCCCGGCCCGGCCGCGCGGATCCCCGCTTCCTCTCCGTGGGCGACGTGGACGACACGCAGTGCGTGCGGCTCGACAGCGACGCCACGAGTCCCAGGATGGAGCCGCGGGCGCCGTGGATGGAGCAGGAGGGGCCGGAATATTGGGAAGAGGAGACAGGGACCGCCAAGGCCAAAGCACAGTTTTACCGAGTGAACCTGCGGACCCTGAGCGGCTACTACAACCAGAGTGAGGCCTGTGAGTGAC

In [5]:
pd.DataFrame(seqs_in_gfa_pggb).groupby(["path_gfa","in_gfa"]).size()

path_gfa                                                                  in_gfa
../HLA-zoo-pggb/A-3105.fa.353ea42.34ee7b1.1576367.smooth.fix.gfa          True      11
../HLA-zoo-pggb/B-3106.fa.12f909e.34ee7b1.b34f00e.smooth.fix.gfa          True       9
../HLA-zoo-pggb/C-3107.fa.51de401.34ee7b1.3dbd3cb.smooth.fix.gfa          True      10
../HLA-zoo-pggb/DMA-3108.fa.353ea42.34ee7b1.1576367.smooth.fix.gfa        True      11
../HLA-zoo-pggb/DMB-3109.fa.51de401.34ee7b1.3dbd3cb.smooth.fix.gfa        True      10
../HLA-zoo-pggb/DOA-3111.fa.51de401.34ee7b1.3dbd3cb.smooth.fix.gfa        True      10
../HLA-zoo-pggb/DOB-3112.fa.51de401.34ee7b1.3dbd3cb.smooth.fix.gfa        True      10
../HLA-zoo-pggb/DPA1-3113.fa.353ea42.34ee7b1.1576367.smooth.fix.gfa       True      11
../HLA-zoo-pggb/DPB1-3115.fa.353ea42.34ee7b1.1576367.smooth.fix.gfa       True      11
../HLA-zoo-pggb/DQA1-3117.fa.51de401.34ee7b1.3dbd3cb.smooth.fix.gfa       True      10
../HLA-zoo-pggb/DQB1-3119.fa.51de401.34ee7b1.3dbd

## PanPA

In [6]:
PATH_GFA_PANPA = "../HLA-zoo-PanPA"

# paths to gfa generated by PanPA
gfas= list(Path(PATH_GFA_PANPA).rglob("*.gfa"))

seqs_in_gfa_panpa=[]
for path_gfa in gfas:
# for name_seq in names_seqs:
    
    # load gfa info
    # path_gfa= [p for p in gfas if name_seq in str(p)][0]
    name_seq = path_gfa.stem
    nodes, edges, paths = load_gfa(path_gfa)

    # load sequences info
    path_seq = [seq for seq in paths_seqs if name_seq in str(seq)][0]
    input_seqs = {record.id: str(record.seq).upper() for record in  SeqIO.parse(path_seq, "fasta")}
    
    # check if sequences are spelt in any of the gfa paths
    for seqid, label in input_seqs.items():
        # label_gfa = paths[seqid]
        # print(seqid, label in paths.values())
        in_gfa=label in paths.values()
        seqs_in_gfa_panpa.append(
            {
                "tool": "PanPA",
                "seqid": seqid, 
                "in_gfa": in_gfa, 
                "name_seq": name_seq,
                "path_seqs": str(path_seq), 
                "path_gfa": str(path_gfa)
            }
        )

In [11]:
pd.DataFrame(seqs_in_gfa_panpa).groupby(["path_gfa","in_gfa"]).size()

path_gfa                                                          in_gfa
../HLA-zoo-PanPA/output-HLA-zoo-mafft.op1.53-ep0/A-3105.gfa       True      11
../HLA-zoo-PanPA/output-HLA-zoo-mafft.op1.53-ep0/B-3106.gfa       False      1
                                                                  True       8
../HLA-zoo-PanPA/output-HLA-zoo-mafft.op1.53-ep0/C-3107.gfa       True      10
../HLA-zoo-PanPA/output-HLA-zoo-mafft.op1.53-ep0/DMA-3108.gfa     True      11
                                                                            ..
../HLA-zoo-PanPA/output-HLA-zoo-mafft.op5-ep0/MICA-100507436.gfa  True       8
../HLA-zoo-PanPA/output-HLA-zoo-mafft.op5-ep0/MICB-4277.gfa       True      11
../HLA-zoo-PanPA/output-HLA-zoo-mafft.op5-ep0/TAP1-6890.gfa       True      11
../HLA-zoo-PanPA/output-HLA-zoo-mafft.op5-ep0/TAP2-6891.gfa       True      11
../HLA-zoo-PanPA/output-HLA-zoo-mafft.op5-ep0/V-352962.gfa        True      10
Length: 86, dtype: int64