Remove sequences from HLA-zoo dataset that are labeled with the reverse complement in pggb 

the list of files and the id sequence that must be removed are in this file `HLA-zoo.to_remove.txt`

In [65]:
from pathlib import Path
from collections import defaultdict
from Bio import SeqIO
from shutil import copyfile

In [66]:
OUTDIR=Path("/data/HLA-zoo/no-reverse-complement")
OUTDIR.mkdir(exist_ok=True, parents=True)
PATH_SEQS=list(Path("/home/avila/HLA-zoo/seqs").glob("*.fa"))

In [67]:
# get id to remove for each fasta file
to_remove = defaultdict(list)
with open("../HLA-zoo.to_remove.txt") as fp:
    for line in fp.readlines():
        path_gfa_pggb, id_fasta, id_to_remove = line.replace("\n","").split(",")
        to_remove[id_fasta].append(id_to_remove)

In [68]:
to_remove

defaultdict(list,
            {'B-3106': ['gi|299782605:5000-8340'],
             'C-3107': ['gi|342187247:4995-8382'],
             'DMA-3108': ['gi|236459249:5000-9508'],
             'DOA-3111': ['gi|236459287:5000-10430'],
             'DOB-3112': ['gi|236459349:5000-9285',
              'gi|530354716:26722-31007'],
             'DPA1-3113': ['gi|501355759:5000-21209'],
             'DQB1-3119': ['gi|345525393:5000-12600'],
             'DRA-3122': ['gi|568815572:9686-14890'],
             'DRB1-3123': ['gi|345525392:5000-18402'],
             'TAP1-6890': ['gi|226246635:5000-13762'],
             'TAP2-6891': ['gi|530354716:4947-21937']})

In [69]:
for fasta in PATH_SEQS:
    id_remove = to_remove.get(fasta.stem,[])
    if id_remove:
        # remove sequence(s) from file  and save it 
        print(id_remove)
        new_fasta = []
        for record in SeqIO.parse(fasta, "fasta"): 
            if record.id not in id_remove:
                new_fasta.append(record)

        SeqIO.write(new_fasta, OUTDIR.joinpath(fasta.stem + ".fa"), "fasta")    

    else:
        # copy file to the new folder
        src = fasta
        dest = OUTDIR.joinpath(fasta.stem + ".fa")
        copyfile(src, dest)

['gi|530354716:4947-21937']
['gi|299782605:5000-8340']
['gi|342187247:4995-8382']
['gi|236459349:5000-9285', 'gi|530354716:26722-31007']
['gi|345525393:5000-12600']
['gi|501355759:5000-21209']
['gi|345525392:5000-18402']
['gi|226246635:5000-13762']
['gi|236459249:5000-9508']
['gi|236459287:5000-10430']
['gi|568815572:9686-14890']
