In [2]:
from Bio import GenBank
from pathlib import Path
import pandas as pd

In [3]:
data_path = "/data/hiv/data"
GENES = ['pol']#, 'env', 'gag', 'vpr', 'vif', 'tat', 'rev', 'vpu', 'nef']

In [None]:
def parse_record(record):
    obj = {}
    
    if 'HIV' in record.source:
        
#         if len(record.sequence) <= 200:
#             return None

        obj = {
            "accession": record.accession[0], 
            'length': len(record.sequence),
            'sequence': record.sequence       
        }

        #[Feature(key='source', location='1..1686'), Feature(key='gene', location='<1..>1686'), Feature(key='CDS', location='<1..>1686')]
        for feature in record.features:
            if feature.key == "source":
                # [Qualifier(key='/organism=', value='"Human immunodeficiency virus 1"'), Qualifier(key='/proviral', value=''), Qualifier(key='/mol_type=', value='"genomic DNA"'), Qualifier(key='/db_xref=', value='"taxon:11676"'), Qualifier(key='/country=', value='"Spain"')]
                for qualifier in feature.qualifiers:
                    if qualifier.key == "/country=":
                        obj["country"] = qualifier.value.replace('"','').replace("'", "")

            if feature.key == "gene":
                pass

            if feature.key == "CDS":
                gene = None
                protein = None
                
                for qualifier in feature.qualifiers:
                    if qualifier.key == "/gene=":
                        gene = qualifier.value.replace('"','').replace("'", "").lower()

                    if qualifier.key == "/translation=":
                        protein = qualifier.value.replace('"','').replace("'", "")
                    
                if protein and gene and gene in GENES:
                    obj[f"{gene}"] = protein    
#                     obj[f"{gene}_loc"] = feature.location.replace("<", "").replace(">", "")
    
    if len(obj.keys()) <= 4:
        # accession, length, sequence, country
        return None
    
    return obj 
    
    
def parse_file(filename):
    data = []
    with open(filename, "r") as handle:
        records = GenBank.parse(handle)
        while True:
            try:
                obj = parse_record(records.__next__())
                if obj:
                    data.append(obj)
            
            except StopIteration as e:
                break
                    
            except Exception as exc:
                print(exc)
                
    return data                

In [None]:
data = []
for filename in sorted(list(Path(data_path).glob("*.seq"))):
    print(f"Parsing {filename}")
    sub_data = parse_file(filename)
    if len(sub_data) > 0:
        data.extend(sub_data)
        sub_df = pd.DataFrame(sub_data)
        sub_df.to_csv(filename.with_suffix(".csv"))
        print("Done")
    else:
        print("No useful data")
        

df = pd.DataFrame(data)
df.to_csv(Path(data_path).joinpath("1-full.csv"), index = False)    

In [None]:
df = df[~df.duplicated(['sequence', 'pol'], keep='first')]

In [4]:
df = pd.read_csv(Path(data_path).joinpath("1-full.csv"))

In [5]:
df

Unnamed: 0,accession,length,sequence,country,pol
0,AB074263,205,CCACAAGGGAAGGCCAGGGAACTTCCTTCAGAGCAGACCAGAACCA...,Japan,PQGKARELPSEQTRTNSPTSTANSPTGRELQVWGRDNNSLSEAGDD...
1,AB074264,193,CCACAAGGGAAGGCCAGGGAATTTTCTTCAGTGCAGACCAGAGCCA...,Japan,PQGKAREFSSVQTRANSPDSPARGELQVWGRDSNSLSEAGTDRQGS...
2,AB074265,190,CAACAAGGGGAGGCCAGGGAATTTTTTTCAGAGCAGACCAGAGCCA...,Japan,QQGEAREFFSEQTRANSPAPSGRELQVWGRDNNSPSEAGADRQGTV...
3,AB074266,193,CCACAAGGGAAGGCCAGGGAATTTTCTCCAGAGCAGGCCAGAGCCA...,Japan,PQGKAREFSPEQARAISSTSPTRRELQVWGGDNNSLSEAGADGQGT...
4,AB074267,202,CCACAAGGGGAGGCCAGGGAATTTTCTTCAGAGCAGACCAGAGCCA...,Japan,PQGEAREFSSEQTRANSPTRANSPTRGELQVWGRDNSSISEAGADR...
...,...,...,...,...,...
462010,MZ468890,1302,CCTCAAATCACTCTTTGGCAACGACCCCTCGTCACAGTAAAGATAG...,Poland,PQITLWQRPLVTVKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...
462011,MZ468891,1302,CCTCAAATCACTCTTTGGCAACGACCCATCGTCACAGTAAGGATAG...,Poland,PQITLWQRPIVTVRIEGQLKDALLDTGADDTVLEDMTLPGRWKPKM...
462012,MZ468892,1302,CCTCAGATCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAG...,Poland,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...
462013,MZ468893,1302,CCTCAGATCACTCTTTGGCAACGAYCCATCGTCACAGTAARGGTAG...,Poland,PQITLWQRXIVTVXVGGQLKEALLDTGADDTVLEDMNLQGKWKPKM...


In [24]:
# bad_letters = ["J", "X"]
# df = df[~df['pol'].str.contains("|".join(bad_letters))]

In [6]:
df = df[df['pol'].str.contains("PISPIET")]

In [7]:
df

Unnamed: 0,accession,length,sequence,country,pol
42,AB098332,9629,TGGATGGGTTAATTTACTCCAGGAAAAGACAAGAAATCCTTGATCT...,Uganda,FFRENLAFQQGEARKFPSEQTGANSPTSRDLWNGGRDSLPSEAGAE...
43,AB098333,9635,TGGATGGGTTAATTTACTCCAGGAAAAGACAAGAAATCCTTGATCT...,Uganda,FFRENLAFQQGEARKFPSEQTGANSPTSRDLWNGGRDSLPSEAGAE...
88,AB221125,9664,TGGATGGGTTAATTTACTCCCAGAAAAGAAAAGATATCCTTGATCT...,Japan,FFRENLAFQQGEAREFPSEQTRANSPTRRELQVWGEDNNSLSEAGA...
89,AB221126,9664,TGGAAGGGTTAATTTACTCCCAGAAAAGAAAAGATATCCTTGATCT...,Japan,FFRENLAFQQGEAREFPSEQTRANSPTRRELQVWGEDNNSLSEAGA...
90,AB231893,9724,TGGATGGGCTAATTTACTCCAAGAAAAGACAAGAGATCCTTGATCT...,Ghana:Accura,FFRENLAFQQGEAREFSSEQARAISSTSRELRVREGDSPLPEAGAE...
...,...,...,...,...,...
462010,MZ468890,1302,CCTCAAATCACTCTTTGGCAACGACCCCTCGTCACAGTAAAGATAG...,Poland,PQITLWQRPLVTVKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...
462011,MZ468891,1302,CCTCAAATCACTCTTTGGCAACGACCCATCGTCACAGTAAGGATAG...,Poland,PQITLWQRPIVTVRIEGQLKDALLDTGADDTVLEDMTLPGRWKPKM...
462012,MZ468892,1302,CCTCAGATCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAG...,Poland,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...
462013,MZ468893,1302,CCTCAGATCACTCTTTGGCAACGAYCCATCGTCACAGTAARGGTAG...,Poland,PQITLWQRXIVTVXVGGQLKEALLDTGADDTVLEDMNLQGKWKPKM...


In [8]:
for gene in GENES:
    sub_df = df[["accession", gene]]
    sub_df.to_csv(Path(data_path).joinpath(f"1-{gene}-PISPIET.csv"), index=False)   
    
    with open(Path(data_path).joinpath(f"1-{gene}-PISPIET.fasta"), "w") as fasta_file:
        for _, row in df.iterrows():
            fasta_file.write(f">{row['accession']}\n")
            fasta_file.write(f"{row[gene]}\n")
    

In [5]:
!jupyter nbconvert --to script 01-build-dataset.ipynb --no-prompt

[NbConvertApp] Converting notebook 01-build-dataset.ipynb to script
[NbConvertApp] Writing 3533 bytes to 01-build-dataset.py
