In [1]:
#!python3 -m pip install biopython

In [1]:
from Bio import GenBank
from pathlib import Path
import pandas as pd

In [2]:
data_path = "/data/hiv/data"
GENES = ['pol', 'gag-pol']#, 'env', 'gag', 'vpr', 'vif', 'tat', 'rev', 'vpu', 'nef']

In [4]:
def parse_record(record):
    obj = {}
    
    if 'HIV' in record.source:
        
#         if len(record.sequence) <= 200:
#             return None

        obj = {
            "accession": record.accession[0], 
            'length': len(record.sequence),
            'sequence': record.sequence       
        }

        #[Feature(key='source', location='1..1686'), Feature(key='gene', location='<1..>1686'), Feature(key='CDS', location='<1..>1686')]
        for feature in record.features:
            if feature.key == "source":
                # [Qualifier(key='/organism=', value='"Human immunodeficiency virus 1"'), Qualifier(key='/proviral', value=''), Qualifier(key='/mol_type=', value='"genomic DNA"'), Qualifier(key='/db_xref=', value='"taxon:11676"'), Qualifier(key='/country=', value='"Spain"')]
                for qualifier in feature.qualifiers:
                    if qualifier.key == "/country=":
                        obj["country"] = qualifier.value.replace('"','').replace("'", "")

            if feature.key == "gene":
                pass

            if feature.key == "CDS":
                gene = None
                sequence = None
                
                for qualifier in feature.qualifiers:
                    if qualifier.key == "/gene=":
                        gene = qualifier.value.replace('"','').replace("'", "").lower()

                    if qualifier.key == "/translation=":
                        sequence = qualifier.value.replace('"','').replace("'", "")
                    
                if sequence and gene and gene in GENES:
                    if gene in obj:
                        print(f"{gene} already is parsed")
                    obj[gene] = sequence    
#                     obj[f"{gene}_loc"] = feature.location.replace("<", "").replace(">", "")
    
    if len(obj.keys()) <= 4:
        # accession, length, sequence, country
        return None
    
    return obj 
    
    
def parse_file(filename):
    data = []
    with open(filename, "r") as handle:
        records = GenBank.parse(handle)
        while True:
            try:
                obj = parse_record(records.__next__())
                if obj:
                    data.append(obj)
            
            except StopIteration as e:
                break
                    
            except Exception as exc:
                print(exc)
                
    return data                

In [5]:
data = []
for filename in sorted(list(Path(data_path).glob("*.seq"))):
    print(f"Parsing {filename}")
    sub_data = parse_file(filename)
    if len(sub_data) > 0:
        data.extend(sub_data)
        sub_df = pd.DataFrame(sub_data)
        sub_df.to_csv(filename.with_suffix(".csv"))
        print("Done")
    else:
        print("No useful data")
        

df = pd.DataFrame(data)
df.to_csv(Path(data_path).joinpath("1-full.csv"), index = False)    

Parsing /data/hiv/data/gbvrl1.seq
Done
Parsing /data/hiv/data/gbvrl10.seq
pol already is parsed
Done
Parsing /data/hiv/data/gbvrl100.seq
Done
Parsing /data/hiv/data/gbvrl101.seq
No useful data
Parsing /data/hiv/data/gbvrl102.seq
Done
Parsing /data/hiv/data/gbvrl103.seq
Done
Parsing /data/hiv/data/gbvrl104.seq
Done
Parsing /data/hiv/data/gbvrl105.seq
No useful data
Parsing /data/hiv/data/gbvrl106.seq
Done
Parsing /data/hiv/data/gbvrl107.seq
Done
Parsing /data/hiv/data/gbvrl108.seq
No useful data
Parsing /data/hiv/data/gbvrl109.seq
Done
Parsing /data/hiv/data/gbvrl11.seq
Done
Parsing /data/hiv/data/gbvrl110.seq
No useful data
Parsing /data/hiv/data/gbvrl111.seq
Done
Parsing /data/hiv/data/gbvrl112.seq
No useful data
Parsing /data/hiv/data/gbvrl113.seq
No useful data
Parsing /data/hiv/data/gbvrl114.seq
Done
Parsing /data/hiv/data/gbvrl115.seq
No useful data
Parsing /data/hiv/data/gbvrl116.seq
No useful data
Parsing /data/hiv/data/gbvrl117.seq
No useful data
Parsing /data/hiv/data/gbvrl118

pol already is parsed
pol already is parsed
pol already is parsed
pol already is parsed
pol already is parsed
pol already is parsed
pol already is parsed
pol already is parsed
pol already is parsed
Done
Parsing /data/hiv/data/gbvrl20.seq
Done
Parsing /data/hiv/data/gbvrl200.seq
No useful data
Parsing /data/hiv/data/gbvrl201.seq
Done
Parsing /data/hiv/data/gbvrl202.seq
No useful data
Parsing /data/hiv/data/gbvrl203.seq
No useful data
Parsing /data/hiv/data/gbvrl204.seq
No useful data
Parsing /data/hiv/data/gbvrl205.seq
No useful data
Parsing /data/hiv/data/gbvrl206.seq
No useful data
Parsing /data/hiv/data/gbvrl207.seq
No useful data
Parsing /data/hiv/data/gbvrl208.seq
No useful data
Parsing /data/hiv/data/gbvrl209.seq
No useful data
Parsing /data/hiv/data/gbvrl21.seq
Done
Parsing /data/hiv/data/gbvrl210.seq
No useful data
Parsing /data/hiv/data/gbvrl211.seq
No useful data
Parsing /data/hiv/data/gbvrl212.seq
No useful data
Parsing /data/hiv/data/gbvrl213.seq
No useful data
Parsing /data

Done
Parsing /data/hiv/data/gbvrl280.seq
No useful data
Parsing /data/hiv/data/gbvrl281.seq
No useful data
Parsing /data/hiv/data/gbvrl282.seq
No useful data
Parsing /data/hiv/data/gbvrl283.seq
No useful data
Parsing /data/hiv/data/gbvrl284.seq
No useful data
Parsing /data/hiv/data/gbvrl285.seq
No useful data
Parsing /data/hiv/data/gbvrl286.seq
No useful data
Parsing /data/hiv/data/gbvrl287.seq
Done
Parsing /data/hiv/data/gbvrl288.seq
No useful data
Parsing /data/hiv/data/gbvrl289.seq
No useful data
Parsing /data/hiv/data/gbvrl29.seq
Done
Parsing /data/hiv/data/gbvrl290.seq
No useful data
Parsing /data/hiv/data/gbvrl291.seq
No useful data
Parsing /data/hiv/data/gbvrl292.seq
No useful data
Parsing /data/hiv/data/gbvrl293.seq
No useful data
Parsing /data/hiv/data/gbvrl294.seq
No useful data
Parsing /data/hiv/data/gbvrl295.seq
No useful data
Parsing /data/hiv/data/gbvrl296.seq
Done
Parsing /data/hiv/data/gbvrl297.seq
No useful data
Parsing /data/hiv/data/gbvrl3.seq
Done
Parsing /data/hiv/

Done
Parsing /data/hiv/data/gbvrl99.seq
Done


In [6]:
df

Unnamed: 0,accession,length,sequence,country,gag-pol,pol
0,AB032740,9427,GGGTCTATAATACACAAGGCTTTTTCCCTGATTGGCAAAACTACAC...,Thailand,MGARASILSGGKLDAWEKIRLRPGGRKKYRMKHLVWASRELERFAL...,
1,AB032741,9430,GGGTCTATAATACACAAGGCTTCTTTCCTGATTGGCAAAACTACAC...,Thailand,MGAKASVLSGGKLDAWEKIRLRPGGRKKYHLKHIVWASRELERFAL...,
2,AB052995,9720,TGGATGGGCTAATTTACTCCAAGAAAAGACAAGAGATCCTTGACTT...,Japan,MGARASVLSGGKLDAWEKIRLRPGGKKKYRMKHLVWASRELERFAL...,
3,AB070352,9731,TGGATGGGCTAATTTACTCCAAGAGATGACAAGAGATCCTTGACTT...,Japan,MGARASVLSGGKLDAWEKIRLRPGGKKKYQLKHVVWASRELERFAL...,
4,AB070353,9720,TGGATGGGCTAATTTACTCCAAGAAAAGACAAGAGATCCTTGACTT...,Japan,MGARASVLSGGKLDAWEKIRLRPGGKKKYRMKHLVWASRELERFAL...,
...,...,...,...,...,...,...
464141,MZ468890,1302,CCTCAAATCACTCTTTGGCAACGACCCCTCGTCACAGTAAAGATAG...,Poland,,PQITLWQRPLVTVKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...
464142,MZ468891,1302,CCTCAAATCACTCTTTGGCAACGACCCATCGTCACAGTAAGGATAG...,Poland,,PQITLWQRPIVTVRIEGQLKDALLDTGADDTVLEDMTLPGRWKPKM...
464143,MZ468892,1302,CCTCAGATCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAG...,Poland,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...
464144,MZ468893,1302,CCTCAGATCACTCTTTGGCAACGAYCCATCGTCACAGTAARGGTAG...,Poland,,PQITLWQRXIVTVXVGGQLKEALLDTGADDTVLEDMNLQGKWKPKM...


In [3]:
# df = pd.read_csv(Path(data_path).joinpath("1-full.csv"))

  df = pd.read_csv(Path(data_path).joinpath("1-full.csv"))


In [8]:
# df = df[df['pol'].str.contains("PISPIET")]

In [4]:
df

Unnamed: 0,accession,length,sequence,country,gag-pol,pol
0,AB032740,9427,GGGTCTATAATACACAAGGCTTTTTCCCTGATTGGCAAAACTACAC...,Thailand,MGARASILSGGKLDAWEKIRLRPGGRKKYRMKHLVWASRELERFAL...,
1,AB032741,9430,GGGTCTATAATACACAAGGCTTCTTTCCTGATTGGCAAAACTACAC...,Thailand,MGAKASVLSGGKLDAWEKIRLRPGGRKKYHLKHIVWASRELERFAL...,
2,AB052995,9720,TGGATGGGCTAATTTACTCCAAGAAAAGACAAGAGATCCTTGACTT...,Japan,MGARASVLSGGKLDAWEKIRLRPGGKKKYRMKHLVWASRELERFAL...,
3,AB070352,9731,TGGATGGGCTAATTTACTCCAAGAGATGACAAGAGATCCTTGACTT...,Japan,MGARASVLSGGKLDAWEKIRLRPGGKKKYQLKHVVWASRELERFAL...,
4,AB070353,9720,TGGATGGGCTAATTTACTCCAAGAAAAGACAAGAGATCCTTGACTT...,Japan,MGARASVLSGGKLDAWEKIRLRPGGKKKYRMKHLVWASRELERFAL...,
...,...,...,...,...,...,...
464141,MZ468890,1302,CCTCAAATCACTCTTTGGCAACGACCCCTCGTCACAGTAAAGATAG...,Poland,,PQITLWQRPLVTVKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...
464142,MZ468891,1302,CCTCAAATCACTCTTTGGCAACGACCCATCGTCACAGTAAGGATAG...,Poland,,PQITLWQRPIVTVRIEGQLKDALLDTGADDTVLEDMTLPGRWKPKM...
464143,MZ468892,1302,CCTCAGATCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAG...,Poland,,PQITLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMNLPGRWKPKM...
464144,MZ468893,1302,CCTCAGATCACTCTTTGGCAACGAYCCATCGTCACAGTAARGGTAG...,Poland,,PQITLWQRXIVTVXVGGQLKEALLDTGADDTVLEDMNLQGKWKPKM...


In [11]:
gag_pol = df[["accession", gene]].dropna()

In [13]:
# gag_pol.hist()

In [7]:
for gene in GENES:
    gene_df = df[["accession", gene]]
    # drop duplicates
    gene_df = gene_df[~gene_df.duplicated([gene], keep='first')]
    gene_df.to_csv(Path(data_path).joinpath(f"1-{gene}.csv"), index=False)   
    
    with open(Path(data_path).joinpath(f"1-{gene}.fasta"), "w") as fasta_file:
        for _, row in gene_df.iterrows():
            fasta_file.write(f">{row['accession']}\n")
            fasta_file.write(f"{row[gene]}\n")
    

In [None]:
!jupyter nbconvert --to script 01-build-dataset.ipynb --no-prompt