In [1]:
#!python3 -m pip install biopython

In [1]:
from Bio import GenBank
from pathlib import Path
import pandas as pd

In [2]:
data_path = "/data/hiv/data"
GENES = ['pol', 'gag-pol']#, 'env', 'gag', 'vpr', 'vif', 'tat', 'rev', 'vpu', 'nef']

In [26]:
def parse_record(record):
    obj = {}
    
    if 'HIV' in record.source:
        
#         if len(record.sequence) <= 200:
#             return None

        obj = {
            "accession": record.accession[0], 
            'length': len(record.sequence),
            'sequence': record.sequence       
        }

        #[Feature(key='source', location='1..1686'), Feature(key='gene', location='<1..>1686'), Feature(key='CDS', location='<1..>1686')]
        for feature in record.features:
            if feature.key == "source":
                # [Qualifier(key='/organism=', value='"Human immunodeficiency virus 1"'), Qualifier(key='/proviral', value=''), Qualifier(key='/mol_type=', value='"genomic DNA"'), Qualifier(key='/db_xref=', value='"taxon:11676"'), Qualifier(key='/country=', value='"Spain"')]
                for qualifier in feature.qualifiers:
                    if qualifier.key == "/country=":
                        obj["country"] = qualifier.value.replace('"','').replace("'", "")

            if feature.key == "gene":
                pass

            if feature.key == "CDS":
                gene = None
                sequence = None
                integrase = False
                
                for qualifier in feature.qualifiers:
                    if qualifier.key == "/gene=":
                        gene = qualifier.value.replace('"','').replace("'", "").lower()

                    if qualifier.key == "/translation=":
                        sequence = qualifier.value.replace('"','').replace("'", "")
                    
                    if qualifier.key == "/product=":
                        value = qualifier.value.replace('"','').replace("'", "") 
                        if value == "integrase":
                            integrase = True
                
                if sequence and integrase:
#                     print(f"{record.accession[0]} {sequence}")
                    obj[gene] = sequence    
    
    if len(obj.keys()) <= 4:
        # accession, length, sequence, country
        return None
    
    return obj 
    
    
def parse_file(filename):
    data = []
    with open(filename, "r") as handle:
        records = GenBank.parse(handle)
        while True:
            try:
                obj = parse_record(records.__next__())
                if obj:
                    data.append(obj)
            
            except StopIteration as e:
                break
                    
            except Exception as exc:
                print(exc)
                
    return data                

In [27]:
data = []
for filename in sorted(list(Path(data_path).glob("*.seq"))):
    print(f"Parsing {filename}")
    sub_data = parse_file(filename)
    if len(sub_data) > 0:
        data.extend(sub_data)
        sub_df = pd.DataFrame(sub_data)
        sub_df.to_csv(filename.with_suffix(".csv"))
        print("Done")
    else:
        print("No useful data")
        

df = pd.DataFrame(data)
df.to_csv(Path(data_path).joinpath("2-integrase.csv"), index = False)    

Parsing /data/hiv/data/gbvrl1.seq
Done


In [28]:
df

Unnamed: 0,accession,length,sequence,country,pol
0,AB716142,864,TTTTTAGATGGGATAGAKAAGGCTCAAGAAGAACATGAAAGATATC...,Iran:Tehran,FLDGIXKAQEEHERYHSNWRAMASDFNLPPIVAKEIVANCDKCQLK...
1,AB716143,864,TTCTTAGATGGAATAGATAAGGCTCAAGAAGAACATGAAAGATATC...,Iran:Tehran,FLDGIDKAQEEHERYHSNWRAMASDFNLPPVVAKQIVANCDKCQLK...
2,AB716144,864,TTTCTAGATGGAATAGATAAGGCTCAAGAAGACCATGAAAAATATC...,Iran:Tehran,FLDGIDKAQEDHEKYHSNWRAMASDFNLPPIVAKEIVANCDKCQLK...
3,AB716145,864,TTTTTAGATGGAATAGATAAGGCTCAAGAAGAACATGAAAGATATC...,Iran:Tehran,FLDGIDKAQEEHERYHSNWRAMASDFNLPPVIAKEIVASCNKCQJK...
4,AB716146,864,TTTTTAGATGGAATAGATAAGGCTCAAGAAGAACATGAAAAATATC...,Iran:Tehran,FLDGIDKAQEEHEKYHNNWRAMASDFNLPPVVAKQIVANCDKCQLK...
...,...,...,...,...,...
2393,AF120201,288,AATTCCAGCAGAAACAGGACAGGAGACAGCATACTTTATACTAAAG...,Republic of the Congo,IPAETGQETAYFILKLAGRWPVKVIHTDNGSNFTSAAVKAACWWAN...
2394,AF203329,933,CCGCGGGAATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGC...,USA,AGIRKVLFLDGIDKAQEDHERYHSNWRAMASDFNIPAVVAKEIVAS...
2395,AF203330,933,CCGCGGGAATCAGGAAAGTACTATTTTTAGATGGAATAGATAAGGC...,USA,AGIRKVLFLDGIDKAQEDHEKYHTNWRAMASDFNLPPVVAKEIMAS...
2396,AF203331,933,CCGCGGGAATCAGGAAAGTACTGTTTTTAGATGGAATAGATAAGGC...,USA,AGIRKVLFLDGIDKAQEEHEKYHSNWRAMASDFNLPPVVAKEIVAS...


In [None]:
!jupyter nbconvert --to script 01-build-dataset.ipynb --no-prompt