In [1]:
#!python3 -m pip install biopython

In [2]:
from Bio import GenBank
from pathlib import Path
import pandas as pd

In [3]:
data_path = "/data/hiv/data"

In [4]:
with open("reference.seq", "r") as fh:
    reference_seq = fh.read()

In [5]:
def parse_record(record):
    obj = {}
    
    if 'HIV' in record.source:
        
        obj = {
            "accession": record.accession[0], 
            'length': len(record.sequence),
            'sequence': record.sequence       
        }

        #[Feature(key='source', location='1..1686'), Feature(key='gene', location='<1..>1686'), Feature(key='CDS', location='<1..>1686')]
        for feature in record.features:
            if feature.key == "source":
                # [Qualifier(key='/organism=', value='"Human immunodeficiency virus 1"'), Qualifier(key='/proviral', value=''), Qualifier(key='/mol_type=', value='"genomic DNA"'), Qualifier(key='/db_xref=', value='"taxon:11676"'), Qualifier(key='/country=', value='"Spain"')]
                for qualifier in feature.qualifiers:
                    if qualifier.key == "/country=":
                        obj["country"] = qualifier.value.replace('"','').replace("'", "")

            if feature.key == "gene":
                pass

            if feature.key == "CDS":
                sequence = None
                integrase = False
                
                for qualifier in feature.qualifiers:

                    if qualifier.key == "/translation=":
                        sequence = qualifier.value.replace('"','').replace("'", "")
                    
                    if qualifier.key == "/product=":
                        value = qualifier.value.replace('"','').replace("'", "") 
                        if value == "integrase":
                            integrase = True
                
                if sequence and integrase:
#                     print(f"{record.accession[0]} {sequence}")
                    obj['gene'] = sequence    
    
    return obj 
    
    
def parse_file(filename):
    data = []
    with open(filename, "r") as handle:
        records = GenBank.parse(handle)
        while True:
            try:
                obj = parse_record(records.__next__())
                if obj:
                    data.append(obj)
            
            except StopIteration as e:
                break
                    
            except Exception as exc:
                print(exc)
                
    return data                

In [6]:
data = []
for filename in sorted(list(Path(data_path).glob("*.seq"))):
    print(f"Parsing {filename}")
    sub_data = parse_file(filename)
    if len(sub_data) > 0:
        data.extend(sub_data)
#         sub_df = pd.DataFrame(sub_data)
#         sub_df.to_csv(filename.with_suffix(".csv"))
        print("Done")
    else:
        print("No useful data")

Parsing /data/hiv/data/gbvrl1.seq
Done
Parsing /data/hiv/data/gbvrl10.seq
Done
Parsing /data/hiv/data/gbvrl100.seq
Done
Parsing /data/hiv/data/gbvrl101.seq
No useful data
Parsing /data/hiv/data/gbvrl102.seq
Done
Parsing /data/hiv/data/gbvrl103.seq
Done
Parsing /data/hiv/data/gbvrl104.seq
Done
Parsing /data/hiv/data/gbvrl105.seq
No useful data
Parsing /data/hiv/data/gbvrl106.seq
Done
Parsing /data/hiv/data/gbvrl107.seq
Done
Parsing /data/hiv/data/gbvrl108.seq
No useful data
Parsing /data/hiv/data/gbvrl109.seq
Done
Parsing /data/hiv/data/gbvrl11.seq
Done
Parsing /data/hiv/data/gbvrl110.seq
No useful data
Parsing /data/hiv/data/gbvrl111.seq
Done
Parsing /data/hiv/data/gbvrl112.seq
No useful data
Parsing /data/hiv/data/gbvrl113.seq
No useful data
Parsing /data/hiv/data/gbvrl114.seq
Done
Parsing /data/hiv/data/gbvrl115.seq
No useful data
Parsing /data/hiv/data/gbvrl116.seq
No useful data
Parsing /data/hiv/data/gbvrl117.seq
No useful data
Parsing /data/hiv/data/gbvrl118.seq
Done
Parsing /dat

Done
Parsing /data/hiv/data/gbvrl250.seq
Done
Parsing /data/hiv/data/gbvrl251.seq
No useful data
Parsing /data/hiv/data/gbvrl252.seq
No useful data
Parsing /data/hiv/data/gbvrl253.seq
No useful data
Parsing /data/hiv/data/gbvrl254.seq
No useful data
Parsing /data/hiv/data/gbvrl255.seq
Done
Parsing /data/hiv/data/gbvrl256.seq
No useful data
Parsing /data/hiv/data/gbvrl257.seq
Done
Parsing /data/hiv/data/gbvrl258.seq
No useful data
Parsing /data/hiv/data/gbvrl259.seq
No useful data
Parsing /data/hiv/data/gbvrl26.seq
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
Done
Parsing /data/hiv/data/gbvrl260.seq
No useful data
Parsing /data/hiv/data/gbvrl261.seq
No useful data
Parsing /data

Done
Parsing /data/hiv/data/gbvrl46.seq
Done
Parsing /data/hiv/data/gbvrl47.seq
Done
Parsing /data/hiv/data/gbvrl48.seq
Done
Parsing /data/hiv/data/gbvrl49.seq
Done
Parsing /data/hiv/data/gbvrl5.seq
No useful data
Parsing /data/hiv/data/gbvrl50.seq
Done
Parsing /data/hiv/data/gbvrl51.seq
Done
Parsing /data/hiv/data/gbvrl52.seq
Done
Parsing /data/hiv/data/gbvrl53.seq
Done
Parsing /data/hiv/data/gbvrl54.seq
No useful data
Parsing /data/hiv/data/gbvrl55.seq
Done
Parsing /data/hiv/data/gbvrl56.seq
Done
Parsing /data/hiv/data/gbvrl57.seq
Done
Parsing /data/hiv/data/gbvrl58.seq
Done
Parsing /data/hiv/data/gbvrl59.seq
Done
Parsing /data/hiv/data/gbvrl6.seq
No useful data
Parsing /data/hiv/data/gbvrl60.seq
Done
Parsing /data/hiv/data/gbvrl61.seq
Done
Parsing /data/hiv/data/gbvrl62.seq
Done
Parsing /data/hiv/data/gbvrl63.seq
Done
Parsing /data/hiv/data/gbvrl64.seq
No useful data
Parsing /data/hiv/data/gbvrl65.seq
No useful data
Parsing /data/hiv/data/gbvrl66.seq
Done
Parsing /data/hiv/data/gbvr

In [7]:
df = pd.DataFrame(data)
df

Unnamed: 0,accession,length,sequence,country,gene
0,AB000071,129,GTACAAATTAATTGTACAAGACCCAACAACAATACAAGAAAAGGTA...,,
1,AB000072,129,GTAGAAATTAATTGTACAAGACCCAGCAATAATACAAGAAAAAGCA...,,
2,AB000073,129,GTAGAAATTAATTGTACAAGACCCAACAATAATACAAGAAAAAGCA...,,
3,AB000074,129,GTAGAAATTAATTGTACAAGACCCAGCAATAATACAAGAAAAAGTA...,,
4,AB000075,129,GTAGAAATTAATTGTACAAGACCCAACAATAATACAAGAAGAAGTA...,,
...,...,...,...,...,...
985917,MZ468890,1302,CCTCAAATCACTCTTTGGCAACGACCCCTCGTCACAGTAAAGATAG...,Poland,
985918,MZ468891,1302,CCTCAAATCACTCTTTGGCAACGACCCATCGTCACAGTAAGGATAG...,Poland,
985919,MZ468892,1302,CCTCAGATCACTCTTTGGCAACGACCCCTCGTCACAATAAAGATAG...,Poland,
985920,MZ468893,1302,CCTCAGATCACTCTTTGGCAACGAYCCATCGTCACAGTAARGGTAG...,Poland,


In [8]:
df = df[~df.duplicated(['gene'], keep='first')]
df = df[["accession", 'gene']].dropna()
df.to_csv(Path(data_path).joinpath(f"1-integrase.csv"), index=False)   

with open(Path(data_path).joinpath(f"1-integrase.fasta"), "w") as fasta_file:
    for _, row in df.iterrows():
        fasta_file.write(f">{row['accession']}\n")
        fasta_file.write(f"{row['gene']}\n")

In [9]:
df

Unnamed: 0,accession,gene
8184,AB716142,FLDGIXKAQEEHERYHSNWRAMASDFNLPPIVAKEIVANCDKCQLK...
8185,AB716143,FLDGIDKAQEEHERYHSNWRAMASDFNLPPVVAKQIVANCDKCQLK...
8186,AB716144,FLDGIDKAQEDHEKYHSNWRAMASDFNLPPIVAKEIVANCDKCQLK...
8187,AB716145,FLDGIDKAQEEHERYHSNWRAMASDFNLPPVIAKEIVASCNKCQJK...
8188,AB716146,FLDGIDKAQEEHEKYHNNWRAMASDFNLPPVVAKQIVANCDKCQLK...
...,...,...
967061,EU517892,FLDGIDKAQEEHEKYHSNWRAMASDFNLPPVVAKEIVASCDKCQLK...
967063,EU517894,FLDGIDKAQEEHEKYHSNWRAMASDFNLPPVVAKEIVASCDKCQLK...
967064,EU517895,FLDGIDKAQEEHEKYHSNWRAMASDFNLPPVVAKEIVASCDKCQLK...
967065,EU517896,FLDGIDKAQEEHEKYHSNWRAMASDFNLPPVVAKEIVASCDKCQLK...
