In [1]:
from Bio import GenBank
from pathlib import Path
import pandas as pd

In [2]:
data_path = "/data/hiv/data"

In [20]:
def parse_record(record):
    # Mums idomus tik tie kur yra 'HIV' SOURCE eilutej
    ## SOURCE      Human immunodeficiency virus 1 (HIV-1)
    
    obj = {}
    
    if 'HIV' in record.source:
        
#         if len(record.sequence) <= 200:
#             return None

        obj = {
            "accession": record.accession[0], 
            'length': len(record.sequence),
            'sequence': record.sequence       
        }

        #[Feature(key='source', location='1..1686'), Feature(key='gene', location='<1..>1686'), Feature(key='CDS', location='<1..>1686')]
        for feature in record.features:
#             print(feature.key)
            if feature.key == "source":
                # [Qualifier(key='/organism=', value='"Human immunodeficiency virus 1"'), Qualifier(key='/proviral', value=''), Qualifier(key='/mol_type=', value='"genomic DNA"'), Qualifier(key='/db_xref=', value='"taxon:11676"'), Qualifier(key='/country=', value='"Spain"')]
#                         for qualifier in qualifiers:

                for qualifier in feature.qualifiers:
                    if qualifier.key == "/country=":
                        obj["country"] = qualifier.value.replace('"','').replace("'", "")

            if feature.key == "gene":
                pass

            if feature.key == "CDS":
                gene = None
                protein = None
                
                for qualifier in feature.qualifiers:
#                     print(qualifier)
                    if qualifier.key == "/gene=":
                        gene = qualifier.value.replace('"','').replace("'", "").lower()

                    if qualifier.key == "/translation=":
                        protein = qualifier.value.replace('"','').replace("'", "")
                    
               # genu pavadinimuose yra nemazai siuksliu, camel case ir t.t.

               # ['pol', 'env', 'gag',
               #'vpr', 'vif', 'tat', 'rev', 'vpu', 'nef', 'gag-pol', 'rev1', 'gp120',
               #'reverse transcriptase', 'HIV-1 protease', 'RT', 'protease', 'PR',
               #'p24', 'envelope', 'gp160', 'tat/rev', 'vpu*', 'rt', 'ORF',
               #'v-1 PROTEASE', 'Gag', 'Pol', 'Vif', 'Vpr', 'Tat', 'Rev', 'Vpu', 'Env',
               #'Nef', 'pro', 'as', 'v-1 reverse transcriptase', 'v-1 protease', 'Pro',
               #'V-1 protease', 'V-1 reverse transcriptase', 'polyprotein', 'rnv', 'gg',
               #'env V3', 'gp41', 'GP160', 'GP120', 'GP41', 'env gene', 'RAK alpha',
               #'V3', 'c2v3', 'vpx']

                if protein and gene and gene in ['pol']:#, 'env', 'gag', 'vpr', 'vif', 'tat', 'rev', 'vpu', 'nef']:
                    obj[f"{gene}"] = protein    
#                     obj[f"{gene}_loc"] = feature.location.replace("<", "").replace(">", "")
    
    if len(obj.keys()) <= 4:
        # accession, length, sequence, country
        return None
    
    return obj 
    
    
def parse_file(filename):
    data = []
    with open(filename, "r") as handle:
        records = GenBank.parse(handle)
        while True:
            try:
                obj = parse_record(records.__next__())
                if obj:
                    data.append(obj)
            
            except StopIteration as e:
                break
                    
            except Exception as exc:
                print(exc)
                
    return data
                

In [21]:
data = []
for filename in sorted(list(Path(data_path).glob("*.seq"))):
    print(f"Parsing {filename}")
    sub_data = parse_file(filename)
    if len(sub_data) > 0:
        data.extend(sub_data)
        sub_df = pd.DataFrame(sub_data)
        sub_df.to_csv(filename.with_suffix(".csv"))
        print("Done")
    else:
        print("No useful data")
        

df = pd.DataFrame(data)
df.to_csv(Path(data_path).joinpath("all.csv"))    

Parsing /data/hiv/data/gbvrl1.seq
Done
Parsing /data/hiv/data/gbvrl10.seq
Done
Parsing /data/hiv/data/gbvrl100.seq
Done
Parsing /data/hiv/data/gbvrl101.seq
No useful data
Parsing /data/hiv/data/gbvrl102.seq
Done
Parsing /data/hiv/data/gbvrl103.seq
Done
Parsing /data/hiv/data/gbvrl104.seq
Done
Parsing /data/hiv/data/gbvrl105.seq
No useful data
Parsing /data/hiv/data/gbvrl106.seq
Done
Parsing /data/hiv/data/gbvrl107.seq
Done
Parsing /data/hiv/data/gbvrl108.seq
No useful data
Parsing /data/hiv/data/gbvrl109.seq
Done
Parsing /data/hiv/data/gbvrl11.seq
Done
Parsing /data/hiv/data/gbvrl110.seq
No useful data
Parsing /data/hiv/data/gbvrl111.seq
Done
Parsing /data/hiv/data/gbvrl112.seq
No useful data
Parsing /data/hiv/data/gbvrl113.seq
No useful data
Parsing /data/hiv/data/gbvrl114.seq
Done
Parsing /data/hiv/data/gbvrl115.seq
No useful data
Parsing /data/hiv/data/gbvrl116.seq
No useful data
Parsing /data/hiv/data/gbvrl117.seq
No useful data
Parsing /data/hiv/data/gbvrl118.seq
No useful data
Pa

No useful data
Parsing /data/hiv/data/gbvrl25.seq
Done
Parsing /data/hiv/data/gbvrl250.seq
No useful data
Parsing /data/hiv/data/gbvrl251.seq
No useful data
Parsing /data/hiv/data/gbvrl252.seq
No useful data
Parsing /data/hiv/data/gbvrl253.seq
No useful data
Parsing /data/hiv/data/gbvrl254.seq
No useful data
Parsing /data/hiv/data/gbvrl255.seq
Done
Parsing /data/hiv/data/gbvrl256.seq
No useful data
Parsing /data/hiv/data/gbvrl257.seq
Done
Parsing /data/hiv/data/gbvrl258.seq
No useful data
Parsing /data/hiv/data/gbvrl259.seq
No useful data
Parsing /data/hiv/data/gbvrl26.seq
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
'Record' object has no attribute 'name'
Done
Parsing /data/hiv/data/gbvrl260.seq
No useful data
Pars

Done
Parsing /data/hiv/data/gbvrl44.seq
Done
Parsing /data/hiv/data/gbvrl45.seq
Done
Parsing /data/hiv/data/gbvrl46.seq
Done
Parsing /data/hiv/data/gbvrl47.seq
Done
Parsing /data/hiv/data/gbvrl48.seq
Done
Parsing /data/hiv/data/gbvrl49.seq
Done
Parsing /data/hiv/data/gbvrl5.seq
No useful data
Parsing /data/hiv/data/gbvrl50.seq
Done
Parsing /data/hiv/data/gbvrl51.seq
Done
Parsing /data/hiv/data/gbvrl52.seq
Done
Parsing /data/hiv/data/gbvrl53.seq
Done
Parsing /data/hiv/data/gbvrl54.seq
No useful data
Parsing /data/hiv/data/gbvrl55.seq
Done
Parsing /data/hiv/data/gbvrl56.seq
Done
Parsing /data/hiv/data/gbvrl57.seq
Done
Parsing /data/hiv/data/gbvrl58.seq
Done
Parsing /data/hiv/data/gbvrl59.seq
No useful data
Parsing /data/hiv/data/gbvrl6.seq
No useful data
Parsing /data/hiv/data/gbvrl60.seq
Done
Parsing /data/hiv/data/gbvrl61.seq
No useful data
Parsing /data/hiv/data/gbvrl62.seq
No useful data
Parsing /data/hiv/data/gbvrl63.seq
Done
Parsing /data/hiv/data/gbvrl64.seq
No useful data
Parsing

In [None]:
df

In [None]:
df.notnull().sum()

In [None]:
# !jupyter nbconvert --to script 1-build-dataset.ipynb