#### Genbank: 
https://www.ncbi.nlm.nih.gov/genbank/

In [121]:
import numpy as np
import pandas as pd
import re

In [122]:
file = 'ptenSequence.gb'

# so the data is structured 
"""
LOCUS       ...
DEFINITION  ...
ACCESSION   ...
VERSION     ...
DBLINKS     ...
KEYWORDS    ...
SOURCE      ...
REFERENCE   ...
    CONSRTM
    TITLE
    JOURNAL
x3
COMMENT     ...
FEATURES             Location/Qualifiers
    source           1..206
    misc_feature
    regulatory
    gene
    mRNA
    CDS
    exon
    ................
ORIGIN
    1 atgcgtaa ...
    61 ggtttgcgac ...
    121 ggggggttta ...   // Step of 60
"""

# Best way to organize the data will be to use Dictionaries igg
# can loop through the lines and check for keywords and extract the following lines till the next keyword or when the trailing whitespace ends

'\nLOCUS       ...\nDEFINITION  ...\nACCESSION   ...\nVERSION     ...\nDBLINKS     ...\nKEYWORDS    ...\nSOURCE      ...\nREFERENCE   ...\n    CONSRTM\n    TITLE\n    JOURNAL\nx3\nCOMMENT     ...\nFEATURES             Location/Qualifiers\n    source           1..206\n    misc_feature\n    regulatory\n    gene\n    mRNA\n    CDS\n    exon\n    ................\nORIGIN\n    1 atgcgtaa ...\n    61 ggtttgcgac ...\n    121 ggggggttta ...   // Step of 60\n'

In [123]:
def readFile(file):
    try:
        with open(file) as f:
            lines = f.readlines()
        return lines
    except FileNotFoundError:
        print(f"File {file} not found.")
        return None
    
data = readFile(file)

In [124]:
# ===== Helper Functions =====

def hasMainKeyword(line):
    findMainKeyandContent = r'^([A-Z]+)\s+(.+)'
    match = re.search(findMainKeyandContent, line)
    if match:
        return {
            'keyword': match.group(1).strip(),
            'content': match.group(2).strip(),
            'indentation': 0
        }
    return None

def hasSubKeyword(line):
    findSubKey = r'^(\s+)([A-Z]+)\s+(.+)'
    match = re.search(findSubKey, line)
    if match:
        return {
            'keyword': match.group(2).strip(),
            'content': match.group(3).strip(),
            'indentation': match.group(1).count(' ')
        }
    return None

def hasOnlyContent(line):
    findOnlyContent = r'^\s+(.+)'
    match = re.search(findOnlyContent, line)
    if match:
        return {
            'content': match.group(1).strip()
        }
    return None

def findParentKeyword(line,data,index=106):
    if hasSubKeyword(line):
        subKeyword = hasSubKeyword(line)
        for i in range(index, -1, -1):
            parentLine = data[i]
            parentKeyword = hasMainKeyword(parentLine) or hasSubKeyword(parentLine)
            if parentKeyword and parentKeyword['indentation'] < subKeyword['indentation']:
                return parentKeyword['keyword']
    if hasOnlyContent(line):
        for i in range(index, -1, -1):
            parentLine = data[i]
            parentKeyword = hasMainKeyword(parentLine) or hasSubKeyword(parentLine)
            if parentKeyword:
                return parentKeyword['keyword']
    return None


# findParentKeyword('            CM000672.2.',data)


In [128]:
fixed_data = {}
last_main = None
last_sub = {}


for i, line in enumerate(data):
    mainKeyword = hasMainKeyword(line)
    subKeyword = hasSubKeyword(line)
    onlyContent = hasOnlyContent(line)

    if mainKeyword:
        key = mainKeyword['keyword']
        fixed_data[key] = {
            'content': mainKeyword['content'],
            'subKeywords': {}
        }
        last_main = key
        last_sub[key] = None 


    elif subKeyword:
        parent = findParentKeyword(line, data, i)
        if parent and parent in fixed_data:
            sub_key = subKeyword['keyword']
            fixed_data[parent]['subKeywords'][sub_key] = subKeyword['content']
            last_sub[parent] = sub_key

    elif onlyContent:
        parent = findParentKeyword(line, data, i)
        if parent in fixed_data:
            if last_sub.get(parent): 
                sub_key = last_sub[parent]
                fixed_data[parent]['subKeywords'][sub_key] += '\n' + onlyContent['content']
            else: 
                fixed_data[parent]['content'] += '\n' + onlyContent['content'] 


fixed_data # i have a headache now

#create dataframe
df = pd.DataFrame.from_dict(fixed_data, orient='index')


In [137]:
df

# get subkeywords of REFERENCE
# df['subKeywords']['REFERENCE']

df['content']['FEATURES']

'Location/Qualifiers\nsource          1..108306\n/organism="Homo sapiens"\n/mol_type="genomic DNA"\n/db_xref="taxon:9606"\n/chromosome="10"\nmisc_feature    <1..65\n/standard_name="ATAC-STARR-seq lymphoblastoid silent\nregion 2585"\n/note="Region: biological region; Derived by automated\ncomputational analysis using gene prediction method:\nRefSeqFE."\n/db_xref="GeneID:130004273"\nregulatory      <1..65\n/regulatory_class="silencer"\n/experiment="EXISTENCE:reporter gene assay evidence\n[ECO:0000049][PMID:35858748]"\n/note="silent region_2585"\n/function="represses an Ori minimal core promoter by\nATAC-STARR-seq in GM12878 lymphoblastoid cells\n{active_cell/tissue: GM12878}"\n/db_xref="GeneID:130004273"\ngene            1..108306\n/gene="PTEN"\n/gene_synonym="10q23del; BZS; CWS1; DEC; GLM2; MHAM;\nMMAC1; PTEN1; PTENbeta; PTENgama; TEP1"\n/note="phosphatase and tensin homolog; Derived by\nautomated computational analysis using gene prediction\nmethod: BestRefSeq."\n/db_xref="GeneID:5728"

In [138]:
df.to_csv('parsed_genbank.csv', index=True)

In [None]:
class GenBank:
    def __init__(self):
        self.data = readFile(file)
        self.orgName = None
        self.geneName = None
    