In [1]:
import pandas as pd
import os
import re
import xmltodict
import sqlite3
from tqdm import tqdm_notebook

conn = sqlite3.connect("human_protein.db")

In [2]:
os.getcwd()
os.path.exists('data\\reviewed_human_protein.xml')

True

In [3]:
# This takes a couple minutes
with open('data/reviewed_human_protein.xml') as f:
    human_protein_dict = xmltodict.parse(f.read())

In [4]:
human_protein_dict.keys()

odict_keys(['uniprot'])

In [5]:
human_protein_dict['uniprot'].keys()

odict_keys(['@xmlns', '@xmlns:xsi', '@xsi:schemaLocation', 'entry', 'copyright'])

In [6]:
entry = human_protein_dict['uniprot']['entry'][0]
entry.keys()

odict_keys(['@dataset', '@created', '@modified', '@version', 'accession', 'name', 'protein', 'gene', 'organism', 'reference', 'comment', 'dbReference', 'proteinExistence', 'keyword', 'feature', 'evidence', 'sequence'])

In [7]:
len(human_protein_dict['uniprot']['entry'])

20394

In [8]:
def commentAlgo(comment,locs,):
    """I'm having a problem with entries sometimes being lists or flat entries"""

def entryCommentParser(entry_name, comments):
    function_list = []
    disease_list = []
    tissue_list = []
    subcellular_list = []
    if type(comments)!=list:
        comments = [comments]
    for comment in comments:
        comm_arr = [entry_name]
        comm_type = comment.get("@type",None)
        value = comment.get("text")
        if comm_type in ['function','pathway','activity regulation','similarity']:
            if value!=None and type(value)!=str:
                value = value.get("#text")
                comm_arr.extend([comm_type, value])
                function_list.append(comm_arr)
        elif comm_type=='disease':
            comm_disease = comment.get('disease')
            if type(comm_disease)!='str' and comm_disease!=None:
                comm_arr.append(comment.get('disease').get("name"))
                disease_list.append(comm_arr)
        elif comm_type=='tissue specificity':
            value = comment.get('text')
            if type(value)!=str:
                value = value.get("#text")
            comm_arr.append(value)
            tissue_list.append(comm_arr)
        # This is a really messily annotated section in these entries...sheesh!
        elif comm_type=='subcellular location':
            locs = []
            loc_source = comment.get('subcellularLocation')
            if type(loc_source)!=list:
                loc_source = [loc_source]
            for loc_head in loc_source:
                if loc_head == None:
                    pass
                else:
                    sub_locs = loc_head.get('location')
                    if type(sub_locs)==list:
                        for sub_sub_locs in sub_locs:
                            if type(sub_sub_locs)==str:
                                value = sub_sub_locs
                            else:
                                value = sub_sub_locs.get('#text')
                            if value not in locs:
                                locs.append(value)
                                subcellular_list.append([entry_name, value])
                    elif type(sub_locs)==str:
                        value = sub_locs
                    else:
                        value = sub_locs.get('#text')
                    if value not in locs:
                        locs.append(value)
                        subcellular_list.append([entry_name, value])
        else:
            pass
    return function_list, disease_list, tissue_list, subcellular_list

def featureParser(entry_name, features):
    """Right now I only want features that are annotations of;
    
    -strand (beta sheet)
    -helix  (alpha helix)
    -turn   (highly structured secondary structure)
    
    """
    sec_struc_list = []
    modified_residues = []
    feat_arr = []
    for feature in features:
        if type(feature)==str:
            pass
        else:
            feat_type = feature.get("@type")
            if feat_type in ['strand','helix','turn']:
                feat_locations = feature.get('location')
                feat_begin = feat_locations.get('begin').get('@position')
                feat_end = feat_locations.get('end').get('@position')
                sec_struc_list.append([entry_name, feat_type, feat_begin, feat_end])
            elif feat_type in ['modified residue','non-standard residue','lipidation',
                               'glycosylation','disulfide bond','cross-link']:
                feat_desc = feature.get("@description")
                feat_posi = feature.get("location")
                if 'begin' in feat_posi.keys():
                    begin = feat_posi.get('begin').get("@position")
                    end = feat_posi.get('end').get("@position")
                    if begin==None:
                        feat_posi = end
                    elif end==None:
                        feat_posi = begin
                    elif begin==None and end==None:
                        feat_posi = None
                    else:
                        feat_posi = begin+"-"+end
                else:
                    feat_posi = feat_posi.get('position').get("@position")
                modified_residues.append([entry_name, feat_type, feat_desc, feat_posi])

    return sec_struc_list, modified_residues

def keywordParser(keyword):
    text = []
    for x in keyword:
        if type(x)==str:
            text.append(x)
        elif x!=None and type(x)!=str:
            text.append(x.get('#text'))
    return ', '.join(text)

def entryParser(entry, conn, print_option=False):
    # Get name
    entry_name = entry.get('name')
    # Get sequence
    entry_seq = entry.get('sequence').get('#text').replace("\n","").strip()
    # Parse keywords
    entry_keywords = ""
    if entry.get('keyword')!=None:
        entry_keywords = keywordParser(entry.get('keyword'))
    # Parse features
    entry_features = entry.get('feature')
    sec_struc_list, modified_residues = featureParser(entry_name, entry_features)
    # Parse comments
    entry_comments = entry.get('comment', None)
    comm_list = []
    disease_list = []
    tissue_list = []
    subcellular_list = []
    if entry_comments != None:
        comm_list, disease_list, tissue_list, subcellular_list \
        = entryCommentParser(entry_name, entry_comments)
    
    # One row each protein: protein name, sequence and keyword string
    protein_df = pd.DataFrame([[entry_name, entry_seq, entry_keywords]],
                              columns=["protein","sequence","keywords"])
    # Multiple rows each protein for secondary structure: protein name, secondary structure type,
    # where it starts, where it ends
    sec_struc_df = pd.DataFrame(sec_struc_list,
                                columns=["protein","sec_struc_type","begin","end"])
    # Multiple rows each protein: protein name, modification type, description of modification,
    # position of modification
    aa_mod_df = pd.DataFrame(modified_residues,
                             columns=['protein','modification','description','position'])
    # Multiple rows each protein for comments: protein name, comment type, comment value
    comment_df = pd.DataFrame(comm_list, columns=["protein","comm_type","value"])
    
    # Multiple rows each protein for disease associations: protein name, disease name
    disease_df = pd.DataFrame(disease_list, columns=["protein","disease"])
    
    # Multiple rows each protein for tissue expression: protein name, tissue
    tissue_df = pd.DataFrame(tissue_list, columns=['protein','disease'])
    
    # Multiple rows each protein for subcellular localization: protein name, subcellular loc
    subcellular_df = pd.DataFrame(subcellular_list, columns=['protein','subcellular_loc'])
    
    if print_option:
        if protein_df.shape[0]!=0:
            print("Protein DataFrame")
            display(protein_df)
        if sec_struc_df.shape[0]!=0:
            print("Secondary structure DataFrame")
            display(sec_struc_df)
        if aa_mod_df.shape[0]!=0:
            print("Amino Acid Modifications")
            display(aa_mod_df)
        if comment_df.shape[0]!=0:
            print("Comment DataFrame")
            display(comment_df)
        if disease_df.shape[0]!=0:
            print("Disease DataFrame")
            display(disease_df)
        if tissue_df.shape[0]!=0:
            print("Tissue DataFrame")
            display(tissue_df)
        if subcellular_df.shape[0]!=0:
            print("Subcellular Location DataFrame")
            display(subcellular_df)
            
    # Record entries in sqlite
    if protein_df.shape[0]!=0:
        protein_df.to_sql('protein',conn,if_exists='append',index=False)
    if sec_struc_df.shape[0]!=0:
        sec_struc_df.to_sql('protein_secondary_structure',conn,if_exists='append',index=False)
    if aa_mod_df.shape[0]!=0:
        aa_mod_df.to_sql('protein_amino_acid_modifications',conn,if_exists='append',index=False)
    if comment_df.shape[0]!=0:
        comment_df.to_sql('protein_comments',conn,if_exists='append',index=False)
    if disease_df.shape[0]!=0:
        disease_df.to_sql('protein_diseases',conn,if_exists='append',index=False)
    if tissue_df.shape[0]!=0:
        tissue_df.to_sql('protein_tissue_expression',conn,if_exists='append',index=False)
    if subcellular_df.shape[0]!=0:
        subcellular_df.to_sql('protein_subcellular_localization',conn,if_exists='append',index=False)


In [9]:
# Takes a little over 15 minutes
for entry in tqdm_notebook(human_protein_dict['uniprot']['entry']):
    entryParser(entry,conn)

HBox(children=(IntProgress(value=0, max=20394), HTML(value='')))


