In [1]:
import requests
import pandas as pd
import numpy as np

In [2]:
def Query_Uniprot(Uniprot_ID_query):

    query_uniprot = 'https://rest.uniprot.org/uniprotkb/search?query=' + Uniprot_ID_query
    # print(query_uniprot)
    
    response_uniprot = requests.get(query_uniprot)
    df_uniprot = pd.DataFrame.from_dict(response_uniprot.json()['results'])

    ## Remove entries without sequence
    df_uniprot_filtered = df_uniprot[~df_uniprot['sequence'].isna()]
    ## Remove unreviewed entries
    # df_uniprot_filtered = df_uniprot_filtered[df_uniprot_filtered['entryType'] == 'UniProtKB reviewed (Swiss-Prot)']
    
    return df_uniprot_filtered

In [3]:
found_list = []
not_found_list = []
result_dict = {}

In [4]:
for ID in ["bactNOG00014","bactNOG00441","bactNOG00852","bactNOG00947","bactNOG01259","bactNOG01662","bactNOG01793","bactNOG01844","bactNOG02159","bactNOG02194","bactNOG02239","bactNOG02592","bactNOG02881","bactNOG02937","bactNOG03269","bactNOG03766","bactNOG04578","bactNOG05056","bactNOG05829","bactNOG06811","bactNOG07153","bactNOG07881","bactNOG08340","bactNOG10247","bactNOG10565","bactNOG12453","bactNOG12836","bactNOG14098","bactNOG15341","bactNOG16627","bactNOG18192","bactNOG18519","bactNOG18630","bactNOG20249","bactNOG24711","bactNOG26316","bactNOG27043","bactNOG28876","bactNOG30464","bactNOG31116","bactNOG36984","bactNOG48307","bactNOG51678","bactNOG62245","bactNOG69335","bactNOG82617","firmNOG00626","firmNOG04290","NOG132553","NOG133663","proNOG30191"]:
    
    query_string = "http://eggnog.embl.de/version_3.0/cgi/members.py?search_term=" + ID + "&group=" + ID
    # print(query_string)
    response = requests.get(query_string)
    df = pd.read_html(response.text)[0]
    
    
    ## Remove those protein without Uniprot ID
    filtered_df = df[~df['Uniprot'].isna()]    
    
    ## Print warning if no entries can be found for given EGGNOG ID    
    if filtered_df.shape[0] == 0:
        print('Nothing found for ' +  ID + ' in EggnogV3.0!!!')
        next
    else:
        ## Parse protein sequence on Uniprot
        Uniprot_ID = filtered_df['Uniprot']
        Uniprot_ID = list(set(Uniprot_ID))
        Uniprot_ID = ['id:' + x for x in Uniprot_ID]
        
        if (len(Uniprot_ID) > 1) & (len(Uniprot_ID) < 500):
            Uniprot_ID_query='%20OR%20'.join(Uniprot_ID)
            uniprot_df = Query_Uniprot(Uniprot_ID_query)
        
        elif len(Uniprot_ID) >= 500:
            # print(len(Uniprot_ID))
            df_list_tmp = []
            for chunk_index in [*range(0,len(Uniprot_ID),500)]:
                try:
                    chunked_Uniprot_ID = Uniprot_ID[chunk_index:chunk_index+500]
                    
                except IndexError:
                    chunked_Uniprot_ID = Uniprot_ID[chunk_index:len(Uniprot_ID)]
                
                ## Query
                Uniprot_ID_query='%20OR%20'.join(chunked_Uniprot_ID)
                uniprot_df_tmp = Query_Uniprot(Uniprot_ID_query)
                df_list_tmp.append(uniprot_df_tmp)
            
            uniprot_df = pd.concat(df_list_tmp)

        else:
            Uniprot_ID_query = Uniprot_ID[0]
            uniprot_df = Query_Uniprot(Uniprot_ID_query)
        
        
        uniprot_df.reset_index(inplace=True, drop=True)
        ## Filter non-Bacteria enteries
        keep_flag = []
        for x in uniprot_df.index.to_list():
            keep_flag.append('Bacteria' in uniprot_df.loc[x, 'organism']['lineage'])
        uniprot_df = uniprot_df[keep_flag]
        
        if uniprot_df.shape[0] == 0:
            print("No protein found for " + ID + ' in Uniprot!!!')
            not_found_list.append(ID)
        else:
            print("Protein sequences found for " + ID + " !!!!")
            found_list.append(ID)

        
        try:
            result_dict[ID] = ['UniRef90_' + x for x in uniprot_df['primaryAccession'].to_list()]
        except KeyError:
            result_dict[ID] = np.nan
        
        # print(uniprot_df)
        # print(list(uniprot_df.columns.values))      

Protein sequences found for bactNOG00014 !!!!
Protein sequences found for bactNOG00441 !!!!
Protein sequences found for bactNOG00852 !!!!
Protein sequences found for bactNOG00947 !!!!
Protein sequences found for bactNOG01259 !!!!
Protein sequences found for bactNOG01662 !!!!
Protein sequences found for bactNOG01793 !!!!
Protein sequences found for bactNOG01844 !!!!
Protein sequences found for bactNOG02159 !!!!
Protein sequences found for bactNOG02194 !!!!
Protein sequences found for bactNOG02239 !!!!
Protein sequences found for bactNOG02592 !!!!
Protein sequences found for bactNOG02881 !!!!
Protein sequences found for bactNOG02937 !!!!
Protein sequences found for bactNOG03269 !!!!
Protein sequences found for bactNOG03766 !!!!
Protein sequences found for bactNOG04578 !!!!
Protein sequences found for bactNOG05056 !!!!
Protein sequences found for bactNOG05829 !!!!
Protein sequences found for bactNOG06811 !!!!
Protein sequences found for bactNOG07153 !!!!
Protein sequences found for bactNO

In [5]:
print(str(len(found_list)) + 'entries could be found with reviewd protein sequences on uniprot')
print(str(len(not_found_list)) + 'entries could not be found with reviewd protein sequences on uniprot')

51entries could be found with reviewd protein sequences on uniprot
0entries could not be found with reviewd protein sequences on uniprot


In [6]:
pseudo_regroup = pd.DataFrame.from_dict(result_dict, orient='index')

In [7]:
pseudo_regroup.to_csv('./Pseudo_GBM_EGGNOG_to_UniRef90.csv', sep = '\t', header=False)