In [1]:
import pandas as pd
import json
import timeit
import seaborn as sns
from modlamp.descriptors import PeptideDescriptor, GlobalDescriptor

In [2]:
%store -r total_res

## Dataframe con todos los resultados

In [4]:
def res_to_pandas(total_res):
    """
    Transform nested dictionary from multiple 
    sequence search into a dataframe
    :param dict total_res: nested dictionary from multiple sequence search
    :return res_pd: Pandas dataframe with all the results
    """
    
    #Getting a DataFrame containing all values from the nested dict
    from itertools import chain
    values=list(total_res.values())
    list_values=list(chain.from_iterable(values))
    values_df=pd.DataFrame(list_values)
    values_df=values_df.drop(columns=['chain_id','entity_id','entry_entity','result_sequence'])
    
    #Now create a dataframe with all query sequences respecting indexes
    key_list = [[k,v] for k, values in total_res.items() for v in values]
    df_list=pd.DataFrame(key_list)
    df_list.drop(1, axis=1,inplace=True) #Dropping values (list of dicts)
    df_list.rename(columns={0:'query_sequences'},inplace=True)
    
    #Concatenating both DataFrames
    res_df=pd.concat([df_list,values_df],axis=1)
    
    return res_df

In [5]:
res_df=res_to_pandas(total_res)

In [6]:
res_df

Unnamed: 0,query_sequences,molecule_name,pdb_id,e_value,percentage_identity
0,GLPRKILCAIAKKKGKCKGPLKLVCKC,[Lasiocepsin],2mbd,1.100000e-13,100.0
1,GFGCPGDAYQCSEHCRALGGGRTGGYCAGPWYLGHPTCTCSF,[Fungal defensin eurocin],2lt8,5.700000e-22,100.0
2,GFGCPGDAYQCSEHCRALGGGRTGGYCAGPWYLGHPTCTCSF,[Defensin MGD-1],1fjn,3.900000e-06,52.5
3,GFGCPGDAYQCSEHCRALGGGRTGGYCAGPWYLGHPTCTCSF,[micasin],2lr5,6.500000e-05,50.0
4,GFGCPGDAYQCSEHCRALGGGRTGGYCAGPWYLGHPTCTCSF,[INVERT_DEFENSINS domain-containing protein],2ru0,1.600000e-04,47.5
...,...,...,...,...,...
10013,TFFRLFNR,[Competence stimulating protein],2i2j,5.500000e-02,100.0
10014,RRSVQWCA,[Lactoferricin-H],1z6w,2.800000e-02,100.0
10015,RRSVQWCA,[Lactoferricin-H],1z6v,2.800000e-02,100.0
10016,FKCRRWQWR,[Lactoferricin-B],1lfc,1.200000e-04,100.0


In [4]:
%store -r total_res

## PDB sequence

In [None]:
import re
import requests

base_url = "https://www.ebi.ac.uk/pdbe/"

api_base = base_url + "api/"

molecules_url = api_base + '/pdb/entry/molecules/'

In [6]:
def make_request(url, mode, pdb_id):
    """
    This function can make GET and POST requests to
    the PDBe API
    
    :param url: String,
    :param mode: String,
    :param pdb_id: String
    :return: JSON or None
    """
    if mode == "get":
        response = requests.get(url=url+pdb_id)
    elif mode == "post":
        response = requests.post(url, data=pdb_id)

    if response.status_code == 200:
        return response.json()
    else:
        print("[No data retrieved - %s] %s" % (response.status_code, response.text))
    
    return None

In [7]:
def get_pdb_sequence(pdb_id_list):
    """
    Get sequences associated with pdb ids
    """
    start = timeit.default_timer()
    res=[]
    i=0
    while i < len(pdb_id_list):
            data=make_request(molecules_url,'get',pdb_id_list[i])
            sequence=data.get(pdb_id_list[i])[0].get('sequence')
            res.append(sequence)
            i=i+1
            print(str(i)+" sequences of "+str(len(pdb_id_list)))
    
    else:
            stop = timeit.default_timer()
            temp = stop-start
            print("Done!")
            hours = temp//3600
            temp = temp - 3600*hours
            minutes = temp//60
            seconds = temp - 60*minutes
            print("Run time: " + '%d:%d:%d' %(hours,minutes,seconds)) 
            
            
    return res

In [84]:
pdbid_list=pd.Series.tolist(res_df['pdb_id'])

In [None]:
pdb_sequences=get_pdb_sequence(pdbid_list)

In [55]:
res_df['pdb_sequence']=pdb_sequences

In [None]:
sec_query=pd.Series.tolist(res_df["query_sequences"])
globaldesc=GlobalDescriptor(sec_query)
globaldesc.length()
lon=globaldesc.descriptor
res_df["query_sequence_length"]=lon

In [None]:
sec_pdb=pd.Series.tolist(res_df["pdb_sequence"])
globaldesc=GlobalDescriptor(sec_pdb)
globaldesc.length()
pdb_lon=globaldesc.descriptor
res_df["pdb_sequence_length"]=pdb_lon

In [None]:
res_df=res_df[["query_sequences","query_sequence_length","pdb_sequence","pdb_sequence_length","pdb_id","e_value","percentage_identity","molecule_name"]]

In [1]:
%store -r res_df