# Extracting organism information

In [None]:
import requests
import pickle
import multiprocessing
import pandas as pd

# Obtain the source organism ncbi scientific name from its pdb code using the PDB API
def query_org(pdb):
    response=requests.get(f'https://data.rcsb.org/rest/v1/core/entry/{pdb}')
    if "rcsb_entry_info" in response.json() and "polymer_entity_count"  in response.json()["rcsb_entry_info"]:
        number_entities=response.json()["rcsb_entry_info"]["polymer_entity_count"]
        for entity in range(1,number_entities+1):
            response=requests.get(f'https://data.rcsb.org/rest/v1/core/polymer_entity/{pdb}/{entity}')
            if "rcsb_entity_source_organism" in response.json():
                if "ncbi_scientific_name" in response.json()["rcsb_entity_source_organism"][0].keys():
                    return response.json()["rcsb_entity_source_organism"][0]["ncbi_scientific_name"]
                else:
                    pass    # will create a None object even if we remove the else
            else:
                return["Error type 1"] # Missing source organism is the database for this pdb code
    else:
        return["Error type 2"]  # Missing the info about the number of entities in the database for this pdb code

#Filter only the pdb codes related to E.coli source organism
def coli_query_org(pdb):
    if query_org(pdb)=="Escherichia coli":
        return pdb

#Filter only the pdb codes related to human source organism
def human_query_org(pdb):
    if query_org(pdb)=="Homo sapiens":
        return pdb

#Filter only the pdb codes related to T.maritima source organism
def thermo_query_org(pdb):
    if query_org(pdb)=="Thermotoga maritima":
        return pdb

###########
# Part 1 - Creating a pickle file with several objects, each a part of the list of whether the pdb is in ecoli or not
###########

# Create pickle file with header describing it

coli_pdb_lst="This is the header of the pickle file of the pdb codes for ecoli source organism proteins"
with open("ecoli_pdb_lsts.pickle","wb") as pickle_output:
    pickle.dump(coli_pdb_lst, pickle_output)

human_pdb_lst="This is the header of the pickle file of the pdb codes for human source organism proteins"
with open("human_pdb_lsts.pickle","wb") as pickle_output:
    pickle.dump(human_pdb_lst, pickle_output)

thermo_pdb_lst="This is the header of the pickle file of the pdb codes for thermotoga maritima source organism proteins"
with open("thermo_pdb_lsts.pickle","wb") as pickle_output:
    pickle.dump(thermo_pdb_lst, pickle_output)

# Create a uniq list of all the pdb names found in DE-STRESS: pdb_list
with open("DE-STRESS_data.pickle", "rb") as data_output:
    lst_dics=pickle.load(data_output)
    pdblist=[dic['PDB Code'].upper() for dic in (lst_dics) if dic!=None] 
    pdb_list=list(pd.unique(pdblist)) 

    # Create several small lists of pdb codes with the source organism ncbi scientific name Escherichia coli
    if __name__=="__main__":
        for i in range((len(pdb_list)//100)+1):#
            print(f"Round number {i+1}")
            pdb_sublst=pdb_list[i*100:(i+1)*100]
            with multiprocessing.Pool(7) as p:
                data=p.map(coli_query_org, pdb_sublst)
                with open("ecoli_pdb_lsts.pickle","ab") as pickle_output:       
                    pickle.dump(data, pickle_output)
    
    # Create several small lists of pdb codes with the source organism ncbi scientific name Homo sapiens
    if __name__=="__main__":
        for i in range((len(pdb_list)//100)+1):#
            print(f"Round number {i+1}")
            pdb_sublst=pdb_list[i*100:(i+1)*100]
            with multiprocessing.Pool(7) as p:
                data=p.map(human_query_org, pdb_sublst)
                with open("human_pdb_lsts.pickle","ab") as pickle_output:      
                    pickle.dump(data, pickle_output)
    # Create several small lists of pdb codes with the source organism ncbi scientific name Thermotoga maritima
    if __name__=="__main__":
        for i in range((len(pdb_list)//100)+1):#
            print(f"Round number {i+1}")
            pdb_sublst=pdb_list[i*100:(i+1)*100]
            with multiprocessing.Pool(7) as p:
                data=p.map(thermo_query_org, pdb_sublst)
                with open("thermo_pdb_lsts.pickle","ab") as pickle_output:
                    pickle.dump(data, pickle_output)


###########
# Part 2 - join all the small lists into one big one
############

#For E.coli 
with open("ecoli_pdb_lsts.pickle","rb") as pickle_output:
    lst=[]
    count=0
    while 1:
        count+=1
        try:
            data=pickle.load(pickle_output)
            if count>=2:
                lst+=(data)
        except EOFError:
            break
    with open("ecoli_pdb_lst.pickle","wb") as pickle_input:      
         pickle.dump(lst, pickle_input)

#For humans
with open("human_pdb_lsts.pickle","rb") as pickle_output:
    lst=[]
    count=0
    while 1:
        count+=1
        try:
            data=pickle.load(pickle_output)
            if count>=2:
                lst+=(data)
        except EOFError:
            break
    with open("human_pdb_lst.pickle","wb") as pickle_input:      
         pickle.dump(lst, pickle_input)

#For T.maritima
with open("thermo_pdb_lsts.pickle","rb") as pickle_output:
    lst=[]
    count=0
    while 1:
        count+=1
        try:
            data=pickle.load(pickle_output)
            if count>=2:
                lst+=(data)
        except EOFError:
            break
    with open("thermo_pdb_lst.pickle","wb") as pickle_input:      
         pickle.dump(lst, pickle_input)


###########
# Part 3 - Remove None objects from list
#############

#For E.coli
with open("ecoli_pdb_lst.pickle","rb") as pickle_output:
    data=pickle.load(pickle_output)
    lst=[]
    for element in data:
        if element != None:
            lst.append(element)
    with open("ECOLI_PDBs.pickle","wb") as pickle_input:      
        pickle.dump(lst, pickle_input)

#For humans
with open("human_pdb_lst.pickle","rb") as pickle_output:
    data=pickle.load(pickle_output)
    lst=[]
    for element in data:
        if element != None:
            lst.append(element)
    with open("HUMAN_PDBs.pickle","wb") as pickle_input:      
        pickle.dump(lst, pickle_input)

#for T.maritima
with open("thermo_pdb_lst.pickle","rb") as pickle_output:
    data=pickle.load(pickle_output)
    lst=[]
    for element in data:
        if element != None:
            lst.append(element)
    with open("THERMO_PDBs.pickle","wb") as pickle_input:      
        pickle.dump(lst, pickle_input)


############
# Part 4 - Create a list of dictionaries where each dictionary corresponds to a peptide chain which pdb code refers to the source organism
############

Ecoli_path="path_of_file\ECOLI_PDBs.pickle"
Human_path="path_of_file\HUMAN_PDBs.pickle"
Thermo_path="path_of_file\THERMO_PDBs.pickle"
DESTRESS_path="path_of_file\DE-STRESS_data.pickle"

#Create list of dictionaries for E.coli proteins
with open(Ecoli_path,"rb") as pickle_output:
    ecoli_pdb_lst=pickle.load(pickle_output)

with open(DESTRESS_path,'rb') as pickle_output:
    dic=pickle.load(pickle_output)
    Ecoli_lst_of_dics=[]
    for i in range(len(dic)):
        ecoli_dic={"Chain ID":"", 'Mean Pack Density':"",'Chain Length':"",
        'Rotatory Bonds':"",'Rosetta Energy': "",'PDB Code':"", 'Exp Method':"", 'Bude Score':"",
         'Evo Score':"", 'DFire Score':"", 'Aggre Score':""}
        if dic[i]!=None:
            if dic[i]["PDB Code"].upper() in ecoli_pdb_lst:
                ecoli_dic["Chain ID"]=dic[i]["Chain ID"]
                ecoli_dic["Mean Pack Density"]=dic[i]["Mean Pack Density"]
                ecoli_dic["Chain Length"]=dic[i]["Chain Length"]
                ecoli_dic["Rotatory Bonds"]=dic[i]["Rotatory Bonds"]
                ecoli_dic["Rosetta Energy"]=dic[i]["Rosetta Energy"]
                ecoli_dic["PDB Code"]=dic[i]["PDB Code"]
                ecoli_dic["Exp Method"]=dic[i]["Exp Method"]
                ecoli_dic["Bude Score"]=dic[i]["Bude Score"]
                ecoli_dic["Evo Score"]=dic[i]["Evo Score"]
                ecoli_dic["DFire Score"]=dic[i]["DFire Score"]
                ecoli_dic["Aggre Score"]=dic[i]["Aggre Score"]
                Ecoli_lst_of_dics.append(ecoli_dic)
    with open("Ecoli_Data.pickle","wb") as pickle_input:
        pickle.dump(Ecoli_lst_of_dics,pickle_input)


#Create list of dictionaries for human proteins
with open(Human_path,"rb") as pickle_output:
    Human_pdb_lst=pickle.load(pickle_output)

with open(DESTRESS_path,'rb') as pickle_output:
    dic=pickle.load(pickle_output)
    Human_lst_of_dics=[]
    for i in range(len(dic)):
        Human_dic={"Chain ID":"", 'Mean Pack Density':"",'Chain Length':"",
        'Rotatory Bonds':"",'Rosetta Energy': "",'PDB Code':"", 'Exp Method':"", 'Bude Score':"",
         'Evo Score':"", 'DFire Score':"", 'Aggre Score':""}
        if dic[i]!=None:
            if dic[i]["PDB Code"].upper() in Human_pdb_lst:
                Human_dic["Chain ID"]=dic[i]["Chain ID"]
                Human_dic["Mean Pack Density"]=dic[i]["Mean Pack Density"]
                Human_dic["Chain Length"]=dic[i]["Chain Length"]
                Human_dic["Rotatory Bonds"]=dic[i]["Rotatory Bonds"]
                Human_dic["Rosetta Energy"]=dic[i]["Rosetta Energy"]
                Human_dic["PDB Code"]=dic[i]["PDB Code"]
                Human_dic["Exp Method"]=dic[i]["Exp Method"]
                Human_dic["Bude Score"]=dic[i]["Bude Score"]
                Human_dic["Evo Score"]=dic[i]["Evo Score"]
                Human_dic["DFire Score"]=dic[i]["DFire Score"]
                Human_dic["Aggre Score"]=dic[i]["Aggre Score"]
                Human_lst_of_dics.append(Human_dic)
        
    with open("Human_Data.pickle","wb") as pickle_input:
        pickle.dump(Human_lst_of_dics,pickle_input)


#Create list of dictionaries for T.maritima proteins
with open(Thermo_path,"rb") as pickle_output:
    thermo_pdb_lst=pickle.load(pickle_output)

with open(DESTRESS_path,'rb') as pickle_output:
    dic=pickle.load(pickle_output)
    Thermo_lst_of_dics=[]
    for i in range(len(dic)):
        thermo_dic={"Chain ID":"", 'Mean Pack Density':"",'Chain Length':"",
        'Rotatory Bonds':"",'Rosetta Energy': "",'PDB Code':"", 'Exp Method':"", 'Bude Score':"",
         'Evo Score':"", 'DFire Score':"", 'Aggre Score':""}
        if dic[i]!=None:
            if dic[i]["PDB Code"].upper() in thermo_pdb_lst:
                thermo_dic["Chain ID"]=dic[i]["Chain ID"]
                thermo_dic["Mean Pack Density"]=dic[i]["Mean Pack Density"]
                thermo_dic["Chain Length"]=dic[i]["Chain Length"]
                thermo_dic["Rotatory Bonds"]=dic[i]["Rotatory Bonds"]
                thermo_dic["Rosetta Energy"]=dic[i]["Rosetta Energy"]
                thermo_dic["PDB Code"]=dic[i]["PDB Code"]
                thermo_dic["Exp Method"]=dic[i]["Exp Method"]
                thermo_dic["Bude Score"]=dic[i]["Bude Score"]
                thermo_dic["Evo Score"]=dic[i]["Evo Score"]
                thermo_dic["DFire Score"]=dic[i]["DFire Score"]
                thermo_dic["Aggre Score"]=dic[i]["Aggre Score"]
                Thermo_lst_of_dics.append(thermo_dic)
    with open("Thermo_Data.pickle","wb") as pickle_input:
        pickle.dump(Thermo_lst_of_dics,pickle_input)
