# Extracting Protein stability data for *de novo* designs


First, the stability_scores.tar.gz file within the aan0693_si_datasets.tar.gz.zip file was downloaded from the Supplmentary materials of the G. J. Rocklin et al. scientific article. From this file, four files were extracted:

- rd1_stability_scores  
- rd2_stability_scores  
- rd3_stability_scores 
- rd4_stability_scores

Reference:

G. J. Rocklin et al., “Global analysis of protein folding using massively parallel design, synthesis, and testing,” Science (1979), vol. 357, no. 6347, pp. 168–175, Jul. 2017, doi: 10.1126/SCIENCE.AAN0693/SUPPL_FILE/AAN0693_SI_DATASETS.TAR.GZ.ZIP.

Then, the data was further processed and analysed:

## Extracting and saving the designs information specific for each topology

In [None]:
import pandas as pd
import pickle
from Rotatory_library import rotatory, rot_density


RD1_file="path_to_file\\rd1_stability_scores.txt"
RD2_file="path_to_file\\rd2_stability_scores.txt"
RD3_file="path_to_file\\rd3_stability_scores.txt"
RD4_file="path_to_file\\rd4_stability_scores.txt"
files=[RD1_file,RD2_file, RD3_file, RD4_file]

#Create a list of dictionaries for the design proteins from the extracted RD1-4 .txt files
def EEHEE_import_to_lst(file, design_round_number):
    df = pd.read_csv(file,sep='\t')
    #Select for data rows specific to EEHEE topology
    data=df["name"].str.contains(f"EEHEE_{design_round_number.lower()}") & ~df["name"].str.contains("hp") & ~df["name"].str.contains("random") & ~df["name"].str.contains("PG_hp") & ~df["name"].str.contains("buryD")
    #Select for columns referent to the name, sequence, and stability score of the design
    data_sele=(df.loc[data,["name","sequence","stabilityscore"]])
    design_lst_of_dics=[]
    for i in range(len(data_sele.iloc[:,0])):
        # Dictionary with all relevant information about one protein design
        design_dic={ "Code":"" , "Topology":"" , "Round":"" , "Sequence":"" , "Stability_Score":"" 
                    , "Rotatable_Bonds":"" , "Rotatable_Bonds_Density": "" }
        lst_values=data_sele.iloc[i,0].split("_")
        design_dic["Code"]=int(lst_values[2][:4])
        design_dic["Topology"]=lst_values[0]
        design_dic["Round"]=lst_values[1].upper()
        #Remove padding sequence
        if data_sele.iloc[i,1][:2]=="GS":
            design_dic["Sequence"]=data_sele.iloc[i,1][2:]
        else:
            design_dic["Sequence"]=data_sele.iloc[i,1]
        design_dic["Stability_Score"]=float(data_sele.iloc[i,2])
        if data_sele.iloc[i,1][:2]=="GS":
            design_dic["Rotatable_Bonds"]=rotatory(data_sele.iloc[i,1][2:])
        else:
            design_dic["Rotatable_Bonds"]=rotatory(data_sele.iloc[i,1])
        if data_sele.iloc[i,1][:2]=="GS":
            design_dic["Rotatable_Bonds_Density"]=rot_density(data_sele.iloc[i,1][2:])
        else:
            design_dic["Rotatable_Bonds_Density"]=rot_density(data_sele.iloc[i,1])
        design_lst_of_dics.append(design_dic)
    return design_lst_of_dics

def HHH_import_to_lst(file, design_round_number):
    df = pd.read_csv(file,sep='\t')
    data=df["name"].str.contains(f"HHH_{design_round_number.lower()}") & ~df["name"].str.contains("hp") & ~df["name"].str.contains("random") & ~df["name"].str.contains("PG_hp") & ~df["name"].str.contains("buryD")
    data_sele=(df.loc[data,["name","sequence","stabilityscore"]])
    design_lst_of_dics=[]
    for i in range(len(data_sele.iloc[:,0])):
        mis_dic={ "Code":"" , "Topology":"" , "Round":"" , "Sequence":"" , "Stability_Score":"" 
                    , "Rotatable_Bonds":"" , "Rotatable_Bonds_Density": "" }
        lst_values=data_sele.iloc[i,0].split("_")
        mis_dic["Code"]=int(lst_values[2][:4])
        mis_dic["Topology"]=lst_values[0]
        mis_dic["Round"]=lst_values[1].upper()
        mis_dic["Sequence"]=data_sele.iloc[i,1]
        mis_dic["Stability_Score"]=float(data_sele.iloc[i,2])
        mis_dic["Rotatable_Bonds"]=rotatory(data_sele.iloc[i,1])
        mis_dic["Rotatable_Bonds_Density"]=rot_density(data_sele.iloc[i,1])
        design_lst_of_dics.append(mis_dic)
    return design_lst_of_dics

def HEEH_import_to_lst(file, design_round_number):
    df = pd.read_csv(file,sep='\t')
    data=df["name"].str.contains(f"HEEH_{design_round_number.lower()}") & ~df["name"].str.contains("hp") & ~df["name"].str.contains("random") & ~df["name"].str.contains("PG_hp") & ~df["name"].str.contains("buryD")
    data_sele=(df.loc[data,["name","sequence","stabilityscore"]])
    design_lst_of_dics=[]
    for i in range(len(data_sele.iloc[:,0])):
        mis_dic={ "Code":"" , "Topology":"" , "Round":"" , "Sequence":"" , "Stability_Score":"" 
                    , "Rotatable_Bonds":"" , "Rotatable_Bonds_Density": "" }
        lst_values=data_sele.iloc[i,0].split("_")
        mis_dic["Code"]=int(lst_values[2][:4])
        mis_dic["Topology"]=lst_values[0]
        mis_dic["Round"]=lst_values[1].upper()
        mis_dic["Sequence"]=data_sele.iloc[i,1]
        mis_dic["Stability_Score"]=float(data_sele.iloc[i,2])
        mis_dic["Rotatable_Bonds"]=rotatory(data_sele.iloc[i,1])
        mis_dic["Rotatable_Bonds_Density"]=rot_density(data_sele.iloc[i,1])
        design_lst_of_dics.append(mis_dic)
    return design_lst_of_dics

def EHEE_import_to_lst(file, design_round_number):
    df = pd.read_csv(file,sep='\t')
    data=df["name"].str.contains(f"EHEE_{design_round_number.lower()}") & ~df["name"].str.contains("hp") & ~df["name"].str.contains("random") & ~df["name"].str.contains("PG_hp") & ~df["name"].str.contains("buryD")
    data_sele=(df.loc[data,["name","sequence","stabilityscore"]])
    design_lst_of_dics=[]
    for i in range(len(data_sele.iloc[:,0])):
        mis_dic={ "Code":"" , "Topology":"" , "Round":"" , "Sequence":"" , "Stability_Score":"" 
                    , "Rotatable_Bonds":"" , "Rotatable_Bonds_Density": "" }
        lst_values=data_sele.iloc[i,0].split("_")
        mis_dic["Code"]=int(lst_values[2][:4])
        mis_dic["Topology"]=lst_values[0]
        mis_dic["Round"]=lst_values[1].upper()
        if data_sele.iloc[i,1][:2]=="GS":
            mis_dic["Sequence"]=data_sele.iloc[i,1][2:]
        else:
            mis_dic["Sequence"]=data_sele.iloc[i,1]
        mis_dic["Stability_Score"]=float(data_sele.iloc[i,2])
        if data_sele.iloc[i,1][:2]=="GS":
            mis_dic["Rotatable_Bonds"]=rotatory(data_sele.iloc[i,1][2:])
        else:
            mis_dic["Rotatable_Bonds"]=rotatory(data_sele.iloc[i,1])
        if data_sele.iloc[i,1][:2]=="GS":
            mis_dic["Rotatable_Bonds_Density"]=rot_density(data_sele.iloc[i,1][2:])
        else:
            mis_dic["Rotatable_Bonds_Density"]=rot_density(data_sele.iloc[i,1])
        design_lst_of_dics.append(mis_dic)
    return design_lst_of_dics


EEHEE_Lst_of_dic_all_files=[]
HHH_Lst_of_dic_all_files=[]
EHEE_Lst_of_dic_all_files=[]
HEEH_Lst_of_dic_all_files=[]
for file in files:
    EEHEE_Lst_of_dic_all_files+=(EEHEE_import_to_lst(file,file[-7:-4]))
    HHH_Lst_of_dic_all_files+=(HHH_import_to_lst(file,file[-7:-4]))
    EHEE_Lst_of_dic_all_files+=(EHEE_import_to_lst(file,file[-7:-4]))
    HEEH_Lst_of_dic_all_files+=(HEEH_import_to_lst(file,file[-7:-4]))


# Save data in a .pickle file 

with open("Lst_Dcs_EEHEE.pickle", "wb") as pickle_input:
    pickle.dump(EEHEE_Lst_of_dic_all_files,pickle_input)
with open("Lst_Dcs_HHH.pickle", "wb") as pickle_input:
    pickle.dump(HHH_Lst_of_dic_all_files,pickle_input)
with open("Lst_Dcs_EHEE.pickle", "wb") as pickle_input:
    pickle.dump(EHEE_Lst_of_dic_all_files,pickle_input)
with open("Lst_Dcs_HEEH.pickle", "wb") as pickle_input:
    pickle.dump(HEEH_Lst_of_dic_all_files,pickle_input)

## Merge the files of different topologies into one with all topologies together

In [None]:
EEHEE_path="file_path\Lst_Dcs_EEHEE.pickle"
HHH_path="file_path\Lst_Dcs_HHH.pickle"
EHEE_path="file_path\Lst_Dcs_EHEE.pickle"
HEEH_path="file_path\Lst_Dcs_HEEH.pickle"
with open(HHH_path, "rb") as pickle_output:
    HHH=pickle.load(pickle_output)
    with open(EHEE_path, "rb") as pickle_output:
        EHEE=pickle.load(pickle_output)
        with open(HEEH_path, "rb") as pickle_output:
            HEEH=pickle.load(pickle_output)
            with open(EEHEE_path, "rb") as pickle_output:
                EEHEE=pickle.load(pickle_output)
                lst_variables=[HHH,EHEE,HEEH,EEHEE]
                lst_all_topologies=[]
                for variable in lst_variables:
                    lst_all_topologies+=variable
                with open("Lst_Dcs_All_topologies.pickle","wb") as pickle_input:
                    pickle.dump(lst_all_topologies,pickle_input)