In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
os.chdir(os.path.expanduser(
    '~/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/Clim_sorting_combos/Clim1/CLClim1NE'))
import plotly.graph_objects as go
import glob
import plotly.express as px

# C-lim half life experimental data (from first Clim sort, using values from the 2023 bioRxiv preprint) 

The data used for Clim rates in this analysis (the first analysis of the Clim rates) comes from this preprint: https://www.biorxiv.org/content/10.1101/2022.08.01.502339v2. The paper has been published, and can be found here: Gupta, et al., 2024: https://www.nature.com/articles/s41467-024-49920-8#Sec20

Specifically, this data comes from Supplementary Table 1: "TableS1 - Half-lives for 3262 proteins across 13 growth conditions >>", which contains the half-lives for proteins in E. coli under carbon-limited conditions. This is the data we are interested in comparing to our model predictions, as the conditions are most similar to our base case.

NOTE that this data has some "ceiling" values it auto assigns proteins to (as defined in the published paper). In the original Clim sort done using the bioarchiv paper (which will be analyzed here), we were not aware that these were ceilinig values. Thus, they are included in this version, but will likely be removed in later Clim rate sorts for half-life value extractions. Also, any protein that did not have a value reported for any one of its doubling time replicates (six total), were also excluded from this sort (leaving roughly 1912 proteins assigned to Clim rates in this CLClimNE sort, 7 assigned to CL, and 2391 to NE).  

ALSO NOTE: This data is generated from the new ecocyc update and WCM update, so the sort doesnt actually change from Clim0, just a different model version run.




## Find essential genes in the Clim1 data

Doing this by first finding the gene names for each of the monomer ids in the Clim1 data, and then comparing these to the essential genes list from the model.

In [None]:
# FUNCTIONS

# get the gene IDs for each monomer IDs:
def get_gene_ids_for_monomer_ids():
    # code adapted from convert_to_flat.py
    import io
    from wholecell.io import tsv
    from wholecell.utils.filepath import ROOT_PATH
    RNAS_FILE = os.path.join(ROOT_PATH, 'reconstruction', 'ecoli', 'flat', 'rnas.tsv')
    with io.open(RNAS_FILE, 'rb') as f:
        reader = tsv.reader(f, delimiter='\t')
        headers = next(reader)
        while headers[0].startswith('#'):
            headers = next(reader)

        gene_id_index = headers.index('gene_id')
        protein_id_index = headers.index('monomer_ids')
        monomer_ids_to_gene_ids = {}
        for line in reader:
            gene_id = line[gene_id_index]
            protein_id = list(line[protein_id_index][2:-2].split('", "'))[0]
            monomer_ids_to_gene_ids[protein_id] = gene_id

        return monomer_ids_to_gene_ids
    
def get_gene_symbols_for_monomer_ids():
    # code adapted from convert_to_flat.py
    import io
    from wholecell.io import tsv
    from wholecell.utils.filepath import ROOT_PATH
    RNAS_FILE = os.path.join(ROOT_PATH, 'reconstruction', 'ecoli', 'flat', 'rnas.tsv')
    with io.open(RNAS_FILE, 'rb') as f:
        reader = tsv.reader(f, delimiter='\t')
        headers = next(reader)
        while headers[0].startswith('#'):
            headers = next(reader)

        gene_symbol_index = headers.index('common_name')
        protein_id_index = headers.index('monomer_ids')
        monomer_ids_to_gene_symbols = {}
        for line in reader:
            gene_symbol = line[gene_symbol_index]
            protein_id = list(line[protein_id_index][2:-2].split('", "'))[0]
            monomer_ids_to_gene_symbols[protein_id] = gene_symbol

        return monomer_ids_to_gene_symbols
    
    
# convert gene IDs to monomer IDs:
def get_monomer_ids_for_gene_ids():
    # code adapted from convert_to_flat.py
    import io
    from wholecell.io import tsv
    from wholecell.utils.filepath import ROOT_PATH
    RNAS_FILE = os.path.join(ROOT_PATH, 'reconstruction', 'ecoli', 'flat', 'rnas.tsv')
    with io.open(RNAS_FILE, 'rb') as f:
        reader = tsv.reader(f, delimiter='\t')
        headers = next(reader)
        while headers[0].startswith('#'):
            headers = next(reader)

        gene_id_index = headers.index('gene_id')
        protein_id_index = headers.index('monomer_ids')
        gene_ids_to_monomer_ids = {}
        for line in reader:
            gene_id = line[gene_id_index]
            protein_id = list(line[protein_id_index][2:-2].split('", "'))[0]
            gene_ids_to_monomer_ids[gene_id] = protein_id

        return gene_ids_to_monomer_ids
    
    
# recall the genes defined as "essential" in the model, and return them as a list of gene names:
def get_essential_genes():
    # code adapted from convert_to_flat.py
    import io
    from wholecell.io import tsv
    from wholecell.utils.filepath import ROOT_PATH
    GENES_FILE = os.path.join(ROOT_PATH, 'validation', 'ecoli', 'flat', 'essential_genes.tsv')
    with io.open(GENES_FILE, 'rb') as f:
        reader = tsv.reader(f, delimiter='\t')
        headers = next(reader)
        while headers[0].startswith('#'):
            headers = next(reader)

        gene_id_index = headers.index('FrameID')
        essential_genes = []
        for line in reader:
            essential_gene = line[gene_id_index]
            essential_genes.append(essential_gene)

        return essential_genes
    


In [None]:
# load in the Clim0 data:
Clim0_data = pd.read_csv('CLClim1NE_Parca_rate_constants/Clim_rate_constants.csv')
monomer_ids = Clim0_data['Protein ID']

Clim0_essential_gene_symbols_found = []
Clim0_essential_monomer_ids_found = []

# todo: figure out how to clean up the run time speed on this function
# find the essential genes in the Clim0 data:
for monomer in monomer_ids:
    # get the gene name for the monomer ID:
    gene_id = get_gene_ids_for_monomer_ids()[monomer]
    if gene_id in get_essential_genes():
        Clim0_essential_gene_symbols_found.append(get_gene_symbols_for_monomer_ids()[monomer])
        Clim0_essential_monomer_ids_found.append(monomer)

print("Essential Genes Found in Clim0 (" +str(len(Clim0_essential_gene_symbols_found))+"): ", Clim0_essential_gene_symbols_found)

# to free up memory:
del Clim0_data, monomer_ids

In [None]:
# for my own reference, find the number of essential genes in the the NE and CL groups in CLClim0NE too: 
CL_data = pd.read_csv('CLClim1NE_Parca_rate_constants/CL_rate_constants.csv')
CL_monomer_ids = CL_data['Protein ID']

CL_essential_gene_symbols_found = []
CL_essential_monomer_ids_found = []
for monomer in CL_monomer_ids:
    # get the gene name for the monomer ID:
    gene_id = get_gene_ids_for_monomer_ids()[monomer]
    if gene_id in get_essential_genes():
        CL_essential_gene_symbols_found.append(get_gene_symbols_for_monomer_ids()[monomer])
        CL_essential_monomer_ids_found.append(monomer)

print("Essential Genes Found in CL (" +str(len(CL_essential_gene_symbols_found))+"): ", CL_essential_gene_symbols_found)

NE_data = pd.read_csv('CLClim1NE_Parca_rate_constants/NE_rate_constants.csv')
NE_monomer_ids = NE_data['Protein ID']

NE_essential_gene_symbols_found = []
NE_essential_monomer_ids_found = []
for monomer in NE_monomer_ids:
    # get the gene name for the monomer ID:
    gene_id = get_gene_ids_for_monomer_ids()[monomer]
    if gene_id in get_essential_genes():
        NE_essential_gene_symbols_found.append(get_gene_symbols_for_monomer_ids()[monomer])
        NE_essential_monomer_ids_found.append(monomer)
        
print("Essential Genes Found in NE (" +str(len(NE_essential_gene_symbols_found))+"): ", NE_essential_gene_symbols_found)

# to free up memory:
del CL_data, NE_data, CL_monomer_ids, NE_monomer_ids

In [None]:
# I would like to see if Lon or Clp are in this out of curiosity:
essential_gene_symbols = []
for gene in get_essential_genes():
    essential_gene_symbols.append(get_gene_symbols_for_monomer_ids()[get_monomer_ids_for_gene_ids()[gene]])
    
print("All Essential Genes in the Model (" +str(len(essential_gene_symbols))+"): ", essential_gene_symbols)

del essential_gene_symbols

# Determining which proteins are assigned to a protease (according to Gupta et al., 2024) and are essential in the CLClim1NE sorted model

This data is from a rendition of supplementary table #2 in the published paper ("TableS2 - Assignment of protein substrates to proteases >>"). This table contains the proteins that are assigned to each of the proteases in the proteins they measured in the paper. This supplementary table has been adapted for use in this analysis, so the full detail about each data column is not included here (but can be found in the original paper). 


# NOTE: THE GENE SYMBOLS IN THE PROTEASE DATA ARE NOT THE SAME AS ALL THE GENE SYMBOLS IN THE MODEL. COME BACK AND MANUALLY EDIT THE PROTEASE DATA TO MATCH THE MODEL GENE SYMBOLS.

In [None]:
# determine which proteins are assigned to proteases in the Clim0 data (does not matter if they are essential or not):
Clim0_data = pd.read_csv('CLClim1NE_Parca_rate_constants/Clim_rate_constants.csv')
monomer_ids = Clim0_data['Protein ID']

Clim0_protease_substrate_gene_symbols = []
Clim0_protease_substrate_monomer_ids = []
for monomer in monomer_ids:
    # get the gene symbol for the monomer ID:
    gene_symbol = get_gene_symbols_for_monomer_ids()[monomer]
    if gene_symbol in protease_data['Gene name'].values:
        Clim0_protease_substrate_gene_symbols.append(gene_symbol)
        Clim0_protease_substrate_monomer_ids.append(monomer)
        
print("Total Protease Substrates in Clim0 (" +str(len(Clim0_protease_substrate_gene_symbols))+"): ", Clim0_protease_substrate_gene_symbols)

# to free up memory:
del Clim0_data, monomer_ids

# repeat for CL
CL_data = pd.read_csv('CLClim1NE_Parca_rate_constants/CL_rate_constants.csv')
CL_monomer_ids = CL_data['Protein ID']

CL_protease_substrate_gene_symbols = []
CL_protease_substrate_monomer_ids = []
for monomer in CL_monomer_ids:
    # get the gene symbol for the monomer ID:
    gene_symbol = get_gene_symbols_for_monomer_ids()[monomer]
    if gene_symbol in protease_data['Gene name'].values:
        CL_protease_substrate_gene_symbols.append(gene_symbol)
        CL_protease_substrate_monomer_ids.append(monomer)
        
print("Total Protease Substrates in CL (" +str(len(CL_protease_substrate_gene_symbols))+"): ", CL_protease_substrate_gene_symbols)

# to free up memory:
del CL_data, CL_monomer_ids

NE_data = pd.read_csv('CLClim1NE_Parca_rate_constants/NE_rate_constants.csv')
NE_monomer_ids = NE_data['Protein ID']

NE_protease_substrate_gene_symbols = []
NE_protease_substrate_monomer_ids = []
for monomer in NE_monomer_ids:
    # get the gene symbol for the monomer ID:
    gene_symbol = get_gene_symbols_for_monomer_ids()[monomer]
    if gene_symbol in protease_data['Gene name'].values:
        NE_protease_substrate_gene_symbols.append(gene_symbol)
        NE_protease_substrate_monomer_ids.append(monomer)
        
print("Total Protease Substrates in NE (" +str(len(NE_protease_substrate_gene_symbols))+"): ", NE_protease_substrate_gene_symbols)

# to free up memory:
del NE_data, NE_monomer_ids
# note that for CLClim0NE, the breakdown appears to be as follows: Clim0 (273), CL (0), NE (25). There are 308 proteins assigned to proteases in the Gupta et al., 2024 data, and 298 of these appear in the whole cell model.

In [None]:
# check if any of the essential genes in the Clim0 data are assigned to a protease:
essential_substrate_gene_symbols = []
for gene in Clim0_essential_gene_symbols_found:
    if gene in protease_data['Gene name'].values:
        essential_substrate_gene_symbols.append(gene)
        
print("Essential Genes with Clim0 half-lives that are assigned to a protease (" +str(len(essential_substrate_gene_symbols))+"): ", essential_substrate_gene_symbols)

In [None]:
# check if any of the essential genes in the CL data are assigned to a protease:
CL_essential_substrate_gene_symbols = []
for gene in CL_essential_gene_symbols_found:
    if gene in protease_data['Gene name'].values:
        CL_essential_substrate_gene_symbols.append(gene)
        
print("Essential Genes with CL half-lives that are assigned to a protease (" +str(len(CL_essential_substrate_gene_symbols))+"): ", CL_essential_substrate_gene_symbols)

# check if any of the essential genes in the NE data are assigned to a protease:
NE_essential_substrate_gene_symbols = []
for gene in NE_essential_gene_symbols_found:
    if gene in protease_data['Gene name'].values:
        NE_essential_substrate_gene_symbols.append(gene)
        
print("Essential Genes with NE half-lives that are assigned to a protease (" +str(len(NE_essential_substrate_gene_symbols))+"): ", NE_essential_substrate_gene_symbols)

next items to do with this info: 
- plot the essential genes in the Clim0 data
- plot the outliers and highlight the essential genes 
- plot the outliers and highlight the protease substrates
- plot of the outliers and highlight the type of protease substrate

# Plot the essential genes in CLClim0NE

In [None]:
# FUNCTIONS:

# convert the data from the ParCa translation.py raw HL saves: 
def convert_HL_data(dataframe):
    name_s = 'rate constant (s^1)' # name in seconds
    name_m = 'half life (min)' # name in mins
    dataframe.rename(columns={'Rate Constant': name_s}, inplace=True)
    dataframe[name_m] = dataframe[name_s] * 60 * (1/np.log(2))
    dataframe[name_m] = 1 / dataframe[name_m] # units of mins
    return dataframe


# convert each data source in the saved HL data file folders 
def convert_data_and_add_HL_source(dataframe, HL_source):
    name_s = HL_source + ' rate constant (s^1)' # name in seconds
    name_m = HL_source + ' half life (min)' # name in mins
    dataframe.rename(columns={'Rate Constant': name_s}, inplace=True)
    dataframe[name_m] = dataframe[name_s] * 60 * (1/np.log(2))
    dataframe[name_m] = 1 / dataframe[name_m] # units of mins
    return dataframe

# CLMLNE_ML_df = convert_data(CLMLNE_ML, "CLMLNE ML")
# CLMLNE_CL_df = convert_data(CLMLNE_CL, "CLMLNE CL")
# CLMLNE_NE_df = convert_data(CLMLNE_NE, "CLMLNE NE")
# CLNE_CL_df = convert_data(CLNE_CL, "CLNE CL")
# CLNE_NE_df = convert_data(CLNE_NE, "CLNE NE")

In [None]:
# determine which proteins are essential genes in the Clim0 data (does not matter if they are assigned to proteases or not):
Clim0_data = pd.read_csv('/Users/miagrahn/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/Clim_sorting_combos/Clim1/CLClim1NE/CLClim1NE_Parca_rate_constants/Clim_rate_constants.csv')
Clim0_data['HL Source'] = "Clim0" # add a column with the half life source
monomer_ids = Clim0_data['Protein ID']

# if a monomer in Clim0_essential_monomer_ids_found is found in monomer_ids, remove them from Clim0_data and add it to a new dataframe: 
essential_monomer_idxs = monomer_ids.isin(Clim0_essential_monomer_ids_found)
Clim0_essential_monomers = Clim0_data[essential_monomer_idxs].copy()
Clim0_essential_monomers['HL Source'] = "Clim0 (essential gene)"
Clim0_data_remaining = Clim0_data[~essential_monomer_idxs].copy()

#del Clim0_data, monomer_ids

# repeat for CL: 
CL_data = pd.read_csv('/Users/miagrahn/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/Clim_sorting_combos/Clim1/CLClim1NE/CLClim1NE_Parca_rate_constants/CL_rate_constants.csv')
CL_data['HL Source'] = "CL" # add a column with the half life source
CL_monomer_ids = CL_data['Protein ID']

# if a monomer in CL_essential_monomer_ids_found is found in monomer_ids, remove them from Clim0_data and add it to a new dataframe: 
CL_essential_monomer_idxs = CL_monomer_ids.isin(CL_essential_monomer_ids_found)
CL_essential_monomers = CL_data[CL_essential_monomer_idxs].copy()
CL_essential_monomers['HL Source'] = "CL (essential gene)"
CL_data_remaining = CL_data[~CL_essential_monomer_idxs].copy()
        
#del CL_data, CL_monomer_ids

# repeat for NE: 
NE_data = pd.read_csv('/Users/miagrahn/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/Clim_sorting_combos/Clim1/CLClim1NE/CLClim1NE_Parca_rate_constants/NE_rate_constants.csv')
NE_data['HL Source'] = "NE" # add a column with the half life source
NE_monomer_ids = NE_data['Protein ID']

# if a monomer in NE_essential_monomer_ids_found is found in monomer_ids, remove them from Clim0_data and add it to a new dataframe: 
NE_essential_monomer_idxs = NE_monomer_ids.isin(NE_essential_monomer_ids_found)
NE_essential_monomers = NE_data[NE_essential_monomer_idxs].copy()
NE_essential_monomers['HL Source'] = "NE (essential gene)"
NE_data_remaining = NE_data[~NE_essential_monomer_idxs].copy()

#del NE_data, NE_monomer_ids


In [None]:
# USER INPUTS: define the name of the combo in use and the special Clim name
current_sequence = "CLClim0NE"
Clim_name = 'Clim0'

Plot #1: A plot of  all the protein half lifes in the combo (no specific order, but sorted by source)

In [None]:
# Plot the half-lifes of CLClim0NE in order
CLClim0NE_df_full = pd.concat([CL_data, Clim0_data, NE_data], ignore_index=True) # concatinate row-wise

# convert the units of the data: 
CLClim0NE_df_full = convert_HL_data(CLClim0NE_df_full)

# sort the rates first:
#CLClim0NE_df_full = CLClim0NE_df_full.sort_values(by=["half life (min)"], ascending=True)
fig = px.scatter(CLClim0NE_df_full, x="Protein ID", y=CLClim0NE_df_full["half life (min)"], color="HL Source")
fig.update_traces(marker=dict(size=3, ), )

# plot specs:
fig.update_xaxes(visible=False)
fig.update_layout(title='The Half Life Values for Proteins in the ' + current_sequence + ' Sorted Combo<br>', xaxis_title='Protein ID', yaxis_title='Half Life (min)')
fig.update_layout(legend_title_text="Half Life Source")

# wont open in pycharm, so save as a html:
out_pth = "~/wcEcoli/out/random_plotlys/" + current_sequence + "_partitioned.html"
out_pth = os.path.expanduser(out_pth)  
output_dir = os.path.dirname(out_pth)
os.makedirs(output_dir, exist_ok=True)  
fig.write_html(out_pth, auto_open=True)

Plot #2: A plot of the proteins' half life assignment, ordered by value

In [None]:
# plot in order! (plotly does not naturally plot in order while merging over the different subsets globally)
CLClim0NE_df_full['gene symbol'] = CLClim0NE_df_full['Protein ID'].apply(lambda x: get_gene_symbols_for_monomer_ids()[x]) # this is to add to the hovertext
ordered_df = CLClim0NE_df_full.sort_values(by=["half life (min)"], ascending=True)

# create a dataframe for the sources of different rates: 
ordered_CL_df = ordered_df[ordered_df['HL Source'] == "CL"]
ordered_Clim_df = ordered_df[ordered_df['HL Source'] == Clim_name]
ordered_NE_df = ordered_df[ordered_df['HL Source'] == "NE"]


# generate the figure in the background first:
CLClim0NE_df_full_sorted = CLClim0NE_df_full.sort_values(by=["half life (min)"], ascending=True)
fig = px.scatter(CLClim0NE_df_full_sorted, x="Protein ID", y="half life (min)", )
fig.update_traces(marker_size=.5, opacity=.3)

# overlay the individual rates
fig.add_trace(go.Scatter(x=(ordered_Clim_df['Protein ID']), y=(ordered_Clim_df["half life (min)"]), mode='markers', name=Clim_name, hovertext=ordered_Clim_df['gene symbol'], marker_size=5, marker=dict(color="darkorange")))
fig.add_trace(go.Scatter(x=(ordered_NE_df['Protein ID']), y=(ordered_NE_df["half life (min)"]), mode='markers', name='NE', hovertext=ordered_NE_df['gene symbol'], marker_size=5, marker=dict(color="lightseagreen")))
fig.add_trace(go.Scatter(x=(ordered_CL_df['Protein ID']), y=(ordered_CL_df["half life (min)"]), mode='markers', name='CL', hovertext=ordered_CL_df['gene symbol'], marker_size=7, marker=dict(color="deeppink"))) # putting this last so that I can see it stacked on top of the others

# plot specs 
fig.update_xaxes(visible=False)
fig.update_layout(legend_title_text='Half Life Source')
fig.update_layout(title='The Half Life Values for Proteins in the ' + current_sequence + ' Sorted Combo<br>', xaxis_title='Protein ID', yaxis_title='Half Life (min)')

# wont open in pycharm, so save as a html:
out_pth = "~/wcEcoli/out/random_plotlys/" + current_sequence + "_ordered.html"
out_pth = os.path.expanduser(out_pth) 
output_dir = os.path.dirname(out_pth)
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it does not exist
fig.write_html(out_pth, auto_open=True) # displays the figure on a default web brouser 

Plot #3: Graph of the proteins with their essential genes highlighted

In [None]:
# combine all the data frames
dfs = [Clim0_essential_monomers, Clim0_data_remaining, CL_essential_monomers, CL_data_remaining, NE_essential_monomers, NE_data_remaining]
CLClim0NE_df_full_withEGs = pd.concat(dfs, ignore_index=True) # concatinate row-wise

# plot in order! (plotly does not naturally plot in order while merging over the different subsets globally)
CLClim0NE_df_full_withEGs = convert_HL_data(CLClim0NE_df_full_withEGs)
CLClim0NE_df_full_withEGs['gene symbol'] = CLClim0NE_df_full_withEGs['Protein ID'].apply(lambda x: get_gene_symbols_for_monomer_ids()[x]) # this is to add to the hovertext
ordered_df = CLClim0NE_df_full_withEGs.sort_values(by=["half life (min)"], ascending=True)

# create a dataframe for the sources of different rates: 
ordered_CL_df = ordered_df[ordered_df['HL Source'] == "CL"]
ordered_Clim_df = ordered_df[ordered_df['HL Source'] == Clim_name]
ordered_NE_df = ordered_df[ordered_df['HL Source'] == "NE"]
ordered_CL_EG_df = ordered_df[ordered_df['HL Source'] == "CL (essential gene)"]
ordered_Clim_EG_df = ordered_df[ordered_df['HL Source'] == Clim_name + " (essential gene)"]
ordered_NE_EG_df = ordered_df[ordered_df['HL Source'] == "NE (essential gene)"]

# generate the figure in the background first:
#ordered_df = CLClim0NE_df_full_withEGs.sort_values(by=["half life (min)"], ascending=True)
fig = px.scatter(ordered_df, x="Protein ID", y="half life (min)", )
fig.update_traces(marker_size=.5, opacity=.1)

# overlay the individual rates
fig.add_trace(go.Scatter(x=(ordered_Clim_df['Protein ID']), y=(ordered_Clim_df["half life (min)"]), mode='markers', name=(Clim_name + ', '+ (str(np.shape(ordered_Clim_df['Protein ID'])[0]))), hovertext=ordered_Clim_df['gene symbol'], marker_size=5, marker=dict(color="darkorange")))
fig.add_trace(go.Scatter(x=(ordered_NE_df['Protein ID']), y=(ordered_NE_df["half life (min)"]), mode='markers', name=('NE, '+ (str(np.shape(ordered_NE_df['Protein ID'])[0]))), hovertext=ordered_NE_df['gene symbol'], marker_size=5, marker=dict(color="lightseagreen")))
fig.add_trace(go.Scatter(x=(ordered_CL_df['Protein ID']), y=(ordered_CL_df["half life (min)"]), mode='markers', name=('CL, ' + (str(np.shape(ordered_CL_df['Protein ID'])[0]))), hovertext=ordered_CL_df['gene symbol'], marker_size=7, marker=dict(color="deeppink"))) 

fig.add_trace(go.Scatter(x=(ordered_Clim_EG_df['Protein ID']), y=(ordered_Clim_EG_df["half life (min)"]), mode='markers', name=(Clim_name + ' (essential genes), ' + (str(np.shape(ordered_Clim_EG_df['Protein ID'])[0]))), hovertext=ordered_Clim_EG_df['gene symbol'], marker_size=3, marker=dict(color="lime", symbol='diamond')))
fig.add_trace(go.Scatter(x=(ordered_NE_EG_df['Protein ID']), y=(ordered_NE_EG_df["half life (min)"]), mode='markers', name=('NE (essential genes), ' + (str(np.shape(ordered_NE_EG_df['Protein ID'])[0]))), hovertext=ordered_NE_EG_df['gene symbol'], marker_size=3, marker=dict(color="yellow", symbol='star')))
fig.add_trace(go.Scatter(x=(ordered_CL_EG_df['Protein ID']), y=(ordered_CL_EG_df["half life (min)"]), mode='markers', name=('CL (essential genes), ' + (str(np.shape(ordered_CL_EG_df['Protein ID'])[0]))), hovertext=ordered_CL_EG_df['gene symbol'], marker_size=3, marker=dict(color="deepskyblue", symbol='square'))) # putting this last so that I can see it stacked on top of the others

# plot specs 
fig.update_xaxes(visible=False)
fig.update_layout(legend_title_text='Half Life Source')
fig.update_layout(title='The Half Life Values for Proteins in the ' + current_sequence + ' Sorted Combo<br>', xaxis_title='Protein ID', yaxis_title='Half Life (min)')

# wont open in pycharm, so save as a html:
out_pth = "~/wcEcoli/out/random_plotlys/" + current_sequence + "_ordered_with_essential_genes.html"
out_pth = os.path.expanduser(out_pth) 
output_dir = os.path.dirname(out_pth)
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it does not exist
fig.write_html(out_pth, auto_open=True) # displays the figure on a default web brouser 

In [None]:
ordered_Clim_EG_df

# Plots with the proteins assigned to proteases

In [None]:
# first, clear all the data above, so that I can make the same plots with the same variables
del essential_monomer_idxs, Clim0_essential_monomers, Clim0_data_remaining, CL_essential_monomer_idxs, CL_essential_monomers, CL_data_remaining, NE_essential_monomer_idxs, NE_essential_monomers, NE_data_remaining

In [None]:
# determine which proteins are assigned to proteases in the Clim0 data (does not matter if they are essential or not):
Clim0_data = pd.read_csv('ParCa_rate_constants/Clim_rate_constants.csv')
Clim0_data['HL Source'] = "Clim0" # add a column with the half life source
monomer_ids = Clim0_data['Protein ID']

# if a monomer in Clim0_essential_monomer_ids_found is found in monomer_ids, remove them from Clim0_data and add it to a new dataframe: 
essential_monomer_idxs = monomer_ids.isin(Clim0_protease_substrate_monomer_ids)
Clim0_essential_monomers = Clim0_data[essential_monomer_idxs].copy()
Clim0_essential_monomers['HL Source'] = "Clim0 (protease substrate)"
Clim0_data_remaining = Clim0_data[~essential_monomer_idxs].copy()

#del Clim0_data, monomer_ids

# repeat for CL: 
CL_data = pd.read_csv('ParCa_rate_constants/CL_rate_constants.csv')
CL_data['HL Source'] = "CL" # add a column with the half life source
CL_monomer_ids = CL_data['Protein ID']

# if a monomer in CL_essential_monomer_ids_found is found in monomer_ids, remove them from Clim0_data and add it to a new dataframe: 
CL_essential_monomer_idxs = CL_monomer_ids.isin(CL_protease_substrate_monomer_ids)
CL_essential_monomers = CL_data[CL_essential_monomer_idxs].copy()
CL_essential_monomers['HL Source'] = "CL (protease substrate)"
CL_data_remaining = CL_data[~CL_essential_monomer_idxs].copy()
        
#del CL_data, CL_monomer_ids

# repeat for NE: 
NE_data = pd.read_csv('ParCa_rate_constants/NE_rate_constants.csv')
NE_data['HL Source'] = "NE" # add a column with the half life source
NE_monomer_ids = NE_data['Protein ID']

# if a monomer in NE_essential_monomer_ids_found is found in monomer_ids, remove them from Clim0_data and add it to a new dataframe: 
NE_essential_monomer_idxs = NE_monomer_ids.isin(NE_protease_substrate_monomer_ids)
NE_essential_monomers = NE_data[NE_essential_monomer_idxs].copy()
NE_essential_monomers['HL Source'] = "NE (protease substrate)"
NE_data_remaining = NE_data[~NE_essential_monomer_idxs].copy()

#del NE_data, NE_monomer_ids


Plot #4: Similar to plot 3, but instead of the essential genes plotted, the proteins assigned to substrates are plotted!

In [None]:
# combine all the data frames
dfs = [Clim0_essential_monomers, Clim0_data_remaining, CL_essential_monomers, CL_data_remaining, NE_essential_monomers, NE_data_remaining]
CLClim0NE_df_full_withPSs = pd.concat(dfs, ignore_index=True) # concatinate row-wise

# plot in order! (plotly does not naturally plot in order while merging over the different subsets globally)
CLClim0NE_df_full_withPSs = convert_HL_data(CLClim0NE_df_full_withPSs)
CLClim0NE_df_full_withPSs['gene symbol'] = CLClim0NE_df_full_withPSs['Protein ID'].apply(lambda x: get_gene_symbols_for_monomer_ids()[x]) # this is to add to the hovertext
ordered_df = CLClim0NE_df_full_withPSs.sort_values(by=["half life (min)"], ascending=True)

# create a dataframe for the sources of different rates: 
ordered_CL_df = ordered_df[ordered_df['HL Source'] == "CL"]
ordered_Clim_df = ordered_df[ordered_df['HL Source'] == Clim_name]
ordered_NE_df = ordered_df[ordered_df['HL Source'] == "NE"]
ordered_CL_PS_df = ordered_df[ordered_df['HL Source'] == "CL (protease substrate)"]
ordered_Clim_PS_df = ordered_df[ordered_df['HL Source'] == Clim_name + " (protease substrate)"]
ordered_NE_PS_df = ordered_df[ordered_df['HL Source'] == "NE (protease substrate)"]

# generate the figure in the background first:
fig = px.scatter(ordered_df, x="Protein ID", y="half life (min)")
fig.update_traces(marker_size=.5, opacity=.1)

# overlay the individual rates
fig.add_trace(go.Scatter(x=(ordered_Clim_df['Protein ID']), y=(ordered_Clim_df["half life (min)"]), mode='markers', name=(Clim_name + ', '+ (str(np.shape(ordered_Clim_df['Protein ID'])[0]))), hovertext=ordered_Clim_df['gene symbol'], marker_size=5, marker=dict(color="darkorange")))
fig.add_trace(go.Scatter(x=(ordered_NE_df['Protein ID']), y=(ordered_NE_df["half life (min)"]), mode='markers', name=('NE, '+ (str(np.shape(ordered_NE_df['Protein ID'])[0]))), hovertext=ordered_NE_df['gene symbol'], marker_size=5, marker=dict(color="lightseagreen")))
fig.add_trace(go.Scatter(x=(ordered_CL_df['Protein ID']), y=(ordered_CL_df["half life (min)"]), mode='markers', name=('CL, ' + (str(np.shape(ordered_CL_df['Protein ID'])[0]))), hovertext=ordered_CL_df['gene symbol'], marker_size=7, marker=dict(color="deeppink"))) 

fig.add_trace(go.Scatter(x=(ordered_Clim_PS_df['Protein ID']), y=(ordered_Clim_PS_df["half life (min)"]), mode='markers', name=(Clim_name + ' (protease substrates), ' + (str(np.shape(ordered_Clim_PS_df['Protein ID'])[0]))), hovertext=ordered_Clim_PS_df['gene symbol'], marker_size=3, marker=dict(color="lime", symbol='diamond')))
fig.add_trace(go.Scatter(x=(ordered_NE_PS_df['Protein ID']), y=(ordered_NE_PS_df["half life (min)"]), mode='markers', name=('NE (protease substrates), ' + (str(np.shape(ordered_NE_PS_df['Protein ID'])[0]))), hovertext=ordered_NE_PS_df['gene symbol'], marker_size=3, marker=dict(color="yellow", symbol='star')))
fig.add_trace(go.Scatter(x=(ordered_CL_PS_df['Protein ID']), y=(ordered_CL_PS_df["half life (min)"]), mode='markers', name=('CL (protease substrates), ' + (str(np.shape(ordered_CL_PS_df['Protein ID'])[0]))), hovertext=ordered_CL_PS_df['gene symbol'], marker_size=3, marker=dict(color="deepskyblue", symbol='square'))) # putting this last so that I can see it stacked on top of the others


# plot specs 
fig.update_xaxes(visible=False)
fig.update_layout(legend_title_text='Half Life Source')
fig.update_layout(title='4: The Half Life Values for Proteins in the ' + current_sequence + ' Sorted Combo<br>', xaxis_title='Protein ID', yaxis_title='Half Life (min)')

# wont open in pycharm, so save as a html:
out_pth = "~/wcEcoli/out/random_plotlys/" + current_sequence + "_ordered_with_protein_substrates.html"
out_pth = os.path.expanduser(out_pth) 
output_dir = os.path.dirname(out_pth)
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it does not exist
fig.write_html(out_pth, auto_open=True) # displays the figure on a default web brouser 

Plot #5: a plot of just the Clim0 half-lifes and the ones that are protease substrates

In [None]:
# first, prepare the data for merging:
#del Clim0_data, monomer_ids, Clim0_data_remaining

# separate CLim0 data
Clim0_data = pd.read_csv('/Users/miagrahn/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/Clim_sorting_combos/Clim1/CLClim1NE/CLClim1NE_Parca_rate_constants/Clim_rate_constants.csv')
Clim0_data['HL Source'] = "Clim0" # add a column with the half life source
monomer_ids = Clim0_data["Protein ID"]

# if a monomer in Clim0_protease_substrate_monomer_ids is found in monomer_ids, remove them from Clim0_data and add it to a new dataframe: 
Clim0_protease_substrate_idxs = monomer_ids.isin(Clim0_protease_substrate_monomer_ids)
Clim0_protease_substrates = Clim0_data[Clim0_protease_substrate_idxs].copy()
Clim0_protease_substrates['HL Source'] = "Clim0 (protease substrate)"
Clim0_data_remaining = Clim0_data[~Clim0_protease_substrate_idxs].copy()

# merge it back together momentarily: 
Clim0_data = pd.concat([Clim0_protease_substrates, Clim0_data_remaining], ignore_index=True)

# load in the "N-end" rule rates from the CLNE model (CL is the exact same between both):
CLNE_NE_data = pd.read_excel('~/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/previous_models/CLNE_files/CLNE_full.xlsx')
CLNE_NE_data.rename(columns={CLNE_NE_data.columns[1]: "Rate Constant"}, inplace=True) 

# convert both datasets: 
CLNE_NE_data = convert_data_and_add_HL_source(CLNE_NE_data, "CLNE NE")
CLClim0NE_Clim0_data = convert_data_and_add_HL_source(Clim0_data, "CLClim0NE Clim0")

In [None]:
# second, merge the data: 

# merge the two dataframes on the protein names
df = pd.merge(CLClim0NE_Clim0_data, CLNE_NE_data, on='Protein ID', how='inner')

# make a new data frame with all proteins that have a half life of greater than 10 mins in the NE half life (min) column 
df_10hrs = df[df['CLNE NE half life (min)'] > 10].copy()
monomer_ids_10hrs = df_10hrs['Protein ID'] # todo: do I need this??
df_10hrs['gene symbol'] = df_10hrs['Protein ID'].apply(lambda x: get_gene_symbols_for_monomer_ids()[x])
df_10hrs = df_10hrs.drop(columns=["CLNE NE rate constant (s^1)", "CLNE NE half life (min)", "CLClim0NE Clim0 rate constant (s^1)"])
df_10hrs

In [None]:
# third, order the dataframe, then resplit the data 

# once again, split the dataframe: 
ordered_df = df_10hrs.sort_values(by="CLClim0NE Clim0 half life (min)", ascending=True)

# filter ordered DataFrame by HL Source using .loc[] method instead:
ordered_Clim_df = ordered_df.loc[ordered_df['HL Source'] == Clim_name]
ordered_Clim_PS_df = ordered_df.loc[ordered_df['HL Source'] == Clim_name + " (protease substrate)"]

In [None]:
ordered_Clim_df

In [None]:
# third, generate the graph
fig = go.Figure()

# generate the figure in the background first:
fig = px.scatter(ordered_df, x=ordered_df["Protein ID"], y=ordered_df["CLClim0NE Clim0 half life (min)"])
fig.update_traces(marker_size=.5, opacity=.1)

# overlay the individual rates
fig.add_trace(go.Scatter(x=(ordered_Clim_df['Protein ID']), y=(ordered_Clim_df["CLClim0NE Clim0 half life (min)"]), mode='markers', name=(Clim_name + ', '+ (str(np.shape(ordered_Clim_df['Protein ID'])[0]))), hovertext=ordered_Clim_df['gene symbol'], marker_size=3, marker=dict(color="lightseagreen"), yaxis="y1"))
fig.add_trace(go.Scatter(x=(ordered_Clim_PS_df['Protein ID']), y=(ordered_Clim_PS_df["CLClim0NE Clim0 half life (min)"]), mode='markers', name=(Clim_name + ' (protease substrates), ' + (str(np.shape(ordered_Clim_PS_df['Protein ID'])[0]))), hovertext=ordered_Clim_PS_df['gene symbol'], marker_size=5, marker=dict(color="orange", symbol='star'), yaxis="y1"))

# Plot Specs
fig.update_layout(title=f'The C-limited (Clim) half life values for the {str(np.shape(df_10hrs)[0])} proteins in the ' + current_sequence + ' PDR combo<br> that were assinged to a half life of 10 hours in the 2020 model (CLNE)', xaxis_title='Protein ID', yaxis_title='Clim Half Life (min)')
fig.update_xaxes(visible=False)
fig.update_layout(autosize=False, width=1200, height=700, showlegend=True)
fig.update_layout(legend_title_text='Half Life Source')

# wont open in pycharm, so save as a html:
out_pth = "~/wcEcoli/out/random_plotlys/" + current_sequence + "_Clim0_ordered_with_protein_substrates.html"
out_pth = os.path.expanduser(out_pth) 
output_dir = os.path.dirname(out_pth)
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it does not exist
fig.write_html(out_pth, auto_open=True) # displays the figure on a default web brouser 


# for proposal: 

10 hours 

Plot #6: a plot of just the Clim0 half-lifes and the ones that are protease substrates
Part A: determine the proteins:

In [None]:
# first find the monomer IDs that are substrates of the proteases in the CLClim0NE model:
protease_data = pd.read_excel('~/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/supplementary_data/Gupta_et_al_2024_ST2_protease_assignment_data.xlsx')

# add the gene symbols to the dataframe:
ordered_protease_df = ordered_df.copy()
ordered_protease_df['protease assignement'] = "NA"

# determine which monomer IDs are protein substrates of the proteases in the CLClim0NE model:
gene_ids = ordered_protease_df['gene symbol']

Clim0_protease_substrate_gene_symbols = []
for gene_id in gene_ids:
    if gene_id in protease_data['Gene name'].values:
        Clim0_protease_substrate_gene_symbols.append(gene_id)
        protease_row = protease_data[protease_data['Gene name'] == gene_id]
        protease = protease_row['Protease assignment'].values[0]  
        ordered_protease_df.loc[ordered_df['gene symbol'] == gene_id, 'protease assignement'] = protease

# figure out which proteins were assigned to lon in the model: 
lon_proteins = ordered_protease_df[ordered_protease_df['protease assignement'] == 'Lon only']
lon_proteins

# figure out which proteins were assigned to clp in the model:
clp_proteins = ordered_protease_df[ordered_protease_df['protease assignement'] == 'ClpP only']

hslv_proteins = ordered_protease_df[ordered_protease_df['protease assignement'] == 'HslV only']

# figure out which proteins are additive in the model: 
additive_proteins = ordered_protease_df[ordered_protease_df['protease assignement'] == 'Additive']

# figure out which proteins are redundant in the model: 
redundant_proteins = ordered_protease_df[ordered_protease_df['protease assignement'] == 'Redundant']

# figure out which proteins are assinged as "unexplained" in the model: 
unexplained_proteins = ordered_protease_df[ordered_protease_df['protease assignement'] == 'Actively degrading in Tripple KO']

# figure out which proteins were not assigned to a protease in the model:
no_protease_proteins = ordered_protease_df[ordered_protease_df['protease assignement'] == 'NA']

# todo: note that these numbers will not add up to those in the graph below bc some proteins in the CLClim0NE model are not in the CLNE model 
#298 total 
print("Lon Proteins(" +str(len(lon_proteins))+"): ", list(lon_proteins['gene symbol'])) #14
print("ClpP Proteins(" +str(len(clp_proteins))+"): ", list(clp_proteins['gene symbol'])) #63
print("HslV Proteins(" +str(len(hslv_proteins))+"): ", list(hslv_proteins['gene symbol'])) #1
print("Additive Proteins(" +str(len(additive_proteins))+"): ", list(additive_proteins['gene symbol'])) #79
print("Redundant Proteins(" +str(len(redundant_proteins))+"): ", list(redundant_proteins['gene symbol'])) #39
print("Unexplained Proteins(" +str(len(unexplained_proteins))+"): ", list(unexplained_proteins['gene symbol'])) #102
#print("No Protease Proteins(" +str(len(no_protease_proteins))+"): ", list(no_protease_proteins['gene symbol'])) # 4012


Part B: generate the graph

In [None]:
fig = go.Figure()

# generate the figure in the background first:
fig = px.scatter(ordered_protease_df, x=ordered_protease_df["Protein ID"], y=ordered_protease_df["CLClim0NE Clim0 half life (min)"])
fig.update_traces(marker_size=.01, opacity=.1)    

# figure out slices of the data:
protease_df = ordered_protease_df.copy()
# make a copy of the ordered_df data:
CLClimNE_Clim0_log_data_with_Lon_proteases = protease_df.copy()
CLClimNE_Clim0_log_data_with_ClpP_proteases = protease_df.copy()
CLClimNE_Clim0_log_data_with_HslV_proteases = protease_df.copy()
CLClimNE_Clim0_log_data_with_Additive_proteases = protease_df.copy()
CLClimNE_Clim0_log_data_with_Redundant_proteases = protease_df.copy()
CLClimNE_Clim0_log_data_with_Unexplained_proteases = protease_df.copy()
CLClimNE_Clim0_log_data_with_no_protease_proteases = protease_df.copy()

# remove the proteins that were not assigned to the protease in the model:
CLClimNE_log_data_with_Lon_proteases = CLClimNE_Clim0_log_data_with_Lon_proteases[CLClimNE_Clim0_log_data_with_Lon_proteases['protease assignement'] == 'Lon only']
CLClimNE_log_data_with_ClpP_proteases = CLClimNE_Clim0_log_data_with_ClpP_proteases[CLClimNE_Clim0_log_data_with_ClpP_proteases['protease assignement'] == 'ClpP only']
CLClimNE_log_data_with_HslV_proteases = CLClimNE_Clim0_log_data_with_HslV_proteases[CLClimNE_Clim0_log_data_with_HslV_proteases['protease assignement'] == 'HslV only']
CLClimNE_log_data_with_Additive_proteases = CLClimNE_Clim0_log_data_with_Additive_proteases[CLClimNE_Clim0_log_data_with_Additive_proteases['protease assignement'] == 'Additive']
CLClimNE_log_data_with_Redundant_proteases = CLClimNE_Clim0_log_data_with_Redundant_proteases[CLClimNE_Clim0_log_data_with_Redundant_proteases['protease assignement'] == 'Redundant']
CLClimNE_log_data_with_Unexplained_proteases = CLClimNE_Clim0_log_data_with_Unexplained_proteases[CLClimNE_Clim0_log_data_with_Unexplained_proteases['protease assignement'] == 'Actively degrading in Tripple KO']
CLClimNE_log_data_with_no_protease_proteases = CLClimNE_Clim0_log_data_with_no_protease_proteases[CLClimNE_Clim0_log_data_with_no_protease_proteases['protease assignement'] == 'NA']

# overlay the individual rates
fig.add_trace(go.Scatter(x=(CLClimNE_log_data_with_no_protease_proteases['Protein ID']), y=(CLClimNE_log_data_with_no_protease_proteases["CLClim0NE Clim0 half life (min)"]), mode='markers', name=("NA"+ ' '+ (str(np.shape(CLClimNE_log_data_with_no_protease_proteases['Protein ID'])[0]))), hovertext=CLClimNE_log_data_with_no_protease_proteases['gene symbol'], marker_size=2, marker=dict(color="lightseagreen", size=3, opacity=.6), yaxis="y1"))

fig.add_trace(go.Scatter(x=(CLClimNE_log_data_with_Unexplained_proteases['Protein ID']), y=(CLClimNE_log_data_with_Unexplained_proteases["CLClim0NE Clim0 half life (min)"]), mode='markers', name=("Unexplained"+ ' '+ (str(np.shape(CLClimNE_log_data_with_Unexplained_proteases['Protein ID'])[0]))), hovertext=CLClimNE_log_data_with_Unexplained_proteases['gene symbol'], marker_size=3, marker=dict(size=3, opacity=.6), yaxis="y1"))

fig.add_trace(go.Scatter(x=(CLClimNE_log_data_with_Redundant_proteases['Protein ID']), y=(CLClimNE_log_data_with_Redundant_proteases["CLClim0NE Clim0 half life (min)"]), mode='markers', name=("Redundant  "+ (str(np.shape(CLClimNE_log_data_with_Redundant_proteases['Protein ID'])[0]))), hovertext=CLClimNE_log_data_with_Redundant_proteases['gene symbol'], marker_size=3, marker=dict(color="orange", size=3, opacity=.6), yaxis="y1"))

fig.add_trace(go.Scatter(x=(CLClimNE_log_data_with_Additive_proteases['Protein ID']), y=(CLClimNE_log_data_with_Additive_proteases["CLClim0NE Clim0 half life (min)"]), mode='markers', name=("Additive  "+ (str(np.shape(CLClimNE_log_data_with_Additive_proteases['Protein ID'])[0]))), hovertext=CLClimNE_log_data_with_Additive_proteases['gene symbol'], marker_size=4, marker=dict( size=3, opacity=.6), yaxis="y1"))

fig.add_trace(go.Scatter(x=(CLClimNE_log_data_with_Lon_proteases['Protein ID']), y=(CLClimNE_log_data_with_Lon_proteases["CLClim0NE Clim0 half life (min)"]), mode='markers', name=("Lon  "+ (str(np.shape(CLClimNE_log_data_with_Lon_proteases['Protein ID'])[0]))), hovertext=CLClimNE_log_data_with_Lon_proteases['gene symbol'], marker_size=5, marker=dict( size=3, opacity=.6), yaxis="y1"))

fig.add_trace(go.Scatter(x=(CLClimNE_log_data_with_ClpP_proteases['Protein ID']), y=(CLClimNE_log_data_with_ClpP_proteases["CLClim0NE Clim0 half life (min)"]), mode='markers', name=("ClpP  "+ (str(np.shape(CLClimNE_log_data_with_ClpP_proteases['Protein ID'])[0]))), hovertext=CLClimNE_log_data_with_ClpP_proteases['gene symbol'], marker_size=5, marker=dict(size=3, opacity=.6), yaxis="y1"))

fig.add_trace(go.Scatter(x=(CLClimNE_log_data_with_HslV_proteases['Protein ID']), y=(CLClimNE_log_data_with_HslV_proteases["CLClim0NE Clim0 half life (min)"]), mode='markers', name=("HslV  "+ (str(np.shape(CLClimNE_log_data_with_HslV_proteases['Protein ID'])[0]))), hovertext=CLClimNE_log_data_with_HslV_proteases['gene symbol'], marker_size=5, marker=dict( size=3, opacity=.6), yaxis="y1"))
              

# Plot Specs
fig.update_layout(title=dict(text=f'The C-limited (Clim) half life values for the New WCM proteins assigned to Clim <br> half-lives that were assinged to a half-life of 10 hours in the original model ', font=dict(size=15)), xaxis_title='Protein ID', yaxis_title='Clim Half Life (min)')
fig.update_xaxes(visible=False)
fig.update_layout(autosize=False, width=700, height=600, showlegend=True)

# wont open in pycharm, so save as a html:
out_pth = "~/wcEcoli/out/random_plotlys/" + current_sequence + "_Clim0_ordered_with_protein_substrates_and_colored_proposal.html"
out_pth = os.path.expanduser(out_pth) 
output_dir = os.path.dirname(out_pth)
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it does not exist
fig.write_html(out_pth, auto_open=True) # displays the figure on a default web brouser 


In [None]:
# load in the protease-substrate assignment datatable: 
protease_data = pd.read_excel('~/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/supplementary_data/Gupta_et_al_2024_ST2_protease_assignment_data.xlsx')

# determine if any of the proteins assigned to a protease are considered an essential gene in the model: 
essential_substrate_gene_symbols = []
for gene in get_essential_genes():
    if get_gene_symbols_for_monomer_ids()[get_monomer_ids_for_gene_ids()[gene]] in protease_data['Gene name'].values:
        essential_substrate_gene_symbols.append(get_gene_symbols_for_monomer_ids()[get_monomer_ids_for_gene_ids()[gene]])
        
print("There are " + str(len(protease_data['Gene name'].values)) + " proteins assigned to a protease in the Gupta et al., 2024 data.")
print("Essential Genes in Model that are Assigned to a Protease (" +str(len(essential_substrate_gene_symbols))+"): ", essential_substrate_gene_symbols)

# Figure out which priority proteins are in the model overall:

In [None]:
CLClim0NE_df_full

In [None]:
ordered_CLClim0NE_df_full

In [None]:
# figure out the priority proteins in the model: 
priority_protease_data = pd.read_excel('~/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/supplementary_data/ST2_prioritysort2_substrates.xlsx')

CLClim0NE_df_full = CLClim0NE_df_full.sort_values(by=["half life (min)"])
ordered_CLClim0NE_df_full = CLClim0NE_df_full.copy()
ordered_CLClim0NE_df_full['protease assignement'] = "NA"

# determine which monomer IDs are protein substrates of the proteases in the CLClim0NE model:
ordered_CLClim0NE_df_full['gene symbol'] = ordered_CLClim0NE_df_full['Protein ID'].apply(lambda x: get_gene_symbols_for_monomer_ids()[x])

gene_ids = ordered_CLClim0NE_df_full['gene symbol']

CLClim0NE_df_full_protease_substrate_gene_symbols = []
for gene_id in gene_ids:
    if gene_id in priority_protease_data['Gene name'].values:
        CLClim0NE_df_full_protease_substrate_gene_symbols.append(gene_id)
        protease_row = priority_protease_data[priority_protease_data['Gene name'] == gene_id]
        protease = protease_row['Protease assignment'].values[0]  
        ordered_CLClim0NE_df_full.loc[ordered_CLClim0NE_df_full['gene symbol'] == gene_id, 'protease assignement'] = protease

# figure out which proteins were assigned to lon in the model: 
lon_proteins = ordered_CLClim0NE_df_full[ordered_CLClim0NE_df_full['protease assignement'] == 'Lon only']
lon_proteins

# figure out which proteins were assigned to clp in the model:
clp_proteins = ordered_CLClim0NE_df_full[ordered_CLClim0NE_df_full['protease assignement'] == 'ClpP only']

hslv_proteins = ordered_CLClim0NE_df_full[ordered_CLClim0NE_df_full['protease assignement'] == 'HslV only']

# figure out which proteins are additive in the model: 
additive_proteins = ordered_CLClim0NE_df_full[ordered_CLClim0NE_df_full['protease assignement'] == 'Additive']

# figure out which proteins are redundant in the model: 
redundant_proteins = ordered_CLClim0NE_df_full[ordered_CLClim0NE_df_full['protease assignement'] == 'Redundant']

# figure out which proteins are assinged as "unexplained" in the model: 
unexplained_proteins = ordered_CLClim0NE_df_full[ordered_CLClim0NE_df_full['protease assignement'] == 'Actively degrading in Tripple KO']

# figure out which proteins were not assigned to a protease in the model:
no_protease_proteins = ordered_CLClim0NE_df_full[ordered_CLClim0NE_df_full['protease assignement'] == 'NA']

# todo: note that these numbers will not add up to those in the graph below bc some proteins in the CLClim0NE model are not in the CLNE model 
#298 total 
print("Lon Proteins(" +str(len(lon_proteins))+"): ", list(lon_proteins['gene symbol'])) #6
print("ClpP Proteins(" +str(len(clp_proteins))+"): ", list(clp_proteins['gene symbol'])) #40
print("HslV Proteins(" +str(len(hslv_proteins))+"): ", list(hslv_proteins['gene symbol'])) #1
print("Additive Proteins(" +str(len(additive_proteins))+"): ", list(additive_proteins['gene symbol'])) #33
print("Redundant Proteins(" +str(len(redundant_proteins))+"): ", list(redundant_proteins['gene symbol'])) 
print("Unexplained Proteins(" +str(len(unexplained_proteins))+"): ", list(unexplained_proteins['gene symbol'])) 

# figure out slices of the data:
ordered_proteases_CLClim0NE_df_full = ordered_CLClim0NE_df_full.copy()
# make a copy of the ordered_df data:
CLClimNE_full_log_data_with_Lon_proteases = ordered_proteases_CLClim0NE_df_full.copy()
CLClimNE_full_log_data_with_ClpP_proteases = ordered_proteases_CLClim0NE_df_full.copy()
CLClimNE_full_log_data_with_HslV_proteases = ordered_proteases_CLClim0NE_df_full.copy()
CLClimNE_full_log_data_with_Additive_proteases = ordered_proteases_CLClim0NE_df_full.copy()
CLClimNE_full_log_data_with_Redundant_proteases = ordered_proteases_CLClim0NE_df_full.copy()
CLClimNE_full_log_data_with_Unexplained_proteases = ordered_proteases_CLClim0NE_df_full.copy()
CLClimNE_full_log_data_with_no_protease_proteases = ordered_proteases_CLClim0NE_df_full.copy()

# remove the proteins that were not assigned to the protease in the model:
CLClimNE_full_log_data_with_Lon_proteases = CLClimNE_full_log_data_with_Lon_proteases[CLClimNE_full_log_data_with_Lon_proteases['protease assignement'] == 'Lon only']
CLClimNE_full_log_data_with_ClpP_proteases = CLClimNE_full_log_data_with_ClpP_proteases[CLClimNE_full_log_data_with_ClpP_proteases['protease assignement'] == 'ClpP only']
CLClimNE_full_log_data_with_HslV_proteases = CLClimNE_full_log_data_with_HslV_proteases[CLClimNE_full_log_data_with_HslV_proteases['protease assignement'] == 'HslV only']
CLClimNE_full_log_data_with_Additive_proteases = CLClimNE_full_log_data_with_Additive_proteases[CLClimNE_full_log_data_with_Additive_proteases['protease assignement'] == 'Additive']

priority_substrates_in_the_full_model = [CLClimNE_full_log_data_with_Lon_proteases, CLClimNE_full_log_data_with_ClpP_proteases, CLClimNE_full_log_data_with_HslV_proteases, CLClimNE_full_log_data_with_Additive_proteases]

# concatenate the dataframes:
priority_substrates_in_the_full_model_df = pd.concat(priority_substrates_in_the_full_model)
priority_substrates_in_the_full_model_df

In [None]:
# trying to debug and figure out why two are missing
priority_protease_data[~priority_protease_data['Gene name'].isin(priority_substrates_in_the_full_model_df['gene symbol'])]
# ok So I figured out the issue. yedW is the same as hprR in our model and csiR is the same as glaR. So, I think the best thing to do is to manually change this gene name in the excel file of priority substrates for these two. 
# edit: I have done what I mentioned above, so this output should be empty now.

In [None]:
# check which of the priority substrates are essential in the model:
# check if any of the essential genes in the Clim0 data are assigned to a protease:
priority_protease_substrate_EG_gene_symbols = []
priority_protease_substrate_EG_monomer_ids = []

for gene in get_essential_genes():
    gene_symbol = get_gene_symbols_for_monomer_ids()[get_monomer_ids_for_gene_ids()[gene]]
    if gene_symbol in priority_protease_data['Gene name'].values:
        priority_protease_substrate_EG_gene_symbols.append(gene_symbol)
        priority_protease_substrate_EG_monomer_ids.append(get_monomer_ids_for_gene_ids()[gene])
        
print("Total Essential Genes in the priority substrates (" +str(len(priority_protease_substrate_EG_gene_symbols))+"): ", priority_protease_substrate_EG_gene_symbols)

## same plot as above but with the priority protease substrates only

Plot 7: a plot of just the Clim0 half-lifes and the ones that are PRIORITY protease substrates
PART A: figure out which protiens in Clim0 (that was originally 10 hours) are priority protease substrates

In [None]:
# first find the monomer IDs that are substrates of the proteases in the CLClim0NE model:
priority_protease_data = pd.read_excel('~/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/supplementary_data/ST2_prioritysort2_substrates.xlsx')


# add the gene symbols to the dataframe:
ordered_priority_protease_df = ordered_df.copy()
ordered_priority_protease_df['protease assignement'] = "NA"

# determine which monomer IDs are protein substrates of the proteases in the CLClim0NE model:
gene_ids = ordered_priority_protease_df['gene symbol']

Clim0_protease_substrate_gene_symbols = []
for gene_id in gene_ids:
    if gene_id in priority_protease_data['Gene name'].values:
        Clim0_protease_substrate_gene_symbols.append(gene_id)
        protease_row = priority_protease_data[priority_protease_data['Gene name'] == gene_id]
        protease = protease_row['Protease assignment'].values[0]  
        ordered_priority_protease_df.loc[ordered_priority_protease_df['gene symbol'] == gene_id, 'protease assignement'] = protease

# figure out which proteins were assigned to lon in the model: 
lon_proteins = ordered_priority_protease_df[ordered_priority_protease_df['protease assignement'] == 'Lon only']

# figure out which proteins were assigned to clp in the model:
clp_proteins = ordered_priority_protease_df[ordered_priority_protease_df['protease assignement'] == 'ClpP only']

hslv_proteins = ordered_priority_protease_df[ordered_priority_protease_df['protease assignement'] == 'HslV only']

# figure out which proteins are additive in the model: 
additive_proteins = ordered_priority_protease_df[ordered_priority_protease_df['protease assignement'] == 'Additive']

# figure out which proteins are redundant in the model: 
redundant_proteins = ordered_priority_protease_df[ordered_priority_protease_df['protease assignement'] == 'Redundant']

# figure out which proteins are assinged as "unexplained" in the model: 
unexplained_proteins = ordered_priority_protease_df[ordered_priority_protease_df['protease assignement'] == 'Actively degrading in Tripple KO']

# figure out which proteins were not assigned to a protease in the model:
no_protease_proteins = ordered_priority_protease_df[ordered_priority_protease_df['protease assignement'] == 'NA']

# todo: note that these numbers will not add up to those in the graph below bc some proteins in the CLClim0NE model are not in the CLNE model 
#298 total 
print("Lon Proteins(" +str(len(lon_proteins))+"): ", list(lon_proteins['gene symbol'])) #14
print("ClpP Proteins(" +str(len(clp_proteins))+"): ", list(clp_proteins['gene symbol'])) #63
print("HslV Proteins(" +str(len(hslv_proteins))+"): ", list(hslv_proteins['gene symbol'])) #1
print("Additive Proteins(" +str(len(additive_proteins))+"): ", list(additive_proteins['gene symbol'])) #79
print("Redundant Proteins(" +str(len(redundant_proteins))+"): ", list(redundant_proteins['gene symbol'])) #39
print("Unexplained Proteins(" +str(len(unexplained_proteins))+"): ", list(unexplained_proteins['gene symbol'])) #102
#print("No Protease Proteins(" +str(len(no_protease_proteins))+"): ", list(no_protease_proteins['gene symbol'])) # 4012

# worth noting: the "Additive Proteins" were 35 when the priority proteases in the whole model were found. only 31 are found in the Clim0 here

PART B: generate the graph

In [None]:
# make the snake graph with the protease data:
# third, generate the graph
fig = go.Figure()

# generate the figure in the background first:
fig = px.scatter(ordered_priority_protease_df, x=ordered_priority_protease_df["Protein ID"], y=ordered_protease_df["CLClim0NE Clim0 half life (min)"])
fig.update_traces(marker_size=.01, opacity=.1)    

# figure out slices of the data:
priority_protease_df = ordered_priority_protease_df.copy()
# make a copy of the ordered_df data:
CLClimNE_Clim0_log_data_with_Lon_proteases = priority_protease_df.copy()
CLClimNE_Clim0_log_data_with_ClpP_proteases = priority_protease_df.copy()
CLClimNE_Clim0_log_data_with_HslV_proteases = priority_protease_df.copy()
CLClimNE_Clim0_log_data_with_Additive_proteases = priority_protease_df.copy()
CLClimNE_Clim0_log_data_with_no_protease_proteases = priority_protease_df.copy()

# remove the proteins that were not assigned to the protease in the model:
CLClimNE_log_data_with_Lon_proteases = CLClimNE_Clim0_log_data_with_Lon_proteases[CLClimNE_Clim0_log_data_with_Lon_proteases['protease assignement'] == 'Lon only']
CLClimNE_log_data_with_ClpP_proteases = CLClimNE_Clim0_log_data_with_ClpP_proteases[CLClimNE_Clim0_log_data_with_ClpP_proteases['protease assignement'] == 'ClpP only']
CLClimNE_log_data_with_HslV_proteases = CLClimNE_Clim0_log_data_with_HslV_proteases[CLClimNE_Clim0_log_data_with_HslV_proteases['protease assignement'] == 'HslV only']
CLClimNE_log_data_with_Additive_proteases = CLClimNE_Clim0_log_data_with_Additive_proteases[CLClimNE_Clim0_log_data_with_Additive_proteases['protease assignement'] == 'Additive']
CLClimNE_log_data_with_no_protease_proteases = CLClimNE_Clim0_log_data_with_no_protease_proteases[CLClimNE_Clim0_log_data_with_no_protease_proteases['protease assignement'] == 'NA']

# overlay the individual rates
fig.add_trace(go.Scatter(x=(CLClimNE_log_data_with_no_protease_proteases['Protein ID']), y=(CLClimNE_log_data_with_no_protease_proteases["CLClim0NE Clim0 half life (min)"]), mode='markers', name=("NA"+ ' '+ (str(np.shape(CLClimNE_log_data_with_no_protease_proteases['Protein ID'])[0]))), hovertext=CLClimNE_log_data_with_no_protease_proteases['gene symbol'], marker_size=2, marker=dict(color="lightseagreen", size=3, opacity=.6), yaxis="y1"))

fig.add_trace(go.Scatter(x=(CLClimNE_log_data_with_Additive_proteases['Protein ID']), y=(CLClimNE_log_data_with_Additive_proteases["CLClim0NE Clim0 half life (min)"]), mode='markers', name=("Additive  "+ (str(np.shape(CLClimNE_log_data_with_Additive_proteases['Protein ID'])[0]))), hovertext=CLClimNE_log_data_with_Additive_proteases['gene symbol'], marker_size=4, marker=dict(color='deeppink', size=3, opacity=.6), yaxis="y1"))

fig.add_trace(go.Scatter(x=(CLClimNE_log_data_with_Lon_proteases['Protein ID']), y=(CLClimNE_log_data_with_Lon_proteases["CLClim0NE Clim0 half life (min)"]), mode='markers', name=("Lon  "+ (str(np.shape(CLClimNE_log_data_with_Lon_proteases['Protein ID'])[0]))), hovertext=CLClimNE_log_data_with_Lon_proteases['gene symbol'], marker_size=5, marker=dict( size=3, opacity=.6), yaxis="y1"))

fig.add_trace(go.Scatter(x=(CLClimNE_log_data_with_ClpP_proteases['Protein ID']), y=(CLClimNE_log_data_with_ClpP_proteases["CLClim0NE Clim0 half life (min)"]), mode='markers', name=("ClpP  "+ (str(np.shape(CLClimNE_log_data_with_ClpP_proteases['Protein ID'])[0]))), hovertext=CLClimNE_log_data_with_ClpP_proteases['gene symbol'], marker_size=5, marker=dict(size=3, opacity=.6), yaxis="y1"))

fig.add_trace(go.Scatter(x=(CLClimNE_log_data_with_HslV_proteases['Protein ID']), y=(CLClimNE_log_data_with_HslV_proteases["CLClim0NE Clim0 half life (min)"]), mode='markers', name=("HslV  "+ (str(np.shape(CLClimNE_log_data_with_HslV_proteases['Protein ID'])[0]))), hovertext=CLClimNE_log_data_with_HslV_proteases['gene symbol'], marker_size=5, marker=dict( size=3, opacity=.6), yaxis="y1"))
              

# Plot Specs
fig.update_layout(title=dict(text=f'The C-limited (Clim) half life values for the New WCM proteins assigned to Clim <br> half-lives that were assinged to a half-life of 10 hours in the original model ', font=dict(size=15)), xaxis_title='Protein ID', yaxis_title='Clim Half Life (min)')
fig.update_xaxes(visible=False)
fig.update_layout(autosize=False, width=700, height=600, showlegend=True)

# wont open in pycharm, so save as a html:
out_pth = "~/wcEcoli/out/random_plotlys/" + current_sequence + "_Clim0_ordered_with_protein_substrates_and_colored_proposal.html"
out_pth = os.path.expanduser(out_pth) 
output_dir = os.path.dirname(out_pth)
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it does not exist
fig.write_html(out_pth, auto_open=True) # displays the figure on a default web brouser 


In [None]:
CLClimNE_full_log_data_with_Additive_proteases[~CLClimNE_full_log_data_with_Additive_proteases['gene symbol'].isin(CLClimNE_Clim0_log_data_with_Additive_proteases['gene symbol'])]
# todo: Note to self: in the final sort method, make sure these proteins are included/reassigned to Clim0 rates! 

# Other interesting things: 

In [None]:
priority_substrates_in_the_model_df["essential gene"] = 'no'
protein_ids = priority_substrates_in_the_model_df['Protein ID']
# determine which priority substrates are essential genes: 
PS_essential_gene_symbols_found = []
PS_essential_monomer_ids_found = []
PS_HL_under_44_mins = []

# todo: figure out how to clean up the run time speed on this function
# find the essential genes in the Clim0 data:
for protein in protein_ids:
    # get the gene name for the monomer ID:
    gene_id = get_gene_ids_for_monomer_ids()[protein]
    if gene_id in get_essential_genes():
        PS_essential_gene_symbols_found.append(get_gene_symbols_for_monomer_ids()[protein])
        PS_essential_monomer_ids_found.append(protein)
        priority_substrates_in_the_model_df.loc[priority_substrates_in_the_model_df['Protein ID'] == protein, 'essential gene'] = 'yes'

print("Essential Genes Found in Clim0 (" +str(len(PS_essential_gene_symbols_found))+"): ", PS_essential_gene_symbols_found)


2 mins

In [None]:
# make a new data frame with all proteins that have a half life of greater than 10 mins in the NE half life (min) column 
df_2mins = df[df['CLNE NE half life (min)'] < 10].copy()
monomer_ids_2mins = df_2mins['Protein ID'] # todo: do I need this??
df_2mins['gene symbol'] = df_2mins['Protein ID'].apply(lambda x: get_gene_symbols_for_monomer_ids()[x])
df_2mins = df_2mins.drop(columns=["CLNE NE rate constant (s^1)", "CLNE NE half life (min)", "CLClim0NE Clim0 rate constant (s^1)"])
df_2mins

In [None]:
# third, order the dataframe, then resplit the data 

# once again, split the dataframe: 
ordered_2min_df = df_2mins.sort_values(by="CLClim0NE Clim0 half life (min)", ascending=True)

# filter ordered DataFrame by HL Source using .loc[] method instead:
ordered_Clim_2min_df = ordered_2min_df.loc[ordered_2min_df['HL Source'] == Clim_name]
ordered_Clim_2min_PS_df = ordered_2min_df.loc[ordered_2min_df['HL Source'] == Clim_name + " (protease substrate)"]

In [None]:
# third, generate the graph
fig = go.Figure()

# generate the figure in the background first:
fig = px.scatter(ordered_2min_df, x=ordered_2min_df["Protein ID"], y=ordered_2min_df["CLClim0NE Clim0 half life (min)"])
fig.update_traces(marker_size=.5, opacity=.1)    

# overlay the individual rates
fig.add_trace(go.Scatter(x=(ordered_Clim_2min_df['Protein ID']), y=(ordered_Clim_2min_df["CLClim0NE Clim0 half life (min)"]), mode='markers', name=(Clim_name + ', '+ (str(np.shape(ordered_Clim_2min_df['Protein ID'])[0]))), hovertext=ordered_Clim_2min_df['gene symbol'], marker_size=8, marker=dict(color="lightseagreen", size=3, opacity=.6), yaxis="y1"))
fig.add_trace(go.Scatter(x=(ordered_Clim_2min_PS_df['Protein ID']), y=(ordered_Clim_2min_PS_df["CLClim0NE Clim0 half life (min)"]), mode='markers', name=(Clim_name + ' (protease substrates), ' + (str(np.shape(ordered_Clim_2min_PS_df['Protein ID'])[0]))), hovertext=ordered_Clim_2min_PS_df['gene symbol'], marker_size=8, marker=dict(color="orange", size=3, opacity=.6), yaxis="y1"))

# add a line at 600 minutes: 
#fig.add_shape(type="line", x0=0, x1=1653, y0=600, y1=600, line=dict(color="black", width=.5,))

# Plot Specs
fig.update_layout(title=dict(text=f'The C-limited (Clim) half life values for the New WCM proteins assigned to Clim <br> half-lives that were assinged to a half-life of 2 minutes in the original model ', font=dict(size=15)), xaxis_title='Protein ID', yaxis_title='Clim Half Life (min)')
fig.update_xaxes(visible=False)
fig.update_layout(autosize=False, width=700, height=600, showlegend=False)

# wont open in pycharm, so save as a html:
out_pth = "~/wcEcoli/out/random_plotlys/" + current_sequence + "_Clim0_ordered_with_protein_substrates_proposal_2mins.html"
out_pth = os.path.expanduser(out_pth) 
output_dir = os.path.dirname(out_pth)
os.makedirs(output_dir, exist_ok=True)  # Create the directory if it does not exist
fig.write_html(out_pth, auto_open=True) # displays the figure on a default web brouser 


## change in stability:


In [None]:
# calculate how many of the proteins in df_10hrs are greater than 10 hours: 
more_stable_10_hrs = df_10hrs['CLClim0NE Clim0 half life (min)'].apply(lambda x: x > 600).sum()
less_stable_10_hrs = df_10hrs['CLClim0NE Clim0 half life (min)'].apply(lambda x: x < 600).sum()
print(f"More stable than 10 hours: {more_stable_10_hrs}, less stable than 10 hours: {less_stable_10_hrs}")

# calculate how many of the proteins in df_2mins are greater than 2 minutes:
more_stable_2_mins = df_2mins['CLClim0NE Clim0 half life (min)'].apply(lambda x: x > 2).sum()
less_stable_2_mins = df_2mins['CLClim0NE Clim0 half life (min)'].apply(lambda x: x < 2).sum()
print(f"More stable than 2 minutes: {more_stable_2_mins}, less stable than 2 minutes: {less_stable_2_mins}")



In [None]:
# create a new dataframe that has the protein ids and a column that says if they are more or less stable than 10 hours now
df_10hrs['Stability'] = df_10hrs['CLClim0NE Clim0 half life (min)'].apply(lambda x: "More Stable" if x > 600 else "Less Stable")
df_10hrs

# create a new dataframe that has the protein ids and a column that says if they are more or less stable than 2 minutes now
df_2mins['Stability'] = df_2mins['CLClim0NE Clim0 half life (min)'].apply(lambda x: "More Stable" if x > 2 else "Less Stable")

# merge the two dataframes:
df_stability = pd.concat([df_10hrs, df_2mins], ignore_index=True)
df_stability

# save the dataframe to a csv file:
out_pth = "~/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/Clim_sorting_combos/Clim0_using_biorxiv_2023_data/CLClim0NE/Clim_stability_change.csv"
out_pth = os.path.expanduser(out_pth)
df_stability.to_csv(out_pth, index=False)


# Random stuff

In [None]:
print(len(df_CLClim0NE_CL), len(df_CLClim0NE_Clim), len(df_CLClim0NE_NE), len(df_CLClim0NE))

In [None]:
CLNE_full_df = pd.read_excel('/Users/miagrahn/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/previous_models/CLNE_files/CLNE_full.xlsx')

# figure out what common values the two dataframes have:
CLNE_full_df['Protein ID'].isin(df_CLClim0NE['Protein ID']).sum()

#figure out which values are NOT shared between the dataframes:
df_CLClim0NE[~df_CLClim0NE['Protein ID'].isin(CLNE_full_df['Protein ID'])]


In [None]:
CLNE_full_df

