In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
os.chdir(os.path.expanduser(
    '~/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/Clim_sorting_combos/Clim0_using_biorxiv_2023_data/CLClim0NE/'))
import plotly.graph_objects as go
import glob
import plotly.express as px

# C-lim half life experimental data (from first Clim sort, using values from the 2023 bioRxiv preprint) 

The data used for Clim rates in this analysis (the first analysis of the Clim rates) comes from this preprint: https://www.biorxiv.org/content/10.1101/2022.08.01.502339v2. The paper has been published, and can be found here: Gupta, et al., 2024: https://www.nature.com/articles/s41467-024-49920-8#Sec20

Specifically, this data comes from Supplementary Table 1: "TableS1 - Half-lives for 3262 proteins across 13 growth conditions >>", which contains the half-lives for proteins in E. coli under carbon-limited conditions. This is the data we are interested in comparing to our model predictions, as the conditions are most similar to our base case.

NOTE that this data has some "ceiling" values it auto assigns proteins to (as defined in the published paper). In the original Clim sort done using the bioarchiv paper (which will be analyzed here), we were not aware that these were ceilinig values. Thus, they are included in this version, but will likely be removed in later Clim rate sorts for half-life value extractions. Also, any protein that did not have a value reported for any one of its doubling time replicates (six total), were also excluded from this sort (leaving roughly 1912 proteins assigned to Clim rates in this CLClimNE sort, 7 assigned to CL, and 2391 to NE).  

ALSO NOTE: even though this file is in the published paper notebook folder, it is still based on the intitial bioRxiv preprint data sort. Future sorts will be done with the new published paper data.


## Find essential genes in the Clim0 data

Doing this by first finding the gene names for each of the monomer ids in the Clim0 data, and then comparing these to the essential genes list from the model.

In [20]:
# FUNCTIONS

# get the gene IDs for each monomer IDs:
def get_gene_ids_for_monomer_ids():
    # code adapted from convert_to_flat.py
    import io
    from wholecell.io import tsv
    from wholecell.utils.filepath import ROOT_PATH
    RNAS_FILE = os.path.join(ROOT_PATH, 'reconstruction', 'ecoli', 'flat', 'rnas.tsv')
    with io.open(RNAS_FILE, 'rb') as f:
        reader = tsv.reader(f, delimiter='\t')
        headers = next(reader)
        while headers[0].startswith('#'):
            headers = next(reader)

        gene_id_index = headers.index('gene_id')
        protein_id_index = headers.index('monomer_ids')
        monomer_ids_to_gene_ids = {}
        for line in reader:
            gene_id = line[gene_id_index]
            protein_id = list(line[protein_id_index][2:-2].split('", "'))[0]
            monomer_ids_to_gene_ids[protein_id] = gene_id

        return monomer_ids_to_gene_ids
    
def get_gene_symbols_for_monomer_ids():
    # code adapted from convert_to_flat.py
    import io
    from wholecell.io import tsv
    from wholecell.utils.filepath import ROOT_PATH
    RNAS_FILE = os.path.join(ROOT_PATH, 'reconstruction', 'ecoli', 'flat', 'rnas.tsv')
    with io.open(RNAS_FILE, 'rb') as f:
        reader = tsv.reader(f, delimiter='\t')
        headers = next(reader)
        while headers[0].startswith('#'):
            headers = next(reader)

        gene_symbol_index = headers.index('common_name')
        protein_id_index = headers.index('monomer_ids')
        monomer_ids_to_gene_symbols = {}
        for line in reader:
            gene_symbol = line[gene_symbol_index]
            protein_id = list(line[protein_id_index][2:-2].split('", "'))[0]
            monomer_ids_to_gene_symbols[protein_id] = gene_symbol

        return monomer_ids_to_gene_symbols
    
    
# convert gene IDs to monomer IDs:
def get_monomer_ids_for_gene_ids():
    # code adapted from convert_to_flat.py
    import io
    from wholecell.io import tsv
    from wholecell.utils.filepath import ROOT_PATH
    RNAS_FILE = os.path.join(ROOT_PATH, 'reconstruction', 'ecoli', 'flat', 'rnas.tsv')
    with io.open(RNAS_FILE, 'rb') as f:
        reader = tsv.reader(f, delimiter='\t')
        headers = next(reader)
        while headers[0].startswith('#'):
            headers = next(reader)

        gene_id_index = headers.index('gene_id')
        protein_id_index = headers.index('monomer_ids')
        gene_ids_to_monomer_ids = {}
        for line in reader:
            gene_id = line[gene_id_index]
            protein_id = list(line[protein_id_index][2:-2].split('", "'))[0]
            gene_ids_to_monomer_ids[gene_id] = protein_id

        return gene_ids_to_monomer_ids
    
    
# recall the genes defined as "essential" in the model, and return them as a list of gene names:
def get_essential_genes():
    # code adapted from convert_to_flat.py
    import io
    from wholecell.io import tsv
    from wholecell.utils.filepath import ROOT_PATH
    GENES_FILE = os.path.join(ROOT_PATH, 'validation', 'ecoli', 'flat', 'essential_genes.tsv')
    with io.open(GENES_FILE, 'rb') as f:
        reader = tsv.reader(f, delimiter='\t')
        headers = next(reader)
        while headers[0].startswith('#'):
            headers = next(reader)

        gene_id_index = headers.index('FrameID')
        essential_genes = []
        for line in reader:
            essential_gene = line[gene_id_index]
            essential_genes.append(essential_gene)

        return essential_genes
    


In [24]:
# load in the Clim0 data:
Clim0_data = pd.read_csv('Clim_rate_constants.csv')
monomer_ids = Clim0_data['Protein ID']

Clim0_essential_gene_symbols_found = []
Clim0_essential_monomer_ids_found = []

# todo: figure out how to clean up the run time speed on this function
# find the essential genes in the Clim0 data:
for monomer in monomer_ids:
    # get the gene name for the monomer ID:
    gene_id = get_gene_ids_for_monomer_ids()[monomer]
    if gene_id in get_essential_genes():
        Clim0_essential_gene_symbols_found.append(get_gene_symbols_for_monomer_ids()[monomer])
        Clim0_essential_monomer_ids_found.append(monomer)

print("Essential Genes Found in Clim0 (" +str(len(Clim0_essential_gene_symbols_found))+"): ", Clim0_essential_gene_symbols_found)

# to free up memory:
del Clim0_data, monomer_ids

Essential Genes Found in Clim0 (341):  ['leuA', 'ubiE', 'ubiB', 'panB', 'leuB', 'fabG', 'ubiA', 'bioF', 'argE', 'adk', 'purA', 'cysC', 'purH', 'purM', 'cysJ', 'trpE', 'argS', 'argH', 'argG', 'aroA', 'aroB', 'aroC', 'aroE', 'purB', 'asnS', 'asd', 'pyrB', 'thrA', 'metL', 'aspS', 'atpB', 'atpC', 'hisG', 'accB', 'cysI', 'accC', 'bioB', 'birA', 'ilvE', 'accA', 'accD', 'carB', 'pheA', 'tyrA', 'gltA', 'kdsB', 'pyrG', 'hisF', 'cydA', 'cydC', 'cysN', 'cysS', 'metC', 'bioA', 'lysA', 'dapA', 'folA', 'pyrC', 'pyrD', 'dapB', 'ilvD', 'ribB', 'tmk', 'dut', 'dxr', 'dxs', 'bioH', 'cca', 'lnt', 'dicA', 'dnaB', 'dnaC', 'dnaE', 'dnaN', 'era', 'ffh', 'frr', 'ftsA', 'ftsI', 'ftsY', 'ftsZ', 'fusA', 'ispG', 'grpE', 'gyrA', 'gyrB', 'infA', 'infC', 'lepB', 'ligA', 'map', 'minD', 'minE', 'groS', 'mrdA', 'mrdB', 'mreB', 'mreC', 'msbA', 'mukB', 'nusA', 'nusG', 'parE', 'prfA', 'pth', 'lptE', 'rnc', 'rne', 'rplB', 'rplC', 'rplD', 'rplE', 'rplF', 'rplJ', 'rplL', 'rplM', 'rplN', 'rplO', 'rplP', 'rplQ', 'rplR', 'rplS',

In [23]:
# for my own reference, find the number of essential genes in the the NE and CL groups in CLClim0NE too: 
CL_data = pd.read_csv('CL_rate_constants.csv')
CL_monomer_ids = CL_data['Protein ID']

CL_essential_gene_symbols_found = []
for monomer in CL_monomer_ids:
    # get the gene name for the monomer ID:
    gene_id = get_gene_ids_for_monomer_ids()[monomer]
    if gene_id in get_essential_genes():
        CL_essential_gene_symbols_found.append(get_gene_symbols_for_monomer_ids()[monomer])

print("Essential Genes Found in CL (" +str(len(CL_essential_gene_symbols_found))+"): ", CL_essential_gene_symbols_found)

NE_data = pd.read_csv('NE_rate_constants.csv')
NE_monomer_ids = NE_data['Protein ID']

NE_essential_gene_symbols_found = []
for monomer in NE_monomer_ids:
    # get the gene name for the monomer ID:
    gene_id = get_gene_ids_for_monomer_ids()[monomer]
    if gene_id in get_essential_genes():
        NE_essential_gene_symbols_found.append(get_gene_symbols_for_monomer_ids()[monomer])
        
print("Essential Genes Found in NE (" +str(len(NE_essential_gene_symbols_found))+"): ", NE_essential_gene_symbols_found)

# to free up memory:
del CL_data, NE_data, CL_monomer_ids, NE_monomer_ids

Essential Genes Found in CL (3):  ['carA', 'cdsA', 'bioD']
Essential Genes Found in NE (60):  ['plsC', 'trpD', 'panD', 'yagG', 'dfp', 'bioC', 'ftsQ', 'ftsW', 'lspA', 'mazE', 'mreD', 'rnpA', 'rpmC', 'rpmH', 'ftsL', 'secM', 'yihA', 'chpS', 'leuL', 'ydfB', 'tadA', 'holB', 'ispF', 'alsK', 'lapC', 'lgt', 'yhhQ', 'prmC', 'yabQ', 'yefM', 'entD', 'wzyE', 'murJ', 'yceQ', 'ymfK', 'racR', 'ydiL', 'exoX', 'zipA', 'ftsB', 'yqgD', 'mqsA', 'tdcF', 'ubiV', 'igaA', 'lptF', 'hemA', 'folK', 'metA', 'nadK', 'pabA', 'coaD', 'mraY', 'pgsA', 'rpoH', 'secE', 'lpxK', 'tnaB', 'ispU', 'lolD']


In [26]:
# I would like to see if Lon or Clp are in this out of curiosity:
essential_gene_symbols = []
for gene in get_essential_genes():
    essential_gene_symbols.append(get_gene_symbols_for_monomer_ids()[get_monomer_ids_for_gene_ids()[gene]])
    
print("Essential Genes in Model (" +str(len(essential_gene_symbols))+"): ", essential_gene_symbols)

del essential_gene_symbols

Essential Genes in Model:  ['argG', 'bioA', 'bioB', 'bioC', 'bioD', 'bioF', 'bioH', 'ilvE', 'cysG', 'ubiE', 'exoX', 'hflD', 'argA', 'argE', 'argH', 'aroA', 'aroB', 'aroC', 'aroE', 'cysC', 'cysE', 'cysH', 'cysI', 'cysJ', 'cysN', 'glyA', 'hisA', 'hisB', 'hisC', 'hisD', 'hisF', 'hisG', 'hisH', 'hisI', 'ilvA', 'ilvC', 'ilvD', 'leuA', 'leuB', 'leuC', 'leuD', 'lysA', 'metA', 'metB', 'metC', 'metF', 'metL', 'pabA', 'pabB', 'pheA', 'proA', 'proB', 'proC', 'serA', 'serB', 'serC', 'thrA', 'thrB', 'thrC', 'trpA', 'trpB', 'trpC', 'trpD', 'trpE', 'tyrA', 'carA', 'carB', 'guaA', 'guaB', 'purA', 'purC', 'purD', 'purE', 'purF', 'purH', 'purK', 'purL', 'purM', 'pyrB', 'pyrC', 'pyrD', 'pyrE', 'pyrF', 'thyA', 'iscS', 'nadA', 'nadB', 'nadC', 'panB', 'panC', 'panD', 'pdxA', 'pdxB', 'pdxH', 'pdxJ', 'cysB', 'leuL', 'metR', 'ptsI', 'atpB', 'atpC', 'glnA', 'gltA', 'icd', 'ppc', 'panZ', 'ribF', 'lspA', 'ispH', 'dapB', 'folA', 'lptD', 'yabQ', 'ftsL', 'ftsI', 'murE', 'murF', 'mraY', 'murD', 'ftsW', 'murG', 'murC'

# Determining which proteins are assigned to a protease (according to Gupta et al., 2024) and are essential in the CLClim0NE sorted model

This data is from a rendition of supplementary table #2 in the published paper ("TableS2 - Assignment of protein substrates to proteases >>"). This table contains the proteins that are assigned to each of the proteases in the proteins they measured in the paper. This supplementary table has been adapted for use in this analysis, so the full detail about each data column is not included here (but can be found in the original paper). 


In [36]:
# load in the protease-substrate assignment datatable: 
protease_data = pd.read_excel('~/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/supplementary_data/Gupta_et_al_2024_ST2_protease_assignment_data.xlsx')

# determine if any of the proteins assigned to a protease are considered an essential gene in the model: 
essential_substrate_gene_symbols = []
for gene in get_essential_genes():
    if get_gene_symbols_for_monomer_ids()[get_monomer_ids_for_gene_ids()[gene]] in protease_data['Gene name'].values:
        essential_substrate_gene_symbols.append(get_gene_symbols_for_monomer_ids()[get_monomer_ids_for_gene_ids()[gene]])
        
print("There are " + str(len(protease_data['Gene name'].values)) + " proteins assigned to a protease in the Gupta et al., 2024 data.")
print("Essential Genes in Model that are Assigned to a Protease (" +str(len(essential_substrate_gene_symbols))+"): ", essential_substrate_gene_symbols)

There are 308 proteins assigned to a protease in the Gupta et al., 2024 data.
Essential Genes in Model that are Assigned to a Protease (57):  ['bioB', 'ilvE', 'argA', 'aroE', 'cysH', 'cysI', 'cysJ', 'cysN', 'leuA', 'leuC', 'leuD', 'lysA', 'thrA', 'trpB', 'tyrA', 'iscS', 'nadA', 'nadB', 'metR', 'ispH', 'ftsI', 'ftsQ', 'ftsZ', 'lpxC', 'erpA', 'frr', 'dxr', 'ispU', 'dnaE', 'thiL', 'dnaX', 'ftsK', 'rpsA', 'mukB', 'acpP', 'minE', 'hemA', 'folE', 'ligA', 'dapA', 'ispG', 'era', 'ffh', 'metK', 'parE', 'ribB', 'obgE', 'def', 'rplX', 'dnaA', 'birA', 'rpoB', 'rpoC', 'ubiA', 'lexA', 'dnaB', 'dnaC']


In [35]:
# determine which proteins are assigned to proteases in the Clim0 data (does not matter if they are essential or not):
Clim0_data = pd.read_csv('Clim_rate_constants.csv')
monomer_ids = Clim0_data['Protein ID']

Clim0_protease_substrate_gene_symbols = []
Clim0_protease_substrate_monomer_ids = []
for monomer in monomer_ids:
    # get the gene symbol for the monomer ID:
    gene_symbol = get_gene_symbols_for_monomer_ids()[monomer]
    if gene_symbol in protease_data['Gene name'].values:
        Clim0_protease_substrate_gene_symbols.append(gene_symbol)
        Clim0_protease_substrate_monomer_ids.append(monomer)
        
print("Total Protease Substrates in Clim0 (" +str(len(Clim0_protease_substrate_gene_symbols))+"): ", Clim0_protease_substrate_gene_symbols)

# to free up memory:
del Clim0_data, monomer_ids

# repeat for CL
CL_data = pd.read_csv('CL_rate_constants.csv')
CL_monomer_ids = CL_data['Protein ID']

CL_protease_substrate_gene_symbols = []
CL_protease_substrate_monomer_ids = []
for monomer in CL_monomer_ids:
    # get the gene symbol for the monomer ID:
    gene_symbol = get_gene_symbols_for_monomer_ids()[monomer]
    if gene_symbol in protease_data['Gene name'].values:
        CL_protease_substrate_gene_symbols.append(gene_symbol)
        CL_protease_substrate_monomer_ids.append(monomer)
        
print("Total Protease Substrates in CL (" +str(len(CL_protease_substrate_gene_symbols))+"): ", CL_protease_substrate_gene_symbols)

# to free up memory:
del CL_data, CL_monomer_ids

NE_data = pd.read_csv('NE_rate_constants.csv')
NE_monomer_ids = NE_data['Protein ID']

NE_protease_substrate_gene_symbols = []
NE_protease_substrate_monomer_ids = []
for monomer in NE_monomer_ids:
    # get the gene symbol for the monomer ID:
    gene_symbol = get_gene_symbols_for_monomer_ids()[monomer]
    if gene_symbol in protease_data['Gene name'].values:
        NE_protease_substrate_gene_symbols.append(gene_symbol)
        NE_protease_substrate_monomer_ids.append(monomer)
        
print("Total Protease Substrates in NE (" +str(len(NE_protease_substrate_gene_symbols))+"): ", NE_protease_substrate_gene_symbols)

# to free up memory:
del NE_data, NE_monomer_ids

# note that for CLClim0NE, the breakdown appears to be as follows: Clim0 (273), CL (0), NE (25). There are 308 proteins assigned to proteases in the Gupta et al., 2024 data, and 298 of these appear in the whole cell model.

Total Protease Substrates in Clim0 (273):  ['leuA', 'ubiA', 'ackA', 'acnB', 'fadE', 'glpD', 'cysJ', 'uxaA', 'astA', 'aroD', 'aroE', 'aroG', 'aroK', 'aroL', 'asnA', 'aspA', 'thrA', 'cysI', 'bioB', 'birA', 'ilvE', 'chbB', 'cfa', 'tyrA', 'ubiC', 'cycA', 'cysD', 'cysN', 'dadA', 'lysA', 'dapA', 'nfsB', 'ribB', 'cobT', 'dxr', 'yaaA', 'bisC', 'clpA', 'clpX', 'dedA', 'dedD', 'dksA', 'dnaB', 'dnaC', 'dnaE', 'dnaK', 'dnaQ', 'era', 'ffh', 'frr', 'ftsI', 'ftsZ', 'ispG', 'mnmG', 'greA', 'helD', 'hflX', 'hmp', 'hsdM', 'hypE', 'ligA', 'mazG', 'minE', 'mukB', 'mutS', 'nfo', 'parC', 'parE', 'pcnB', 'priC', 'pspA', 'recA', 'recD', 'recF', 'recJ', 'rhlB', 'rplX', 'rpsA', 'ruvC', 'sbcC', 'selD', 'prlF', 'srmB', 'tonB', 'topB', 'uvrB', 'uvrD', 'ispH', 'sbcD', 'ybaB', 'relE', 'yebC', 'yecA', 'uvrY', 'yibA', 'yidA', 'mioC', 'ybhA', 'rlmD', 'mazF', 'radA', 'rng', 'lipA', 'dinG', 'ahpC', 'ahpF', 'ychF', 'yjgA', 'dps', 'def', 'yigI', 'ycgB', 'ibpA', 'greB', 'grcA', 'yihD', 'yihI', 'yiiQ', 'rsuA', 'radD', 'ccmH'

In [33]:
# check if any of the essential genes in the Clim0 data are assigned to a protease:
essential_substrate_gene_symbols = []
for gene in Clim0_essential_gene_symbols_found:
    if gene in protease_data['Gene name'].values:
        essential_substrate_gene_symbols.append(gene)
        
print("Essential Genes with Clim0 half-lives that are assigned to a protease (" +str(len(essential_substrate_gene_symbols))+"): ", essential_substrate_gene_symbols)

Essential Genes in Clim0 that are assigned to a protease (54):  ['leuA', 'ubiA', 'cysJ', 'aroE', 'thrA', 'cysI', 'bioB', 'birA', 'ilvE', 'tyrA', 'cysN', 'lysA', 'dapA', 'ribB', 'dxr', 'dnaB', 'dnaC', 'dnaE', 'era', 'ffh', 'frr', 'ftsI', 'ftsZ', 'ispG', 'ligA', 'minE', 'mukB', 'parE', 'rplX', 'rpsA', 'ispH', 'def', 'erpA', 'acpP', 'ftsK', 'iscS', 'obgE', 'folE', 'nadB', 'leuC', 'leuD', 'dnaX', 'argA', 'cysH', 'lexA', 'dnaA', 'metR', 'nadA', 'rpoB', 'rpoC', 'metK', 'thiL', 'trpB', 'lpxC']


In [34]:
# check if any of the essential genes in the CL data are assigned to a protease:
CL_essential_substrate_gene_symbols = []
for gene in CL_essential_gene_symbols_found:
    if gene in protease_data['Gene name'].values:
        CL_essential_substrate_gene_symbols.append(gene)
        
print("Essential Genes with CL half-lives that are assigned to a protease (" +str(len(CL_essential_substrate_gene_symbols))+"): ", CL_essential_substrate_gene_symbols)

# check if any of the essential genes in the NE data are assigned to a protease:
NE_essential_substrate_gene_symbols = []
for gene in NE_essential_gene_symbols_found:
    if gene in protease_data['Gene name'].values:
        NE_essential_substrate_gene_symbols.append(gene)
        
print("Essential Genes with NE half-lives that are assigned to a protease (" +str(len(NE_essential_substrate_gene_symbols))+"): ", NE_essential_substrate_gene_symbols)

Essential Genes with CL half-lives that are assigned to a protease (0):  []
Essential Genes with NE half-lives that are assigned to a protease (3):  ['ftsQ', 'hemA', 'ispU']


next items to do with this info: 
- plot the essential genes in the Clim0 data
- plot the outliers and highlight the essential genes 
- plot the outliers and highlight the protease substrates