In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import pickle
import os

from models.ecoli.analysis import parcaAnalysisPlot
from wholecell.analysis.analysis_tools import exportFigure
from wholecell.utils import constants
from wholecell.utils import units

In [2]:
# load in the rates:
NE_path ='~/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/previous_models/CLNE_files/CLNE_NE_rates_only.csv'
NE_rates_df = pd.read_csv(NE_path)
CL_path ='~/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/previous_models/CLNE_files/CLNE_CL_rates_only.csv'
CL_rates_df = pd.read_csv(CL_path)

In [3]:
# load in the supplemental data tabel S1 from Gupta et al. 2024:
s1_path ='~/wcEcoli/models/ecoli/analysis/local_notebooks/C_limited_PDR_analyses_with_published_paper_data/supplementary_data/41467_2024_49920_MOESM4_ESM_ST1.xlsx'
table_s1_df = pd.read_excel(s1_path, skiprows=[0, 1, 2, 3])
df = pd.read_excel(s1_path, nrows=2, skiprows=[0, 1, 2], header = None)
media_condition_dict = dict(zip(df.iloc[1], df.iloc[0]))

# create a copy of the table with all the protein names and gene names used in the paper
full_table = table_s1_df[['Protein ID', 'Gene names ']] 

# Create a deep copy
full_list_df= full_table.copy(deep=True)

In [4]:
full_list_df

Unnamed: 0,Protein ID,Gene names
0,sp|A5A614|YCIZ_ECOLI,yciZ
1,sp|O32583|THIS_ECOLI,thiS
2,sp|P00350|6PGD_ECOLI,gnd
3,sp|P00363|FRDA_ECOLI,frdA
4,sp|P00370|DHE4_ECOLI,gdhA
...,...,...
3257,sp|P0AAD4|TYRP_ECOLI,tyrP
3258,sp|P42592|YGJK_ECOLI,ygjK
3259,sp|P69330|CITD_ECOLI,citD
3260,sp|P77294|YDER_ECOLI,ydeR


In [5]:
# Isolate the UniProt ID and gene name from the Protein ID column
full_list_df['UniProt id'] = full_list_df['Protein ID'].str.split('|', expand = True)[1]
full_list_df['UniProt gene name'] = full_list_df['Protein ID'].str.split('|', expand = True)[2]

In [6]:
full_list_df

Unnamed: 0,Protein ID,Gene names,UniProt id,UniProt gene name
0,sp|A5A614|YCIZ_ECOLI,yciZ,A5A614,YCIZ_ECOLI
1,sp|O32583|THIS_ECOLI,thiS,O32583,THIS_ECOLI
2,sp|P00350|6PGD_ECOLI,gnd,P00350,6PGD_ECOLI
3,sp|P00363|FRDA_ECOLI,frdA,P00363,FRDA_ECOLI
4,sp|P00370|DHE4_ECOLI,gdhA,P00370,DHE4_ECOLI
...,...,...,...,...
3257,sp|P0AAD4|TYRP_ECOLI,tyrP,P0AAD4,TYRP_ECOLI
3258,sp|P42592|YGJK_ECOLI,ygjK,P42592,YGJK_ECOLI
3259,sp|P69330|CITD_ECOLI,citD,P69330,CITD_ECOLI
3260,sp|P77294|YDER_ECOLI,ydeR,P77294,YDER_ECOLI


In [7]:
# Now, find the corresponding monomer ID for each UniProt ID using an interactive function
import requests

# Create a session
s = requests.Session() 

# Post login credentials to session:
s.post('https://websvc.biocyc.org/credentials/login/', data={'email':'', 'password':''})

<Response [401]>

In [8]:
def get_ecocyc_id(uniprot_id):
    monomer_id = None
    """
    Fetches the EcoCyc ID for a given gene name.

    Args:
        gene_name: The gene name to search for.

    Returns:
        The EcoCyc Monomer ID, or None if not found.
    """
    # Issue web service request:
    url = f"https://websvc.biocyc.org/ECOLI/foreignid?ids=UniProt:{uniprot_id}"
    response = s.get(url).text.split('\t')
    if response[1] == '1':
        monomer_id = response[2].replace("\n", "")
    return monomer_id

In [9]:
# find the monomer id for each UniProt ID: 
full_list_df['Monomer ID'] = full_list_df['UniProt id'].apply(get_ecocyc_id)
full_list_df

KeyboardInterrupt: 

In [12]:
# save the table to a csv file
#full_list_df.to_csv('~/wcEcoli/reconstruction/ecoli/scripts/protein_half_lives/Gupta_et_al_half_lives_with_EcoCyc_Monomer_IDs/Clim_EcoCyc_Monomer_ID_Matches_11152024.csv')

# load the table from the csv file
full_list_df = pd.read_csv('~/wcEcoli/reconstruction/ecoli/scripts/protein_half_lives/Gupta_et_al_half_lives_with_EcoCyc_Monomer_IDs/Clim_EcoCyc_Monomer_ID_Matches_11152024.csv')
full_list_df

Unnamed: 0.1,Unnamed: 0,Protein ID,Gene names,UniProt id,UniProt gene name,Monomer ID
0,0,sp|A5A614|YCIZ_ECOLI,yciZ,A5A614,YCIZ_ECOLI,MONOMER0-2820
1,1,sp|O32583|THIS_ECOLI,thiS,O32583,THIS_ECOLI,THIS-MONOMER
2,2,sp|P00350|6PGD_ECOLI,gnd,P00350,6PGD_ECOLI,6PGLUCONDEHYDROG-MONOMER
3,3,sp|P00363|FRDA_ECOLI,frdA,P00363,FRDA_ECOLI,FUM-FLAVO
4,4,sp|P00370|DHE4_ECOLI,gdhA,P00370,DHE4_ECOLI,GDHA-MONOMER
...,...,...,...,...,...,...
3257,3257,sp|P0AAD4|TYRP_ECOLI,tyrP,P0AAD4,TYRP_ECOLI,TYRP-MONOMER
3258,3258,sp|P42592|YGJK_ECOLI,ygjK,P42592,YGJK_ECOLI,G7599-MONOMER
3259,3259,sp|P69330|CITD_ECOLI,citD,P69330,CITD_ECOLI,ACPSUB-MONOMER
3260,3260,sp|P77294|YDER_ECOLI,ydeR,P77294,YDER_ECOLI,G6793-MONOMER


In [35]:
# get the common name from rnas.tsv for each monomer ID
rnas = pd.read_csv('~/wcEcoli/reconstruction/ecoli/flat/rnas.tsv', sep='\t',
                   skiprows=[0, 1, 2, 3])
rnas_data = rnas[['common_name', 'gene_id', 'monomer_ids']]
rnas_common_names =  rnas[['monomer_ids','common_name']]

import ast
# Function to evaluate strings as lists
def parse_monomer_ids(row):
    try:
        return ast.literal_eval(row)  # Convert the string to a Python list
    except (ValueError, SyntaxError):
        return []  # Return an empty list if the conversion fails

# make a copy of rnas_common_names
rnas_common_names_copy = rnas_common_names.copy(deep=True)

# Apply the parsing function to the monomer_ids column
rnas_common_names_copy['monomer_ids'] = rnas_common_names_copy['monomer_ids'].apply(parse_monomer_ids)

# Now, monomer_ids is a proper list for each row
rnas_common_names_copy


Unnamed: 0,monomer_ids,common_name
0,[ALARACEBIOSYN-MONOMER],alr
1,[MODB-MONOMER],modB
2,[EG10003-MONOMER],cysZ
3,[EG10004-MONOMER],dfp
4,[DCUB-MONOMER],dcuB
...,...,...
4742,[MHPDHYDROL-MONOMER],mhpD
4743,[MHPF-MONOMER],mhpF
4744,[HCAA2-MONOMER],hcaF
4745,[EG10923-MONOMER],ruvA


In [37]:
# make a separate list of just the monomer_ids that have multiple monomers
common_names_with_multiple_monomer_ids = []
multiple_monomer_ids = []
for i in range(len(rnas_common_names_copy)):
    if len(rnas_common_names_copy.iloc[i]['monomer_ids']) > 1:
        multiple_monomer_ids.append(rnas_common_names_copy.iloc[i]['monomer_ids'])
        common_names_with_multiple_monomer_ids.append(rnas_common_names_copy.iloc[i]['common_name'])
        
multiple_monomer_ids_df = pd.DataFrame({'common_name': common_names_with_multiple_monomer_ids, 'monomer_ids': multiple_monomer_ids})
multiple_monomer_ids_df

Unnamed: 0,common_name,monomer_ids
0,cheA,"[PROTEIN-CHEA, CHEA-SMALL]"
1,clpB,"[MONOMER0-4519, EG10157-MONOMER]"
2,dnaX,"[MONOMER0-2383, EG10245-MONOMER]"
3,infB,"[MONOMER0-4539, MONOMER0-4538, EG10505-MONOMER]"
4,mcrB,"[MONOMER0-4524, EG10574-MONOMER]"
5,mrcB,"[MONOMER0-4521, EG10605-MONOMER]"
6,yibX,"[MONOMER0-4509, MONOMER0-4508]"
7,copA,"[MONOMER0-4382, G6260-MONOMER]"
8,cobB,"[MONOMER0-4534, G6577-MONOMER]"


In [44]:
# loop through each monomer_id in the rnas_common_names table and if it exits in the full_list_df table, add the common name to the full_list_df table

# make a full_list_df copy:
full_list_df_copy = full_list_df.copy(deep=True)

# add a common name column to the full_list_df_copy table
full_list_df_copy['common_name'] = None

# save which monomer_ids have multiple monomers and their gene names: 
multiple_monomer_ids_with_gene_names = []
gene_names_for_multiple_monomer_ids = []

for i in range(len(rnas_common_names_copy['monomer_ids'])):
    monomer_id = rnas_common_names_copy.iloc[i]['monomer_ids']
    if len(monomer_id) > 1: 
        # check if the monomer_id is in the full_list_df_copy table
        for j in range(len(monomer_id)):
            if monomer_id[j] in full_list_df_copy['Monomer ID'].values:
                common_name = rnas_common_names.iloc[i]['common_name']
                full_list_df_copy.loc[full_list_df_copy['Monomer ID'] == monomer_id[j], 'common_name']  = common_name
                multiple_monomer_ids_with_gene_names.append(monomer_id[j])
                gene_names_for_multiple_monomer_ids.append(rnas_common_names_copy.iloc[i]['common_name'])
    elif len(monomer_id) == 0: 
        # some monomer_ids have no common name, so just skip them
        pass
    else: 
        monomer_id = monomer_id[0]
        common_name = rnas_common_names.iloc[i]['common_name']
        if monomer_id in full_list_df_copy['Monomer ID'].values: 
            full_list_df_copy.loc[full_list_df_copy['Monomer ID'] == monomer_id, 'common_name']  = common_name

multiple_monomer_ids_with_gene_names_df = pd.DataFrame({'monomer_ids': multiple_monomer_ids_with_gene_names, 'common_name': gene_names_for_multiple_monomer_ids})            
full_list_df_copy     

['PROTEIN-CHEA', 'CHEA-SMALL']
['MONOMER0-4519', 'EG10157-MONOMER']
['MONOMER0-2383', 'EG10245-MONOMER']
['MONOMER0-4539', 'MONOMER0-4538', 'EG10505-MONOMER']
['MONOMER0-4524', 'EG10574-MONOMER']
['MONOMER0-4521', 'EG10605-MONOMER']
['MONOMER0-4509', 'MONOMER0-4508']
['MONOMER0-4382', 'G6260-MONOMER']
['MONOMER0-4534', 'G6577-MONOMER']


Unnamed: 0.1,Unnamed: 0,Protein ID,Gene names,UniProt id,UniProt gene name,Monomer ID,common_name
0,0,sp|A5A614|YCIZ_ECOLI,yciZ,A5A614,YCIZ_ECOLI,MONOMER0-2820,yciZ
1,1,sp|O32583|THIS_ECOLI,thiS,O32583,THIS_ECOLI,THIS-MONOMER,thiS
2,2,sp|P00350|6PGD_ECOLI,gnd,P00350,6PGD_ECOLI,6PGLUCONDEHYDROG-MONOMER,gnd
3,3,sp|P00363|FRDA_ECOLI,frdA,P00363,FRDA_ECOLI,FUM-FLAVO,frdA
4,4,sp|P00370|DHE4_ECOLI,gdhA,P00370,DHE4_ECOLI,GDHA-MONOMER,gdhA
...,...,...,...,...,...,...,...
3257,3257,sp|P0AAD4|TYRP_ECOLI,tyrP,P0AAD4,TYRP_ECOLI,TYRP-MONOMER,tyrP
3258,3258,sp|P42592|YGJK_ECOLI,ygjK,P42592,YGJK_ECOLI,G7599-MONOMER,ygjK
3259,3259,sp|P69330|CITD_ECOLI,citD,P69330,CITD_ECOLI,ACPSUB-MONOMER,citD
3260,3260,sp|P77294|YDER_ECOLI,ydeR,P77294,YDER_ECOLI,G6793-MONOMER,ydeR


In [45]:
multiple_monomer_ids_with_gene_names_df

Unnamed: 0,monomer_ids,common_name
0,CHEA-SMALL,cheA
1,EG10157-MONOMER,clpB
2,EG10505-MONOMER,infB
3,EG10574-MONOMER,mcrB
4,EG10605-MONOMER,mrcB


In [58]:
# check if any common_names in full_list_df_copy show up for more than one Monomer ID
#common_names = full_list_df_copy['common_name'].unique() # this is likely not 3262 because some are still "None"

# check how many are not unique: 
non_unique_common_names = []
for common_name in range(len(full_list_df_copy['common_name'])): 
    common_name = full_list_df_copy.iloc[common_name]['common_name']
    if len(full_list_df_copy[full_list_df_copy['common_name'] == common_name]) > 1: 
        non_unique_common_names.append(common_name)
print(len(non_unique_common_names)) # 0

# check how many are unique:
unique_common_names = []
for common_name in range(len(full_list_df_copy['common_name'])): 
    common_name = full_list_df_copy.iloc[common_name]['common_name']
    if len(full_list_df_copy[full_list_df_copy['common_name'] == common_name]) == 1: 
        unique_common_names.append(common_name)
print(len(unique_common_names)) # 2354

# check how many are still "None":
none_common_names = 0
for common_name in range(len(full_list_df_copy['common_name'])):
    common_name = full_list_df_copy.iloc[common_name]['common_name']
    if common_name == None: 
        none_common_names += 1

print(none_common_names) # 8

# perfect, adds to 3262

0
3254
8
