In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

import plotly.graph_objects as go
import glob
import plotly.express as px

In [18]:
# Not all gene IDs in the model match those used in the data (from Gupta et al., 2024). Figure out which may need to be manually reassigned: 

# load in the sort file that was fed into convert to flat (should be 2431 genes long)
sort_file = pd.read_csv('~/wcEcoli/reconstruction/ecoli/scripts/protein_half_lives/protein_half_lives_Clim2.tsv', sep='\t')

# load in the file post convertion to flat file (should be like 2306 proteins long): 
flat_file = pd.read_csv('~/wcEcoli/reconstruction/ecoli/flat/protein_half_lives_Clim2.tsv', sep='\t', skiprows=[0,]) 
flat_file

Unnamed: 0,id,half_life (units.min)
0,1-PFK-MONOMER,766.3
1,2-ISOPROPYLMALATESYN-MONOMER,276.9
2,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,493.4
3,2-OCTAPRENYLPHENOL-HYDROX-MONOMER,796.0
4,3-CH3-2-OXOBUTANOATE-OH-CH3-XFER-MONOMER,532.7
...,...,...
2301,YRBF-MONOMER,823.8
2302,YTFQ-MONOMER,790.4
2303,YTFR-MONOMER,654.1
2304,ZNUA-MONOMER,638.1


In [19]:
# convert the proteins in flat_file to gene ids:

# functions:
# get the gene IDs for each monomer IDs:
def get_gene_ids_for_monomer_ids():
    # code adapted from convert_to_flat.py
    import io
    from wholecell.io import tsv
    from wholecell.utils.filepath import ROOT_PATH
    RNAS_FILE = os.path.join(ROOT_PATH, 'reconstruction', 'ecoli', 'flat', 'rnas.tsv')
    with io.open(RNAS_FILE, 'rb') as f:
        reader = tsv.reader(f, delimiter='\t')
        headers = next(reader)
        while headers[0].startswith('#'):
            headers = next(reader)

        gene_id_index = headers.index('gene_id')
        protein_id_index = headers.index('monomer_ids')
        monomer_ids_to_gene_ids = {}
        for line in reader:
            gene_id = line[gene_id_index]
            protein_id = list(line[protein_id_index][2:-2].split('", "'))[0]
            monomer_ids_to_gene_ids[protein_id] = gene_id

        return monomer_ids_to_gene_ids

def get_gene_symbols_for_monomer_ids():
    # code adapted from convert_to_flat.py
    import io
    from wholecell.io import tsv
    from wholecell.utils.filepath import ROOT_PATH
    RNAS_FILE = os.path.join(ROOT_PATH, 'reconstruction', 'ecoli', 'flat', 'rnas.tsv')
    with io.open(RNAS_FILE, 'rb') as f:
        reader = tsv.reader(f, delimiter='\t')
        headers = next(reader)
        while headers[0].startswith('#'):
            headers = next(reader)

        gene_symbol_index = headers.index('common_name')
        protein_id_index = headers.index('monomer_ids')
        monomer_ids_to_gene_symbols = {}
        for line in reader:
            gene_symbol = line[gene_symbol_index]
            protein_id = list(line[protein_id_index][2:-2].split('", "'))[0]
            monomer_ids_to_gene_symbols[protein_id] = gene_symbol

        return monomer_ids_to_gene_symbols
    
# extract protein ids from flat_file: 
monomer_ids = flat_file['id']
gene_symbols = []
for monomer in monomer_ids:
    # get the gene name for the monomer ID:
    gene_id = get_gene_ids_for_monomer_ids()[monomer]
    gene_symbol = get_gene_symbols_for_monomer_ids()[monomer]
    gene_symbols.append(gene_symbol)
    


In [20]:
# add the gene_symbols to the flat file:
flat_file['Gene id'] = gene_symbols
flat_file

Unnamed: 0,id,half_life (units.min),Gene id
0,1-PFK-MONOMER,766.3,fruK
1,2-ISOPROPYLMALATESYN-MONOMER,276.9,leuA
2,2-OCTAPRENYL-METHOXY-BENZOQ-METH-MONOMER,493.4,ubiE
3,2-OCTAPRENYLPHENOL-HYDROX-MONOMER,796.0,ubiB
4,3-CH3-2-OXOBUTANOATE-OH-CH3-XFER-MONOMER,532.7,panB
...,...,...,...
2301,YRBF-MONOMER,823.8,mlaF
2302,YTFQ-MONOMER,790.4,ytfQ
2303,YTFR-MONOMER,654.1,ytfR
2304,ZNUA-MONOMER,638.1,znuA


In [21]:
# find which gene IDs are in the sort_file but not in the flat_file:
sort_file_genes = sort_file['Gene id']
flat_file_genes = flat_file['Gene id']

# Find genes in sort_file but not in flat_file
genes_in_sort_not_flat = sort_file_genes[~sort_file_genes.isin(flat_file_genes)]
genes_in_sort_not_flat # 125 genes are in the sort file but not in the flat file

15      yliE
24      yifL
31      hexR
33      yliF
37      csiD
        ... 
2391     gmr
2396    ylaB
2409    yciV
2423    yqiC
2424    ygbT
Name: Gene id, Length: 125, dtype: object

In [22]:
# now load in rnas.tsv :
rnas = pd.read_csv('~/wcEcoli/reconstruction/ecoli/flat/rnas.tsv', sep='\t', skiprows=[0,1,2,3])
#rnas = rnas.set_index('id')
#rnas.reset_index(inplace=True)
rnas

Unnamed: 0,id,common_name,synonyms,type,modified_forms,gene_id,monomer_ids,anticodon,coding_segments
0,EG10001_RNA,alr,"[""alr"", ""alr5"", ""b4053"", ""ECK4045""]",mRNA,[],EG10001,"[""ALARACEBIOSYN-MONOMER""]",,[]
1,EG10002_RNA,modB,"[""modB"", ""chlJ"", ""tslJ"", ""b0764"", ""ECK0753""]",mRNA,[],EG10002,"[""MODB-MONOMER""]",,[]
2,EG10003_RNA,cysZ,"[""cysZ"", ""b2413"", ""ECK2408""]",mRNA,[],EG10003,"[""EG10003-MONOMER""]",,[]
3,EG10004_RNA,dfp,"[""dfp"", ""coaBC"", ""b3639"", ""ECK3629""]",mRNA,[],EG10004,"[""EG10004-MONOMER""]",,[]
4,EG10006_RNA,dcuB,"[""dcuB"", ""genF"", ""b4123"", ""ECK4116""]",mRNA,[],EG10006,"[""DCUB-MONOMER""]",,[]
...,...,...,...,...,...,...,...,...,...
4742,M013_RNA,mhpD,"[""mhpD"", ""mhpS"", ""b0350"", ""ECK0347""]",mRNA,[],M013,"[""MHPDHYDROL-MONOMER""]",,[]
4743,M014_RNA,mhpF,"[""mhpF"", ""b0351"", ""ECK0348""]",mRNA,[],M014,"[""MHPF-MONOMER""]",,[]
4744,M015_RNA,hcaF,"[""hcaF"", ""yfhV"", ""hcaA"", ""phdC2"", ""digB"", ""hca...",mRNA,[],M015,"[""HCAA2-MONOMER""]",,[]
4745,RUVA_RNA,ruvA,"[""ruvA"", ""b1861"", ""ECK1862""]",mRNA,[],RUVA,"[""EG10923-MONOMER""]",,[]


In [23]:
# now check if the mismatched genes in the sort file are in the rnas file as synonyms: 
rnas_synonyms = rnas['synonyms']

# find the genes in the sort file that are not in the flat file:
synonyms = [] 
model_value = []
index_value = []
# for gene in genes_in_sort_not_flat:
#     if any(gene in synonyms for synonyms in rnas_synonyms):
#         synonyms.append(synonyms)
#         # find the common_name for the synonym:
#         for i in range(len(rnas_synonyms)):
#             if gene in rnas_synonyms[i]:
#                 model_value.append(rnas['common_name'][i])
#     else:
#         synonyms.append(0)
#         model_value.append("None")

for gene in genes_in_sort_not_flat:
    for index, synonym in enumerate(rnas_synonyms):
        # Check if the gene is in the current synonyms list
        if gene in synonym:
            # add the info to the respective list
            synonyms.append(synonym)
            model_value.append(rnas['common_name'][index])
            index_value.append(index)
            break  # Stop searching after the first match
    else:
        synonyms.append(0)
        model_value.append("None")
        index_value.append("None")
    

In [24]:
# make a dataframe of the mismatched genes and their synonyms:
mismatched_genes = pd.DataFrame({'Sorted file Gene id': genes_in_sort_not_flat, 'Synonyms': synonyms, 'Model Gene id': model_value, 'rnas dataset index': index_value})
mismatched_genes

Unnamed: 0,Sorted file Gene id,Synonyms,Model Gene id,rnas dataset index
15,yliE,"[""pdeI"", ""yliE"", ""b0833"", ""ECK0823""]",pdeI,3335
24,yifL,"[""lptM"", ""yifL"", ""b4558"", ""ECK3803""]",lptM,2607
31,hexR,"[""yebK"", ""hexR(P.a.)"", ""b1853"", ""ECK1854""]",yebK,2400
33,yliF,"[""dgcI"", ""yliF"", ""b0834"", ""ECK0824""]",dgcI,3336
37,csiD,"[""glaH"", ""csi-12"", ""ygaT"", ""csiD"", ""b2659"", ""E...",glaH,4244
...,...,...,...,...
2391,gmr,"[""pdeR"", ""yciR"", ""gmr"", ""b1285"", ""ECK1280""]",pdeR,3534
2396,ylaB,"[""pdeB"", ""ylaB"", ""b0457"", ""ECK0451""]",pdeB,3168
2409,yciV,"[""rnm"", ""trpH"", ""yciV"", ""b1266"", ""ECK1260""]",rnm,3529
2423,yqiC,"[""ubiK"", ""yqiC"", ""b3042"", ""ECK3033""]",ubiK,4418


In [25]:
# now check if the mismatched genes in the sort file are in the rnas file as synonyms: 
rnas_synonyms = rnas['synonyms']

# make a copy of the sort file: 
sort_file_copy = sort_file.copy()

# find the genes in the sort file that are not in the flat file:
synonyms = [] 
model_value = []
index_value = []
for gene in genes_in_sort_not_flat:
    for index, synonym in enumerate(rnas_synonyms):
        # Check if the gene is in the current synonyms list
        if gene in synonym:
            # add the info to the respective list
            synonyms.append(synonym)
            model_value.append(rnas['common_name'][index])
            index_value.append(index)
            
            # change the value in the sort file to the model value:
            sort_file_copy.loc[sort_file_copy['Gene id'] == gene, 'Gene id'] = rnas['common_name'][index]
            
            break  # Stop searching after the first match
    else:
        synonyms.append(0)
        model_value.append("None")
        index_value.append("None")
    

In [26]:
sort_file

Unnamed: 0,Gene id,half_life (units.min)
0,deaD,98.538280
1,cspA,107.599391
2,ilvM,74.662317
3,fliD,89.145652
4,intA,78.816019
...,...,...
2426,csdA,646.660475
2427,cbl,69.778293
2428,tauA,683.375221
2429,yafV,704.811442


In [29]:
sort_file_copy

Unnamed: 0,Gene id,half_life (units.min)
0,deaD,98.538280
1,cspA,107.599391
2,ilvM,74.662317
3,fliD,89.145652
4,intA,78.816019
...,...,...
2426,csdA,646.660475
2427,cbl,69.778293
2428,tauA,683.375221
2429,yafV,704.811442


In [31]:
# save the new sort file:
#sort_file_copy.to_csv('~/wcEcoli/reconstruction/ecoli/scripts/protein_half_lives/protein_half_lives_Clim2_corrected.tsv', sep='\t', index=False)    
# NOTE: tested this file in the convert to flat file and it worked! now the protein_half_lives_Clim2_corrected.tsv file in the flat files folder is of length 2423 (only 8 less than the original sort file in the scripts folder)

In [28]:
# find the genes in the sort file that are not in the flat file:
sort_file_genes = sort_file_copy['Gene id']
flat_file_genes = flat_file['Gene id']

# Find genes in sort_file but not in flat_file
genes_in_sort_not_flat_new = sort_file_genes[~sort_file_genes.isin(flat_file_genes)]
genes_in_sort_not_flat_new # odd that this gives almost the same number? I think it shouldn't matter, becuase the flat_file wont change (so it should be the same number because that hasnt been regenerated with the new sort_file_copy gene ids

# find which gene ID is not in the genes_in_sort_not_flat_new but is in genes_in_sort_not_flat:
genes_in_sort_not_flat[~genes_in_sort_not_flat.isin(genes_in_sort_not_flat_new)]

15      yliE
24      yifL
31      hexR
33      yliF
37      csiD
        ... 
2391     gmr
2396    ylaB
2409    yciV
2423    yqiC
2424    ygbT
Name: Gene id, Length: 116, dtype: object

In [30]:
# figure out how many model_value genes are "None": 
model_value_df = pd.DataFrame(model_value)
model_value_df[model_value_df[0] == "None"] # hmm is this 9 becuase the last thing was 116? 116 + 9 = 125


Unnamed: 0,0
8,
9,
20,
44,
53,
58,
60,
88,
96,
