In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
os.chdir(os.path.expanduser('~/wcEcoli/out/saved_PDRs/'))
import plotly.graph_objects as go
import glob
import plotly.express as px

In [None]:
# Original list
conditions = [
    'basal', 'with_aa', 'acetate', 'succinate', 'no_oxygen', 
    'CPLX-125__active', 'CPLX-125__inactive', 'CPLX-172__active', 'CPLX-172__inactive',
    'CPLX0-226__active', 'CPLX0-226__inactive', 'CPLX0-228__active', 'CPLX0-228__inactive', 
    'CPLX0-7669__active', 'CPLX0-7669__inactive', 'CPLX0-7705__active', 'CPLX0-7705__inactive', 
    'CPLX0-7740__active', 'CPLX0-7740__inactive', 'CPLX0-7796__active', 'CPLX0-7796__inactive', 
    'CPLX0-7916__active', 'CPLX0-7916__inactive', 'EG12123-MONOMER__active', 'EG12123-MONOMER__inactive', 
    'FNR-4FE-4S-CPLX__active', 'FNR-4FE-4S-CPLX__inactive', 'MONOMER0-155__active', 'MONOMER0-155__inactive', 
    'MONOMER0-160__active', 'MONOMER0-160__inactive', 'MONOMER0-162__active', 'MONOMER0-162__inactive', 
    'PC00010__active', 'PC00010__inactive', 'PC00027__active', 'PC00027__inactive', 'PD00288__active', 
    'PD00288__inactive', 'PD00519__active', 'PD00519__inactive', 'PHOSPHO-ARCA__active', 'PHOSPHO-ARCA__inactive', 
    'PHOSPHO-BAER__active', 'PHOSPHO-BAER__inactive', 'PHOSPHO-DCUR__active', 'PHOSPHO-DCUR__inactive', 
    'PHOSPHO-NARL__active', 'PHOSPHO-NARL__inactive', 'PUTA-CPLX__active', 'PUTA-CPLX__inactive'
]

# Removing everything after '__'
cleaned_conditions = [condition.split('__')[0] for condition in conditions]

hi = cleaned_conditions[5:]
active_TFs = np.unique(hi)

In [None]:
gene_id_list = []
# todo: change this to be the Clim proteins of interest
for id in active_TFs:
    monomer_id = id
    # get the gene name for the monomer ID:
    get_symbols_for_monomer_ids()
    gene_id_list.append(get_symbols_for_monomer_ids()[monomer_id])

In [None]:
hi = ["trpR", "araC", "crp", "argR", "argP", "fis", "cytR", "metJ", "bglJ", "lrhA", "fnr", "lrp", "dnaA", "tyrR", "lexA", "ihfA", "hns", "leuO", "arcA", "baeR", "dcuR", "narL", "putA"]

# make a function that converts the monomer ids to gene symbols
def get_common_name_for_monomer_ids():
    # code adapted from convert_to_flat.py
    import io
    from wholecell.io import tsv
    from wholecell.utils.filepath import ROOT_PATH
    RNAS_FILE = os.path.join(ROOT_PATH, 'reconstruction', 'ecoli', 'flat', 'rnas.tsv')
    with io.open(RNAS_FILE, 'rb') as f:
        reader = tsv.reader(f, delimiter='\t')
        headers = next(reader)
        while headers[0].startswith('#'):
            headers = next(reader)

        gene_id_index = headers.index('gene_id')
        common_name_index = headers.index('common_name')
        protein_id_index = headers.index('monomer_ids')
        monomer_ids_to_common_names = {}
        for line in reader:
            gene_symbol = line[common_name_index]
            protein_id = list(line[protein_id_index][2:-2].split('", "'))[0]
            monomer_ids_to_common_names[protein_id] = gene_symbol

        return monomer_ids_to_common_names



monomer_id_list = []
    
# now, get the monomer id for each gene id: 
for gene in hi:
    for key, value in get_common_name_for_monomer_ids().items():
        if value == gene:
            monomer_id_list.append(key)
            
print(monomer_id_list)

In [None]:
# make a function that converts the monomer ids to gene symbols
def get_symbols_for_monomer_ids():
    # code adapted from convert_to_flat.py
    import io
    from wholecell.io import tsv
    from wholecell.utils.filepath import ROOT_PATH
    RNAS_FILE = os.path.join(ROOT_PATH, 'reconstruction', 'ecoli', 'flat', 'rnas.tsv')
    with io.open(RNAS_FILE, 'rb') as f:
        reader = tsv.reader(f, delimiter='\t')
        headers = next(reader)
        while headers[0].startswith('#'):
            headers = next(reader)

        gene_id_index = headers.index('gene_id')
        protein_id_index = headers.index('monomer_ids')
        monomer_ids_to_gene_ids = {}
        for line in reader:
            gene_symbol = line[gene_id_index]
            protein_id = list(line[protein_id_index][2:-2].split('", "'))[0]
            monomer_ids_to_gene_ids[protein_id] = gene_symbol

        return monomer_ids_to_gene_ids
    
gene_id_list = []
# todo: change this to be the Clim proteins of interest
for id in df_Clim['Protein ID']:
    monomer_id = id[:-3]
    # get the gene name for the monomer ID:
    get_symbols_for_monomer_ids()
    gene_id_list.append(get_symbols_for_monomer_ids()[monomer_id])
    
# find the essential genes in the list of gene ids
def get_essential_genes():
    # code adapted from convert_to_flat.py
    import io
    from wholecell.io import tsv
    from wholecell.utils.filepath import ROOT_PATH
    GENES_FILE = os.path.join(ROOT_PATH, 'validation', 'ecoli', 'flat', 'essential_genes.tsv')
    with io.open(GENES_FILE, 'rb') as f:
        reader = tsv.reader(f, delimiter='\t')
        headers = next(reader)
        while headers[0].startswith('#'):
            headers = next(reader)

        gene_id_index = headers.index('FrameID')
        #protein_id_index = headers.index('proteinID')
        #gene_ids_to_monomer_ids = {}
        essential_genes = []
        for line in reader:
            #gene_symbol = line[gene_id_index]
            essential_gene = line[gene_id_index]
            essential_genes.append(essential_gene)
            #protein_id = list(line[protein_id_index][2:-2].split('", "'))[0]
            #gene_ids_to_monomer_ids[protein_id] = gene_symbol
            #symbols_to_monomer_ids[gene_symbol] = protein_id

        return essential_genes
    
# get the essential genes:
essential_genes_found = []
for gene in gene_id_list:
    if gene in get_essential_genes():
        essential_genes_found.append(gene)
        
print("Essential Genes Found (" +str(len(essential_genes_found))+"): ", essential_genes_found)

test for nora 

3600.0