# Adding Checkpoint 4 -5 Easy Fixes

Description
 This script will use the basal simulation after cp4 and the implementation of microarray 2 as the base (which incorporates checkpoint 3 and microarray 1 too), and runs FBA externally to the model to see if model solves and see if there is growth (needs definition) and analyze gene usage. It will also implement some of the easy fixes that were identified in all checkpoint 5 and onward!

Below are the specific easy gene fixes that were implemented in this notebook:
<br>
mtlA-EG10615, mmuP-G6135, panF-EG10685, frlA-EG12908, psuK-EG11646, shiA-G7067, thiK-G6566, gmd-EG11787, rhtB-EG11469, codB-EG11327, gltS-EG10406
<br>

See an upward trend in the gene usage (from 44.10% --> 45.87%) and new gene usage (from 32.89% --> 39.08%) from this new implementation.


In [1]:
from html.parser import interesting_normal

import numpy as np
import ast
import pandas as pd
import os
import matplotlib.pyplot as plt
import dill
import requests
import xmltodict
import cvxpy as cp
import itertools
import networkx as nx
import plotly.express as px

from scipy.special import logsumexp

%matplotlib inline
# sns.set(style='dar|kgrid', palette='viridis', context='talk')

os.chdir(os.path.expanduser('~/vEcoli')) #import repo

from ecoli.processes.metabolism_redux_classic import NetworkFlowModel, FlowResult
from ecoli.library.schema import numpy_schema, bulk_name_to_idx, listener_schema, counts
from ecoli.processes.registries import topology_registry
TOPOLOGY = topology_registry.access("ecoli-metabolism-redux")
import plotly.express as px

In [2]:
# load checkpoint 2 model
time = '400'
date = '2025-05-15'
experiment = 'NEW_NewGenes_checkpoint2'
condition = 'basal'
entry = f'{experiment}_{time}_{date}'
folder = f'out/metabolism-comparison/{condition}/{entry}/'

output = np.load(folder + '0_output.npy',allow_pickle='TRUE').item()
# output = np.load(r"out/geneRxnVerifData/output_glc.npy", allow_pickle=True, encoding='ASCII').tolist()
output = output['agents']['0']
fba = output['listeners']['fba_results']
bulk = pd.DataFrame(output['bulk'])
f = open(folder + 'agent_steps.pkl', 'rb')
agent = dill.load(f)
f.close()

In [3]:
# get commonly stored variables
metabolism = agent['ecoli-metabolism-redux-classic']
stoichiometry = metabolism.stoichiometry.copy()
reaction_names = metabolism.reaction_names
fba_new_reaction_ids = metabolism.parameters["fba_new_reaction_ids"]
fba_reaction_ids_to_base_reaction_ids = metabolism.parameters['fba_reaction_ids_to_base_reaction_ids']
metabolites = metabolism.metabolite_names.copy()
binary_kinetic_idx = metabolism.binary_kinetic_idx
exchange_molecules = metabolism.exchange_molecules

S = stoichiometry .copy()
S = pd.DataFrame(S, index=metabolites , columns=reaction_names )
homeostatic_count = pd.DataFrame(fba["homeostatic_metabolite_counts"], columns=metabolism.homeostatic_metabolites).loc[24, :]
homeostatic = pd.DataFrame(fba["target_homeostatic_dmdt"], columns=metabolism.homeostatic_metabolites).loc[24, :]
maintenance = pd.DataFrame(fba["maintenance_target"][1:], columns=['maintenance_reaction']).iat[24, 0]
kinetic = pd.DataFrame(fba["target_kinetic_fluxes"], columns=metabolism.kinetic_constraint_reactions).loc[24, :].copy()

In [4]:
# parameters that are the same across the two simulation
kinetic_reaction_ids = metabolism.kinetic_constraint_reactions
allowed_exchange_uptake = metabolism.allowed_exchange_uptake
FREE_RXNS = ["TRANS-RXN-145", "TRANS-RXN0-545", "TRANS-RXN0-474"]
ADDED_RXNS = ['HS-TRANSPORT-RXN-CPD0-1202', 'HS-TRANSPORT-RXN-CPD0-1202 (reverse)',
                   'HS-TRANSPORT-RXN[CCO-OUTER-MEM]-OXAMATE', 'HS-TRANSPORT-RXN[CCO-OUTER-MEM]-OXAMATE (reverse)',
                   'HS-TRANSPORT-RXN[CCO-PM-BAC-NEG]-OXAMATE', 'HS-TRANSPORT-RXN[CCO-PM-BAC-NEG]-OXAMATE (reverse)',
                   'HS-BETA-GLUCURONID-RXN_CPD-3611//METOH', 'HS-SPONTANEOUS-TRANSPORT[CCO-OUTER-MEM]-HCN', 'HS-SPONTANEOUS-TRANSPORT[CCO-OUTER-MEM]-HCN (reverse)',
                   'HS-SPONTANEOUS-TRANSPORT[CCO-PM-BAC-NEG]-HCN','HS-SPONTANEOUS-TRANSPORT[CCO-PM-BAC-NEG]-HCN (reverse)']

# Define Functions

In [5]:
def get_subset_S(S, met_of_interest):
    S_met = S.loc[met_of_interest, :]
    S_met = S_met.loc[:,~np.all(S_met == 0, axis=0)]
    return S_met, S_met.columns

def get_keys(dict, value):
    return [key for key in dict if dict[key] == value]

def test_NetworkFlowModel(objective_weights,
                          uptake_addition = set([]), uptake_removal = set([]), new_exchange_molecules = set([]),
                          add_metabolite = None, add_homeostatic_demand = None, add_reaction = None, add_kinetic = None, remove_reaction = None, force_reaction = None):
    # update exchanges
    uptake = metabolism.allowed_exchange_uptake.copy()
    uptake = set(uptake)
    uptake = uptake | uptake_addition
    uptake = uptake - uptake_removal

    exchange_molecules = metabolism.exchange_molecules.copy()
    exchange_molecules = exchange_molecules | new_exchange_molecules

    # update stoichiometry
    reaction_names = metabolism.reaction_names.copy()
    kinetic_reaction_ids = metabolism.kinetic_constraint_reactions.copy()
    kinetic = pd.DataFrame(fba["target_kinetic_fluxes"], columns=metabolism.kinetic_constraint_reactions).loc[24, :].copy()
    metabolites = metabolism.metabolite_names.copy()
    homeostatic = pd.DataFrame(fba["target_homeostatic_dmdt"], columns=metabolism.homeostatic_metabolites).loc[24, :].copy()
    homeostatic_counts = homeostatic_count.copy() * metabolism.counts_to_molar.asNumber()

    S_new = stoichiometry.copy()

    if add_metabolite is not None: # add to metabolites list because they are currently not included in the model
        for m in add_metabolite:
            if m not in metabolites:
                metabolites.append(m)
        # append rows of zeros to S_new of length add_metabolite
        S_new = np.concatenate((S_new, np.zeros((len(add_metabolite), S_new.shape[1]))), axis=0)

    if add_reaction is not None:
        # assert add_reaction is a dictionary
        assert isinstance(add_reaction, dict)

        for r,s in add_reaction.items():
            if r not in reaction_names:
                reaction_names.append(r)
            # append columns of reaction stoich to S_new of length add_reaction
            new_reaction = np.zeros((S_new.shape[0], 1))
            for m, v in s.items():
                new_reaction[metabolites.index(m), 0] = v
            S_new = np.concatenate((S_new, new_reaction), axis=1)

    if add_kinetic is not None:
        # assert add_kinetic is a dictionary
        assert isinstance(add_kinetic, dict)

        for r, v in add_kinetic.items():
            if r not in kinetic_reaction_ids:
                kinetic_reaction_ids.append(r)
                kinetic[r] = v

    if remove_reaction is not None:
        for r in remove_reaction:
            r_idx = reaction_names.index(r)
            S_new = np.delete(S_new, r_idx, axis=1)
            reaction_names.remove(r)
            if r in kinetic_reaction_ids:
                kinetic_reaction_ids.remove(r)
                del kinetic[r]

    if force_reaction is not None:
        force_reaction_idx = np.array([reaction_names.index(r) for r in force_reaction])
    else:
        force_reaction_idx = force_reaction

    if add_homeostatic_demand is not None:
        # assert add_homeostatic_demand is a set
        assert isinstance(add_homeostatic_demand, list)

        for met in add_homeostatic_demand:
            homeostatic[met] = 100
            homeostatic_counts[met] = 1

    # Solve NetworkFlowModel
    model = NetworkFlowModel(
            stoich_arr=S_new,
            metabolites=metabolites,
            reactions=reaction_names,
            homeostatic_metabolites=list(dict(homeostatic).keys()),
            kinetic_reactions=kinetic_reaction_ids,
            free_reactions=FREE_RXNS)
    model.set_up_exchanges(exchanges=exchange_molecules, uptakes=uptake)
    solution: FlowResult = model.solve(
            homeostatic_concs=homeostatic_counts, # in conc
            homeostatic_dm_targets=np.array(list(dict(homeostatic).values())), # *10^7
            maintenance_target=maintenance, # *10^6 ish
            kinetic_targets=np.array(list(dict(kinetic).values())), # *10^6 ish
            # binary_kinetic_idx=binary_kinetic_idx, #7646
            binary_kinetic_idx=None,
            force_flow_idx=force_reaction_idx,
            objective_weights=objective_weights, #same
            upper_flux_bound= 1000000000, # increase to 10^9 because notebook runs FlowResult using Counts, WC runs using conc.
            solver=cp.GLOP) #SCS. ECOS, MOSEK
    return solution.objective, solution.velocities, reaction_names, S_new, metabolites, kinetic

# Create DataFrames to Store Results

In [6]:
# all reactions
sim = pd.DataFrame(fba["estimated_fluxes"], columns= reaction_names).mean(axis=0).copy()
# kinetic reactions
kc_target_cp2 = pd.DataFrame(fba["target_kinetic_fluxes"], columns= kinetic_reaction_ids).mean(axis=0).copy()

In [7]:
#accumulating the simulations
df_all = sim.copy()
df_all.columns = ['sim_cp2_basal']

## Implement Checkpoints 5 (including some from checkpoint 3 that were not added) Easy Fixes

In [8]:
#checkpoint conditions added that were easy fixes
conditions = {
     'EG10615-D-Mannitol and D-Sorbitol':{
        'Add': set(['MANNITOL[e]', 'SORBITOL[e]']),
        'Remove': set([]),
        'Add Demand': None,
    },
    'G6135-S-methyl-L-methionine':{
        'Add': set(['CPD-397[e]']),
        'Remove': set([]),
        'Add Demand': None,
    },
    'EG10685-(R)-pantothenate':{
        'Add': set(['PANTOTHENATE[p]']),
        'Remove': set([]),
        'Add Demand': None,
    },
    'EG12908-N6-(D-psicosyl)-L-lysine and N6-(1-deoxy-D-fructos-1-yl)-L-lysine':{
        'Add': set(['PSICOSELYSINE[c]', 'FRUCTOSELYSINE[c]']),
        'Remove': set([]),
        'Add Demand': None,
    },
    'EG11646-pseudouridine':{
        'Add': set(['CPD-497[e]']),
        'Remove': set([]),
        'Add Demand': None,
    },
    'G7067-shikimate':{
        'Add': set(['SHIKIMATE[e]']),
        'Remove': set([]),
        'Add Demand': None,

    },
    'G6566-thiamine':{
        'Add': set(['THIAMINE[e]']),
        'Remove': set([]),
        'Add Demand': None,
    },
    'EG11787-4,6-pyr-α-D-Gal-(1→4)-β-D-GlcA-(1→3)-2-O-Ac-α-D-Gal-(1→3)-α-L-Fuc-(1→4)-3-O-Ac-α-L-Fuc-(1→3)-α-D-Glc-PP-Und':{
        'Add': set([]),
        'Remove': set([]),
        'Add Demand': ['CPD-24196[e]'],
    },
    'EG11469-L-homoserine lactone':{
        'Add': set([]),
        'Remove': set([]),
        'Add Demand': ['CPD-15554[e]'],
    },
    'EG11327-cytosine':{
        'Add': set(['CYTOSINE[e]']),
        'Remove': set([]),
        'Add Demand': None,
    },
    'EG10406-L-glutamate':{
        'Add': set(['GLT[p]']),
        'Remove': set(['GLC[p]']),
        'Remove Reaction': ['TRANS-RXN-261', 'TRANS-RXN-162'],
        'Add Demand': None,
    },

}


In [9]:
#add all of the conditions from above into the df_all
condition_names = []
for condition_name, condition in conditions.items():
    
    if 'Remove Reaction' in condition:
        remove_reaction = condition['Remove Reaction']
    else:
        remove_reaction = None

    
    objective_weights = {'secretion': 0.01, 'efficiency': 0.000001, 'kinetics': 0.000001}
    
    _, solution_flux, test_reaction_names, S_new, test_metabolites, test_kinetic = test_NetworkFlowModel(
                                        objective_weights, uptake_addition=condition['Add'],
                                        uptake_removal=condition['Remove'], add_homeostatic_demand=condition['Add Demand'],
                                        remove_reaction=remove_reaction,

    )    
    
    # get the fluxes
    sim_flux = pd.DataFrame({f'sim_cp5+_{condition_name}': solution_flux}, index = test_reaction_names)
    condition_names.append(f'sim_cp5+_{condition_name}')
    df_all = pd.concat([df_all, sim_flux], axis=1)
    
    print(f"""Finished enviornment: {condition_name}""")

Finished enviornment: EG10615-D-Mannitol and D-Sorbitol
Finished enviornment: G6135-S-methyl-L-methionine
Finished enviornment: EG10685-(R)-pantothenate
Finished enviornment: EG12908-N6-(D-psicosyl)-L-lysine and N6-(1-deoxy-D-fructos-1-yl)-L-lysine
Finished enviornment: EG11646-pseudouridine
Finished enviornment: G7067-shikimate
Finished enviornment: G6566-thiamine
Finished enviornment: EG11787-4,6-pyr-α-D-Gal-(1→4)-β-D-GlcA-(1→3)-2-O-Ac-α-D-Gal-(1→3)-α-L-Fuc-(1→4)-3-O-Ac-α-L-Fuc-(1→3)-α-D-Glc-PP-Und
Finished enviornment: EG11469-L-homoserine lactone
Finished enviornment: EG11327-cytosine
Finished enviornment: EG10406-L-glutamate


In [10]:
df_all['is_new'] = 'Old Reactions'
df_all.loc[fba_new_reaction_ids, 'is_new'] = 'New Reactions'
df_all.loc[ADDED_RXNS, 'is_new'] = 'Heena\'s Reactions'

df_all['kinetic'] = [kinetic[r] if r in kinetic_reaction_ids else False for r in df_all.index]


# Calculate % Gene Usage with Checkpoint 5 and Microarray 3 and 4

In [13]:
%store -r df_gene_usage genes_to_enzymes df_all_gene_usage new_genes df_all_gene_usage_cp4_w_cp3 df_all_gene_usage_microarray3

In [16]:
# add checkpoint 5 to microarray 3 and 4
df_all_gene_usage_cp5_w_m3 = df_all_gene_usage_microarray3.copy()
reaction_catalysts = metabolism.parameters["reaction_catalysts"]
conditions_previous = ['basal', 'acetate', 'rich', 'anaerobic_basal', 'anaerobic_acetate', 'anaerobic_rich']
conditions_all = conditions_previous + condition_names
conditions_all = conditions_all + condition_names

for condition in condition_names:
    df_all_gene_usage_cp5_w_m3[condition] = 0
    new_reaction_usage = df_all.loc[fba_new_reaction_ids, condition]
    for rxn in fba_new_reaction_ids:
        is_used = new_reaction_usage.loc[rxn]
        enzymes = reaction_catalysts.get(rxn,[])
        for enz in enzymes:
            enz = enz[:-3]
            genes = get_keys(genes_to_enzymes, enz)
            for gene in genes:
                if is_used:
                    df_all_gene_usage_cp5_w_m3.loc[gene, condition] += 1
                if rxn in kinetic_reaction_ids:
                    df_all_gene_usage_cp5_w_m3.loc[gene, 'has_kinetic'] = True


# tally usage
df_all_gene_usage_cp5_w_m3['is_used'] = np.any(df_all_gene_usage_cp5_w_m3[conditions_all] > 0, axis=1)
df_gene_usage_cp5_w_m3 = df_all_gene_usage_cp5_w_m3.loc[new_genes,:]

In [17]:
%store -r df_all_gene_usage df_all_gene_usage_cp1 df_all_gene_usage_cp2 df_all_gene_usage_cp3_w_cp2 all_gene_usage_cp4_w_microarray2 new_gene_usage_cp4_w_microarray2 new_gene_usage_cp4_w_microarray3 all_gene_usage_cp4_w_microarray3

In [23]:

df_all_gene_usage = df_all_gene_usage.copy()

gene_used_before = df_gene_usage['is_used']
all_gene_used_before = df_all_gene_usage['is_used']
num_genes = len(gene_used_before)

perc_gene_usage_prev = sum(gene_used_before)/num_genes
all_gene_usage_cp5_w_microarray3 = all_gene_usage_cp4_w_microarray3 | df_gene_usage_cp5_w_m3['is_used'] | all_gene_usage_cp4_w_microarray2
new_gene_usage_cp5_w_microarray3 = all_gene_usage_cp5_w_microarray3[new_genes]

perc_gene_usage_all_cp5= sum(all_gene_usage_cp5_w_microarray3)/len(all_gene_usage_cp5_w_microarray3)
perc_gene_usage_new_cp5  = sum(all_gene_usage_cp5_w_microarray3.loc[new_genes])/num_genes

print(f'% new genes usage before checkpoint 1 and 2 are: {perc_gene_usage_prev: 0.2%}')
print(f'% new genes usage at checkpoint 5 are: {perc_gene_usage_new_cp5: 0.2%}')
print(f'% all genes usage at checkpoint 5 with microarray 3 and 4 are: {perc_gene_usage_all_cp5: 0.2%}')

% new genes usage before checkpoint 1 and 2 are:  4.56%
% new genes usage at checkpoint 5 are:  39.09%
% all genes usage at checkpoint 5 with microarray 3 and 4 are:  45.63%


In [24]:
all_gene_usage_cp5_w_microarray3

Gene ID (EcoCyc)
EG10001     True
EG10002    False
EG10004     True
EG10006     True
EG10007    False
           ...  
M011        True
M012        True
M013       False
M014        True
M015        True
Name: is_used, Length: 1247, dtype: bool

In [25]:
ids_to_check = [
    "EG11557", "EG11559", "EG11560", "EG11561", "EG11787", "EG11788", "EG10144",
    "EG10140", "EG10141", "EG10142", "G7096", "G7097", "G7099", "G7100", "G7102",
    "G7103", "G7104", "EG10177", "EG10175", "EG10160", "EG11327", "EG11871",
    "EG11104", "EG11469", "EG10556", "EG12495", "G7855", "EG10522", "EG11869",
    "EG20051", "EG20053", "GB4478", "EG10592", "EG11700", "EG12281", "EG10401",
    "EG10406", "EG12282", "EG12283", "EG12494", "G7856", "EG12522", "EG10615",
    "G6135", "EG10685", "EG12908", "EG11646", "G6518", "G7067", "G6347",
    "EG10953", "EG10954", "G6217", "G6218", "G6566", "EG11574", "G6219",
    "EG11573", "EG11572", "EG11787","EG10126","EG10127", "EG10128","EG10130","EG10271","EG10272",
    "EG11012"
]
all_gene_usage_cp5_w_microarray3.loc[ids_to_check]

Gene ID (EcoCyc)
EG11557    False
EG11559    False
EG11560    False
EG11561    False
EG11787     True
           ...  
EG10128    False
EG10130    False
EG10271    False
EG10272    False
EG11012    False
Name: is_used, Length: 67, dtype: bool

In [26]:
#all of the genes that were shaded green expected to resolve
ids_to_check = [
    "EG11557", "EG11559", "EG11560", "EG11561", "EG11787", "EG11788", "EG10144",
    "EG10140", "EG10141", "EG10142", "G7096", "G7097", "G7099", "G7100", "G7102",
    "G7103", "G7104", "EG10177", "EG10175", "EG10160", "EG11327", "EG11871",
    "EG11104", "EG11469", "EG10556", "EG12495", "G7855", "EG10522", "EG11869",
    "EG20051", "EG20053", "GB4478", "EG10592", "EG11700", "EG12281", "EG10401",
    "EG10406", "EG12282", "EG12283", "EG12494", "G7856", "EG12522", "EG10615",
    "G6135", "EG10685", "EG12908", "EG11646", "G6518", "G7067", "G6347",
    "EG10953", "EG10954", "G6217", "G6218", "G6566", "EG11574", "G6219",
    "EG11573", "EG11572", "EG11787","EG10126","EG10127", "EG10128","EG10130","EG10271","EG10272",
    "EG11012"
]
column_name = "Gene ID (EcoCyc)"

matches_df = all_gene_usage_cp5_w_microarray3[all_gene_usage_cp5_w_microarray3.index.isin(ids_to_check)] #extract the one of interset

print(f"Found {len(matches_df)} matches:")
print(matches_df)

Found 66 matches:
Gene ID (EcoCyc)
EG10126    False
EG10127    False
EG10128    False
EG10130    False
EG10140     True
           ...  
G7103       True
G7104       True
G7855       True
G7856       True
GB4478      True
Name: is_used, Length: 66, dtype: bool


In [27]:
matches_df[matches_df] #ones that are true

Gene ID (EcoCyc)
EG10140    True
EG10141    True
EG10142    True
EG10144    True
EG10160    True
EG10175    True
EG10177    True
EG10401    True
EG10406    True
EG10522    True
EG10556    True
EG10615    True
EG10685    True
EG11327    True
EG11469    True
EG11572    True
EG11573    True
EG11574    True
EG11646    True
EG11700    True
EG11787    True
EG11788    True
EG11869    True
EG11871    True
EG12494    True
EG12495    True
EG12522    True
EG20051    True
EG20053    True
G6135      True
G6217      True
G6218      True
G6219      True
G6518      True
G6566      True
G7067      True
G7096      True
G7097      True
G7099      True
G7100      True
G7102      True
G7103      True
G7104      True
G7855      True
G7856      True
GB4478     True
Name: is_used, dtype: bool

In [28]:
df_diff = new_gene_usage_cp5_w_microarray3.compare(new_gene_usage_cp4_w_microarray2) #to see new genes used overall between checkpoints
df_diff

Unnamed: 0_level_0,self,other
Gene ID (EcoCyc),Unnamed: 1_level_1,Unnamed: 2_level_1
EG11327,True,False
EG10160,True,False
EG10175,True,False
EG10177,True,False
EG11788,True,False
EG11787,True,False
EG11871,True,False
G6135,True,False
EG10685,True,False
EG11646,True,False


# Plot 1: Create histogram to track gene usage

In [29]:
%store -r perc_gene_usage_cp4

In [34]:
# % new gene usage
df_gene_usage = df_all_gene_usage.loc[new_genes,:]
df_gene_usage_cp1 = df_all_gene_usage_cp1.loc[new_genes,:]
df_gene_usage_cp2 = df_all_gene_usage_cp2.loc[new_genes,:]
df_gene_usage_cp3_w_cp2 = df_all_gene_usage_cp3_w_cp2.loc[new_genes,:]
df_gene_usage_cp4_w_cp3 = df_all_gene_usage_cp4_w_cp3.loc[new_genes,:]

num_genes = len(df_gene_usage)
perc_gene_usage_basal = sum(df_gene_usage_cp2.basal > 0)/num_genes * 100
perc_gene_usage_basic_conditions = sum(gene_used_before)/num_genes * 100
perc_gene_usage_cp1  = sum(df_gene_usage_cp1['is_used'])/num_genes * 100
perc_gene_usage_cp2  = sum(df_gene_usage_cp2['is_used'])/num_genes * 100
perc_gene_usage_cp3  = sum(df_gene_usage_cp3_w_cp2['is_used'])/num_genes * 100
perc_gene_usage_cp4 = sum(new_gene_usage_cp4_w_microarray2)/num_genes * 100 #bc one columned df
perc_gene_usage_cp5 = sum(new_gene_usage_cp4_w_microarray3)/num_genes * 100
perc_gene_usage_cp6 = sum(new_gene_usage_cp5_w_microarray3)/num_genes * 100

perc = [perc_gene_usage_basal, perc_gene_usage_basic_conditions, perc_gene_usage_cp1, perc_gene_usage_cp2, perc_gene_usage_cp3, perc_gene_usage_cp4, perc_gene_usage_cp5, perc_gene_usage_cp6]
x_label = ['Basal', 'Basal, Acetate, Rich, Anaerobic', 'Checkpoint 1', 'Checkpoint 2', 'Cp2 + Microarray 1', 'Checkpoint 4 + Microarray2', 'Microarray3+4', 'Checkpoint 5 + Microarray3+4']

In [35]:
# % all gene usage
num_genes = len(df_all_gene_usage)
perc_all_gene_usage_basal = sum(df_all_gene_usage_cp2.basal > 0)/num_genes * 100
perc_all_gene_usage_basic_conditions = sum(df_all_gene_usage['is_used'])/num_genes * 100
perc_all_gene_usage_cp1  = sum(df_all_gene_usage_cp1['is_used'])/num_genes * 100
perc_all_gene_usage_cp2  = sum(df_all_gene_usage_cp2['is_used'])/num_genes * 100
perc_all_gene_usage_cp3_w_cp2  = sum(df_all_gene_usage_cp3_w_cp2['is_used'])/num_genes * 100
perc_all_gene_usage_cp4 = sum(all_gene_usage_cp4_w_microarray2)/num_genes * 100
perc_all_gene_usage_cp5 = sum(all_gene_usage_cp4_w_microarray3)/num_genes * 100
perc_all_gene_usage_cp6 = sum(all_gene_usage_cp5_w_microarray3)/num_genes * 100

perc_all = [perc_all_gene_usage_basal, perc_all_gene_usage_basic_conditions, perc_all_gene_usage_cp1, perc_all_gene_usage_cp2, perc_all_gene_usage_cp3_w_cp2, perc_all_gene_usage_cp4,perc_all_gene_usage_cp5,perc_all_gene_usage_cp6]

In [36]:
# Create a DataFrame
df = pd.DataFrame({
    "Condition": x_label,
    "New Metabolic Gene": perc,
    "All Metabolic Gene": perc_all,
})

# Reshape to long format
df_long = df.melt(id_vars="Condition",
                  value_vars=["New Metabolic Gene", "All Metabolic Gene"],
                  var_name="Gene Group",
                  value_name="Percent Usage")

# Create grouped bar plot
fig = px.bar(
    df_long,
    x="Condition",
    y="Percent Usage",
    color="Gene Group",              # creates a legend
    barmode="group",                 # side-by-side bars
    text="Percent Usage",
    title="New vs All Metabolic Gene Usage by Condition",
    labels={"Percent Usage": "Percent gene usage (%)"},
    color_discrete_map={
        "New Metabolic Gene": "#4C78A8",
        "All Metabolic Gene": "#c26426"
    }
)

# Customize appearance
fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig.update_layout(
    paper_bgcolor='rgba(255, 255, 255, 0)',
    plot_bgcolor ='rgba(255, 255, 255, 0)',
    yaxis_title="Percent gene usage (%)",
    yaxis=dict(range=[0, 45]),
    xaxis_title=None,
    font_color = 'white',
    xaxis_tickangle=-35,
    uniformtext_minsize=8,
    uniformtext_mode='hide',
    margin=dict(t=50, b=50, l=50, r=50),
    width=1000,   # width in pixels
    height=650
)

fig.show(renderer='browser')
# save
# fig.write_image("notebooks/Heena notebooks/Metabolism_New Genes/out/gene_usage_histogram_cp3.png", scale=5, width=800, height=500)