In [None]:
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind, mannwhitneyu
import seaborn as sns
from pytfa.io.json import load_json_model
from skimpy.analysis.oracle import *
import os
import sys
from multiprocessing import Pool
from tqdm.auto import tqdm
import seaborn as sns

sys.path.append("..") # Adds higher directory to python modules path.

from utils.remove_outliers import remove_outliers_parallel

import configparser

# Load config.ini
config = configparser.ConfigParser()
config.read(os.path.abspath(os.path.join('..', 'src', 'config.ini')))

base_dir = config['paths']['base_dir']
path_to_samples_WT = os.path.join(base_dir, config['paths']['path_to_samples_WT'])
path_to_samples_MUT = os.path.join(base_dir, config['paths']['path_to_samples_MUT'])

path_to_tmodel_WT = os.path.join(base_dir, config['paths']['path_to_tmodel_WT'])
path_to_tmodel_MUT = os.path.join(base_dir, config['paths']['path_to_tmodel_MUT'])

path_to_fcc_WT = os.path.join(base_dir, config['paths']['path_to_fcc_WT'])
path_to_fcc_MUT = os.path.join(base_dir, config['paths']['path_to_fcc_MUT'])


In [None]:
model_wt = load_json_model(path_to_tmodel_WT)
model_mut = load_json_model(path_to_tmodel_MUT)
rxns_wt = [rxn.id for rxn in model_wt.reactions]
rxns_mut = [rxn.id for rxn in model_mut.reactions]

# Find which reactions are in mut but not in wt
rxns_diff = list(set(rxns_mut) - set(rxns_wt))
print('Reactions in mut but not in wt:', rxns_diff)

# Find which reactions have opposite directionality in mut comprade to wt
diff_bdrs = []
for rxn in model_wt.reactions:
    try:
        rxn_mut = model_mut.reactions.get_by_id(rxn.id)
        if rxn_mut.lower_bound*rxn.lower_bound < 0:
            print(rxn.id)
            diff_bdrs.append(rxn.id)
    except KeyError:
        pass

print(len(diff_bdrs), 'reactions have opposite directionality in mut compared to wt')

In [None]:
# Load the 5000 steady state samples for each model
samples_WT = pd.read_csv(path_to_samples_WT, index_col=0, header=0)
samples_MUT = pd.read_csv(path_to_samples_MUT, index_col=0, header=0)

# Remove unwanted columns
prefix_list = ['DG_', 'DGo_', 'FU_', 'BU_', 'MinFluxVar_', 'LnGamma_', 'LC_']
samples_WT = samples_WT[samples_WT.columns[~samples_WT.columns.str.startswith(tuple(prefix_list))]]
samples_MUT = samples_MUT[samples_MUT.columns[~samples_MUT.columns.str.startswith(tuple(prefix_list))]]

# Remove columns if all the rows are NaN
samples_WT = samples_WT.dropna(axis=1, how='all')
samples_MUT = samples_MUT.dropna(axis=1, how='all')

# Compare the steady states that were used for the MCA analysis
mut_ss = []
for file in os.listdir(path_to_fcc_MUT):
    try:
        mut_ss.append(int(file.split('_')[4]))
    except:
        print('The file', file, 'does not follow the naming convention')

mut_ss = np.unique(mut_ss)
samples_MUT = samples_MUT.loc[mut_ss,:]

# Find the steady states that were used for the MCA res
wt_ss = []
for file in os.listdir(path_to_fcc_WT):
    try:
        wt_ss.append(int(file.split('_')[4]))
    except:
        print('The file', file, 'does not follow the naming convention')

wt_ss = np.unique(wt_ss)
samples_WT = samples_WT.loc[wt_ss,:]

# Keep only the common columns in the two sample dataframes
common_cols = list(set(samples_WT.columns).intersection(samples_MUT.columns))
samples_WT = samples_WT[common_cols]
samples_MUT = samples_MUT[common_cols]

In [None]:
# Function to compute L2 norm for a single pair
def compute_diff(args):
    i, j = args
    diff = samples_WT.loc[i] - samples_MUT.loc[j]
    # Make the pair into a string 
    pair = str(i) + '_' + str(j)
    return pair, np.linalg.norm(diff)

# Main parallelized computation
if __name__ == "__main__":
    # Create all combinations of indices for WT and MUT
    wt_indices = samples_WT.index
    mut_indices = samples_MUT.index
    all_pairs = [(i, j) for i in wt_indices for j in mut_indices]

    # Set up multiprocessing pool
    with Pool(processes=15) as pool:
        # Map the function to the list of pairs
        results = list(tqdm(pool.imap(compute_diff, all_pairs), total=len(all_pairs)))

    # Convert results to a dictionary
    diffs = dict(results)

    # Make a dataframe from the dictionary
    diff_df = pd.DataFrame.from_dict(diffs, orient='index', columns=['L2_norm'])

In [None]:
mean_l2_norm = diff_df['L2_norm'].mean()
std_l2_norm = diff_df['L2_norm'].std()

plt.figure(figsize=(10, 6))
sns.histplot(diff_df.L2_norm, kde=True, bins=30, color='skyblue', legend=False)
plt.axvline(mean_l2_norm, color='red', linestyle='dashed', linewidth=2, label='Mean')
plt.axvline(mean_l2_norm + std_l2_norm, color='green', linestyle='dashed', linewidth=2, label='Mean + 1 Std')
plt.axvline(mean_l2_norm - std_l2_norm, color='green', linestyle='dashed', linewidth=2, label='Mean - 1 Std')
plt.title('Distribution of L2 Norm Differences', fontsize=18)
plt.xlabel('L2 Norm', fontsize=15)
plt.ylabel('Number of pairs', fontsize=15)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid(True, alpha=0.2)
plt.legend()
plt.show()

In [None]:
# Find which pair is closest to the average difference
mean = diff_df['L2_norm'].mean()
diff_df['diff_from_mean'] = abs(diff_df['L2_norm'] - mean)
print(diff_df['diff_from_mean'].idxmin())

In [None]:
mean_l2_norm = diff_df['L2_norm'].mean()
std_l2_norm = diff_df['L2_norm'].std()
median_l2_norm = diff_df['L2_norm'].median()

print(f"Mean L2 Norm: {mean_l2_norm}")
print(f"Standard Deviation of L2 Norm: {std_l2_norm}")
print(f"Median L2 Norm: {median_l2_norm}")

In [None]:
diff_df.sort_values(by='diff_from_mean')