# Purpose

Show that when you regress out composition variance from bulk co-expression and train models with it, the most impacted GO terms are those with high brain MGC scores

In [1]:
import pandas as pd
import os 
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import json

### Load GO learnability from Brain with no composition variance

In [2]:
def load_regressed_performance(path = "/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/deconvolutingBulk/data/dev/july_boot_29")->pd.DataFrame:
    """Loads performance from 62 simulations where composition variance from brain markers was regressed out

    Args:
        path (str, optional): path to regression performance directory

    Returns:
        pd.DataFrame: Dataframe that has performance for bulk brain with regressed comp
    """

    lo_dirs = os.listdir(path)
    lo_potential_files = [f"{path}/{dir}/melted_EGAD.csv" for dir in lo_dirs]

    def filter_existing_paths(paths):
        """Filter out paths that do not exist."""
        existing_paths = [path for path in paths if os.path.exists(path)]
        return existing_paths
    lo_files = filter_existing_paths(lo_potential_files)
    
    lo_dfs = [pd.read_csv(file, index_col=0) for file in lo_files]

    def add_boot_col(lo_dfs):
        lo_new_dfs = []
        for i,df in enumerate(lo_dfs):
            df['boot']=i
            lo_new_dfs.append(df)
        return lo_new_dfs
    lo_dfs = add_boot_col(lo_dfs=lo_dfs)
    
    df = pd.concat(lo_dfs, axis = 0)
    # df_grouped = df.reset_index().groupby(by = ['index', 'tissue', 'type'])['auc'].mean().reset_index()
    return df #df_grouped

GO_learnabilty_regressed = load_regressed_performance()
GO_learnabilty_regressed = GO_learnabilty_regressed.loc[:,['auc','type']].reset_index().groupby(['index','type']).mean().reset_index() # aggragate over repeats
GO_learnabilty_regressed = GO_learnabilty_regressed.loc[:,['type','auc', 'index']].pivot(index='index', columns='type', values='auc').reset_index()
GO_learnabilty_regressed.head()

type,index,markers,random
0,GO:0000045,0.535564,0.547717
1,GO:0000070,0.666578,0.692279
2,GO:0000077,0.566519,0.535777
3,GO:0000079,0.595023,0.547955
4,GO:0000082,0.528514,0.505917


### Load Brain and MGC Scores

In [3]:
def load_mges_scores(mges_path:str, mgesname:str)->pd.DataFrame:
	"""Load MGES dictionary

	Args:
		mges_path (str): path to dict
		mgesname (str): name of MGES type (Brain or PBMC)

	Returns:
		pd.DataFrame: has GO term and term's MGES score
	"""
	# Load the JSON file into a dictionary
	with open(mges_path, 'r') as json_file:
		mges = json.load(json_file)
	
	# Convert the dictionary to a DataFrame
	mges = pd.DataFrame(list(mges.items()), columns=['GO', mgesname])
	
	return mges

brain_mges = load_mges_scores("../MGES/data/brain_mges.json", 'brainMGES')
pbmc_mges = load_mges_scores("../MGES/data/pbmc_mges.json", 'pbmcMGES')

In [4]:
def process(GO_learnabilty_regressed, brain_mges, pbmc_mges):
	GO_learnabilty_regressed = GO_learnabilty_regressed.merge(brain_mges, left_on='index', right_on="GO")
	GO_learnabilty_regressed = GO_learnabilty_regressed.merge(pbmc_mges, on="GO")
	GO_learnabilty_regressed = GO_learnabilty_regressed.drop(columns='index')
	GO_learnabilty_regressed['random-markers'] = GO_learnabilty_regressed.random - GO_learnabilty_regressed.markers
	return GO_learnabilty_regressed

GO_learnabilty_regressed = process(GO_learnabilty_regressed, brain_mges, pbmc_mges)

In [5]:
GO_learnabilty_regressed.loc[:,['brainMGES','pbmcMGES','random-markers']].corr()

Unnamed: 0,brainMGES,pbmcMGES,random-markers
brainMGES,1.0,0.083642,0.044381
pbmcMGES,0.083642,1.0,-0.194196
random-markers,0.044381,-0.194196,1.0


In [6]:
# sns.lmplot(pivoted, x = 'MGES_Value', y = 'random-markers', hue='Sample', legend=None, height=8, aspect=1.5, ci = None)
# # Add title and labels
# plt.title('GO Terms with Highest Brain MGCS Show \n Decreased Learnability Without Composition EFfects', fontsize=20)
# plt.xlabel('Marker Gene Content Score (MGCS)', fontsize=16)
# plt.ylabel('Difference in Learnability (AUROC) \n (Random Genes - Marker Gene) ', fontsize=16)

# # Add legend with increased font size
# plt.legend(title='MGCS Type', title_fontsize='14', fontsize='14')

# # Show plot
# plt.show()