In [1]:
# This script is for making sure that the average number of cells sampled across a variance level converges to the expected baseline proportion

In [21]:
import os
import pandas as pd
from functools import reduce
import re
import json
import plotly.express as px

In [22]:
path = "/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/testrun/simulations/0/brain_sc_with_metadata_cpm_pc_cell_type_profiles.csv"
proportions_json_path= "/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/cell_type_proportions.json"

In [23]:
def getBaselineProportion(CTProfile_name:str, proportions_json_path:json) -> dict:
    """Get the list of proper cell type baseline proportions depending on if we are using brain or pbmc

    Args:
        CTProfile_name (str): name of the dataframe of cell type profiles
        proportions_json (json): json that contains baseline cell type profiles for brian or pbmc

    Returns:
        dict: A dict where keys are cell types, and values are their expected baseline proportions
    """
    
    # open json file
    with open(proportions_json_path, 'r') as json_file:
        dict_propostions_json = json.load(json_file)
    
        print(dict_propostions_json)
    
    # Get the correct key
    for key in dict_propostions_json.keys():
        if key == CTProfile_name:
            correct_key = key
    # If no key was found,raise error
    if not correct_key:
        raise ValueError("Key not found")
    
    # Return the proportions of cell types based on our key (which is a tissue type)
    return dict_propostions_json[correct_key]
baseline_proportion = getBaselineProportion(CTProfile_name="brain_sc_with_metadata_cpm_pc_cell_type_profiles.csv", proportions_json_path=proportions_json_path)

    

{'pbmc_sc_with_metadata_cpm_pc_cell_type_profiles.csv': {'B-cells': 0.05, 'Dendritic cells': 0.05, 'Macrophages': 0.2, 'Monocytes': 0.05, 'Nk-cells': 0.1, 'Platelets': 0.05, 'T-cells': 0.5}, 'brain_sc_with_metadata_cpm_pc_cell_type_profiles.csv': {'Astrocytes': 0.05, 'Excitatory neurons': 0.5, 'Inhibitory neurons': 0.3, 'Microglial cells': 0.05, 'Oligodendrocyte precursor cells': 0.05, 'Oligodendrocytes': 0.05}}


In [24]:
#### Get all the simulation files that contain how many cells were subasampled for each simulation

# Get a list of all files in the directory
all_files = os.listdir(path)

pattern = ".*n_sim.*"

# Initialize a list to store matching file names
matching_files = []

# Loop through the files and check if they match the pattern
for file_name in all_files:
    if re.match(pattern, file_name):
        matching_files.append(file_name)
        
        
        
matching_files




FileNotFoundError: [Errno 2] No such file or directory: '/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/testrun/simulations/0/brain_sc_with_metadata_cpm_pc_cell_type_profiles.csv'

In [25]:
def getMatchingFiles(path):
    
    lofiles = os.listdir(path)


    pattern = ".*n_sim.*"

    # Initialize a list to store matching file names
    matching_files = []

    # Loop through the files and check if they match the pattern
    for file_name in lofiles:
        if re.match(pattern, file_name):
            matching_files.append(f"{path}/{file_name}")
        
    return matching_files
        

In [26]:
lopaths = [os.path.join(path, file) for file in matching_files]
lopaths

NameError: name 'matching_files' is not defined

In [27]:
lodfs = [pd.read_csv(file_path, index_col=0) for file_path in lopaths]
lodfs

NameError: name 'lopaths' is not defined

In [28]:
merged_df = pd.concat(lodfs, axis = 1)
merged_df = merged_df.sum(axis = 1)
merged_df

NameError: name 'lodfs' is not defined

In [29]:
def normalize_series_by_sum(series):
    """
    Normalize a pandas Series by dividing each value by the sum of the Series.

    Args:
        series (pandas.Series): The Series to normalize.

    Returns:
        pandas.Series: A new Series with values normalized by the sum.
    """
    # Calculate the sum of the Series
    series_sum = series.sum()

    # Normalize the Series by dividing each value by the sum
    normalized_series = series / series_sum

    return normalized_series

def getCellAverage(loFiles):
    lodfs = [pd.read_csv(file_path, index_col=0) for file_path in loFiles]
    
    merged_df = pd.concat(lodfs, axis = 1)
    merged_df = merged_df.sum(axis = 1)

    normalized_series = normalize_series_by_sum(merged_df)
    
    return normalized_series


In [30]:
retrieved_proportion = normalize_series_by_sum(merged_df)
retrieved_proportion

Unnamed: 0,0.005,0.75,0.02,0.08,0.085,10,0,0.05,0.075,0.01,...,0.03,0.1,0.045,0.35,0.5,0.07,1000,0.2,0.45,0.035
Astrocytes,0.05009,0.078115,0.04928,0.051005,0.05296,0.16397,0.05,0.04902,0.05092,0.050105,...,0.04933,0.052095,0.04907,0.07445,0.07426,0.05053,0.120805,0.05986,0.070975,0.049545
Excitatory neurons,0.49953,0.365305,0.50106,0.499065,0.501355,0.216805,0.5,0.502035,0.50479,0.500065,...,0.500335,0.499655,0.500405,0.443215,0.44359,0.502315,0.23416,0.48897,0.44727,0.50163
Inhibitory neurons,0.300205,0.30252,0.29964,0.296795,0.297135,0.26049,0.3,0.30033,0.29582,0.299545,...,0.3018,0.29586,0.29925,0.27314,0.254245,0.295135,0.20203,0.295195,0.26707,0.298585
Microglial cells,0.050155,0.081205,0.05006,0.048445,0.048975,0.12027,0.05,0.048715,0.04859,0.05006,...,0.05049,0.04875,0.049865,0.068165,0.060325,0.050565,0.14933,0.05197,0.076005,0.051105
Oligodendrocyte precursor cells,0.049945,0.095185,0.050695,0.051935,0.05031,0.122705,0.05,0.050575,0.048265,0.050175,...,0.04902,0.05463,0.052375,0.068495,0.09533,0.051225,0.138595,0.055165,0.076085,0.05013
Oligodendrocytes,0.050075,0.07767,0.049265,0.052755,0.049265,0.11576,0.05,0.049325,0.051615,0.05005,...,0.049025,0.04901,0.049035,0.072535,0.07225,0.05023,0.15508,0.04884,0.062595,0.049005


In [31]:
variance_int = path.split("/")[-2]
variance_int

'0'

In [32]:
#os.mkdir("../../data/retrieved_proportions")
#os.mkdir("../../data/retrieved_proportions/brain")
retrieved_proportion.to_csv(f"../../data/retrieved_proportions/brain/var{variance_int}.csv")

In [40]:
def main():
    main_path = "/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/1004_in_numerator_sqrtvariance/simulations"
    
    lovariances = os.listdir(main_path)
    
    lopaths = [f"{main_path}/{variance}/brain_sc_with_metadata_cpm_pc_cell_type_profiles.csv" for variance in lovariances]
    print(lopaths)
    
    
    lolofiles = [getMatchingFiles(path) for path in lopaths]
    print(lolofiles)
    
    dict_variance_average = {}
    
    
    for i, loFiles in enumerate(lolofiles):
        cell_averages = getCellAverage(loFiles)

        dict_variance_average[lovariances[i]] = cell_averages
        
    #print(dict_variance_average)
    return dict_variance_average
    
dict_variance_average = main()

['/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/1004_in_numerator_sqrtvariance/simulations/0.005/brain_sc_with_metadata_cpm_pc_cell_type_profiles.csv', '/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/1004_in_numerator_sqrtvariance/simulations/0.75/brain_sc_with_metadata_cpm_pc_cell_type_profiles.csv', '/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/1004_in_numerator_sqrtvariance/simulations/0.02/brain_sc_with_metadata_cpm_pc_cell_type_profiles.csv', '/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/1004_in_numerator_sqrtvariance/simulations/0.08/brain_sc_with_metadata_cpm_pc_cell_type_profiles.csv', '/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/1004_in_numerator_sqrtvariance/simulations/0.085/brain_sc_with_metadata_cpm_pc_cell_type_profiles.csv', '/space/grp/aadrian/Pseudob

In [41]:
merged_df = pd.DataFrame(dict_variance_average)
merged_df

Unnamed: 0,0.005,0.75,0.02,0.08,0.085,10,0,0.05,0.075,0.01,...,0.03,0.1,0.045,0.35,0.5,0.07,1000,0.2,0.45,0.035
Astrocytes,0.05009,0.078115,0.04928,0.051005,0.05296,0.16397,0.05,0.04902,0.05092,0.050105,...,0.04933,0.052095,0.04907,0.07445,0.07426,0.05053,0.120805,0.05986,0.070975,0.049545
Excitatory neurons,0.49953,0.365305,0.50106,0.499065,0.501355,0.216805,0.5,0.502035,0.50479,0.500065,...,0.500335,0.499655,0.500405,0.443215,0.44359,0.502315,0.23416,0.48897,0.44727,0.50163
Inhibitory neurons,0.300205,0.30252,0.29964,0.296795,0.297135,0.26049,0.3,0.30033,0.29582,0.299545,...,0.3018,0.29586,0.29925,0.27314,0.254245,0.295135,0.20203,0.295195,0.26707,0.298585
Microglial cells,0.050155,0.081205,0.05006,0.048445,0.048975,0.12027,0.05,0.048715,0.04859,0.05006,...,0.05049,0.04875,0.049865,0.068165,0.060325,0.050565,0.14933,0.05197,0.076005,0.051105
Oligodendrocyte precursor cells,0.049945,0.095185,0.050695,0.051935,0.05031,0.122705,0.05,0.050575,0.048265,0.050175,...,0.04902,0.05463,0.052375,0.068495,0.09533,0.051225,0.138595,0.055165,0.076085,0.05013
Oligodendrocytes,0.050075,0.07767,0.049265,0.052755,0.049265,0.11576,0.05,0.049325,0.051615,0.05005,...,0.049025,0.04901,0.049035,0.072535,0.07225,0.05023,0.15508,0.04884,0.062595,0.049005


In [42]:
melted = merged_df.reset_index().melt(id_vars=['index'])
melted = melted.sort_values("variable")
melted


Unnamed: 0,index,variable,value
36,Astrocytes,0,0.050000
37,Excitatory neurons,0,0.500000
38,Inhibitory neurons,0,0.300000
41,Oligodendrocytes,0,0.050000
40,Oligodendrocyte precursor cells,0,0.050000
...,...,...,...
103,Excitatory neurons,10000,0.259390
105,Microglial cells,10000,0.153865
106,Oligodendrocyte precursor cells,10000,0.160465
107,Oligodendrocytes,10000,0.108785


In [43]:
fig = px.line(melted, x = 'variable', y = 'value', color = 'index')

fig.update_layout(title = "Average Percent Composition of CTs Across Variance Levels")

# Update x-axis and y-axis labels
fig.update_xaxes(title_text='Variance')
fig.update_yaxes(title_text='Average Percent Composition')
fig.update_layout(legend_title_text='Cell Types (CTs)')

In [18]:
merged_df

Unnamed: 0,0.07,0.35,0.065,10,10000,1000,0.095,0.45,0.015,0.04,...,0.06,0.02,0.75,0.08,0.025,0.2,1,0.5,0.05,0.055
Astrocytes,0.0737,0.15278,0.09684,0.17676,0.166445,0.20603,0.093775,0.18937,0.056335,0.07169,...,0.09429,0.059885,0.153155,0.115065,0.06139,0.16568,0.1888,0.16029,0.08221,0.06858
Excitatory neurons,0.40693,0.188085,0.379615,0.089845,0.095245,0.11639,0.35296,0.179415,0.490765,0.439055,...,0.39265,0.471145,0.132975,0.36932,0.473685,0.27267,0.14941,0.18324,0.41874,0.410585
Inhibitory neurons,0.23746,0.16144,0.238065,0.152255,0.09967,0.12644,0.206215,0.151595,0.294075,0.249205,...,0.22733,0.28576,0.135025,0.206255,0.282665,0.14519,0.115665,0.124815,0.244,0.23851
Microglial cells,0.080105,0.16876,0.081265,0.263675,0.16192,0.174455,0.1173,0.182995,0.054855,0.07606,...,0.09575,0.057745,0.16707,0.110965,0.06516,0.135215,0.194635,0.20461,0.07255,0.09408
Oligodendrocyte precursor cells,0.09672,0.162445,0.09921,0.16416,0.2517,0.214305,0.10406,0.133015,0.051885,0.070635,...,0.09285,0.066915,0.217545,0.12466,0.06608,0.16673,0.133495,0.180255,0.08811,0.11417
Oligodendrocytes,0.105085,0.16649,0.105005,0.153305,0.22502,0.16238,0.12569,0.16361,0.052085,0.093355,...,0.09713,0.05855,0.19423,0.073735,0.05102,0.114515,0.217995,0.14679,0.09439,0.074075
