In [1]:
# This script is for making sure that the average number of cells sampled across a variance level converges to the expected baseline proportion

In [54]:
import os
import pandas as pd
from functools import reduce
import re
import json

In [28]:
path = "/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/simulations/0/pbmc_sc_with_metadata_cpm_pc_cell_type_profiles.csv"
proportions_json_path= "/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/cell_type_proportions.json"

In [29]:
def getBaselineProportion(CTProfile_name:str, proportions_json_path:json) -> dict:
    """Get the list of proper cell type baseline proportions depending on if we are using brain or pbmc

    Args:
        CTProfile_name (str): name of the dataframe of cell type profiles
        proportions_json (json): json that contains baseline cell type profiles for brian or pbmc

    Returns:
        dict: A dict where keys are cell types, and values are their expected baseline proportions
    """
    
    # open json file
    with open(proportions_json_path, 'r') as json_file:
        dict_propostions_json = json.load(json_file)
    
        print(dict_propostions_json)
    
    # Get the correct key
    for key in dict_propostions_json.keys():
        if key == CTProfile_name:
            correct_key = key
    # If no key was found,raise error
    if not correct_key:
        raise ValueError("Key not found")
    
    # Return the proportions of cell types based on our key (which is a tissue type)
    return dict_propostions_json[correct_key]
baseline_proportion = getBaselineProportion(CTProfile_name="brain_sc_with_metadata_cpm_pc_cell_type_profiles.csv", proportions_json_path=proportions_json_path)

    

{'pbmc_sc_with_metadata_cpm_pc_cell_type_profiles.csv': {'B-cells': 0.05, 'Dendritic cells': 0.05, 'Macrophages': 0.2, 'Monocytes': 0.05, 'Nk-cells': 0.1, 'Platelets': 0.05, 'T-cells': 0.5}, 'brain_sc_with_metadata_cpm_pc_cell_type_profiles.csv': {'Astrocytes': 0.05, 'Excitatory neurons': 0.5, 'Inhibitory neurons': 0.3, 'Microglial cells': 0.05, 'Oligodendrocyte precursor cells': 0.05, 'Oligodendrocytes': 0.05}}


In [88]:
#### Get all the simulation files that contain how many cells were subasampled for each simulation

# Get a list of all files in the directory
all_files = os.listdir(path)

pattern = ".*n_sim.*"

# Initialize a list to store matching file names
matching_files = []

# Loop through the files and check if they match the pattern
for file_name in all_files:
    if re.match(pattern, file_name):
        matching_files.append(file_name)
        
        
        
matching_files




['pbmc_sc_with_metadata_cpm_pc_cell_type_profiles_n_sim_81_profiles.csv',
 'pbmc_sc_with_metadata_cpm_pc_cell_type_profiles_n_sim_6_profiles.csv',
 'pbmc_sc_with_metadata_cpm_pc_cell_type_profiles_n_sim_74_profiles.csv',
 'pbmc_sc_with_metadata_cpm_pc_cell_type_profiles_n_sim_27_profiles.csv',
 'pbmc_sc_with_metadata_cpm_pc_cell_type_profiles_n_sim_75_profiles.csv',
 'pbmc_sc_with_metadata_cpm_pc_cell_type_profiles_n_sim_26_profiles.csv',
 'pbmc_sc_with_metadata_cpm_pc_cell_type_profiles_n_sim_7_profiles.csv',
 'pbmc_sc_with_metadata_cpm_pc_cell_type_profiles_n_sim_80_profiles.csv',
 'pbmc_sc_with_metadata_cpm_pc_cell_type_profiles_n_sim_24_profiles.csv',
 'pbmc_sc_with_metadata_cpm_pc_cell_type_profiles_n_sim_77_profiles.csv',
 'pbmc_sc_with_metadata_cpm_pc_cell_type_profiles_n_sim_82_profiles.csv',
 'pbmc_sc_with_metadata_cpm_pc_cell_type_profiles_n_sim_5_profiles.csv',
 'pbmc_sc_with_metadata_cpm_pc_cell_type_profiles_n_sim_4_profiles.csv',
 'pbmc_sc_with_metadata_cpm_pc_cell_type_p

In [11]:
def getMatchingFiles(path):
    
    lofiles = os.listdir(path)


    pattern = ".*n_sim.*"

    # Initialize a list to store matching file names
    matching_files = []

    # Loop through the files and check if they match the pattern
    for file_name in lofiles:
        if re.match(pattern, file_name):
            matching_files.append(f"{path}/{file_name}")
        
    return matching_files
        

In [23]:
lopaths = [os.path.join(path, file) for file in matching_files]
lopaths

['/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/simulations/0/brain_sc_with_metadata_cpm_pc_cell_type_profiles.csv/brain_sc_with_metadata_cpm_pc_cell_type_profiles_n_sim_42_profiles.csv',
 '/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/simulations/0/brain_sc_with_metadata_cpm_pc_cell_type_profiles.csv/brain_sc_with_metadata_cpm_pc_cell_type_profiles_n_sim_11_profiles.csv',
 '/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/simulations/0/brain_sc_with_metadata_cpm_pc_cell_type_profiles.csv/brain_sc_with_metadata_cpm_pc_cell_type_profiles_n_sim_68_profiles.csv',
 '/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/simulations/0/brain_sc_with_metadata_cpm_pc_cell_type_profiles.csv/brain_sc_with_metadata_cpm_pc_cell_type_profiles_n_sim_69_profiles.csv',
 '/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimula

In [26]:
lodfs = [pd.read_csv(file_path, index_col=0) for file_path in lopaths]
lodfs

[                                 numbers_to_sample
 Astrocytes                                      50
 Excitatory neurons                             500
 Inhibitory neurons                             300
 Microglial cells                                50
 Oligodendrocyte precursor cells                 50
 Oligodendrocytes                                50,
                                  numbers_to_sample
 Astrocytes                                      50
 Excitatory neurons                             500
 Inhibitory neurons                             300
 Microglial cells                                50
 Oligodendrocyte precursor cells                 50
 Oligodendrocytes                                50,
                                  numbers_to_sample
 Astrocytes                                      50
 Excitatory neurons                             500
 Inhibitory neurons                             300
 Microglial cells                                50
 Oligodend

In [43]:
merged_df = pd.concat(lodfs, axis = 1)
merged_df = merged_df.sum(axis = 1)
merged_df

Astrocytes                          4200
Excitatory neurons                 42000
Inhibitory neurons                 25200
Microglial cells                    4200
Oligodendrocyte precursor cells     4200
Oligodendrocytes                    4200
dtype: int64

In [5]:
def normalize_series_by_sum(series):
    """
    Normalize a pandas Series by dividing each value by the sum of the Series.

    Args:
        series (pandas.Series): The Series to normalize.

    Returns:
        pandas.Series: A new Series with values normalized by the sum.
    """
    # Calculate the sum of the Series
    series_sum = series.sum()

    # Normalize the Series by dividing each value by the sum
    normalized_series = series / series_sum

    return normalized_series


In [54]:
retrieved_proportion = normalize_series_by_sum(merged_df)
retrieved_proportion

Astrocytes                         0.05
Excitatory neurons                 0.50
Inhibitory neurons                 0.30
Microglial cells                   0.05
Oligodendrocyte precursor cells    0.05
Oligodendrocytes                   0.05
dtype: float64

In [60]:
variance_int = path.split("/")[-2]
variance_int

'0'

In [68]:
#os.mkdir("../../data/retrieved_proportions")
#os.mkdir("../../data/retrieved_proportions/brain")
retrieved_proportion.to_csv(f"../../data/retrieved_proportions/brain/var{variance_int}.csv")

In [78]:
def main():
    main_path = "/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/simulations"
    
    lovariances = os.listdir(main_path)
    
    lopaths = [f"{main_path}/{variance}/pbmc_sc_with_metadata_cpm_pc_cell_type_profiles.csv" for variance in lovariances]
    print(lopaths)
    
    
    lolofiles = [getMatchingFiles(path) for path in lopaths]
    print(lolofiles)
    
    dict_variance_average = {}
    
    
    for i, loFiles in enumerate(lolofiles):
        cell_averages = getCellAverage(loFiles)

        dict_variance_average[lovariances[i]] = cell_averages
        
    #print(dict_variance_average)
    return dict_variance_average
    
dict_variance_average = main()

['/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/simulations/0.055/pbmc_sc_with_metadata_cpm_pc_cell_type_profiles.csv', '/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/simulations/0.1/pbmc_sc_with_metadata_cpm_pc_cell_type_profiles.csv', '/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/simulations/0.025/pbmc_sc_with_metadata_cpm_pc_cell_type_profiles.csv', '/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/simulations/0.07/pbmc_sc_with_metadata_cpm_pc_cell_type_profiles.csv', '/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/simulations/0.045/pbmc_sc_with_metadata_cpm_pc_cell_type_profiles.csv', '/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/simulations/0.09/pbmc_sc_with_metadata_cpm_pc_cell_type_profiles.csv', '/space/grp/aadrian/Pseud

In [62]:
merged_df = pd.DataFrame(dict_variance_average)
merged_df

Unnamed: 0,0.055,0.1,0.025,0.07,0.045,0.09,0.03,1,0.5,0.04,...,0.075,0.05,0.095,0.015,0.3,0.9,0.065,0,0.02,0.08
B-cells,0.054177,0.065815,0.051863,0.056325,0.052101,0.061765,0.049043,0.13865,0.117254,0.053145,...,0.05828,0.053821,0.065994,0.05072,0.103943,0.132823,0.053571,0.05,0.04971,0.060704
Dendritic cells,0.05302,0.06893,0.050273,0.053659,0.051064,0.062135,0.05116,0.119747,0.116646,0.052597,...,0.052534,0.050688,0.062569,0.04824,0.11051,0.123145,0.051019,0.05,0.04944,0.054482
Macrophages,0.197262,0.177203,0.199782,0.196092,0.195973,0.183483,0.204331,0.144343,0.168202,0.195786,...,0.186928,0.195612,0.188875,0.20002,0.152189,0.151911,0.198321,0.2,0.19972,0.180698
Monocytes,0.049004,0.061408,0.047823,0.057599,0.047835,0.059436,0.047514,0.128626,0.118267,0.044668,...,0.058923,0.058803,0.060312,0.05031,0.108263,0.132566,0.056459,0.05,0.04962,0.05199
Nk-cells,0.092805,0.095959,0.101286,0.092575,0.103434,0.088852,0.097636,0.146558,0.13647,0.100095,...,0.095348,0.097657,0.094267,0.10276,0.122584,0.13373,0.091937,0.1,0.09962,0.096511
Platelets,0.056047,0.068129,0.049253,0.058399,0.04914,0.071518,0.049412,0.134818,0.114547,0.05722,...,0.058261,0.047068,0.06292,0.04815,0.104322,0.137481,0.054922,0.05,0.05134,0.065669
T-cells,0.497685,0.462555,0.49972,0.485351,0.500454,0.47281,0.500904,0.187257,0.228615,0.496489,...,0.489725,0.49635,0.465062,0.4998,0.298189,0.188343,0.493772,0.5,0.50055,0.489947


In [77]:
def getCellAverage(loFiles):
    lodfs = [pd.read_csv(file_path, index_col=0) for file_path in loFiles]
    
    merged_df = pd.concat(lodfs, axis = 1)
    merged_df = merged_df.sum(axis = 1)

    normalized_series = normalize_series_by_sum(merged_df)
    
    return normalized_series

In [37]:
cor_matrix = merged_df.corr(method = "pearson")
cor_matrix = cor_matrix.sort_index(axis = 0).sort_index(axis = 1)

In [38]:
import plotly.express as px

# Sample data (replace this with your own data)
data = [
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
]

# Create a heatmap using Plotly Express
fig = px.imshow(cor_matrix)

# Show the heatmap
fig.show()


In [46]:
melted = merged_df.reset_index().melt(id_vars=['index'])
melted = melted.sort_values("variable")
melted


Unnamed: 0,index,variable,value
166,Platelets,0,0.050000
162,Dendritic cells,0,0.050000
163,Macrophages,0,0.200000
164,Monocytes,0,0.050000
165,Nk-cells,0,0.100000
...,...,...,...
54,Platelets,1,0.134818
53,Nk-cells,1,0.146558
52,Monocytes,1,0.128626
49,B-cells,1,0.138650


In [51]:
fig = px.line(melted, x = 'variable', y = 'value', color = 'index')

fig.update_layout(title = "Average Percent Composition of CTs Across Variance Levels")

# Update x-axis and y-axis labels
fig.update_xaxes(title_text='Variance')
fig.update_yaxes(title_text='Average Percent Composition')
fig.update_layout(legend_title_text='Cell Types (CTs)')

In [41]:
merged_df

Unnamed: 0,0.055,0.1,0.025,0.07,0.045,0.09,0.03,1,0.5,0.04,...,0.075,0.05,0.095,0.015,0.3,0.9,0.065,0,0.02,0.08
B-cells,0.054177,0.065815,0.051863,0.056325,0.052101,0.061765,0.049043,0.13865,0.117254,0.053145,...,0.05828,0.053821,0.065994,0.05072,0.103943,0.132823,0.053571,0.05,0.04971,0.060704
Dendritic cells,0.05302,0.06893,0.050273,0.053659,0.051064,0.062135,0.05116,0.119747,0.116646,0.052597,...,0.052534,0.050688,0.062569,0.04824,0.11051,0.123145,0.051019,0.05,0.04944,0.054482
Macrophages,0.197262,0.177203,0.199782,0.196092,0.195973,0.183483,0.204331,0.144343,0.168202,0.195786,...,0.186928,0.195612,0.188875,0.20002,0.152189,0.151911,0.198321,0.2,0.19972,0.180698
Monocytes,0.049004,0.061408,0.047823,0.057599,0.047835,0.059436,0.047514,0.128626,0.118267,0.044668,...,0.058923,0.058803,0.060312,0.05031,0.108263,0.132566,0.056459,0.05,0.04962,0.05199
Nk-cells,0.092805,0.095959,0.101286,0.092575,0.103434,0.088852,0.097636,0.146558,0.13647,0.100095,...,0.095348,0.097657,0.094267,0.10276,0.122584,0.13373,0.091937,0.1,0.09962,0.096511
Platelets,0.056047,0.068129,0.049253,0.058399,0.04914,0.071518,0.049412,0.134818,0.114547,0.05722,...,0.058261,0.047068,0.06292,0.04815,0.104322,0.137481,0.054922,0.05,0.05134,0.065669
T-cells,0.497685,0.462555,0.49972,0.485351,0.500454,0.47281,0.500904,0.187257,0.228615,0.496489,...,0.489725,0.49635,0.465062,0.4998,0.298189,0.188343,0.493772,0.5,0.50055,0.489947
