In [1]:
# This script is for making sure that the average number of cells sampled across a variance level converges to the expected baseline proportion

In [39]:
import os
import pandas as pd

import re
import json
import plotly.express as px

In [2]:
path = "/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/simulations/0/pbmc_sc_with_metadata_cpm_pc_cell_type_profiles.csv"
proportions_json_path= "/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/cell_type_proportions.json"

In [3]:
def getBaselineProportion(CTProfile_name:str, proportions_json_path:json) -> dict:
    """Get the list of proper cell type baseline proportions depending on if we are using brain or pbmc

    Args:
        CTProfile_name (str): name of the dataframe of cell type profiles
        proportions_json (json): json that contains baseline cell type profiles for brian or pbmc

    Returns:
        dict: A dict where keys are cell types, and values are their expected baseline proportions
    """
    
    # open json file
    with open(proportions_json_path, 'r') as json_file:
        dict_propostions_json = json.load(json_file)
    
        print(dict_propostions_json)
    
    # Get the correct key
    for key in dict_propostions_json.keys():
        if key == CTProfile_name:
            correct_key = key
    # If no key was found,raise error
    if not correct_key:
        raise ValueError("Key not found")
    
    # Return the proportions of cell types based on our key (which is a tissue type)
    return dict_propostions_json[correct_key]
baseline_proportion = getBaselineProportion(CTProfile_name="brain_sc_with_metadata_cpm_pc_cell_type_profiles.csv", proportions_json_path=proportions_json_path)

    

{'pbmc_sc_with_metadata_cpm_pc_cell_type_profiles.csv': {'B-cells': 0.05, 'Dendritic cells': 0.05, 'Macrophages': 0.2, 'Monocytes': 0.05, 'Nk-cells': 0.1, 'Platelets': 0.05, 'T-cells': 0.5}, 'brain_sc_with_metadata_cpm_pc_cell_type_profiles.csv': {'Astrocytes': 0.05, 'Excitatory neurons': 0.5, 'Inhibitory neurons': 0.3, 'Microglial cells': 0.05, 'Oligodendrocyte precursor cells': 0.05, 'Oligodendrocytes': 0.05}}


In [7]:
def getMatchingFiles(path):
    
    lofiles = os.listdir(path)


    pattern = ".*n_sim.*"

    # Initialize a list to store matching file names
    matching_files = []

    # Loop through the files and check if they match the pattern
    for file_name in lofiles:
        if re.match(pattern, file_name):
            matching_files.append(f"{path}/{file_name}")
        
    return matching_files
        

In [4]:
def normalize_df_by_sum(series):
    """
    Normalize a pandas Series by dividing each value by the sum of the Series.

    Args:
        series (pandas.Series): The Series to normalize.

    Returns:
        pandas.Series: A new Series with values normalized by the sum.
    """
    # Calculate the sum of the Series
    series_sum = series.sum()

    # Normalize the Series by dividing each value by the sum
    normalized_series = series / series_sum

    return normalized_series


In [6]:
def getCellAverage(loFiles):
    lodfs = [pd.read_csv(file_path, index_col=0) for file_path in loFiles]
    
    merged_df = pd.concat(lodfs, axis = 1)
    merged_df = merged_df.sum(axis = 1)

    normalized_series = normalize_series_by_sum(merged_df)
    
    return normalized_series

In [42]:
def normalize_dataframe(df):
    """
    Normalize a pandas DataFrame by dividing each value by the column total.

    Parameters:
    df (pd.DataFrame): The input DataFrame to be normalized.

    Returns:
    pd.DataFrame: The normalized DataFrame.
    """
    # Calculate the sum of each column
    column_sums = df.sum()
    
    # Divide each value by the column total
    normalized_df = df / column_sums
    
    return normalized_df

In [43]:
def main():
    main_path = "/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/simulations"
        
    variances_of_interest = ["0", "0.1", "1"]
    
    lopaths = [f"{main_path}/{variance}/pbmc_sc_with_metadata_cpm_pc_cell_type_profiles.csv" for variance in variances_of_interest]
    print(lopaths)
    
    
    lolofiles = [getMatchingFiles(path) for path in lopaths]
    print(lolofiles)
    
    # get a list of list of dfs
    lolodfs = [getDfs(lofiles) for lofiles in lolofiles]
    
    # merge each list of dfs
    lo_mergeddfs = [pd.concat(lodfs, axis = 1) for lodfs in lolodfs]
    
    # Normalize each df
    lo_mergeddfs = [normalize_dataframe(df) for df in lo_mergeddfs]
    
    # Init a dict to hold the merged dataframes
    dict_variances = {}
    
    for i, merged_df in enumerate(lo_mergeddfs):
        variance_level = variances_of_interest[i]
        print(variance_level)
        dict_variances[variance_level] = merged_df
        
    print(dict_variances)
    return dict_variances
    
dict_variances= main()

['/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/simulations/0/pbmc_sc_with_metadata_cpm_pc_cell_type_profiles.csv', '/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/simulations/0.1/pbmc_sc_with_metadata_cpm_pc_cell_type_profiles.csv', '/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/simulations/1/pbmc_sc_with_metadata_cpm_pc_cell_type_profiles.csv']
[['/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/simulations/0/pbmc_sc_with_metadata_cpm_pc_cell_type_profiles.csv/pbmc_sc_with_metadata_cpm_pc_cell_type_profiles_n_sim_67_profiles.csv', '/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin/bulkSimulationOneProfile/data/simulations/0/pbmc_sc_with_metadata_cpm_pc_cell_type_profiles.csv/pbmc_sc_with_metadata_cpm_pc_cell_type_profiles_n_sim_34_profiles.csv', '/space/grp/aadrian/Pseudobulk_Function_Pipeline_HighRes/bin

In [12]:
def getDfs(lofiles):
    
    lodfs = [pd.read_csv(file, index_col=0) for file in lofiles]
    
    return lodfs

In [44]:
dict_variances['0']

Unnamed: 0,numbers_to_sample,numbers_to_sample.1,numbers_to_sample.2,numbers_to_sample.3,numbers_to_sample.4,numbers_to_sample.5,numbers_to_sample.6,numbers_to_sample.7,numbers_to_sample.8,numbers_to_sample.9,...,numbers_to_sample.10,numbers_to_sample.11,numbers_to_sample.12,numbers_to_sample.13,numbers_to_sample.14,numbers_to_sample.15,numbers_to_sample.16,numbers_to_sample.17,numbers_to_sample.18,numbers_to_sample.19
Monocytes,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
T-cells,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
Platelets,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
Dendritic cells,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
B-cells,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05
Nk-cells,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
Macrophages,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,...,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2


In [48]:
def graph(df_variance, var_level:int):
    
    # Rename columns to their positions
    df_variance.columns = [str(i) for i, _ in enumerate(df_variance.columns)]
    
    melted = df_variance.reset_index().melt(id_vars=['index', ])
    melted = melted.sort_values("variable")

    
    fig = px.line(melted, x = 'variable', y = 'value', color = 'index')

    fig.update_layout(title = f"Percent Composition Across Simulations with {var_level} Variance")

    # Update x-axis and y-axis labels
    fig.update_xaxes(title_text = "Simulation") 
    fig.update_yaxes(title_text='Percent Composition')
    fig.update_layout(legend_title_text='Cell Types (CTs)')
    
    fig.show()
        
    
for key in dict_variances.keys():
    graph(dict_variances[key], key)

In [37]:
cor_matrix = merged_df.corr(method = "pearson")
cor_matrix = cor_matrix.sort_index(axis = 0).sort_index(axis = 1)

In [38]:
import plotly.express as px

# Sample data (replace this with your own data)
data = [
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
]

# Create a heatmap using Plotly Express
fig = px.imshow(cor_matrix)

# Show the heatmap
fig.show()


In [46]:
melted = merged_df.reset_index().melt(id_vars=['index'])
melted = melted.sort_values("variable")
melted


Unnamed: 0,index,variable,value
166,Platelets,0,0.050000
162,Dendritic cells,0,0.050000
163,Macrophages,0,0.200000
164,Monocytes,0,0.050000
165,Nk-cells,0,0.100000
...,...,...,...
54,Platelets,1,0.134818
53,Nk-cells,1,0.146558
52,Monocytes,1,0.128626
49,B-cells,1,0.138650


In [51]:
fig = px.line(melted, x = 'variable', y = 'value', color = 'index')

fig.update_layout(title = "Average Percent Composition of CTs Across Variance Levels")

# Update x-axis and y-axis labels
fig.update_xaxes(title_text='Variance')
fig.update_yaxes(title_text='Average Percent Composition')
fig.update_layout(legend_title_text='Cell Types (CTs)')

In [41]:
merged_df

Unnamed: 0,0.055,0.1,0.025,0.07,0.045,0.09,0.03,1,0.5,0.04,...,0.075,0.05,0.095,0.015,0.3,0.9,0.065,0,0.02,0.08
B-cells,0.054177,0.065815,0.051863,0.056325,0.052101,0.061765,0.049043,0.13865,0.117254,0.053145,...,0.05828,0.053821,0.065994,0.05072,0.103943,0.132823,0.053571,0.05,0.04971,0.060704
Dendritic cells,0.05302,0.06893,0.050273,0.053659,0.051064,0.062135,0.05116,0.119747,0.116646,0.052597,...,0.052534,0.050688,0.062569,0.04824,0.11051,0.123145,0.051019,0.05,0.04944,0.054482
Macrophages,0.197262,0.177203,0.199782,0.196092,0.195973,0.183483,0.204331,0.144343,0.168202,0.195786,...,0.186928,0.195612,0.188875,0.20002,0.152189,0.151911,0.198321,0.2,0.19972,0.180698
Monocytes,0.049004,0.061408,0.047823,0.057599,0.047835,0.059436,0.047514,0.128626,0.118267,0.044668,...,0.058923,0.058803,0.060312,0.05031,0.108263,0.132566,0.056459,0.05,0.04962,0.05199
Nk-cells,0.092805,0.095959,0.101286,0.092575,0.103434,0.088852,0.097636,0.146558,0.13647,0.100095,...,0.095348,0.097657,0.094267,0.10276,0.122584,0.13373,0.091937,0.1,0.09962,0.096511
Platelets,0.056047,0.068129,0.049253,0.058399,0.04914,0.071518,0.049412,0.134818,0.114547,0.05722,...,0.058261,0.047068,0.06292,0.04815,0.104322,0.137481,0.054922,0.05,0.05134,0.065669
T-cells,0.497685,0.462555,0.49972,0.485351,0.500454,0.47281,0.500904,0.187257,0.228615,0.496489,...,0.489725,0.49635,0.465062,0.4998,0.298189,0.188343,0.493772,0.5,0.50055,0.489947
