In [60]:
# Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

# Data Import
PREFIX = '05c_l1e-6_'
C_LumA = pd.read_csv(f"../Outputs/{PREFIX}C_est_LumA.csv", index_col=0)
C_LumB = pd.read_csv(f"../Outputs/{PREFIX}C_est_LumB.csv", index_col=0)
C_Her2 = pd.read_csv(f"../Outputs/{PREFIX}C_est_Her2.csv", index_col=0)
C_Basal = pd.read_csv(f"../Outputs/{PREFIX}C_est_Basal.csv", index_col=0)
c_LumA = pd.read_csv(f"../Outputs/{PREFIX}c_est_LumA.csv", index_col=0)
c_LumB = pd.read_csv(f"../Outputs/{PREFIX}c_est_LumB.csv", index_col=0)
c_Her2 = pd.read_csv(f"../Outputs/{PREFIX}c_est_Her2.csv", index_col=0)
c_Basal = pd.read_csv(f"../Outputs/{PREFIX}c_est_Basal.csv", index_col=0)




In [61]:
C_full_LumA = pd.concat((C_LumA, c_LumA), axis=0)
C_full_LumB = pd.concat((C_LumB, c_LumB), axis=0)
C_full_Her2 = pd.concat((C_Her2, c_Her2), axis=0)
C_full_Basal = pd.concat((C_Basal, c_Basal), axis=0)

In [62]:
c_LumA

Unnamed: 0,TCGA-B6-A408,TCGA-AQ-A04L,TCGA-BH-A0GY,TCGA-BH-A18S,TCGA-BH-A0HL,TCGA-AO-A12C,TCGA-AN-A0FS,TCGA-BH-A1EX,TCGA-A2-A0ET,TCGA-BH-A0GZ,...,TCGA-A7-A0DB.2,TCGA-OL-A66N,TCGA-LL-A5YN,TCGA-AC-A3OD.1,TCGA-AC-A2QI,TCGA-A7-A0DB.3,TCGA-BH-A18M.1,TCGA-E9-A1RF.1,TCGA-E2-A156,TCGA-A1-A0SE
hidden,0.162201,0.083794,0.147353,0.033727,0.206042,0.114673,0.122993,0.147509,0.151753,0.150385,...,0.413042,0.215679,0.088258,0.216437,0.127423,0.064022,0.052253,0.098522,0.222196,0.122901


In [63]:
cell_types = list(C_full_LumA.index)
groups = {"LumA": C_full_LumA, "LumB": C_full_LumB, "Her2": C_full_Her2, "Basal": C_full_Basal}

In [64]:
# Mean and std across samples (columns) for each cell type (row)
means = pd.DataFrame({name: df.mean(axis=1, skipna=True) for name, df in groups.items()})
stds  = pd.DataFrame({name: df.std(axis=1, ddof=1, skipna=True) for name, df in groups.items()})

# Format as "mean ± std"
overview = means.round(3).map(lambda x: f"{x:.3f}").astype(str) + " ± " + stds.round(3).map(lambda x: f"{x:.3f}").astype(str)
overview.index.name = "cell_type"
overview


Unnamed: 0_level_0,LumA,LumB,Her2,Basal
cell_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
B cell,0.043 ± 0.085,0.040 ± 0.093,0.089 ± 0.162,0.084 ± 0.157
CD4 T cell,0.105 ± 0.085,0.064 ± 0.075,0.050 ± 0.069,0.047 ± 0.070
CD8 T cell,0.052 ± 0.068,0.083 ± 0.077,0.057 ± 0.068,0.059 ± 0.078
Endothelial cell,0.077 ± 0.039,0.085 ± 0.044,0.072 ± 0.045,0.051 ± 0.041
Fibroblast,0.097 ± 0.053,0.087 ± 0.050,0.072 ± 0.056,0.065 ± 0.087
Healthy Epithelial,0.258 ± 0.108,0.225 ± 0.097,0.242 ± 0.111,0.290 ± 0.154
Myeloid cell,0.097 ± 0.063,0.087 ± 0.061,0.085 ± 0.055,0.097 ± 0.084
NK cell,0.111 ± 0.077,0.145 ± 0.084,0.133 ± 0.088,0.122 ± 0.087
Perivascular cell,0.018 ± 0.026,0.019 ± 0.023,0.013 ± 0.020,0.018 ± 0.027
hidden,0.142 ± 0.063,0.165 ± 0.064,0.188 ± 0.159,0.166 ± 0.132


In [65]:
from statsmodels.stats.multitest import multipletests
from scipy.stats import kruskal

# List of cell types (assuming row index is the same across all DataFrames)

In [66]:
results = []
for ct in cell_types:
    arrays = []
    for group in groups:
        arrays.append(groups[group].loc[ct].values)
    H, p = kruskal(*arrays)
    results.append([ct, H, p])

results = pd.DataFrame(results, columns=["Cell Type", "H-statistic", "p-value"])




In [67]:
results.set_index("Cell Type", inplace=True)

In [68]:
# multiple testing correction
results["p-value-corrected"] = multipletests(results["p-value"], method="fdr_bh")[1]#.map(lambda x: f"{x:.3g}")
#results = results.sort_values(by="p-value-corrected")

results



Unnamed: 0_level_0,H-statistic,p-value,p-value-corrected
Cell Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B cell,7.178682,0.06641546,0.07379495
CD4 T cell,110.548202,8.36287e-24,8.36287e-23
CD8 T cell,40.047979,1.040847e-08,2.081694e-08
Endothelial cell,96.618917,8.286424e-21,2.762141e-20
Fibroblast,102.48024,4.551308e-22,2.275654e-21
Healthy Epithelial,33.763671,2.222346e-07,3.70391e-07
Myeloid cell,5.482486,0.1396899,0.1396899
NK cell,31.795677,5.778919e-07,8.255599e-07
Perivascular cell,11.181037,0.01078616,0.0134827
hidden,40.703423,7.558249e-09,1.889562e-08


In [69]:
final_table = pd.concat((overview, results), axis=1)
final_table

Unnamed: 0,LumA,LumB,Her2,Basal,H-statistic,p-value,p-value-corrected
B cell,0.043 ± 0.085,0.040 ± 0.093,0.089 ± 0.162,0.084 ± 0.157,7.178682,0.06641546,0.07379495
CD4 T cell,0.105 ± 0.085,0.064 ± 0.075,0.050 ± 0.069,0.047 ± 0.070,110.548202,8.36287e-24,8.36287e-23
CD8 T cell,0.052 ± 0.068,0.083 ± 0.077,0.057 ± 0.068,0.059 ± 0.078,40.047979,1.040847e-08,2.081694e-08
Endothelial cell,0.077 ± 0.039,0.085 ± 0.044,0.072 ± 0.045,0.051 ± 0.041,96.618917,8.286424e-21,2.762141e-20
Fibroblast,0.097 ± 0.053,0.087 ± 0.050,0.072 ± 0.056,0.065 ± 0.087,102.48024,4.551308e-22,2.275654e-21
Healthy Epithelial,0.258 ± 0.108,0.225 ± 0.097,0.242 ± 0.111,0.290 ± 0.154,33.763671,2.222346e-07,3.70391e-07
Myeloid cell,0.097 ± 0.063,0.087 ± 0.061,0.085 ± 0.055,0.097 ± 0.084,5.482486,0.1396899,0.1396899
NK cell,0.111 ± 0.077,0.145 ± 0.084,0.133 ± 0.088,0.122 ± 0.087,31.795677,5.778919e-07,8.255599e-07
Perivascular cell,0.018 ± 0.026,0.019 ± 0.023,0.013 ± 0.020,0.018 ± 0.027,11.181037,0.01078616,0.0134827
hidden,0.142 ± 0.063,0.165 ± 0.064,0.188 ± 0.159,0.166 ± 0.132,40.703423,7.558249e-09,1.889562e-08


In [70]:
final_table.loc[:,"p-value"] = results["p-value"].astype(np.float64).map(lambda x: f"{x:.3g}")

 '5.78e-07' '0.0108' '7.56e-09']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  final_table.loc[:,"p-value"] = results["p-value"].astype(np.float64).map(lambda x: f"{x:.3g}")


In [71]:
final_table.loc[:,"p-value-corrected"] = results["p-value-corrected"].map(lambda x: f"{x:.3g}")

 '8.26e-07' '0.0135' '1.89e-08']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  final_table.loc[:,"p-value-corrected"] = results["p-value-corrected"].map(lambda x: f"{x:.3g}")


In [72]:
final_table.loc[:,"H-statistic"] = results["H-statistic"].astype(np.float64).map(lambda x: f"{x:.3f}")

 '11.181' '40.703']' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  final_table.loc[:,"H-statistic"] = results["H-statistic"].astype(np.float64).map(lambda x: f"{x:.3f}")


In [73]:
final_table

Unnamed: 0,LumA,LumB,Her2,Basal,H-statistic,p-value,p-value-corrected
B cell,0.043 ± 0.085,0.040 ± 0.093,0.089 ± 0.162,0.084 ± 0.157,7.179,0.0664,0.0738
CD4 T cell,0.105 ± 0.085,0.064 ± 0.075,0.050 ± 0.069,0.047 ± 0.070,110.548,8.36e-24,8.359999999999999e-23
CD8 T cell,0.052 ± 0.068,0.083 ± 0.077,0.057 ± 0.068,0.059 ± 0.078,40.048,1.04e-08,2.08e-08
Endothelial cell,0.077 ± 0.039,0.085 ± 0.044,0.072 ± 0.045,0.051 ± 0.041,96.619,8.290000000000001e-21,2.76e-20
Fibroblast,0.097 ± 0.053,0.087 ± 0.050,0.072 ± 0.056,0.065 ± 0.087,102.48,4.55e-22,2.2800000000000002e-21
Healthy Epithelial,0.258 ± 0.108,0.225 ± 0.097,0.242 ± 0.111,0.290 ± 0.154,33.764,2.22e-07,3.7e-07
Myeloid cell,0.097 ± 0.063,0.087 ± 0.061,0.085 ± 0.055,0.097 ± 0.084,5.482,0.14,0.14
NK cell,0.111 ± 0.077,0.145 ± 0.084,0.133 ± 0.088,0.122 ± 0.087,31.796,5.78e-07,8.26e-07
Perivascular cell,0.018 ± 0.026,0.019 ± 0.023,0.013 ± 0.020,0.018 ± 0.027,11.181,0.0108,0.0135
hidden,0.142 ± 0.063,0.165 ± 0.064,0.188 ± 0.159,0.166 ± 0.132,40.703,7.56e-09,1.89e-08


In [74]:
final_table = final_table.reset_index(names="Cell type")

In [76]:
final_table.to_latex("../Outputs/Table_Kruskal-Wallis.tex", index=False, na_rep='-', column_format='l|c|c|c|c|c|c|c')