In [1]:
import sys
import pandas as pd
import numpy as np

sys.path.append('/home/yuliya/repos/cosybio/FedProt/evaluation_utils/')
from evaluation import evaluation_func as fp_eval

from collections import OrderedDict

In [2]:
color_dict = OrderedDict()

color_dict["Methods"] = OrderedDict({
    "FedProt":"black",
    "Fisher":"#E69F00",
    "Stouffer":"#D44400",
    "REM":"#009E73",
    "RankProd":"#2E5EAA"
})

# Read results

In [3]:
root_dir = "/home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/"

dfs = {}

workdir = root_dir + "balanced/results" 
df = fp_eval.read_results(workdir,
                          deqms_name="/central_res.tsv",
                          fedprot_name="/DPE.csv")
dfs["Balanced"] = df

workdir = root_dir + "imbalanced/results"
df = fp_eval.read_results(workdir,
                          deqms_name="/central_res.tsv",
                          fedprot_name="/DPE.csv")
dfs["Imbalanced"] = df


print(dfs["Imbalanced"].shape)
dfs["Imbalanced"].head(4)

Results loaded from /home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/balanced/results with 2242 genes. Adj.p-values were not log-transformed.
Results loaded from /home/yuliya/repos/cosybio/FedProt/evaluation/bacterial/imbalanced/results with 2223 genes. Adj.p-values were not log-transformed.


(2223, 12)


Unnamed: 0,pv_DEqMS,lfc_DEqMS,pv_FedProt,lfc_FedProt,lfc_Fisher,pv_Fisher,lfc_REM,pv_REM,pv_Stouffer,lfc_Stouffer,pv_RankProd,lfc_RankProd
P00350,1.452389e-10,0.18982,1.209438e-10,0.18982,0.167429,3.008376e-11,0.196547,3.6263770000000003e-22,1.969299e-08,0.167429,0.231855,0.167429
P00363,0.05886478,-0.121199,0.05508612,-0.121199,-0.099338,0.483314,-0.067334,0.3365202,0.8768886,-0.099338,0.649956,-0.099338
P00370,2.272542e-14,0.482349,1.628821e-14,0.482349,0.468147,1.198634e-10,0.471281,2.8638e-12,8.617205e-11,0.468147,0.018845,0.468147
P00393,0.1292024,0.052034,0.1233822,0.052034,0.02022,0.5957304,0.027907,0.4032698,0.7548646,0.02022,0.531348,0.02022


# Deviations in the results of decentralized methods

First - for balanced.  
adj. p-val:
- min diff (supplementary)
- mean diff,
- max diff, 
- r, 
- rho

log2FC - same, for supplementary.
  




## stats tables

In [4]:
for dataset in dfs:
    pval_basic_stats = fp_eval.calc_stats(
        dfs[dataset], 
        lfc_thr=0.58, adj_pval_thr=0.01,
        stats=["MinDiff", "MeanDiff", "MaxDiff", "r", "ρ"],
        methods=["FedProt", "Fisher", "Stouffer", "REM", "RankProd"],
        column_name="pv_",
        top_genes=-1
    )
    logfc_basic_stats = fp_eval.calc_stats(
        dfs[dataset],
        lfc_thr=0.58, adj_pval_thr=0.01,
        stats=["MinDiff", "MeanDiff", "MaxDiff", "r", "ρ"],
        methods=["FedProt", "Fisher", "Stouffer", "REM", "RankProd"],
        column_name="lfc_",
        top_genes=-1
    )

    # save to file, dataset to lower case
    dataset = dataset.lower()
    pval_basic_stats.to_csv(f'{root_dir}/{dataset}/results_tables/pval_basic_stats.csv')
    logfc_basic_stats.to_csv(f'{root_dir}/{dataset}/results_tables/logfc_basic_stats.csv')

    print(f'Dataset: {dataset}')
    print("\tLogFC basic stats:")
    print(logfc_basic_stats)
    print("\tP-value basic stats:")
    print(pval_basic_stats)        

Calculating corrs. Using p-vals - not log-transformed.
Correlations computed for all genes from pv_ columns.
Correlations computed for all genes from lfc_ columns.
Calculating corrs. Using p-vals - not log-transformed.
Correlations computed for all genes from pv_ columns.
Correlations computed for all genes from lfc_ columns.


Dataset: balanced
	LogFC basic stats:
               MinDiff      MeanDiff       MaxDiff         r         ρ
FedProt   0.000000e+00  1.266807e-14  6.039613e-14  1.000000  1.000000
Fisher    2.025490e-07  3.151856e-02  5.648453e-01  0.997119  0.995851
Stouffer  2.025490e-07  3.151856e-02  5.648453e-01  0.997119  0.995851
REM       1.485483e-05  3.477359e-02  5.514332e-01  0.996711  0.994698
RankProd  2.025490e-07  3.151856e-02  5.648453e-01  0.997119  0.995851
	P-value basic stats:
                MinDiff  MeanDiff   MaxDiff         r         ρ
FedProt   6.733165e-108  0.000779  0.011238  0.999973  0.999995
Fisher     7.520537e-96  0.060850  0.999992  0.367356  0.947161
Stouffer   3.759399e-97  0.067970  0.999990  0.394224  0.954391
REM       1.062029e-107  0.075422  0.950243  0.705698  0.879489
RankProd   1.413735e-12  0.210913  0.774153  0.490878  0.868676
Dataset: imbalanced
	LogFC basic stats:
               MinDiff      MeanDiff       MaxDiff         r         ρ
FedProt   0.000000e

In [5]:
for dataset in dfs:
    transfomed_dataset = dfs[dataset].copy()
    
    # transform p-values to -log10 pv_ + column["FedProt", "Fisher", "Stouffer", "REM", "RankProd"]
    # there is no function in the evaluation_utils for this, so we will do it manually
    for method in ["DEqMS", "FedProt", "Fisher", "Stouffer", "REM", "RankProd"]:
        # first replace zeros with smallest non-zero value (offset)
        offset = transfomed_dataset[f"pv_{method}"].replace(0, np.nan).min()
        transfomed_dataset[f"pv_{method}"] = transfomed_dataset[f"pv_{method}"].replace(0, offset)        
        transfomed_dataset[f"pv_{method}"] = -np.log10(transfomed_dataset[f"pv_{method}"])

    pval_basic_stats = fp_eval.calc_stats(
        transfomed_dataset, 
        lfc_thr=0.58, adj_pval_thr=0.01,
        stats=["MinDiff", "MeanDiff", "MaxDiff", "r", "ρ"],
        methods=["FedProt", "Fisher", "Stouffer", "REM", "RankProd"],
        column_name="pv_",
        top_genes=-1
    )
    # save to file, dataset to lower case
    dataset = dataset.lower()
    pval_basic_stats.to_csv(f'{root_dir}/{dataset}/results_tables/log10pval_basic_stats.csv')
    
    print(f'Dataset: {dataset}')
    print("\tP-value basic stats:")
    print(pval_basic_stats)        

Calculating corrs. Using p-vals - log-transformed.
Correlations computed for all genes from pv_ columns.
Calculating corrs. Using p-vals - log-transformed.
Correlations computed for all genes from pv_ columns.


Dataset: balanced
	P-value basic stats:
               MinDiff   MeanDiff     MaxDiff         r         ρ
FedProt   1.808478e-09   0.106685    0.667432  0.999996  0.999995
Fisher    2.521811e-04   3.677046   25.466432  0.974007  0.947636
Stouffer  3.178318e-03   3.281332   26.379490  0.980632  0.954797
REM       3.261655e-03  15.762635  258.189408  0.731352  0.879487
RankProd  1.021528e-03  16.668084  101.154049  0.773729  0.869206
Dataset: imbalanced
	P-value basic stats:
           MinDiff   MeanDiff     MaxDiff         r         ρ
FedProt   0.000002   0.097361    0.725899  0.999992  0.999993
Fisher    0.002489   2.308141   17.004740  0.970069  0.935209
Stouffer  0.001169   2.137610   17.942440  0.976551  0.937949
REM       0.000402  12.943510  259.289782  0.754849  0.880833
RankProd  0.001159   9.407645   59.997642  0.757345  0.826258


In [6]:
# RMSE
fp_eval.calc_stats(
        dfs["Balanced"], 
        lfc_thr=0.58, adj_pval_thr=0.01,
        stats=["RMSE"],
        methods=["FedProt", "Fisher", "Stouffer", "REM", "RankProd"],
        column_name="pv_",
        top_genes=-1
    )

Calculating RMSE. Using p-vals - not log-transformed.
RMSE computed for all genes from pv_ columns.


Unnamed: 0,RMSE
FedProt,0.00198
Fisher,0.180861
Stouffer,0.190047
REM,0.177863
RankProd,0.279386


In [7]:
# RMSE
fp_eval.calc_stats(
        dfs["Imbalanced"], 
        lfc_thr=0.58, adj_pval_thr=0.01,
        stats=["RMSE"],
        methods=["FedProt", "Fisher", "Stouffer", "REM", "RankProd"],
        column_name="pv_",
        top_genes=-1
    )

Calculating RMSE. Using p-vals - not log-transformed.
RMSE computed for all genes from pv_ columns.


Unnamed: 0,RMSE
FedProt,0.003517
Fisher,0.193432
Stouffer,0.21364
REM,0.186199
RankProd,0.266898


### correlation plots

In [None]:
# log transform p-values in dfs, but first replace 0 with tiny value
log_dfs = {}
for k in dfs:
    df = dfs[k]
    df = df.replace(0,1e-300)
    df["pv_DEqMS"] = -np.log10(df["pv_DEqMS"])
    for m in ["Fisher","Stouffer","REM","RankProd"]:
        df["pv_"+m] = -np.log10(df["pv_"+m])
    log_dfs[k] = df



corrs = plt_results(log_dfs, text = "", colors = color_dict,
                    datasets=["pep_PG", "Genes"], methods=["Fisher","Stouffer","REM","RankProd"])
# plt.savefig("/home/yuliya/repos/cosybio/FedDEqMS/data/04_evaluation/plots/" + "Fig3B_prototype.png", dpi=1200)
corrs