In [2]:
## packages
import os
import random
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from src.utils import get_data, get_colnames
from src.eval import eval_top_n_guides_genewise, eval_top_n_guides_modelwise, calculate_ranking, plot_ranking, create_challengeR_dataset

from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [3]:
## set seeds
seed = 5555
random.seed(seed)
np.random.seed(seed)

## get time stamp
date = datetime.now()
date = "{}-{}-{}".format(date.year, date.month, date.day)

In [4]:
## files and directories
date_model_trained = "2021-4-13"

input_predictions = output_models = "/storage/groups/haicu/workspace/crispri/models/" + date_model_trained + "/"
output_performance = "../reports/performance_eval/" + date + "/"
output_plots = "../reports/plots_eval/" + date + "/"

os.makedirs(os.path.dirname(output_performance), exist_ok=True)
os.makedirs(os.path.dirname(output_plots), exist_ok=True)

file_data_wang = '../datasets/data_wang.pickle'
file_data_rousset_E18 = '../datasets/data_rousset_E18.pickle'
file_data_rousset_E75 = '../datasets/data_rousset_E75.pickle'

In [5]:
## setup parameters
top_n_guides_list = [1,3] 
p_value_thr = 0.05

#datasets_models_trained = ["wang_orig_guide", "rousset_E18_orig_guide", "rousset_E75_orig_guide", "wang_rousset_E18_orig_guide", "wang_rousset_E75_orig_guide", "wang_rousset_E18_rousset_E75_orig_guide"]

#datasets_models_trained = ["wang_median-sub_guide", "rousset_E18_median-sub_guide", "rousset_E75_median-sub_guide", "wang_rousset_E18_median-sub_guide", "wang_rousset_E75_median-sub_guide", "wang_rousset_E18_rousset_E75_median-sub_guide"]

datasets_models_trained = ["wang_rank_guide", "rousset_E18_rank_guide", "rousset_E75_rank_guide", "wang_rousset_E18_rank_guide", "wang_rousset_E75_rank_guide", "wang_rousset_E18_rousset_E75_rank_guide"]

models_trained = ["elnet", "GBM", "1DCNN"]

In [6]:
## load data
data_wang = get_data(file_data_wang)
data_rousset_E18 = get_data(file_data_rousset_E18)
data_rousset_E75 = get_data(file_data_rousset_E75)

genes_of_datasets = {"wang": sorted(data_wang["geneid"].to_list()), "rousset_E18": sorted(data_rousset_E18["geneid"].to_list()), "rousset_E75": sorted(data_rousset_E75["geneid"].to_list())}
print("number of genes per dataset:")
{k: len(set(v)) for k, v in genes_of_datasets.items()}

number of genes per dataset:


{'wang': 244, 'rousset_E18': 147, 'rousset_E75': 137}

In [7]:
%matplotlib

colnames_predictions = ["log2FC_target","log2FC_predicted","log2FC_original"]
colnames_metrics = ["spearmanR","performance_increase","wilcoxon_p-value"]

# calculate metrics and ranking
for n in range(len(top_n_guides_list)):
    
    top_n_guides = top_n_guides_list[n]
    print("\n---")
    print("calculate metrics and ranking for top " + str(top_n_guides) + " guides")

    number_sig_p_values_per_model_per_dataset = pd.DataFrame(columns = models_trained, index = datasets_models_trained)
    ranking_per_model_per_dataset = pd.DataFrame(columns = models_trained, index = datasets_models_trained)
    ranking_per_gene_per_model_mean = pd.DataFrame(columns = models_trained, index = datasets_models_trained)
    mean_FC_top_n_guides_per_gene_per_model_per_dataset = []
    mean_mse_per_gene_per_model_per_dataset = []
    spearmanR_per_gene_per_model_per_dataset = []
    
    for dataset in datasets_models_trained:
        print("dataset: " + dataset)
           
        if "wang" in dataset:
            genes = genes_of_datasets["wang"]
            
        elif "rousset_E18" in dataset:
            genes = genes_of_datasets["rousset_E18"]
                
        elif "rousset_E75" in dataset:
            genes = genes_of_datasets["rousset_E75"] 
        genes_unique = sorted(list(set(genes)))
        
        #assemble all predictions of all models and plot summary plots
        filename_plots = output_plots + dataset + "_summary.pdf"
        pp = PdfPages(filename_plots)
    
        predictions_all_models = pd.DataFrame(columns = get_colnames(models_trained, colnames_predictions), index = genes)
        for model in models_trained:
    
            file_predictions = input_predictions + model + "/predictions_" + dataset + ".csv"
            predictions = pd.read_csv(file_predictions, header=0, index_col=1)
            predictions.sort_index(inplace=True)
            
            eval_top_n_guides_modelwise(model, dataset, predictions, top_n_guides_list, pp)
        
            for colname in colnames_predictions:
                predictions_all_models[model + "_" + colname] = predictions[colname]
        
        pp.close()
        
        #calculat metics per gene
        metrics_per_gene_per_model = pd.DataFrame(columns = get_colnames(models_trained, colnames_metrics), index = genes_unique)
        ranking_per_gene_per_model = pd.DataFrame(columns = models_trained, index = genes_unique)
        mean_FC_top_n_guides_per_gene_per_model = pd.DataFrame(columns = models_trained, index = genes_unique)
        mean_mse_per_gene_per_model = pd.DataFrame(columns = models_trained, index = genes_unique)
        spearmanR_per_gene_per_model = pd.DataFrame(columns = models_trained, index = genes_unique)
        
        for gene in genes_unique:
            predictions_of_gene = predictions_all_models.loc[gene,:].copy()
            if predictions_of_gene.shape[0] > 10:
                metrics_per_gene_per_model, ranking_per_gene_per_model, mean_FC_top_n_guides_per_gene_per_model, mean_mse_per_gene_per_model, spearmanR_per_gene_per_model = eval_top_n_guides_genewise(top_n_guides, models_trained, dataset, predictions_of_gene, 
                                                                                                                                            metrics_per_gene_per_model.copy(), ranking_per_gene_per_model.copy(), 
                                                                                                                                            mean_FC_top_n_guides_per_gene_per_model.copy(), mean_mse_per_gene_per_model.copy(), spearmanR_per_gene_per_model.copy(),
                                                                                                                                            colnames_metrics, gene, output_plots + dataset + "/", plot=True)

        
        
        # drop lines of excluded genes
        metrics_per_gene_per_model = metrics_per_gene_per_model.dropna()
        ranking_per_gene_per_model = ranking_per_gene_per_model.dropna()
        mean_FC_top_n_guides_per_gene_per_model = mean_FC_top_n_guides_per_gene_per_model.dropna()
        mean_mse_per_gene_per_model = mean_mse_per_gene_per_model.dropna()
        spearmanR_per_gene_per_model = spearmanR_per_gene_per_model.dropna()
        
        
        # get number of significant p-values per model and calculate ranking
        ranking_per_model_per_dataset, number_sig_p_values_per_model_per_dataset = calculate_ranking(models_trained, dataset, metrics_per_gene_per_model, p_value_thr, 
                                                                                                     ranking_per_model_per_dataset.copy(), number_sig_p_values_per_model_per_dataset.copy())
        ranking_per_gene_per_model_mean.loc[dataset, :] = ranking_per_gene_per_model.mean().to_list()
        
        #create challengeR tables for mean FC of top n guides per gene
        #mean_FC_top_n_guides_per_gene_per_model['TestCase'] = mean_FC_top_n_guides_per_gene_per_model.index
        #mean_FC_top_n_guides_per_gene_per_model = pd.melt(mean_FC_top_n_guides_per_gene_per_model, id_vars='TestCase')
        #mean_FC_top_n_guides_per_gene_per_model["Task"] = dataset
        
        #mean_FC_top_n_guides_per_gene_per_model.columns = ["TestCase", "Algorithm", "MetricValue", "Task"]
        mean_FC_top_n_guides_per_gene_per_model_per_dataset.append(create_challengeR_dataset(mean_FC_top_n_guides_per_gene_per_model, dataset))
        
        #create challengeR tables for mean mse per gene
        #mean_mse_per_gene_per_model['TestCase'] = mean_mse_per_gene_per_model.index
        #mean_mse_per_gene_per_model = pd.melt(mean_mse_per_gene_per_model, id_vars='TestCase')
        #mean_mse_per_gene_per_model["Task"] = dataset
        
        #mean_mse_per_gene_per_model.columns = ["TestCase", "Algorithm", "MetricValue", "Task"]
        mean_mse_per_gene_per_model_per_dataset.append(create_challengeR_dataset(mean_mse_per_gene_per_model, dataset))
        
        #create challengeR tables for spearman correlation per gene
        #spearmanR_per_gene_per_model['TestCase'] = spearmanR_per_gene_per_model.index
        #spearmanR_per_gene_per_model = pd.melt(spearmanR_per_gene_per_model, id_vars='TestCase')
        #spearmanR_per_gene_per_model["Task"] = dataset
        
        #spearmanR_per_gene_per_model.columns = ["TestCase", "Algorithm", "MetricValue", "Task"]
        spearmanR_per_gene_per_model_per_dataset.append(create_challengeR_dataset(spearmanR_per_gene_per_model, dataset))
        
        
        # save metrics per dataset
        metrics_per_gene_per_model.to_csv(output_performance + "metrics_" + dataset + "_top_" + str(top_n_guides) +"_guides.csv")

    print(ranking_per_gene_per_model_mean)
    
    # write challengeR files for mean FC of top n guides per gene
    mean_FC_top_n_guides_per_gene_per_model_per_dataset = pd.concat(mean_FC_top_n_guides_per_gene_per_model_per_dataset)
    mean_FC_top_n_guides_per_gene_per_model_per_dataset.to_csv(output_performance + "challengeR_ranking_top_" + str(top_n_guides) + ".csv", index=False)
        
    # write challengeR files for mean mse per gene
    mean_mse_per_gene_per_model_per_dataset = pd.concat(mean_mse_per_gene_per_model_per_dataset)
    mean_mse_per_gene_per_model_per_dataset.to_csv(output_performance + "challengeR_mean_mse.csv", index=False)
    
    # write challengeR files for spearman correlation per gene
    spearmanR_per_gene_per_model_per_dataset = pd.concat(spearmanR_per_gene_per_model_per_dataset)
    spearmanR_per_gene_per_model_per_dataset.to_csv(output_performance + "challengeR_spearman.csv", index=False)
    
        
    # plot ranking
    filename_plots = output_plots + "ranking_top_" + str(top_n_guides) + ".pdf"
    pp = PdfPages(filename_plots)
    plot_ranking(ranking_per_model_per_dataset.copy(), number_sig_p_values_per_model_per_dataset.copy(), top_n_guides, pp)
    pp.close()

Using matplotlib backend: agg

---
calculate metrics and ranking for top 1 guides
dataset: wang_rank_guide
dataset: rousset_E18_rank_guide
dataset: rousset_E75_rank_guide
dataset: wang_rousset_E18_rank_guide
dataset: wang_rousset_E75_rank_guide
dataset: wang_rousset_E18_rousset_E75_rank_guide
                                            elnet       GBM     1DCNN
wang_rank_guide                          2.030702  2.008772  1.960526
rousset_E18_rank_guide                   1.934783  2.184783  1.880435
rousset_E75_rank_guide                   1.955556  2.155556  1.888889
wang_rousset_E18_rank_guide              2.065789   1.97807   1.95614
wang_rousset_E75_rank_guide              2.002193  2.046053  1.951754
wang_rousset_E18_rousset_E75_rank_guide  1.991228  1.993421  2.015351

---
calculate metrics and ranking for top 3 guides
dataset: wang_rank_guide
dataset: rousset_E18_rank_guide
dataset: rousset_E75_rank_guide
dataset: wang_rousset_E18_rank_guide
dataset: wang_rousset_E75_rank_guide
d