### For DSB Repair models Evaluation

In [1]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
#############################################
def is_Exist_file(path):
    import os
    if os.path.exists(path):
        os.remove(path)


def mkdir(path):
    import os
    path = path.strip()  # 去除首位空格
    path = path.rstrip("\\")  # 去除尾部 \ 符号
    isExists = os.path.exists(path)  # 判断路径是否存在
    # 判断结果
    if not isExists:
        os.makedirs(path)  # 如果不存在则创建目录
        print(path + ' 创建成功')
    else:
        print(path + ' 目录已存在')  # 如果目录存在则不创建，并提示目录已存在


## 需要遍历的目录树的路径
## 路径和文件名连接构成完整路径
def walk(path):
    import os
    input_path_list = []
    if not os.path.exists(path):
        return -1
    for root, dirs, names in os.walk(path):
        for filename in names:
            input_path = os.path.join(root, filename)
            input_path_list.append(input_path)
    return input_path_list
#############################################

In [3]:
## Get DSB Repair Modeling plot data 
def get_all_models_plot_data(data_dir):
    data_label_list = ['Self-K562', 'ForeCasT_Lindel', 'Self-Jurkat']
    model_list = ['K562', 'Lindel', 'ForeCasT', 'Jurkat']
    data_list = []
    cell_line = 'K562'
    for data_label in data_label_list:
        for model_label in model_list:
            data_path = data_dir + '/Self-%s/eval_%s_Test_by_%s_based_on_%s_labels.csv'%(cell_line, data_label, model_label, cell_line)
            if os.path.exists(data_path):
                data = pd.read_csv(data_path)
                data_list.append(data)
                data['data_label'] = data_label
                data['model_label'] = model_label
    ## For Jurkat - DSB repair model
    model_label = 'Jurkat'
    for data_label in data_label_list:
        data_path = data_dir + '/Self-%s/eval_%s_Test_by_%s_based_on_%s_labels.csv'%(model_label, data_label, model_label, model_label)
        if os.path.exists(data_path):
            data = pd.read_csv(data_path)
            data_list.append(data)
            data['data_label'] = data_label
            data['model_label'] = model_label
    ## concat
    data = pd.concat(data_list, axis=0)
    data.reset_index(drop=True, inplace=True)
    return data


def get_one_cell_line_models_plot_data(cell_line, data_dir):
    data_label_list = ['Self-K562', 'ForeCasT_Lindel', 'Self-Jurkat']
    model_list = ['K562', 'Jurkat', 'Lindel', 'ForeCasT']
    data_list = []
    for data_label in data_label_list:
        for model_label in model_list:
            data_path = data_dir + '/Self-%s/eval_%s_Test_by_%s_based_on_%s_labels.csv'%(cell_line, data_label, model_label, cell_line)
            if os.path.exists(data_path):
                data = pd.read_csv(data_path)
                data_list.append(data)
                data['data_label'] = data_label
                data['model_label'] = model_label
    ## concat
    data = pd.concat(data_list, axis=0)
    data.reset_index(drop=True, inplace=True)
    return data

In [4]:
## compute statistics
def compute_one_way_ANOVA_Turkey_test(data, data_label, y='pearson'):
    data1 = data.loc[data['data_label']==data_label, :]
    from scipy import stats
    keys = list(data1['model_label'].unique())
    values = []
    for model_label in keys:
        values.append(list(data1.loc[data1['model_label']==model_label, y]))
    data_dict = dict(zip(keys, values))
    ## stats f_onewsy functions takes the groups as input and returns F and P-values
    if len(keys) == 3:
        fvalue, pvalue = stats.f_oneway(data_dict[keys[0]], 
                                        data_dict[keys[1]], 
                                        data_dict[keys[2]])
    elif len(keys) == 4:
        fvalue, pvalue = stats.f_oneway(data_dict[keys[0]], 
                                        data_dict[keys[1]], 
                                        data_dict[keys[2]], 
                                        data_dict[keys[3]])
    else:
        fvalue, pvalue = 0, 0
        pass
    ## For the Tukey test
    from statsmodels.stats.multicomp import pairwise_tukeyhsd
    m_comp = pairwise_tukeyhsd(endog=data1[y], groups=data1['model_label'], alpha=0.05)
    ## To dataframe
    df_comp = pd.DataFrame(data=m_comp._results_table.data[1:], columns=m_comp._results_table.data[0])
    return df_comp
##############################################################

In [5]:
main_path = "../../data"
os.chdir(main_path)
figsuplix = 'pdf'
save_dir = "./%s/DSB/ANOVA-Turkey_tes"%figsuplix
mkdir(save_dir)


cell_line = 'K562'
data_dir = './DSB/Prediction'
data = get_one_cell_line_models_plot_data(cell_line, data_dir)

## get Turkey test
data_label_list = ['Self-K562', 'ForeCasT_Lindel', 'Self-Jurkat']
y_list = ['pearson', 'spearman', 'symKL', 'MSE']
df_list = []
for data_label in data_label_list:
    for y in y_list:
        df_comp = compute_one_way_ANOVA_Turkey_test(data, data_label, y)
        df_comp['data_label'] = data_label
        df_comp['metrics'] = y
        df_list.append(df_comp)
##
df_comp = pd.concat(df_list, axis=0)      
df_comp.reset_index(drop=True, inplace=True)
## save path
save_hoc_test_K562_path = save_dir + 'ANOVA-Turkey_test-based_on_K562_category.xlsx'
df_comp.to_excel(save_hoc_test_K562_path, index=False)
df_comp.head()

./pdf/DSB/ANOVA-Turkey_tes 目录已存在


Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject,data_label,metrics
0,ForeCasT,K562,0.2167,0.001,0.2118,0.2217,True,Self-K562,pearson
1,ForeCasT,Lindel,0.135,0.001,0.13,0.1399,True,Self-K562,pearson
2,K562,Lindel,-0.0818,0.001,-0.0867,-0.0769,True,Self-K562,pearson
3,ForeCasT,K562,0.1464,0.001,0.1431,0.1497,True,Self-K562,spearman
4,ForeCasT,Lindel,0.1171,0.001,0.1138,0.1204,True,Self-K562,spearman
