### The Pearson correlation of efficienty among biological repeats

In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

figsuplix = 'pdf'

In [2]:
def is_Exist_file(path):
    import os
    if os.path.exists(path):
        os.remove(path)


def mkdir(path):
    import os
    path = path.strip()  # 去除首位空格
    path = path.rstrip("\\")  # 去除尾部 \ 符号
    isExists = os.path.exists(path)  # 判断路径是否存在
    # 判断结果
    if not isExists:
        os.makedirs(path)  # 如果不存在则创建目录
        print(path + ' 创建成功')
    else:
        print(path + ' 目录已存在')  # 如果目录存在则不创建，并提示目录已存在

In [3]:
## reads, barcode cutoff
def data_preprocess(data, reads_cutoff, barcode_cutoff):
    data = data.loc[(data['reads_num']>=reads_cutoff) & (data['barcode_num']>=barcode_cutoff), :]
    data = data[['sgRNA_name', 'new_mutation', 'off-target_eff']]
    data.reset_index(drop=True, inplace=True)
    return data


## computing off-target efficiency correlation between data1 and data2
def copmuting_correlation(data1, data2, reads_cutoff, barcode_cutoff):
    data1 = data_preprocess(data1, reads_cutoff, barcode_cutoff)
    data2 = data_preprocess(data2, reads_cutoff, barcode_cutoff)
    data = pd.merge(data1, data2, how='inner', on=['sgRNA_name', 'new_mutation'])
    pear = data['off-target_eff_x'].corr(data['off-target_eff_y'], method='pearson')
    spear = data['off-target_eff_x'].corr(data['off-target_eff_y'], method='spearman')
    return (pear, spear)


## plot correlation data between diffetent cell lines on off-target efficiency
def plot_correlation(data_dict, reads_cutoff, barcode_cutoff):
    stat_pearson_dict = {}
    stat_spearman_dict = {}
    # for reads_cutoff in [200, 400, 600, 800, 1000, 1200]:
    for label1, data1 in data_dict.items():
        stat_pearson_dict['cell line'] = []
        stat_pearson_dict[label1] = []
        stat_spearman_dict['cell line'] = []
        stat_spearman_dict[label1] = []
        for label2, data2 in data_dict.items():
            pear, spear = copmuting_correlation(data1, data2, reads_cutoff, barcode_cutoff)
            stat_pearson_dict['cell line'].append(label2)
            stat_pearson_dict[label1].append(pear)
            stat_spearman_dict['cell line'].append(label2)
            stat_spearman_dict[label1].append(spear)
    ## DataFrame
    stat_pear = pd.DataFrame(stat_pearson_dict)
    stat_spear = pd.DataFrame(stat_spearman_dict)
#     stat_pear.to_excel(save_dir + "/heatmap_correlation_off-target.xlsx", index=False)
    stat_pear.index = stat_pear['cell line'].tolist()
    stat_spear.index = stat_spear['cell line'].tolist()
    del stat_pear['cell line']
    del stat_spear['cell line']
    return (stat_pear, stat_spear)

In [4]:
## plor heatmap correlation
def plot_heatmap_correlation(corr_data, corr_label, save_dir):
    ## 绘制 heatmap
    import matplotlib.pyplot as plt
    import seaborn as sns
    #设置右上三角不绘制
    plt.figure(figsize=(6.0, 4.0))

    sns.set_style("white")
    #mask为 和相关系数矩阵xcorr一样大的 全0(False)矩阵
    mask = np.zeros_like(corr_data, dtype=np.bool)
    # 将mask右上三角(列号》=行号)设置为True
    mask[np.triu_indices_from(mask)] = True
    for i in range(mask.shape[0]):
        mask[i, i] = False
    #cmap是设置热图的颜色
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    #绘制热图
    g = sns.heatmap(corr_data, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f', vmin=0.0, vmax=1)
#     g = sns.heatmap(corr_data, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f', vmin=0.5, vmax=1)
    #  xcorr : 数据矩阵
    #  mask : 为True的元素对应位置不会画出来（mask面具的意义）
    #  cmap: 颜色设置
    #  square: （True）代表行列长度一致，且绘制的每个也是方格
    #  annot ： 在格内显示数据
    #  fmt ：数据格式
    plt.xticks(rotation=90, weight='bold')
    plt.yticks(rotation=0, weight='bold')

    plt.title("The %s correlation"%(corr_label), 
              fontsize=12, weight='bold')
    mkdir(save_dir)
#     save_path = save_dir + '/3_cell_lines-%s_off-target_correlation_between_cell_lines_r%sb%s.png'%(corr_label, 
#                                                                                        reads_cutoff, barcode_cutoff)
    save_path = save_dir + '/3_cell_lines-%s_off-target_correlation_between_cell_lines_r%sb%s.%s'%(corr_label, 
                                                                                       reads_cutoff, barcode_cutoff, 
                                                                                                  figsuplix)
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()
####################################################

In [None]:
os