### Data from patch pipeline

In [1]:
import pandas as pd
import os
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
# 检查文件是否存在，存在删除
def is_Exist_file(path):
    import os
    if os.path.exists(path):
        os.remove(path)


def mkdir(path):
    import os
    path = path.strip()  # 去除首位空格
    path = path.rstrip("\\")  # 去除尾部 \ 符号
    isExists = os.path.exists(path)  # 判断路径是否存在
    # 判断结果
    if not isExists:
        os.makedirs(path)  # 如果不存在则创建目录
        print(path + ' 创建成功')
    else:
        print(path + ' 目录已存在')  # 如果目录存在则不创建，并提示目录已存在

In [3]:
## 功能模块2 -- 突变长度
######################################################################################
## 画图2：insertion、deletion、mismatch 等突变类型各自的修复长度分布
## obtain plot bar data
def obtain_repair_length_data(df):
    ## cols
    inser_col_list = []
    del_col_list = []
    mis_col_list = []
    for length in range(50):
        length += 1
        inser_col_list.append('I' + '-' + str(length))
        del_col_list.append('D' + '-' + str(length))
        mis_col_list.append('M' + '-' + str(length))
    length_cols = del_col_list + inser_col_list + mis_col_list
    ## start
    import copy

    temp = copy.deepcopy(df)
    sum_col = 'sum'
    temp[sum_col] = temp[length_cols].sum(axis=1)
    insertion_cols = []
    deletion_cols = []
    mismatch_cols = []
    for i in range(50):
        i += 1
        inser_col = 'I' + '-' + str(i)
        del_col = 'D' + '-' + str(i)
        mis_col = 'M' + '-' + str(i)
        insertion_cols.append(inser_col)
        deletion_cols.append(del_col)
        mismatch_cols.append(mis_col)
        temp[inser_col] = temp[inser_col] * 100 / temp[sum_col]
        temp[del_col] = temp[del_col] * 100 / temp[sum_col]
        temp[mis_col] = temp[mis_col] * 100 / temp[sum_col]
    temp['Others'] = temp['other_num'] * 100 / temp[sum_col]
    inser_dict = dict(temp[insertion_cols].mean(axis=0))
    del_dict = dict(temp[deletion_cols].mean(axis=0))
    mis_dict = dict(temp[mismatch_cols].mean(axis=0))
    other_dict = dict(temp[['Others']].mean(axis=0))
    return {'I': inser_dict, 'D': del_dict, 'M': mis_dict, 'Others': other_dict}


def plot_bar_repair_length(data_dict, title, savefig_path):
    import matplotlib.pyplot as plt
    plt.switch_backend('agg')
    import numpy as np

    plt.rcParams['savefig.dpi'] = 100  # 图片像素
    plt.rcParams['figure.dpi'] = 100  # 分辨率
    # plt.style.use("ggplot")
    plt.style.use("seaborn-white")
    # plt.figure()
    plt.figure(figsize=(9.0, 4.0))

    del_data_list = []
    del_name_list = ['D-30+', 'D-29', 'D-28', 'D-27', 'D-26', 'D-25', 'D-24', 'D-23', 'D-22', 'D-21',
                     'D-20', 'D-19', 'D-18', 'D-17', 'D-16', 'D-15', 'D-14', 'D-13', 'D-12', 'D-11',
                     'D-10', 'D-9', 'D-8', 'D-7', 'D-6', 'D-5', 'D-4', 'D-3', 'D-2', 'D-1']
    D30 = [data_dict['D']['D-%s' % (i)] for i in range(30, 51)]
    del_data_list.append(sum(D30))
    for col in del_name_list[1:]:
        del_data_list.append(data_dict['D'][col])

    ## insertion
    inser_name_list = ['I-1', 'I-2', 'I-3', 'I-4', 'I-5', 'I-6', 'I-7', 'I-8', 'I-9',
                       'I-10', 'I-11', 'I-12', 'I-13', 'I-14', 'I-15+']
    inser_data_list = [data_dict['I'][col] for col in inser_name_list[:-1]]
    I15 = [data_dict['I']['I-%s' % (i)] for i in range(15, 51)]
    inser_data_list.append(sum(I15))

    mis_name_list = ['M-1', 'M-2', 'M-3', 'M-4', 'M-5+']
    mis_data_list = [data_dict['M'][col] for col in mis_name_list[:-1]]
    M5 = [data_dict['M']['M-%s' % (i)] for i in range(5, 51)]
    mis_data_list.append(sum(M5))

    ################################################################
    ## save plot data
    del_plot_data = pd.DataFrame()
    inser_plot_data = pd.DataFrame()
    mis_plot_data = pd.DataFrame()
    del_plot_data['label'] = del_name_list
    del_plot_data['fracs'] = del_data_list
    inser_plot_data['label'] = inser_name_list
    inser_plot_data['fracs'] = inser_data_list
    mis_plot_data['label'] = mis_name_list
    mis_plot_data['fracs'] = mis_data_list
    plot_data = pd.concat([del_plot_data, inser_plot_data, mis_plot_data], axis=0)
    ##
    save_plot_data_path = '.'.join(savefig_path.split('.')[:-1]) + '.xlsx'
    plot_data.to_excel(save_plot_data_path, index=False)
    ################################################################

    width = 0.8
    x1 = range(len(del_name_list))
    plt.bar(x1, del_data_list, width=width, fc='y')

    x2 = np.array(list(range(len(inser_name_list)))) + len(del_name_list) + 1
    plt.bar(x2, inser_data_list, width=width, fc='y')

    x3 = np.array(list(range(len(mis_name_list)))) + len(del_name_list) + len(inser_name_list) + 2
    plt.bar(x3, mis_data_list, width=width, fc='y')

    ## 设置坐标轴
    ax = plt.gca()
    # ax.spines["right"].set_color("none")   #设置坐标轴的颜色
    # ax.spines["top"].set_color("none")
    # ax.spines["left"].set_position(("data",0))  #设置坐标轴的位置
    # ax.spines["bottom"].set_position(("data",-0.01))
    ax.xaxis.set_ticks_position("bottom")  # x轴的数据显示在x轴右侧
    ax.yaxis.set_ticks_position("left")  # y轴的数据显示在y轴左侧

    # 去掉边框
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    #     ax.spines['bottom'].set_visible(False)
    #     ax.spines['left'].set_visible(False)

    # deletions
    plt.annotate(s='', xy=(-0.5, -0.001), xytext=(29.6, -0.001), weight='bold',
                 arrowprops=dict(arrowstyle='-', color='black'))
    plt.text(15, -1.4, 'Deletions', ha='center', va='bottom', fontsize=10, weight='bold')
    # insertions
    plt.annotate(s='', xy=(30.4, -0.001), xytext=(45.7, -0.001), weight='bold',
                 arrowprops=dict(arrowstyle='-', color='black'))
    plt.text(37.5, -1.4, 'Insertions', ha='center', va='bottom', fontsize=10, weight='bold')
    # mismatches
    plt.annotate(s='', xy=(46.4, -0.001), xytext=(52, -0.001), weight='bold',
                 arrowprops=dict(arrowstyle='-', color='black'))
    plt.text(49.5, -1.4, 'Mismatches', ha='center', va='bottom', fontsize=10, weight='bold')

    # 显示x轴刻度
    x = list(x1) + list(x2) + list(x3)
    xticks = [i.split('-')[1] for i in del_name_list] + [i.split('-')[1] for i in inser_name_list] + [i.split('-')[1]
                                                                                                      for i in
                                                                                                      mis_name_list]
    name_num_dict = {}
    for i in range(len(x)):
        k = x[i]
        v = xticks[i]
        name_num_dict[k] = v
    x_new = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38,
             39, 41, 43, 45, 47, 48, 49, 50, 51]
    plt.xticks(x_new, [name_num_dict[i] for i in x_new], fontsize=8)
    plt.xlim(-0.5, 52)
    plt.ylim(-1.5, 30)

    # plt.grid()     #显示网格线
    # 为两条坐标轴设置名称
    # plt.title(title)
    plt.xlabel("Length of gRNA repair", weight='bold', fontsize=13)
    plt.ylabel("Average fraction of\n repair outcome per gRNA (%)", weight='bold', fontsize=13)

    ## plt.legend()
    plt.savefig(savefig_path, dpi=300, bbox_inches='tight')
#     plt.close()
    plt.show()
    
    
## 修复长度
def main_repair_length(data, title, savefig_dir, figname):
    ## plot data
    data_dict = obtain_repair_length_data(data)
    #     return data_dict

    ## plot
    mkdir(savefig_dir)
    savefig_path = savefig_dir + '/' + figname
    plot_bar_repair_length(data_dict, title, savefig_path)

In [4]:
## 画图模块2
######################################################################################
## pie of repair type
def pie_repair_type(fracs, labels, explode, yanse, title, savefig_path, startangle=90):
    import numpy as np
    import matplotlib.pyplot as plt
    plt.switch_backend('agg')

    plt.figure(figsize=(6, 5))
    # labels = 'A', 'B', 'C', 'D'
    # fracs = [15, 30.55, 44.44, 10]
    # explode = [0]*len(fracs) # 0.1 凸出这部分，
    plt.axes(aspect=1)  # set this , Figure is round, otherwise it is an ellipse
    # autopct ，show percet
    patches, l_text, p_text = plt.pie(x=fracs, labels=labels, explode=explode, autopct='%3.2f %%',
                                      shadow=False, labeldistance=1.1, startangle=startangle, pctdistance=0.6,
                                      colors=yanse)
    ## 改变文本的大小
    ## 方法是把每个text遍历，调用set_size方法设置它的属性
    for t in l_text:
        # t.set_size=(30)
        t.set_fontsize(20)
    for t in p_text:
        # t.set_size=(30)
        t.set_fontsize(20)
    # plt.title('')
    '''
    labeldistance，文本的位置离远点有多远，1.1指1.1倍半径的位置
    autopct，圆里面的文本格式，%3.1f%%表示小数有三位，整数有一位的浮点数
    shadow，饼是否有阴影
    startangle，起始角度，0，表示从0开始逆时针转，为第一块。一般选择从90度开始比较好看
    pctdistance，百分比的text离圆心的距离
    patches, l_texts, p_texts，为了得到饼图的返回值，p_texts饼图内部文本的，l_texts饼图外label的文本
    '''
    plt.axis('equal')
    # plt.title(title, fontsize=10)
    # plt.legend()
    plt.savefig(savefig_path, dpi=300, bbox_inches='tight')
#     plt.close()
    plt.show()


## obtain plot pie_data
def obtain_pie_data(df, cols):
    import copy
    temp = copy.deepcopy(df)
    sum_col = 'sum'
    temp[sum_col] = temp[cols].sum(axis=1)
    for col in cols:
        temp[col] = temp[col] * 100 / temp[sum_col]
    temp.rename(columns={'other_num': 'Others'}, inplace=True)
    cols = cols[:-1] + ['Others']
    fracs_s = temp[cols].mean(axis=0)
    fracs_d = dict(fracs_s)
    fracs = [fracs_d[col] for col in cols]
    return (fracs, cols)


## main_hyperopt
def main_pie_all_edited_type(raw_data, cols, yanse, title, savefig_dir, fig_name, startangle):
    fracs, labels = obtain_pie_data(raw_data, cols)
    ## pie data
    pie_data = pd.DataFrame()
    pie_data['label'] = labels
    pie_data['fracs'] = fracs
    save_pie_data_path = savefig_dir + '/' + fig_name.split('.')[0] + '.xlsx'
    mkdir(savefig_dir)
    pie_data.to_excel(save_pie_data_path, index=False)
    ## plot
    explode = [0.1] * len(fracs)
    savefig_path = savefig_dir + '/' + fig_name
    pie_repair_type(fracs, labels, explode, yanse, title, savefig_path, startangle)
######################################################################################

In [5]:

def plot_main(stat_data, savefig_dir):
    ## plot 1: distribution of repair outcome
    title = 'distibution of repair outcome'
    # figname = 'distibution of repair outcome.png'
    figname = 'distibution of repair outcome.%s'%figsuplix
    main_repair_length(stat_data, title, savefig_dir, figname)
    ## plot 2: pie repair outcomes
    cols = ['insertion', 'deletion', 'mismatch', 'other_num']
    yanse = ['pink', 'cornflowerblue', 'tomato', 'dimgray']
    title = 'distibution of repair outcome'
    # figname = 'pie_repair_outcomes.png'
    figname = 'pie_repair_outcomes.%s'%figsuplix
    startangle = 45
    main_pie_all_edited_type(stat_data, cols, yanse, title, savefig_dir, figname, startangle)
############################################################

In [6]:
# For DNTTKO-Jurkat
figsuplix = 'pdf'
main_path = "../../data"
os.chdir(main_path)

data_dir = "./DSB/pie and length/%s"
# save_label = 'for_DNTTOE'
save_dir = "./%s/DSB/Fig34-pie and length"%(figsuplix)
for cell_line in ['Jurkat', 'DNTTKO']:
    data_label = "%s_SW"%(cell_line)
    read_stat_data_path = data_dir%(data_label) + "/for_DNTTKO/statistics_DSB_repair_profile_SW_data.log_comm"
    if cell_line not in ['DNTTKO', 'DNTTOE']:
        savefig_dir = save_dir + "/%s_WT"%(cell_line)
    else:
        savefig_dir = save_dir + "/%s_DNTTKO"%(cell_line)
    mkdir(savefig_dir)
    stat_data = pd.read_csv(read_stat_data_path, sep='\t')
    plot_main(stat_data, savefig_dir)

./pdf/DSB/Fig34-pie and length/Jurkat_WT 创建成功
./pdf/DSB/Fig34-pie and length/Jurkat_WT 目录已存在
./pdf/DSB/Fig34-pie and length/Jurkat_WT 目录已存在
./pdf/DSB/Fig34-pie and length/DNTTKO_DNTTKO 创建成功
./pdf/DSB/Fig34-pie and length/DNTTKO_DNTTKO 目录已存在
./pdf/DSB/Fig34-pie and length/DNTTKO_DNTTKO 目录已存在
