### For off-target insertion

In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

figsuplix = 'pdf'

In [2]:
def is_Exist_file(path):
    import os
    if os.path.exists(path):
        os.remove(path)


def mkdir(path):
    import os
    path = path.strip()  # 去除首位空格
    path = path.rstrip("\\")  # 去除尾部 \ 符号
    isExists = os.path.exists(path)  # 判断路径是否存在
    # 判断结果
    if not isExists:
        os.makedirs(path)  # 如果不存在则创建目录
        print(path + ' 创建成功')
    else:
        print(path + ' 目录已存在')  # 如果目录存在则不创建，并提示目录已存在

In [3]:
## mutation 坐标转换
#####################################################################################
## 反向互补
def reverseComplement(seq):
    """
     生成反向互补序列
     :param seq:
     :return:revComSeq
     """
    ATCG_dict = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'a': 't', 't': 'a', 'c': 'g', 'g': 'c', 'N': 'N'}
    revComSeq = ''
    for i in seq:
        revComSeq = ATCG_dict[i] + revComSeq
    return revComSeq


## 转换核心函数
def transformer_core(mut, index_dict):
    if 'M' in mut:
        inf, sup = int(mut.split(':')[0][1:]), int(mut.split(':')[1][:3])
        bn, an = mut.split('_')[1].split(':')[0], mut.split('_')[1].split(':')[1]
        new_inf, new_sup = index_dict[inf], index_dict[sup]
        new_bn, new_an = reverseComplement(bn), reverseComplement(an)
        return '%s%s:%sM_%s:%s'%(mut[0], new_inf, new_sup, new_bn, new_an)
    elif 'I' in mut:
        new_inser_site = index_dict[int(mut.split('_')[0][1:-1])]
        new_inser_nucles = reverseComplement(mut.split('_')[-1])
        return '%s%sI_%s'%(mut[0], new_inser_site, new_inser_nucles)
    elif 'D' in mut:
        inf, sup = int(mut.split(':')[0][1:]), int(mut.split(':')[-1][:-1])
        new_inf, new_sup = index_dict[inf], index_dict[sup]
        return '%s%s:%sD'%(mut[0], new_inf, new_sup)
    else:
        return mut


## 解析 all_mutation & target_mutation 信息
def parse_mutation_information(mutation):
    '''
    mutation = 'X0+Y179:180D.Y186:186M_C:A+Z196I_T'
    # mutation = 'X0+Y0+Z0'
    muts = parse_mutation_information(mutation)
    '''
    import re
    mut_list = re.split('\+|\.', mutation)
    mut_dict = {'X': [], 'Y': [], 'Z': []}
    for mut in mut_list:
        mut_dict[mut[0]].append(mut)
    return mut_dict


## 调整突变坐标的参考系：以 gRNASeq 为基
def adjust_off_target_mutaion_reference_system(mutation):
    '''
    mutation = 'X164I_T+Y178I_C+Z0'
    mutation = 'X164:164D+Y187I_G+Z199:199M_C:T'
    mutation = 'X0+Y0+Z0'
    adjust_off_target_mutaion_reference_system(mutation)
    '''
    ## 长序列与短序列对应字典
    index_dict = {}
    for i in range(20):
        index_dict[125 + i] = -19 + i
    for i in range(63):
        index_dict[145 + i] = 1 + i
    for i in range(20):
        index_dict[208 + i] = 64 + i
    ## 解析 all_mutation & target_mutation 信息
    mut_dict = parse_mutation_information(mutation)
    new_mut_dict = {'X': '', 'Y': '', 'Z': ''}
    for label, mut_list in mut_dict.items():
        a_list = []
        for mut in mut_list:
            new_mut = transformer_core(mut, index_dict)
            a_list.append(new_mut)
        a_list.sort(reverse=False)
        new_mut_dict[label] = '.'.join(a_list)
    new_list = []
    for label in ['X', 'Y', 'Z']:
        new_list.append(new_mut_dict[label])
    return '+'.join(new_list)
#####################################################################################

In [4]:
## up, core, down 区域内突变核苷酸总长度
def region_mutation_length(mutation):
    mut_list = mutation.split(".")
    mut_length = 0
    for mut in mut_list:
        if 'M' in mut:
            mut_length += len(mut.split(":")[-1])
        elif 'I' in mut:
            mut_length += len(mut.split("_")[-1])
        elif 'D' in mut:
            inf, sup = int(mut.split(':')[0][1:]), int(mut.split(':')[-1][:-1])
            delt_length = sup - inf + 1
            mut_length += delt_length
        else:
            pass
    return mut_length


def mutation_region_splitting(data):
    ## step 1: mutaion 区域划分
    data['new_mutation'] = data['all_mutation'].apply(lambda x: adjust_off_target_mutaion_reference_system(x))
    ## up, core, down 区域内突变类型
    data['up_mutation'] = data['new_mutation'].apply(lambda x: x.split('+')[0])
    data['core_mutation'] = data['new_mutation'].apply(lambda x: x.split('+')[1])
    data['down_mutation'] = data['new_mutation'].apply(lambda x: x.split('+')[2])
    ## up, core, down 区域内间断突变的个数
    data['up_mut_num'] = data['up_mutation'].apply(lambda x: len(x.split('.')) if x != 'X0' else 0)
    data['core_mut_num'] = data['core_mutation'].apply(lambda x: len(x.split('.')) if x != 'Y0' else 0)
    data['down_mut_num'] = data['down_mutation'].apply(lambda x: len(x.split('.')) if x != 'Z0' else 0)
    ## up, core, down 区域内突变核苷酸总长度
    data['up_mut_length'] = data['up_mutation'].apply(lambda x: region_mutation_length(x))
    data['core_mut_length'] = data['core_mutation'].apply(lambda x: region_mutation_length(x))
    data['down_mut_length'] = data['down_mutation'].apply(lambda x: region_mutation_length(x))
    ############################################################
    print("\nFirst: 统计 X0+Y0+Z0、XA+Y0+Z0、X0+YA+Z0、X0+Y0+ZA、XA+YA+Z0、XA+Y0+ZA、X0+YA+ZA、XA+YA+ZA 的类型占比")
    data_000 = data.loc[(data['up_mut_num']==0) & (data['core_mut_num']==0) & (data['down_mut_num']==0), :]
    data_100 = data.loc[(data['up_mut_num']!=0) & (data['core_mut_num']==0) & (data['down_mut_num']==0), :]
    data_010 = data.loc[(data['up_mut_num']==0) & (data['core_mut_num']!=0) & (data['down_mut_num']==0), :]
    data_001 = data.loc[(data['up_mut_num']==0) & (data['core_mut_num']==0) & (data['down_mut_num']!=0), :]
    data_110 = data.loc[(data['up_mut_num']!=0) & (data['core_mut_num']!=0) & (data['down_mut_num']==0), :]
    data_101 = data.loc[(data['up_mut_num']!=0) & (data['core_mut_num']==0) & (data['down_mut_num']!=0), :]
    data_011 = data.loc[(data['up_mut_num']==0) & (data['core_mut_num']!=0) & (data['down_mut_num']!=0), :]
    data_111 = data.loc[(data['up_mut_num']!=0) & (data['core_mut_num']!=0) & (data['down_mut_num']!=0), :]
    ## count
    count_dict = {}
    count_000 = data_000.shape[0]
    count_100, count_010, count_001 = data_100.shape[0], data_010.shape[0], data_001.shape[0]
    count_110, count_101, count_011 = data_110.shape[0], data_101.shape[0], data_011.shape[0]
    count_111 = data_111.shape[0]
    count_dict['X0+Y0+Z0'] = count_000
    count_dict['XA+Y0+Z0'] = count_100
    count_dict['X0+YA+Z0'] = count_010
    count_dict['X0+Y0+ZA'] = count_001
    count_dict['XA+YA+Z0'] = count_110
    count_dict['XA+Y0+ZA'] = count_101
    count_dict['X0+YA+ZA'] = count_011
    count_dict['XA+YA+ZA'] = count_111
    print("X0+Y0+Z0:", count_000)
    print("XA+Y0+Z0:", count_100)
    print("X0+YA+Z0:", count_010)
    print("X0+Y0+ZA:", count_001)
    print("XA+YA+Z0:", count_110)
    print("XA+Y0+ZA:", count_101)
    print("X0+YA+ZA:", count_011)
    print("XA+YA+ZA:", count_111)
    count_sum = data.shape[0]
    print("Distribution of count ratio:", round(count_000/count_sum, 3), 
                        round(count_100/count_sum, 3), round(count_010/count_sum, 3), round(count_001/count_sum, 3), 
                        round(count_110/count_sum, 3), round(count_101/count_sum, 3), round(count_011/count_sum, 3), 
                        round(count_111/count_sum, 3))
    return (data_000, data_100, data_010, data_001, data_110, data_101, data_011, data_111, count_dict)

###### read data

In [5]:
## read 
def get_raw_inser_data(read_dir, mut_type, reads_num, barcode_num):
    inser_path = read_dir + '/off-target.%s.accurate.offseq'%(mut_type)
    inser = pd.read_csv(inser_path, sep='\t')
    inser = inser.loc[(inser['reads_num']>=reads_num) & (inser['barcode_num']>=barcode_num), :]
    inser.reset_index(drop=True, inplace=True)
    print('columns:', inser.columns.tolist())
    print(inser.shape)
    ## data 根据 up, core, down 区域突变与否划分数据集
    inser_000, inser_100, inser_010, inser_001, inser_110, inser_101, inser_011, inser_111, count_dict = mutation_region_splitting(inser)
    return (inser, inser_000, inser_100, inser_010, inser_001, inser_110, inser_101, inser_011, inser_111)
    

## read on target 
def get_on_target_data(read_dir, reads_num=200, barcode_num=10):
    data_path = read_dir + '/off-target.perfect.accurate.offseq'
    data = pd.read_csv(data_path, sep='\t')
    data = data.loc[(data['reads_num']>=reads_num) & (data['barcode_num']>=barcode_num), :]
    data.reset_index(drop=True, inplace=True)
    print('columns:', data.columns.tolist())
    print(data.shape)
    ## data 根据 up, core, down 区域突变与否划分数据集
    data_000, data_100, data_010, data_001, data_110, data_101, data_011, data_111, count_dict = mutation_region_splitting(data)
    return (data, data_000, data_100, data_010, data_001, data_110, data_101, data_011, data_111)


## For insertion & perfect common
## 确定 insertion id 
def obtain_inser_data(inser, data, inser_110, inser_111, data_100, data_101, save_dir):
    ## 1、insertion XA+YA+Z0
    print("\n1、insertion XA+YA+Z0")
    inser_id_1 = inser_110[['sgRNA_name', 'up_mutation', 'down_mutation']].drop_duplicates()
    on_id_1 = data_100[['sgRNA_name', 'up_mutation', 'down_mutation']].drop_duplicates()
    comm_id_1 = pd.merge(inser_id_1, on_id_1, how='inner', on=['sgRNA_name', 'up_mutation', 'down_mutation'])
    print("XA+YA+Z0 insertion id:", inser_id_1.shape)
    print("XA+Y0+Z0 perfect id:", on_id_1.shape)
    print("common XA+Y0+Z0 id perfeat id:", comm_id_1.shape)

    ## 2、insertion XA+YA+ZA
    print("\n2、insertion XA+YA+ZA")
    inser_id_2 = inser_111[['sgRNA_name', 'up_mutation', 'down_mutation']].drop_duplicates()
    on_id_2 = data_101[['sgRNA_name', 'up_mutation', 'down_mutation']].drop_duplicates()
    comm_id_2 = pd.merge(inser_id_2, on_id_2, how='inner', on=['sgRNA_name', 'up_mutation', 'down_mutation'])
    print("XA+YA+ZA insertion id:", inser_id_2.shape)
    print("XA+Y0+ZA perfect id:", on_id_2.shape)
    print("common XA+YA+ZA insertion id:", comm_id_2.shape)

    ## merging
    print('\nmerging')
    comm_id = pd.concat([comm_id_1, comm_id_2], axis=0)
    ## For perfect
    on_id = pd.merge(data[['sgRNA_name', 'up_mutation', 'down_mutation', 'reads_num', 'barcode_num', 'off-target_eff']], 
                     comm_id, how='inner', on=['sgRNA_name', 'up_mutation', 'down_mutation'])
    on_id.rename(columns={'reads_num': 'p_reads_num', 'barcode_num': 'p_barcode_num', 'off-target_eff': 'on-target_eff'}, 
                 inplace=True)
    print("perfect on_id:", on_id.shape)
    inser_id = pd.merge(inser, on_id, how='inner', on=['sgRNA_name', 'up_mutation', 'down_mutation'])
    print("inser_id.shape:", inser_id.shape)
    ## to save
    mkdir(save_dir)
    inser_id.to_excel(save_dir + '/off-target insertion data.xlsx', index=False)
    return inser_id

#### 1. The distribution of insertion position

In [6]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

figsuplix = 'pdf'

In [7]:
def is_Exist_file(path):
    import os
    if os.path.exists(path):
        os.remove(path)


def mkdir(path):
    import os
    path = path.strip()  # 去除首位空格
    path = path.rstrip("\\")  # 去除尾部 \ 符号
    isExists = os.path.exists(path)  # 判断路径是否存在
    # 判断结果
    if not isExists:
        os.makedirs(path)  # 如果不存在则创建目录
        print(path + ' 创建成功')
    else:
        print(path + ' 目录已存在')  # 如果目录存在则不创建，并提示目录已存在

In [8]:
## for insertion XA+YA+Z0 analysis
def insertion_position(mutation):
    import re
    mut_list = re.split('\+|\.', mutation)
    positions = []
    for mut in mut_list:
        if 'I' in mut:
            positions.append(int(mut.split("_")[0][1:-1]))
        else:
            pass
    return str(tuple(positions))


def insertion_nucleotide(mutation):
    import re
    mut_list = re.split('\+|\.', mutation)
    inser_nucle = ''
    for mut in mut_list:
        if 'I' in mut:
            inser_nucle = inser_nucle + mut.split("_")[1]
        else:
            pass
    return inser_nucle


def helper_inser_nucle_count(inser_nucle_dict, nucle):
    try:
        return inser_nucle_dict[nucle]
    except KeyError as e:
        return 0
    


## 分析 XA+YA+Z0 中 YA insertion 的长度均是 1
## 具体内容：
## 1、insertion length distribution
## 2、insertion position-count distribution
## 3、insertion nucleotide-count distribution
## 4、insertion position-nucleotied-count distribution 
def insertion_count_nucleotide_distribution(inser_id):
    inser1 = inser_id.loc[(inser_id['up_mut_num']!=0) & (inser_id['core_mut_num']!=0) & (inser_id['down_mut_num']==0), :]
    print("insertion XA+YA+Z0 shape:", inser1.shape)
    print("inser1['core_mut_length'].value_counts():", inser1['core_mut_length'].value_counts())
    inser1['position'] = inser1['core_mutation'].apply(lambda x: insertion_position(x))
    inser1['inser_nucleotide'] = inser1['core_mutation'].apply(lambda x: insertion_nucleotide(x))
    print(len(list(inser1['position'].unique())))
    print("inser_nucleotide.value_counts():\n", inser1['inser_nucleotide'].value_counts())

    ## insertion position-count distribution 
    ## insertion position-nucleotide-count distribution
    inser_nucle_list = []
    inser_length_dict = {}
    inser_nucle_dict = {}
    for index, row in inser1.iterrows():
        pos = eval(row['position'])[0]
        nucle = row['inser_nucleotide']
        if pos not in inser_length_dict:
            inser_length_dict[pos] = 1
            inser_nucle_dict[pos] = {nucle: 1}
        else:
            inser_length_dict[pos] += 1
            try:
                inser_nucle_dict[pos][nucle] += 1
            except KeyError as e:
                inser_nucle_dict[pos][nucle] = 1
        inser_nucle_list = list(set(inser_nucle_list + [nucle]))
    inser_nucle_list.sort(reverse=False)
    ## 
    all_pos = list(inser_length_dict.keys())
    all_pos.sort(reverse=False)
    pos_df = pd.DataFrame({"position": all_pos})
    pos_df['inser_count'] = pos_df['position'].apply(lambda x: inser_length_dict[x])
    pos_df['inser_nucles'] = pos_df['position'].apply(lambda x: inser_nucle_dict[x])
    for nucle in inser_nucle_list:
        pos_df[nucle] = pos_df['inser_nucles'].apply(lambda x: helper_inser_nucle_count(x, nucle))
    return pos_df

In [9]:
## labels
def get_xticks(pos_df):
    pam_dict = {41: 'N', 42: 'G', 43: 'G'}
    pos_list = pos_df['position'].tolist()
    labels = []
    for pos in pos_list:
        if pos not in pam_dict:
            labels.append(pos-20)
        else:
            labels.append(pam_dict[pos])
    return labels


## 柱状图
def plot_single_bar(data_list, labels, title, save_dir, 
                    xlabel='Target+PAM Insertion Position', ylabel='Count'):
    import matplotlib.pyplot as plt
    # 设置默认绘图风格
    plt.style.use("seaborn-white")  
    fig, ax = plt.subplots(1,1, figsize=(12, 4))

    plt.bar(range(len(data_list)), data_list, color='darkslateblue')
    ## 坐标轴不可见
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ## xlabel, ylabel
    plt.xlabel(xlabel, fontsize=12, weight='bold')
    plt.ylabel(ylabel, fontsize=12, weight='bold')
    ## xticks
    plt.xticks(range(len(labels)), labels, fontsize=12, weight='bold')
    ## title
    plt.title(title, fontsize=12, weight='bold')
    ## save
    mkdir(save_dir)
    savefig_path = save_dir + '/%s.%s'%(title, figsuplix)
    plt.savefig(savefig_path, dpi=300, bbox_inches='tight')
    plt.show()
    

## insertion position nucleotide count distribution
def plot_bar(data, xcol, ycol, hue_col, title, save_dir, xticks, xlabel, ylabel, palette):
    import seaborn as sns
    import matplotlib.pyplot as plt
    
    fig, ax = plt.subplots(1,1, figsize=(16, 4))
    fig.subplots_adjust(wspace=5)
    sns.barplot(x=xcol, y=ycol, hue=hue_col, data=data, palette=palette)
    ## 坐标轴不可见
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ## ylim
    # plt.ylim(0, 1.1)
    plt.ylabel(ylabel, fontsize=12, weight='bold')
    plt.xlabel(xlabel, fontsize=12, weight='bold')
    ## xticks
    plt.xticks(range(len(xticks)), xticks, fontsize=10, weight='bold')
    ## title
    plt.title(title, fontsize=12, weight='bold')
    plt.legend(title='insertion nucleotide')
    mkdir(save_dir)
    savefig_path = save_dir + '/%s.%s'%(title, figsuplix)
    plt.savefig(savefig_path, dpi=300, bbox_inches = 'tight')
    plt.show()

#### Analysis of insertion off-target editing efficiency

In [10]:
## for XA+YA+Z0 
## positon-insertion nucleotide: relative efficiency
def get_insertion_XA_YA_Z0_reltive_efficiency(inser1, on_eff_cutoff=0.05):
    ## get relative off-target efficiency
    inser1['position'] = inser1['core_mutation'].apply(lambda x: insertion_position(x))
    inser1['inser_nucleotide'] = inser1['core_mutation'].apply(lambda x: insertion_nucleotide(x))
    inser1 = inser1.loc[inser1['on-target_eff']>=on_eff_cutoff, :]
    inser1['reltv_eff'] = inser1.apply(lambda row: row['off-target_eff']/row['on-target_eff'], axis=1)

    ## statistic relative efficiency
    pos_list = []
    nucle_list = []
    pos_nucle_eff_dict = {}
    inser11 = inser1.loc[inser1['core_mut_num']==1, :]
    for index, row in inser11.iterrows():
        positions = eval(row['position'])
        inser_nucles =  row['inser_nucleotide']
        reltv_eff = row['reltv_eff']
        for i, nucle in enumerate(inser_nucles):
            pos = positions[i]
            pos_list.append(pos)
            nucle_list.append(nucle)
            if pos not in pos_nucle_eff_dict:
                pos_nucle_eff_dict[pos] = {nucle: [reltv_eff]}
            else:
                try:
                    pos_nucle_eff_dict[pos][nucle].append(reltv_eff)
                except KeyError as e:
                    pos_nucle_eff_dict[pos][nucle] = [reltv_eff]
    ## DataFrame
    pos_list = list(set(pos_list))
    pos_list.sort(reverse=False)
    nucle_list = list(set(nucle_list))
    nucle_list.sort(reverse=False)
    stat_df_list = []
    for pos in pos_list:
        for nucle in nucle_list:
            try:
                stat_df = pd.DataFrame({'reltv_eff': pos_nucle_eff_dict[pos][nucle]})
            except KeyError as e:
                stat_df = pd.DataFrame({'reltv_eff': []})
            stat_df['position'] = pos
            stat_df['inser_nucleotide'] = nucle
            stat_df_list.append(stat_df)
    stat_df = pd.concat(stat_df_list, axis=0)
    stat_df.reset_index(drop=True, inplace=True)
    print("stat_df.shape:", stat_df.shape)
    return stat_df

In [11]:
## plot the effect of insertion position on editing efficiency
def plot_inser_reltv_eff_on_position(stat_df, save_dir):
    import matplotlib.pyplot as plt
    import seaborn as sns

    # 设置默认绘图风格
    plt.style.use("seaborn-white")  
    fig, ax = plt.subplots(1,1, figsize=(6, 3))

    order = [(i) for i in range(24, 44)]
    ax = sns.boxplot(x='position', y='reltv_eff', data=stat_df, width=0.4, color='white', 
                     fliersize=0.5, linewidth=0.5, order=order)
    ## 坐标轴不可见
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    # # 添加分布散点图 boxplot with jitter
    ax = sns.stripplot(x='position', y='reltv_eff', data=stat_df, color="orange", jitter=0.1, 
                       size=0.8, order=order)
    plt.ylabel("")
    plt.xlabel("")
    ## xticks
    xticks = list(range(4, 21, 1)) + ['N', 'G', 'G']
    print(len(xticks))
    plt.xticks(range(20), xticks, fontsize=6, weight='bold')
    plt.yticks(fontsize=6, weight='bold')
    ## ylim
    # plt.ylim(0, 2.3)
    plt.xlim(-1, 20)
    title = 'Effect of off-target insertion position on gRNA acitvity'
    mkdir(save_dir)
    savefig_path = save_dir + '/%s.%s'%(title, figsuplix)
    plt.savefig(savefig_path, dpi=300, bbox_inches = 'tight')
    plt.show()


## plot: the effect of off-target insertion XA+YA+Z0 nucleotide type on editing efficiency
def plot_inser_reltv_eff_on_pos_nucle(stat_df, save_dir):
    import matplotlib.pyplot as plt
    import seaborn as sns

    # 设置默认绘图风格
    plt.style.use("seaborn-white")  
    fig, ax = plt.subplots(1,1, figsize=(6, 3))

    order = [(i) for i in range(24, 44)]
    ax = sns.boxplot(x='position', y='reltv_eff', hue='inser_nucleotide', data=stat_df, width=0.6,  
                     fliersize=0.5, linewidth=0.5, order=order)
    ## 坐标轴不可见
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.ylabel("")
    plt.xlabel("")
    ## xticks
    xticks = list(range(4, 21, 1)) + ['N', 'G', 'G']
    print(len(xticks))
    plt.xticks(range(20), xticks, fontsize=6, weight='bold')
    plt.yticks(fontsize=6, weight='bold')
    ## ylim
    plt.xlim(-1, 20)
    title = 'Effect of off-target insertion nucleotide type on gRNA activity'
    mkdir(save_dir)
    savefig_path = save_dir + '/%s.%s'%(title, figsuplix)
    plt.savefig(savefig_path, dpi=300, bbox_inches = 'tight')
    plt.show()

### Summary

In [12]:
## plot insertion data 分布
def plot_distribution_insertion_data(pos_df, data_dir):
    ## 1. plot insertion position distribution
    # data_list
    xticks = get_xticks(pos_df)
    data_list = pos_df['inser_count'].tolist()
    title = 'position distriburion of off-target insertion'
    ylabel = 'off-target id count'
    xlabel='Target+PAM Insertion Position'
    plot_single_bar(data_list, xticks, title, data_dir, xlabel, ylabel)

    ## 2. plot insertion nucleotide distribution without considering insertion position
    xticks = ['A', 'C', 'G', 'T']
    data_dict = dict(pos_df[xticks].sum(axis=0))
    data_list = [data_dict[nucle] for nucle in xticks]
    title = 'nucleotide distribution of off-target insertion'
    ylabel = 'off-target id count'
    xlabel = 'Insertion Nucleotide'
    plot_single_bar(data_list, xticks, title, data_dir, xlabel, ylabel)

    ## 3. insertion position nucleotide count distribution
    xticks = get_xticks(pos_df)
    ## plot data
    pos_nucle_df_list = []
    for col in ['A', 'C', 'G', 'T']:
        temp = pos_df[['position', col]]
        temp.rename(columns={col: 'count'}, inplace=True)
        temp['nucle'] = col
        pos_nucle_df_list.append(temp)
    pos_nucle_df = pd.concat(pos_nucle_df_list, axis=0)
    ## 
    palette = {'A': 'slateblue', 
               'C': 'lightseagreen', 
               'G': 'olive', 
               'T': 'limegreen', 
               'Conv2D': 'lightseagreen', 
               'Conv1D_LSTM': 'deepskyblue', 
               'Stacked_LSTM': 'royalblue', 
               'XGBoost': 'slateblue', 
               'BiLSTM': 'purple'}
    xcol = 'position'
    ycol = 'count'
    hue_col = 'nucle'
    xlabel = 'Target+PAM Insertion Position'
    ylabel = 'off-target id count'
    title = 'position insertion nucleotide distribution of off-target insertion'
    plot_bar(pos_nucle_df, xcol, ycol, hue_col, title, data_dir, xticks, xlabel, ylabel, palette)