### Selecting gRNAs for off-target library

In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
#################################################################################
# 检查文件是否存在，存在删除
def is_Exist_file(path):
    import os
    if os.path.exists(path):
        os.remove(path)


def mkdir(path):
    import os
    path = path.strip()  # 去除首位空格
    path = path.rstrip("\\")  # 去除尾部 \ 符号
    isExists = os.path.exists(path)  # 判断路径是否存在
    # 判断结果
    if not isExists:
        os.makedirs(path)  # 如果不存在则创建目录
        print(path + ' 创建成功')
    else:
        print(path + ' 目录已存在')  # 如果目录存在则不创建，并提示目录已存在
#################################################################################

In [3]:
#################################################################################
## Pick sgRNA set For Off Target Library from gRNA edit efficiency
## eff_cutoff: 高编辑效率；barn_cutoff： 高 baorcode 的数
## 尽可能选择 target 基因区域
def pick_sgRNAs_for_offTarget(file_path, eff_cutoff=0.35, barn_cutoff=20, pick_method='based_seq', seq_n=5):
    import random
    data = pd.read_csv(file_path, sep='\t')
    all_index = data.index.tolist()
    temp1 = data.loc[(data['Jeff_BarW']>=eff_cutoff) & (data['Keff_BarW']>=eff_cutoff) &
                     (data['JBar_n1'] >= barn_cutoff) & (data['JBar_n2'] >= barn_cutoff) &
                    (data['KBar_n1'] >= barn_cutoff) & (data['KBar_n2'] >= barn_cutoff), :]
    index1 = temp1.index.tolist()
    index2 = list(set(all_index) - set(index1))
    temp2 = data.loc[index2, :]
    temp1.reset_index(drop=True, inplace=True)
    temp2.reset_index(drop=True, inplace=True)
    ##
    if pick_method == 'based_seq':
        temp1['Mseq'] = temp1['gRNATarget'].apply(lambda x: x[-seq_n:])
        temp2['Mseq'] = temp2['gRNATarget'].apply(lambda x: x[-seq_n:])
        Mseq1 = temp1['Mseq'].unique().tolist()
        Mseq2 = temp2['Mseq'].unique().tolist()
        Mseq_left = list(set(Mseq2) - set(Mseq1))
        ## Pick sgRNAs
        ## 对于高编辑区域
        each_choose_n = int(1024/(4**seq_n))
        pick_sgRNA_list = []
        for mseq in Mseq1:
            temp = temp1.loc[temp1['Mseq']==mseq, :]
            temp['gRNA_type'] = temp['sgRNA_name'].apply(lambda x: 1 if 'noncoding' not in x else 0)
            check_temp = temp.loc[temp['gRNA_type']==1, :] ## 选择编码区域的sgRNA
            if len(check_temp) < each_choose_n:
                check_temp = temp
            else:
                pass
            temp_sgRNA_list = check_temp['sgRNA_name'].tolist()
            try:
                one_sgRNA = random.sample(temp_sgRNA_list, each_choose_n)
            except ValueError as e:
                one_sgRNA = temp_sgRNA_list
            pick_sgRNA_list += one_sgRNA
        for mseq in Mseq_left:
            temp = temp2.loc[temp2['Mseq'] == mseq, :]
            temp['gRNA_type'] = temp['sgRNA_name'].apply(lambda x: 1 if 'noncoding' not in x else 0)
            check_temp = temp.loc[temp['gRNA_type'] == 1, :]  ## 选择编码区域的sgRNA
            if len(check_temp) < each_choose_n:
                check_temp = temp
            else:
                pass
            temp_sgRNA_list = check_temp['sgRNA_name'].tolist()
            try:
                one_sgRNA = random.sample(temp_sgRNA_list, each_choose_n)
            except ValueError as e:
                one_sgRNA = temp_sgRNA_list
            pick_sgRNA_list += one_sgRNA
    else: ## 随机
        sgRNA_list = temp1['sgRNA_name'].tolist()
        pick_sgRNA_list = random.sample(sgRNA_list, 1000)

    print('Pick sgRNA number:', len(pick_sgRNA_list))
    ## to save
    pick_data = data.loc[data['sgRNA_name'].isin(pick_sgRNA_list), :]
    pick_data.reset_index(drop=True, inplace=True)
    return pick_data
#################################################################################

In [4]:
## statistics 1 -- nucleotide balanced for each position
def plot_heatmap(pick_data, save_path1):
    print('/nstatistics 1 -- nucleotide balanced for each position')
    ## Get plot data
    nucle_sort = ['A', 'G', 'C', 'T']
    sum_n = len(pick_data)
    plot_data = {}
    plot_data_2 = {}
    for i in range(20):
        key = 20 - i
        plot_data[key] = []
        pick_data['nucle'] = pick_data['gRNATarget'].apply(lambda x: x[i])
        temp_dict = dict(pick_data['nucle'].value_counts())
        for nucle in nucle_sort:
            nucle_n = temp_dict[nucle]
            plot_data[key].append(nucle_n/sum_n)

        ## dinucleotide
        if i != 19:
            key1 = 19 - i
            plot_data_2[key1] = []
            index_name = []
            pick_data['dinucle'] = pick_data['gRNATarget'].apply(lambda x: x[i:i+2])
            temp_dict = dict(pick_data['dinucle'].value_counts())
            for nucle1 in nucle_sort:
                for nucle2 in nucle_sort:
                    one_index = '%s%s'%(nucle1, nucle2)
                    index_name.append(one_index)
                    dinucle_n = temp_dict[one_index]
                    plot_data_2[key1].append(dinucle_n/sum_n)
        else:
            pass
    plot_df = pd.DataFrame(plot_data)
    plot_df.index = nucle_sort
    plot_df_2 = pd.DataFrame(plot_data_2)
    plot_df_2.index = index_name


    ## plot nucleotide
    import seaborn as sns
    import matplotlib.pyplot as plt
    sns.set()
    f, ax = plt.subplots(figsize=(12, 2))
    sns.heatmap(plot_df, fmt=".2g", cmap='YlGnBu', ax=ax, vmin=0, vmax=0.5)
    #设置坐标字体方向
    label_y = ax.get_yticklabels()
    plt.setp(label_y, rotation=360, horizontalalignment='right')
    label_x = ax.get_xticklabels()
    plt.setp(label_x, rotation=0, horizontalalignment='right')
    plt.savefig(save_path1, dpi=300, bbox_inches = 'tight')
    plt.show()
    print('---------------------------------------------------')