In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import json

import util_functions
from tqdm import tqdm

In [2]:
json_fp = "./config.json"
with open(json_fp, 'r') as fp:
    config = json.load(fp)

In [3]:
gRNA_ref_df = pd.read_csv("/project/GCRB/Hon_lab/s223695/Data_project/Perturb_seq_edist_pipeline/ref/Hon_sgRNA_index_dacc_annot_reference.csv",sep="\t")

neg_control_df = \
    pd.read_csv("/project/GCRB/Hon_lab/s223695/Data_project/Perturb_seq_edist_pipeline/ref/negative_controls.tsv",sep="\t",index_col=0)
non_target_df = \
    pd.read_csv("/project/GCRB/Hon_lab/s223695/Data_project/Perturb_seq_edist_pipeline/ref/non_targeting.tsv",sep="\t",index_col=0)

neg_control_name = neg_control_df.index.tolist()
pos_control_name = ["CD81","CD151","CD55","CD29","B2M","AARS","POLR1D","DNAJC19","MALAT1","NGFRP1","TFRC"]
non_target_name = non_target_df.index.tolist()

def detect_source(target_gRNA_name):
    target_gene = util_functions.extract_gene_name(target_gRNA_name)
    if target_gene in neg_control_name:
        return "neg_control"
    elif target_gene in pos_control_name:
        return "pos_control"
    elif target_gene=="non-targeting":
        return "non-targeting"
    else:
        return "target"

In [4]:
gRNA_ref_df["target_transcript_name"] = gRNA_ref_df["protospacer_ID"].apply(util_functions.extract_transcript_name)
gRNA_ref_df["source"] = gRNA_ref_df["protospacer_ID"].apply(detect_source)
gRNA_ref_df["target_gene_name"] = gRNA_ref_df["intended_target_name"].copy()

In [5]:
print(np.unique(gRNA_ref_df["source"],return_counts=True))

(array(['neg_control', 'non-targeting', 'pos_control', 'target'],
      dtype=object), array([  598,   600,    18, 13142]))


In [6]:
gRNA_ref_df.head()

Unnamed: 0,protospacer_ID,protospacer,intended_target_name,type,genomic_element,reverse_compliment,target_transcript_name,source,target_gene_name
0,DNAJC19_ B,GGGAACTCCTGTAAGGTCAG,DNAJC19,targeting,promoter,CTGACCTTACAGGAGTTCCC,DNAJC19,pos_control,DNAJC19
1,POLR1D_ B,GGGAAGCAAGGACCGACCGA,POLR1D,targeting,promoter,TCGGTCGGTCCTTGCTTCCC,POLR1D,pos_control,POLR1D
2,OR5K2-2,GAAAAAATTGTAGAGGAATA,OR5K2,targeting,promoter,TATTCCTCTACAATTTTTTC,OR5K2,neg_control,OR5K2
3,SP1_+_53773993.23-P1P2-1,GAAAAACGCGGACGCTGACG,SP1,targeting,promoter,CGTCAGCGTCCGCGTTTTTC,SP1:P1P2,target,SP1
4,SP8_-_20826141.23-P1P2-2,GAAAAAGATCCTCTGAGAGG,SP8,targeting,promoter,CCTCTCAGAGGATCTTTTTC,SP8:P1P2,target,SP8


In [7]:
gRNA_ref_df_output = gRNA_ref_df.loc[:,["protospacer_ID","target_transcript_name","target_gene_name",
                                        "source","protospacer","reverse_compliment"]]

In [8]:
gRNA_ref_df_output

Unnamed: 0,protospacer_ID,target_transcript_name,target_gene_name,source,protospacer,reverse_compliment
0,DNAJC19_ B,DNAJC19,DNAJC19,pos_control,GGGAACTCCTGTAAGGTCAG,CTGACCTTACAGGAGTTCCC
1,POLR1D_ B,POLR1D,POLR1D,pos_control,GGGAAGCAAGGACCGACCGA,TCGGTCGGTCCTTGCTTCCC
2,OR5K2-2,OR5K2,OR5K2,neg_control,GAAAAAATTGTAGAGGAATA,TATTCCTCTACAATTTTTTC
3,SP1_+_53773993.23-P1P2-1,SP1:P1P2,SP1,target,GAAAAACGCGGACGCTGACG,CGTCAGCGTCCGCGTTTTTC
4,SP8_-_20826141.23-P1P2-2,SP8:P1P2,SP8,target,GAAAAAGATCCTCTGAGAGG,CCTCTCAGAGGATCTTTTTC
...,...,...,...,...,...,...
14353,ZNF532_-_56532303.23-P1-2,ZNF532:P1,ZNF532,target,GTTTTGGCTGCCATGAAGGG,CCCTTCATGGCAGCCAAAAC
14354,ZNF829_-_37406927.23-P1P2-2,ZNF829:P1P2,ZNF829,target,GTTTTGGTCCCCAGGAGAAC,GTTCTCCTGGGGACCAAAAC
14355,NANOG_+_7942459.23-P1P2-2,NANOG:P1P2,NANOG,target,GTTTTTCCATTATAACTTGG,CCAAGTTATAATGGAAAAAC
14356,OR8B3-5,OR8B3,OR8B3,neg_control,GTTTTTGTCTTCAAAAATCT,AGATTTTTGAAGACAAAAAC


In [9]:
gRNA_ref_df_output.to_csv(os.path.join(config["output_file_name_list"]["OUTPUT_FOLDER"],
                                       config["output_file_name_list"]["annotation_file"]))