In [1]:
#### IMPORT LIBS AND DEFINE FUNCTIONS ####
import os, sys, subprocess
import pandas as pd
import numpy as np
from Bio import SeqIO
import warnings
warnings.filterwarnings('ignore')

## FUNCTIONS FOR CAS-OFFINDER
def generate_casoff_input_file(ref_genome_fa_path, protein, pam, 
                               n_mismatch, grna_list, len_spacer = 23, n_bulges = 1):
    pam_alias = "N" * len(pam)
    guide_bulge_pattern = f"{'N' * len_spacer + pam} {n_bulges} {n_bulges}"
    #guide_bulge_pattern = f"{'N' * len_spacer + pam}"
    input_file_content = [ref_genome_fa_path, guide_bulge_pattern] + [f"{x}{pam_alias} {n_mismatch}" for x in grna_list]
    input_file_name = f"{protein}_{pam}_Input_File_v3.txt"

    with open(input_file_name, "w") as f:
        f.writelines("\n".join(input_file_content))

def generate_validation_str(row, trim_len):
    """
    - Creates string-representations for simpler interpretation of genomic hits
    - Let:
            - 0 = Mismatch
            - D = DNA_bulge
            - R = RNA_bulge
    """
    out_str = []
    crRNA_str = row["crRNA"][:-trim_len]
    dna_str = row["DNA"][:-trim_len]
    
    for i in range(len(dna_str)):
        a = crRNA_str[i]
        b = dna_str[i]

        if a == b:
            out_str.append("1")
        elif a == '-':
            out_str.append("R")
        elif b == '-':
            out_str.append("D")
        elif b.islower():
            out_str.append("0")
    return "".join(out_str)

def count_mismatches(s):
    total_count = s.count('0') + s.count("R") + s.count('D')
    return total_count


In [2]:
#### SETUP CAS-OFFINDER ####
pwd = "/home/ubuntu/software/cas_offinder_v3/"
conda_env_name = "be_off"

## PARSE/FORMAT gRNA LIBRARY AND DETAILS
ref_genome_fa_path = "/home/ubuntu/software/cas_offinder_v3/GRCh38.p14.genome.fa"
protein_pam_dict = {"wt_geo_paper_pam" : "NNNNCWAA",
                    "igeo" : "NNNNCNNN",
                    "wt_nme2cas9" : "NNNNCC",
                    "inme_prelim1":"NNNNCH"
                   }

# MAKE INPUT FILES
grnas = ["AGCATGATACATGGAATGAGGGA", 
         "ACTGCATAAGGGAGGAAGAACGC", 
         "CTCCTTCCTAGTCTCCTGATATT", 
         "ATATCAGGAGACTAGGAAGGAGG"]
for protein,pam in protein_pam_dict.items():
    generate_casoff_input_file(ref_genome_fa_path, protein, pam = pam, n_mismatch = 5, grna_list = grnas)


## WRITE BASH SCRIPT
job_name = "cas_offinder_iGEO"
script_name = f"CasOffinder_iGEO.sh"
cas_off_path = "/home/ubuntu/software/cas_offinder_v3/cas-offinder-3.0.0b3/cas-offinder"
bash_lines = ["#!/bin/bash","#SBATCH -p gpu", f"#SBATCH --job-name {job_name}", f"#SBATCH -o %j.out","#SBATCH -e %j.err"]

cas_offs = []
output_files = []
for protein,pam in protein_pam_dict.items():
    input_file = f"{protein}_{pam}_Input_File_v3.txt"
    output_file = f"{pwd}{protein}_{pam}_Output_File_v3.txt"
    output_files.append(output_file)
    cas_off = f"{cas_off_path} {pwd}{input_file} G {output_file}"
    cas_offs.append(cas_off)

with open(script_name, "w") as f:
    f.writelines("\n".join(bash_lines+cas_offs))

# RUN CAS-OFFINDER    
#os.system(f"sbatch {script_name}") 


In [3]:
#### CONSOLIDATE CAS-OFFINDER OUTPUT FILES ####
casoff_dir = "/groups/doudna/projects/mtrinidad_projects/Geo_Cas9_Indel_HDR_KC/iGEO_Off_Target_Analysis/Off_Target_Prediction/CasOffinder_Results/"
casoff_cols = ["Id", "Bulge type", "crRNA", "DNA", "Chromosome", "Location", "Direction", "Mismatches", "Bulge Size"]
ot_df_dict = {}
coord_dict = {}
seq_dict = {}

for output_file in [x.replace(pwd, casoff_dir) for x in output_files]: # Note: Output files reorganized
    base = output_file.split("/")[-1].split("_Output_File_v3.txt")[0]
    pam = base.split("_")[-1]
    protein = base.split(f"_{pam}")[0]
    print(f"#### PROCESSING: {base}, {protein}, {pam}")

    # READ CASOFFINDER RESULTS
    temp_df = pd.read_csv(output_file, sep="\t", names=casoff_cols, skiprows=2)
    print(f"# N Unfiltered: {temp_df.shape[0]}")
    
    # FLAG OFF-TARGETS
    temp_df["Validation_String"] = temp_df.apply(generate_validation_str, trim_len = len(pam), axis = 1)
    temp_df['Num_Seed_Diffs'] = temp_df["Validation_String"].str[-10:-3].apply(count_mismatches)
    temp_df['Pass_Seed_Filter'] = temp_df["Validation_String"].str[-10:-3].apply(count_mismatches) <= 2  
    temp_df["Proximal_2bp_Diffs"] = temp_df["Validation_String"].str[-2:].apply(count_mismatches)
    temp_df["Distal_3bp_Diffs"] = temp_df["Validation_String"].str[0:3].apply(count_mismatches)
    temp_df["Middle_Diffs"] = temp_df["Validation_String"].str[3:-10].apply(count_mismatches)
    temp_df["CasOffinder_Output"] = base

    # FILTER AND SORT
    filtered_df = temp_df.loc[temp_df.Pass_Seed_Filter == True]
    filtered_df.sort_values(["Middle_Diffs"], ascending=True, inplace = True) # "Mismatches", "Bulge Size"
    filtered_df["Coordinate_ID"] = filtered_df['Chromosome'].astype(str) + '_' + filtered_df['Location'].astype(str) + '_' + \
                                    filtered_df['Direction'].astype(str)
    coord_dict[base] = filtered_df.Coordinate_ID.unique()
    ot_df_dict[base] = filtered_df
    print(f"# TOTAL OFF-TARGETS (Passing < 2 Seed-Region Mismatches & Bulges): {filtered_df.shape[0]}")

## LABEL COMMON OFF_TARGETS AMONGST EDITORS AND GUIDES
grna_id_dict = {0:"EMX1-AllT1", 
                1:"EMX1-AllT2", 
                2:"AAVS1-AllT1", 
                3:"AAVS1-AllT2"}
for ot_analysis_1, ot_df in ot_df_dict.items():
    print(f"#### PROCESSING: {ot_analysis_1}")
    for ot_analysis_2,coords in coord_dict.items():
        if ot_analysis_1 == ot_analysis_2:
            print(f"!!!!Skipping {ot_analysis_2}")
            continue
        else:
            print(f"# Added {ot_analysis_2}")
            new_col = f"OT_In_{ot_analysis_2}"
            ot_df[new_col] = 0
            ot_df.loc[ot_df.Coordinate_ID.isin(coords), new_col] = 1
    ot_df = ot_df.loc[ot_df.Chromosome.str.contains("chr")]
    ot_df["Id"] = ot_df["Id"].apply(lambda x: grna_id_dict[x])
    ot_df_dict[ot_analysis_1] = ot_df
    ot_df.to_csv(f"{ot_analysis_1}_OffTargets.csv")

## FIND COMMON OTs
overlap_cols = [f"OT_In_{x}" for x in coord_dict.keys()]
to_merge = [v for k,v in ot_df_dict.items()]
all_dfs = pd.concat(to_merge, ignore_index=True)
common_entries = all_dfs.groupby(['Coordinate_ID'])['CasOffinder_Output'].nunique()

# Filter groups that appear in all DataFrames
common_entries = common_entries[common_entries == len(to_merge)]

# Reset index
common_entries = common_entries.reset_index()[['Coordinate_ID']]
print(common_entries)

#### PROCESSING: wt_geo_paper_pam_NNNNCWAA, wt_geo_paper_pam, NNNNCWAA
# N Unfiltered: 5580
# TOTAL OFF-TARGETS (Passing < 2 Seed-Region Mismatches & Bulges): 4411
#### PROCESSING: igeo_NNNNCNNN, igeo, NNNNCNNN
# N Unfiltered: 158947
# TOTAL OFF-TARGETS (Passing < 2 Seed-Region Mismatches & Bulges): 134077
#### PROCESSING: wt_nme2cas9_NNNNCC, wt_nme2cas9, NNNNCC
# N Unfiltered: 33259
# TOTAL OFF-TARGETS (Passing < 2 Seed-Region Mismatches & Bulges): 27176
#### PROCESSING: inme_prelim1_NNNNCH, inme_prelim1, NNNNCH
# N Unfiltered: 152356
# TOTAL OFF-TARGETS (Passing < 2 Seed-Region Mismatches & Bulges): 128428
#### PROCESSING: wt_geo_paper_pam_NNNNCWAA
!!!!Skipping wt_geo_paper_pam_NNNNCWAA
# Added igeo_NNNNCNNN
# Added wt_nme2cas9_NNNNCC
# Added inme_prelim1_NNNNCH
#### PROCESSING: igeo_NNNNCNNN
# Added wt_geo_paper_pam_NNNNCWAA
!!!!Skipping igeo_NNNNCNNN
# Added wt_nme2cas9_NNNNCC
# Added inme_prelim1_NNNNCH
#### PROCESSING: wt_nme2cas9_NNNNCC
# Added wt_geo_paper_pam_NNNNCWAA
# Added 