In [35]:
import numpy as np
from sklearn.cluster import SpectralClustering,KMeans
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import matthews_corrcoef,accuracy_score,precision_score,recall_score,confusion_matrix
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import os

In [36]:
# HL: transfer from dataden "/umms-kinfai/duolin/ying/reditools2_candidates/"
hash_candidat={}
hash_candidat['AFG-H1_directRNA']='H1-AFG.candidate_sites.tab'
hash_candidat['AFG-H9_directRNA']='H9-AFG.candidate_sites.tab'
hash_candidat["PGC-H1_directRNA"]='H1-PGC.candidate_sites.tab'
hash_candidat["DE-H1_directRNA"]='H1-DE.candidate_sites.tab'
hash_candidat["DE-H9_directRNA"]='H9-DE.candidate_sites.tab'
hash_candidat["GM12878_directRNA"]='GM12878.candidate_sites.tab'
hash_candidat["H1-hESC_directRNA"]='H1-hESC.candidate_sites.tab'
hash_candidat["H9-hESC_directRNA"]='H9-hESC.candidate_sites.tab'
hash_candidat['HEK293T_DKO_directRNA']='HEK293T_WT.candidate_sites.tab'
hash_candidat["HEK293T_WT_directRNA"]='HEK293T_WT.candidate_sites.tab'
hash_candidat["HEK_WT_pass"]='HEK293T_WT.candidate_sites.tab'

In [37]:
long_reads_min_coverage = coverage_cutoff = 5

In [38]:
covfilename=5
datatype='HEK293T_WT_directRNA'
includesnp=False #False
outputfolder="figures"

#REDD
featuredim=5
windowsize=9
inputfolder="/nfs/turbo/umms-kinfai/haorli/20240314_ReDD_result_data/figure2c/ReDD_results/"
modelname = 'hg38_merge9alldata5_noearlystop_run1ep40_run2ep60_negsite_HEK293T_WT_directRNA_epochs5_KOnonw5_withLSTM'
candidatefile="/nfs/turbo/umms-kinfai/haorli/20240314_ReDD_result_data/figure2a/reditools2_candidates/"+hash_candidat[datatype]

filename1=modelname+"/"+datatype+"_onlycandidate_cov"+str(covfilename)+"_ratio0_modcov0.sitelev.bed" #background must use no filter except coverage
filename2=modelname+"/"+datatype+"_noncandidate_cov"+str(covfilename)+"_ratio0_modcov0.sitelev.bed"

In [39]:
AG_ratio_per_site={}
shortreadcoverage={}
input = open(candidatefile,'r')
for line in input:
    chr_ = line.split()[0]
    pos_ = line.split()[1]
    chrpos=chr_+"-"+pos_
    if float(line.split("\t")[4]) < 10:
        continue
    AG_ratio_per_site[chrpos]=float(line.split("\t")[3])
    shortreadcoverage[chrpos]=float(line.split("\t")[4])

In [40]:
#REDD
featuredim=5
windowsize=9
inputfolder="/nfs/turbo/umms-kinfai/haorli/20240314_ReDD_result_data/figure2c/ReDD_results/"
modelname = 'hg38_merge9alldata5_noearlystop_run1ep40_run2ep60_negsite_HEK293T_WT_directRNA_epochs5_KOnonw5_withLSTM'

# WT data
datatype="HEK293T_WT_directRNA"
filename=inputfolder+"/"+modelname+"/"+datatype+"_onlycandidate.txt"
input = open(filename)
pos_coverage={}
coverage = {}
cutoff = 0.5#0.5
for line in input:
    score=float(line.split("\t")[-1])
    if score>=cutoff:
       predict_label=1
    else:
       predict_label=0
    
    transid= line.split("\t")[2]
    transpos = line.split("\t")[3]
    chrpos = transid+"-"+transpos
    
    if chrpos not in pos_coverage.keys():
        pos_coverage[chrpos]=predict_label
        coverage[chrpos]=1
    else:
        pos_coverage[chrpos]+=predict_label
        coverage[chrpos]+=1
long_reads_min_coverage = 5
WT_predict_value_all={}
for site in AG_ratio_per_site:   
     if site in pos_coverage:
          if coverage[site] >= long_reads_min_coverage:
              WT_predict_value_all[site]=float(pos_coverage[site]/coverage[site])
                

# DKO data
datatype="HEK293T_DKO_directRNA"
filename=inputfolder+"/"+modelname+"/"+datatype+"_onlycandidate.txt"
input = open(filename)
pos_coverage={}
coverage = {}
cutoff = 0.5#0.5
for line in input:
    score=float(line.split("\t")[-1])
    if score>=cutoff:
       predict_label=1
    else:
       predict_label=0
    
    transid= line.split("\t")[2]
    transpos = line.split("\t")[3]
    chrpos = transid+"-"+transpos
    
    if chrpos not in pos_coverage.keys():
        pos_coverage[chrpos]=predict_label
        coverage[chrpos]=1
    else:
        pos_coverage[chrpos]+=predict_label
        coverage[chrpos]+=1
long_reads_min_coverage = 5
DKO_predict_value_all={}
for site in AG_ratio_per_site:   
     if site in pos_coverage:
          if coverage[site] >= long_reads_min_coverage:
              DKO_predict_value_all[site]=float(pos_coverage[site]/coverage[site])

In [41]:
true_list=[]
WT_predict_list = []
DKO_predict_list = []
coords_list = []
for key in WT_predict_value_all:#1259
    true_list.append(AG_ratio_per_site[key])
    WT_predict_list.append(WT_predict_value_all[key])
    if key in DKO_predict_value_all:
        DKO_predict_list.append(DKO_predict_value_all[key])
    else:
        DKO_predict_list.append(0)
    coords_list.append(key)
true_list,WT_predict_list,DKO_predict_list,coords_list = np.array(true_list),np.array(WT_predict_list),np.array(DKO_predict_list),np.array(coords_list)


In [42]:
def check_site_in_range(site,example_chr,example_start,example_end):
    rname,pos = site.split('-')
    if rname == example_chr:
        if int(pos) >= example_start:
            if int(pos) <= example_end:
                return True
    return False

In [43]:
example_chr = 'chr11'
example_start = 31430000
example_end = 31431500

In [44]:
import pandas as pd
df = pd.DataFrame([coords_list,true_list,WT_predict_list,DKO_predict_list]).T
df.columns = ['Site','truth_ratio','ReDD_WT_ratio','ReDD_DKO_ratio']

# example_df = df[df['truth_ratio'] >= pos_coverage_ratio]

example_df = example_df[example_df['Site'].apply(lambda x:check_site_in_range(x,example_chr,example_start,example_end))]

In [45]:
example_df

Unnamed: 0,Site,truth_ratio,ReDD_WT_ratio,ReDD_DKO_ratio,ReDD_WT_prediction,ReDD_DKO_prediction,ReDD_threshold
8915,chr11-31431024,0.29,0.107143,0.0,editing,non-editing,0.051
8916,chr11-31431038,0.12,0.033333,0.0,non-editing,non-editing,0.051
8918,chr11-31431051,0.34,0.393939,0.029412,editing,non-editing,0.051
8919,chr11-31431089,0.81,0.583333,0.0,editing,non-editing,0.051
8920,chr11-31431137,0.2,0.184211,0.0,editing,non-editing,0.051
8924,chr11-31431200,0.17,0.454545,0.068182,editing,editing,0.051
8925,chr11-31431203,0.48,0.651163,0.136364,editing,editing,0.051
8927,chr11-31431240,0.84,0.826087,0.042553,editing,non-editing,0.051
8928,chr11-31431241,0.73,0.818182,0.021277,editing,non-editing,0.051
8929,chr11-31431242,0.4,0.418605,0.021277,editing,non-editing,0.051


In [46]:
#REDD
redd_thres = 0.051

In [47]:
#export of IGV
example_df['ReDD_WT_prediction'] = example_df['ReDD_WT_ratio'].apply(lambda x:'editing' if x>=redd_thres else 'non-editing')
example_df['ReDD_DKO_prediction'] = example_df['ReDD_DKO_ratio'].apply(lambda x:'editing' if x>=redd_thres else 'non-editing')
example_df['ReDD_threshold'] = redd_thres

In [48]:
example_df

Unnamed: 0,Site,truth_ratio,ReDD_WT_ratio,ReDD_DKO_ratio,ReDD_WT_prediction,ReDD_DKO_prediction,ReDD_threshold
8915,chr11-31431024,0.29,0.107143,0.0,editing,non-editing,0.051
8916,chr11-31431038,0.12,0.033333,0.0,non-editing,non-editing,0.051
8918,chr11-31431051,0.34,0.393939,0.029412,editing,non-editing,0.051
8919,chr11-31431089,0.81,0.583333,0.0,editing,non-editing,0.051
8920,chr11-31431137,0.2,0.184211,0.0,editing,non-editing,0.051
8924,chr11-31431200,0.17,0.454545,0.068182,editing,editing,0.051
8925,chr11-31431203,0.48,0.651163,0.136364,editing,editing,0.051
8927,chr11-31431240,0.84,0.826087,0.042553,editing,non-editing,0.051
8928,chr11-31431241,0.73,0.818182,0.021277,editing,non-editing,0.051
8929,chr11-31431242,0.4,0.418605,0.021277,editing,non-editing,0.051


In [49]:
from pathlib import Path
Path('plot_data/').mkdir(exist_ok=True,parents=True)
example_df.to_csv('plot_data/IGV.tsv',sep='\t',index=False)

In [50]:
example_df

Unnamed: 0,Site,truth_ratio,ReDD_WT_ratio,ReDD_DKO_ratio,ReDD_WT_prediction,ReDD_DKO_prediction,ReDD_threshold
8915,chr11-31431024,0.29,0.107143,0.0,editing,non-editing,0.051
8916,chr11-31431038,0.12,0.033333,0.0,non-editing,non-editing,0.051
8918,chr11-31431051,0.34,0.393939,0.029412,editing,non-editing,0.051
8919,chr11-31431089,0.81,0.583333,0.0,editing,non-editing,0.051
8920,chr11-31431137,0.2,0.184211,0.0,editing,non-editing,0.051
8924,chr11-31431200,0.17,0.454545,0.068182,editing,editing,0.051
8925,chr11-31431203,0.48,0.651163,0.136364,editing,editing,0.051
8927,chr11-31431240,0.84,0.826087,0.042553,editing,non-editing,0.051
8928,chr11-31431241,0.73,0.818182,0.021277,editing,non-editing,0.051
8929,chr11-31431242,0.4,0.418605,0.021277,editing,non-editing,0.051
