In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import LocalOutlierFactor
import os
from bib import *

### Load JACUSA CALL2 features

In [None]:
# Params from snakemake
inp1 = snakemake.input[0]       # WT vs IVT JACUSA CALL2 Output
inp2 = snakemake.input[1]       # KO vs IVT JACUSA CALL2 Output
inp3 = snakemake.input[2]       # WT vs KO JACUSA CALL2 Output
dtype = snakemake.params[0]     # MinION or Flongle
target = snakemake.params[1]    # KO target position
ref = snakemake.params[2]       # NR_003286_RNA18SN5 or NR_003287_RNA28SN5
lof_thre = snakemake.params[3]  # LOF contamination value
lof_neigh = snakemake.params[4] # LOF neighborhood size
label = snakemake.params[5] 
mod_status_file = snakemake.params[6] # rRNA modification status file
output =  snakemake.output[0] # output folder
if not os.path.exists(output):
    os.makedirs(output, exist_ok=True)
output = output + "/"

dic = dict({'wt_ivt':inp1 , 'ko_ivt' : inp2, 'ko_wt':inp3})

In [2]:
# # Params from snakemake
# inp1 = '../data/Output/snakemake/jacusa/MinION/oligo/oligo_A_A_v222vsoligo_m6A_A_v222_call2.out'       # WT vs IVT JACUSA CALL2 Output
# inp2 = '../data/Output/snakemake/jacusa/MinION/oligo/oligo_m6A_A_v222vsoligo_m6A_A_v222_call2.out'       # KO vs IVT JACUSA CALL2 Output
# inp3 = '../data/Output/snakemake/jacusa/MinION/oligo/oligo_A_A_v222vsoligo_m6A_A_v222_call2.out'       # WT vs KO JACUSA CALL2 Output
# dtype = 'MinION'     # MinION or Flongle
# target = [50]    # KO target position
# ref = 'Ref_m1A_and_random_m6A'       # NR_003286_RNA18SN5 or NR_003287_RNA28SN5
# lof_thre = 0.001  # LOF contamination value
# lof_neigh = 20 # LOF neighborhood size
# label = 'oligo' 
# mod_status_file = '../data/mod_Ribozyme.tsv' # rRNA modification status file
# output =  '../data/Output/snakemake' # output folder
# if not os.path.exists(output):
#     os.makedirs(output, exist_ok=True)
# output = output + "/"

# dic = dict({'wt_ivt':inp1 , 'ko_ivt' : inp2, 'ko_wt':inp3})

In [3]:
# JACUSA CALL2 features
for key in dic:
    
    df0 = pd.read_csv(dic[key], sep = '\t',skiprows=1)
    df0 = ExtractFeatures(df0)
#     df0.to_csv(JC2out_Features, index=False) 
    df0['Ref_Pos'] = df0["Ref"]+ "_" + df0["Pos"].astype(str) 

#     # load rRNA modifications
#     mod = pd.read_csv(mod_file, sep = '\t', header = None)
#     mod['Ref_Pos'] = mod[0]+ "_" + mod[2].astype(str) 
#     mod = mod.rename(columns={3 :  'Mod'})

    # load rRNA modifications status
    mods = pd.read_csv(mod_status_file, sep = ',')
    mods['Ref_Pos'] = mods[mods.columns[0]]+ "_" + mods[mods.columns[1]].astype(str) 
#     mod_ = pd.merge(mods[['Ref_Pos','ModStatus','Status']],mod[['Ref_Pos','Mod']], on='Ref_Pos')
    # merge features with modifications
    df1 = pd.merge(df0,mods[['Ref_Pos','ModStatus','Status']], on='Ref_Pos')
    df1 = df1.sort_values(by=['Ref' , 'Pos']).reset_index(drop=True)
    
    ### Add features in 5mer context
    """ 
    Build the table of features in 5mer context : Mismatch, Mismatch + Insertion + Deletion, 
    Mismatch in the 5mer context + Insertion + Deletion, Mismatch + Insertion + Deletion all in the 5mer context   
    """

    df2 = KmerFeatures(df1)
    feat1 = df2.Mis
    feat2 = df2.Mis + df2.Ins + df2.Del
    feat3 = df2.SumMis + df2.Ins + df2.Del
    feat4 = df2.SumMis + df2.SumIns +df2.SumDel

    dfsave = pd.DataFrame({'label':key,'dtype':dtype,'Ref_Pos': df2['Ref_Pos'],'Ref': df2['Ref'], 'Pos':df2['Pos'],'Coverage1' : df2['Cov1'],'Coverage2' : df2['Cov2'],'Mis': feat1, 'Mis+Del+Ins': feat2
                           , 'MisContext+Del+Ins':feat3, 'Mis+Del+Ins_Context':feat4, 'ModStatus' : df2['ModStatus'], 'Status':df2['Status'], 'Kmer' :df2['5mer'] })

    # add features to the table of features
    if 'table' in globals():
        table = table.append(dfsave)
    else: 
        table = dfsave

In [14]:
table.to_csv(output+'Features_JACUSA2CALL2.csv', index=False)

Generate ScatterPlots for each features combination. Features are supposed to be already generated and added to the table of features

In [None]:
features = ['Mis', 'Mis+Del+Ins', 'MisContext+Del+Ins','Mis+Del+Ins_Context']
for feature in features:
    label1 = table['label'].unique()[0]
    label2 = table['label'].unique()[1]
    label3 = table['label'].unique()[2]

    x= table[(table["label"] == label1) & (table["Ref"] == ref)]
    y= table[(table["label"] == label2) & (table["Ref"] == ref)]
    z= table[(table["label"] == label3) & (table["Ref"] == ref)]

    df_ = pd.merge(x,y, on = 'Ref_Pos')
    df_ = pd.merge(df_,z, on = 'Ref_Pos')

    ScatterPlot_3Feat(df_[['Pos',feature+'_x',feature+'_y',feature]],label,feature,ref,target,dtype,lof_thre,lof_neigh, path = output)
    #  only outliers are labeled here ... ax.annotate() can be used to annotate other positions of interest. 