# IVT vs WT Analysis of the Genetic Model

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import LocalOutlierFactor
import os
import math
from bib import *

Gnerate BarPLots for every combination of features. Features are supposed to be already generated and added to the table of features

In [2]:
# # Params from snakemake
# inp1 = '../../output/snakemake/jacusa/Flongle/tetRvstetR_met/Ribozyme_teR-met_blaRvstetR-I_blaR-met_call2.out'       # WT vs IVT JACUSA CALL2 Output
# dtype = 'min'     # MinION or Flongle
# method = 'LOF'    # NR_003286_RNA18SN5 or NR_003287_RNA28SN5
# ref = 'T7-tetR'       # NR_003286_RNA18SN5 or NR_003287_RNA28SN5
# lof_thre = 0.01  # LOF contamination value
# lof_neigh = 20 # LOF neighborhood size
# output =  ''   # output folder
# mod_status_file = "../../data/tetR_blaR/mod_tetR_blaR.tsv" # rRNA modification status file
# target = [44, 110, 173, 288, 449, 548] 
# label ='wt_ivt'

In [None]:
inp1 = snakemake.input[0]       # WT vs IVT JACUSA CALL2 Output
dtype = snakemake.params[0]     # MinION or Flongle
method = snakemake.params[1]    # NR_003286_RNA18SN5 or NR_003287_RNA28SN5
ref = snakemake.params[2]       # NR_003286_RNA18SN5 or NR_003287_RNA28SN5
lof_thre = snakemake.params[3]  # LOF contamination value
lof_neigh = snakemake.params[4] # LOF neighborhood size
output =  snakemake.output[0]   # output folder
mod_status_file = snakemake.params[5] # rRNA modification status file
target = snakemake.params[6] # rRNA modification status file
if not os.path.exists(output):
    os.makedirs(output, exist_ok=True)
output = output + "/"
label = snakemake.params[7]

In [3]:
    # JACUSA CALL2 features
  
    df0 = pd.read_csv(inp1, sep = '\t',skiprows=1)
    df0 = ExtractFeatures(df0)
    #     df0.to_csv(JC2out_Features, index=False) 
    df0['Ref_Pos'] = df0["Ref"]+ "_" + df0["Pos"].astype(str) 

    #     # load rRNA modifications
    #     mod = pd.read_csv(mod_file, sep = '\t', header = None)
    #     mod['Ref_Pos'] = mod[0]+ "_" + mod[2].astype(str) 
    #     mod = mod.rename(columns={3 :  'Mod'})

    # load rRNA modifications status
    mods = pd.read_csv(mod_status_file, sep = ',')
    mods['Ref_Pos'] = mods[mods.columns[0]]+ "_" + mods[mods.columns[1]].astype(str) 
    #     mod_ = pd.merge(mods[['Ref_Pos','ModStatus','Status']],mod[['Ref_Pos','Mod']], on='Ref_Pos')
    # merge features with modifications
    df1 = pd.merge(df0,mods[['Ref_Pos','ModStatus','Status']], on='Ref_Pos')
    df1 = df1.sort_values(by=['Ref' , 'Pos']).reset_index(drop=True)
    
    ### Add features in 5mer context
    """ 
    Build the table of features in 5mer context : Mismatch, Mismatch + Insertion + Deletion, 
    Mismatch in the 5mer context + Insertion + Deletion, Mismatch + Insertion + Deletion all in the 5mer context   
    """

    df2 = KmerFeatures(df1)
    feat1 = df2.Mis
    feat2 = df2.Mis + df2.Ins + df2.Del
    feat3 = df2.SumMis + df2.Ins + df2.Del
    feat4 = df2.SumMis + df2.SumIns +df2.SumDel

    dfsave = pd.DataFrame({'label':label,'dtype':dtype,'Ref_Pos': df2['Ref_Pos'],'Ref': df2['Ref'], 'Pos':df2['Pos'],'Coverage1' : df2['Cov1'],'Coverage2' : df2['Cov2'],'Mis': feat1, 'Mis+Del+Ins': feat2
                           , 'MisContext+Del+Ins':feat3, 'Mis+Del+Ins_Context':feat4, 'ModStatus' : df2['ModStatus'], 'Status':df2['Status'], 'Kmer' :df2['5mer'] })

    # add features to the table of features
    if 'table' in globals():
        table = table.append(dfsave)
    else: 
        table = dfsave

In [25]:
table.to_csv(output+'Features_JACUSA2CALL2.csv', index=False)

In [None]:
    features = ['Mis', 'Mis+Del+Ins', 'MisContext+Del+Ins','Mis+Del+Ins_Context']
    title = label
    for feature in features:
        df_= table[(table["label"] == label) & (table["Ref"] == ref)]
        BarPlot(df_[['Pos',feature, 'ModStatus','Status']],feature,title,target,outlier=lof_thre,neigh=lof_neigh,path = output, method = method)
        #  only outliers are labeled here ... ax.annotate() can be used to annotate other positions of interest. 

In [7]:
# inp1 = '../../output/snakemake/jacusa/Flongle/tetR_blaR/Ribozyme_teR-met_blaRvsRibozyme_tetR_blaR-met_call2.out'
# df0 = pd.read_csv(inp1, sep = '\t',skiprows=1)
# df0= df0[df0['strand']=='+']
# mod = pd.DataFrame([])
# mod['Ref'] = df0['#contig']
# mod['Position']= df0['end']
# mod['ModStatus']= 'Unm'
# mod['Status']= 'Unm'
# mod

In [8]:
# mod.to_csv(output+'mod_tetR_balR.tsv', index=False)