In [1]:
import pandas as pd
import seaborn as sns
from pycaret.regression import *
import ViennaRNA as RNA

#Set NaCl concentration to 150 mM
RNA.cvar.salt = 0.150

#Load trained models
model_17_24 = load_model('17-24 log2 bc final xgbr, cbr, lgbm, br.pc1')
model_21_28 = load_model('21-28 log2 bc final cbr, lgbm, br.pc1')
model_25_32 = load_model('25-32 log2 bc final cbr&lgbm.pc1')
model_29_36 = load_model('29-36 log2 bc final cbr, lgbm, br.pc1')
model_33_40 = load_model('33-40 log2 bc final cbr, lgbm, br.pc1')
model_37_44 = load_model('37-44 log2 bc final cbr, lgbm, br.pc1')
model_41_48 = load_model('41-48 log2 bc final cbr, xgbr, lgbm.pc1')
model_45_52 = load_model('45-52 log2 bc final cbr, lgbm, br.pc1')

model_dict = {
    '17-24': model_17_24, '21-28': model_21_28, '25-32': model_25_32, '29-36': model_29_36, '33-40' : model_33_40, '37-44': model_37_44,
    '41-48': model_41_48, '45-52': model_45_52
}

r2_17_24 = 0.4558 
r2_21_28 = 0.5967
r2_25_32 = 0.511
r2_29_36 = 0.6641
r2_33_40 = 0.5455
r2_37_44 = 0.3797
r2_41_48 = 0.2674
r2_45_52 = 0.3145

r2_sum = r2_17_24 + r2_21_28 + r2_25_32 + r2_29_36 + r2_33_40 + r2_37_44 + r2_41_48 + r2_45_52

#Open files with all possible 8 bp mutations and their features
all_mut_df_17_24 = pd.read_csv('all_mut_df_17-24.csv', index_col = 'Unnamed: 0')
all_mut_df_21_28 = pd.read_csv('all_mut_df_21-28.csv', index_col = 'Unnamed: 0')
all_mut_df_25_32 = pd.read_csv('all_mut_df_25-32.csv', index_col = 'Unnamed: 0')
all_mut_df_29_36 = pd.read_csv('all_mut_df_29-36.csv', index_col = 'Unnamed: 0')
all_mut_df_33_40 = pd.read_csv('all_mut_df_33-40.csv', index_col = 'Unnamed: 0')
all_mut_df_37_44 = pd.read_csv('all_mut_df_37-44.csv', index_col = 'Unnamed: 0')
all_mut_df_41_48 = pd.read_csv('all_mut_df_41-48.csv', index_col = 'Unnamed: 0')
all_mut_df_45_52 = pd.read_csv('all_mut_df_45-52.csv', index_col = 'Unnamed: 0')


all_mut_dict = {
    '17-24': all_mut_df_17_24, '21-28': all_mut_df_21_28, '25-32': all_mut_df_25_32, '29-36': all_mut_df_29_36, '33-40' : all_mut_df_33_40,
    '37-44': all_mut_df_37_44, '41-48': all_mut_df_41_48, '45-52': all_mut_df_45_52
}

Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded


In [2]:
terminators = ['sNRP-1', 'SV40 early']

#+3..+38 nts after main cleavage site
DSEs = ['catgcgtcaattttacgcatgattatctttaacgta', 'ctagttgtggtttgtccaaactcatcaatgtatcttat']

#-24..+38 nts after main cleavage site
MFE_seq = ['aataaaatacgaaatgactagtcatgcgtcaattttacgcatgattatctttaacgtacgtc', 'ataaagcatttttttcactgcattctagttgtggtttgtccaaactcatcaatgtatcttat']

In [3]:
MFE = []

for i in MFE_seq:
    MFE.append(1.09543 ** RNA.fold(i)[1])

df = pd.DataFrame(data = [terminators, DSEs, MFE_seq, MFE], index = ['terminator', 'DSE', 'MFE_seq', 'MFE']).T

df['17-24'] = 0
df['21-28'] = 0
df['25-32'] = 0
df['29-36'] = 0
df['33-40'] = 0
df['37-44'] = 0
df['41-48'] = 0
df['45-52'] = 0

for i in df.index:
    df.loc[i,'17-24'] = df.loc[i,'DSE'][0:8].upper()
    df.loc[i,'21-28'] = df.loc[i,'DSE'][4:12].upper()
    df.loc[i,'25-32'] = df.loc[i,'DSE'][8:16].upper()
    df.loc[i,'29-36'] = df.loc[i,'DSE'][12:20].upper()
    df.loc[i,'33-40'] = df.loc[i,'DSE'][16:24].upper()
    df.loc[i,'37-44'] = df.loc[i,'DSE'][20:28].upper()
    df.loc[i,'41-48'] = df.loc[i,'DSE'][24:32].upper()
    df.loc[i,'45-52'] = df.loc[i,'DSE'][28:36].upper()

In [4]:
for lib in model_dict:
    index_list = []

    for mutation in df[lib]:
        index_list.append(all_mut_dict[lib].index[all_mut_dict[lib].Mutation == mutation].tolist()[0])
        
    pred_df = pd.DataFrame(all_mut_dict[lib].iloc[index_list])
    pred_df['exp_MFE'] = df['MFE'].to_list()
    DoSIA_df = predict_model(model_dict[lib], data = pred_df.reset_index())

    df[lib] = DoSIA_df.prediction_label.to_list()


In [5]:
df['pred'] = 0

df.pred = (
df['17-24']*r2_17_24+df['21-28']*r2_21_28+df['25-32']*r2_25_32+df['29-36']*r2_29_36+df['33-40']*r2_33_40+df['37-44']*r2_37_44+df['41-48']*r2_41_48+df['45-52']*r2_45_52
                 ) / r2_sum

df

Unnamed: 0,terminator,DSE,MFE_seq,MFE,17-24,21-28,25-32,29-36,33-40,37-44,41-48,45-52,pred
0,sNRP-1,catgcgtcaattttacgcatgattatctttaacgta,aataaaatacgaaatgactagtcatgcgtcaattttacgcatgatt...,0.275345,0.489346,-0.00612,-0.02706,0.179254,0.287595,-0.013078,0.013429,0.068335,0.13431
1,SV40 early,ctagttgtggtttgtccaaactcatcaatgtatcttat,ataaagcatttttttcactgcattctagttgtggtttgtccaaact...,0.63456,0.995939,1.205513,1.67563,1.81722,1.29455,0.838331,0.37257,0.287452,1.191759
