In [1]:
import pandas as pd
import numpy as np
import sys
from xgboost import XGBRegressor
import csv
from sklearn.metrics import r2_score

In [2]:
def encode_input(df, window_size=1, pad=0, no_onehot=False, seq=True, struct=True):
    '''Creat input/output for regression model for predicting structure probing data.
    Inputs:
    
    dataframe (in EternaBench RDAT format)
    window_size: size of window (in one direction). so window_size=1 is a total window size of 3
    pad: number of nucleotides at start to not include
    seq (bool): include sequence encoding
    struct (bool): include bpRNA structure encoding
    
    Outputs:
    Input array (n_samples x n_features): array of windowed input features
    feature_names (list, length = kernel x window): feature names, i.e. `S_-12`
    
    '''    
    inpts = []

    feature_kernel=[]
    if seq:
        feature_kernel.extend(['A','U','G','C'])
    if struct:
        feature_kernel.extend(['H','E','I','M','B','S', 'X'])

    feature_names = ['%s_%d' % (k, val) for val in range(-1*window_size, window_size+1) for k in feature_kernel]
    
    for i, row in df.iterrows():
        length = len(row['sequence'])
        arr = np.zeros([length,len(feature_kernel)])
        
        for index in range(length):
            ctr=0

            #encode sequence
            if seq:
                
                for char in ['A','U','G','C']:
                    if row['sequence'][index]==char:
                        arr[index,ctr]+=1
                    ctr+=1

            if struct:
                for char in ['H','E','I','M','B','S', 'X']:
                    if row['bpRNA_string'][index]==char:
                        arr[index,ctr]+=1
                    ctr+=1

        # add zero padding to the side

        padded_arr = np.vstack([np.zeros([window_size,len(feature_kernel)]), arr, np.zeros([window_size,len(feature_kernel)])])

        for index in range(length):
            new_index = index+window_size-pad
            tmp = padded_arr[new_index-window_size:new_index+window_size+1]
            inpts.append(tmp.flatten())
            
    return np.array(inpts), feature_names

In [3]:
reg = XGBRegressor(n_estimators=8200, tree_method='hist', learning_rate=0.005, max_depth=7, subsample=0.8, colsample_bytree=0.9, reg_alpha=0.005)

In [4]:
reg.load_model('../../model_files/bt_xgb/bt_xgb.model')

In [6]:
OpenVaccineRound6_3 = pd.read_csv('../../../DeepDeg3/data/Round6_3/nov16_last57_CDS.csv')
OpenVaccineRound6_3.head()

Unnamed: 0,id,title,name,sequence,CAI,GC content,SUP vienna,SUP first 14 vienna,SUP first 30 vienna,punp_vec vienna,...,MFE Struct EternaFold,bprna_string,degscore_vec,degscore,MLD,n_hairpins,n_3WJs,n_4WJs,n_5WJs_up,hp_3WJ_ratio
0,10421216,Mod of wateronthemoon's OV:nanofl 6,Merida,AUGGCCGUCUACCCCUACGACGUGCCCGACUACGCCGGCUACCCCU...,0.890295,59.742351,195.361171,5.711359,16.950537,[9.98375821e-01 6.18258391e-01 2.89686821e-03 ...,...,..(((((((.........)))).))).......(((((((.(((((...,EESSSSSSSHHHHHHHHHSSSSBSSSEEEEEEESSSSSSSMSSSSS...,[ 0.747 0.393 0.36 0.242 -0.1 0.11 0....,177.064,78,20,5,1,3,3.999999
1,10421202,phoning it in 5,cynwulf28,AUGGCUGUAUAUCCGUAUGAUGUGCCCGAUUAUGCUGGGUACCCAU...,0.754168,51.529791,191.430001,3.47641,10.565406,[9.87866639e-01 9.84536094e-01 3.36349718e-03 ...,...,..(((((.((((((((.....(((((((.......))))))).(((...,EESSSSSBSSSSSSSSMMMMMSSSSSSSHHHHHHHSSSSSSSMSSS...,[ 5.96e-01 3.17e-01 3.28e-01 1.21e-01 1.50...,188.019,118,14,8,2,0,1.75
2,10421119,Mod of DigitalEmbrace's Curevac 4 Mod 2-Kaggl...,Merida,AUGGCUGUUUACCCUUACGAUGUUCCCGACUACGCCGGGUACCCGU...,0.786049,59.742351,179.628614,7.173642,15.45094,[9.74989208e-01 9.99191615e-01 5.90287475e-03 ...,...,..(((((((............(((((((.(...(.(((((((.(((...,EESSSSSSSMMMMMMMMMMMMSSSSSSSISIIISISSSSSSSISSS...,[ 0.6 0.324 0.326 0.066 0.106 0.075 0....,170.49,96,17,6,1,2,2.833333
3,10421010,phoning it in 4,cynwulf28,AUGGCCGUGUAUCCUUAUGAUGUUCCAGAUUAUGCCGGCUACCCCU...,0.778542,49.758454,218.053576,5.990581,8.63337,[9.73084676e-01 9.34549226e-01 9.55890625e-01 ...,...,...((((((((.(((..((((((((((((((((.(((((..........,EEESSSSSSSSBSSSIISSSSSSSSSSSSSSSSISSSSSIIIIIII...,[ 7.24000000e-01 3.49000000e-01 4.86000000e-...,206.416,61,16,4,1,2,3.999999
4,10420989,phoning it in 3,cynwulf28,AUGGCCGUUUACCCGUACGAUGUGCCUGACUACGCGGGCUAUCCGU...,0.797147,55.877617,213.95994,3.269031,6.95299,[7.92954489e-01 6.56017560e-01 1.23806781e-01 ...,...,(((((((((..((((((((....(((((......)))))....)))...,SSSSSSSSSIISSSSSSSSIIIISSSSSHHHHHHSSSSSIIIISSS...,[ 4.88000000e-01 1.36000000e-01 1.69000000e-...,191.271,90,18,6,1,2,3.0


In [7]:
OpenVaccineRound6_3.columns

Index(['id', 'title', 'name', 'sequence', 'CAI', 'GC content', 'SUP vienna',
       'SUP first 14 vienna', 'SUP first 30 vienna', 'punp_vec vienna',
       'mean bp prox vienna', 'SUP eternafold', 'SUP first 14 eternafold',
       'SUP first 30 eternafold', 'punp_vec eternafold',
       'mean bp prox eternafold', 'MFE Struct EternaFold', 'bprna_string',
       'degscore_vec', 'degscore', 'MLD', 'n_hairpins', 'n_3WJs', 'n_4WJs',
       'n_5WJs_up', 'hp_3WJ_ratio'],
      dtype='object')

In [8]:
OpenVaccineRound6_3.shape

(57, 26)

In [9]:
OpenVaccineRound6_3['bpRNA_string'] = OpenVaccineRound6_3['bprna_string']

In [10]:
%%time
encodings_6 = []
for jj in range(57):
    encoding, feature_names = encode_input(OpenVaccineRound6_3.iloc[[jj]], window_size=20)
    encodings_6.append(encoding)

CPU times: user 1.68 s, sys: 31.4 ms, total: 1.71 s
Wall time: 1.71 s


In [11]:
%%time
preds_57 = []
for jj in range(57):
    preds_57.append(list(reg.predict(encodings_6[jj])))

CPU times: user 34.8 s, sys: 259 ms, total: 35 s
Wall time: 3.17 s


In [12]:
with open("deg_1day_pH10_57_preds_1.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(preds_57)

In [13]:
sum_57 = [sum(preds_57[j]) for j in range(57)]

In [14]:
len(preds_57[0])

621

In [15]:
deg_1day_pH10_57_preds_2 = pd.DataFrame(data=preds_57, columns=[f'col_{i}' for i in range(621)])
deg_1day_pH10_57_preds_2.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_611,col_612,col_613,col_614,col_615,col_616,col_617,col_618,col_619,col_620
0,1.366867,0.467511,0.293193,0.29211,0.238905,0.162171,0.700372,0.258511,0.849207,0.427987,...,0.278565,0.475116,0.02522,0.128702,0.245885,0.207553,0.508007,0.116429,0.692021,0.645087
1,0.9914,0.258527,0.20665,0.212319,0.220645,0.109767,1.002435,1.002825,1.19676,0.32593,...,1.053303,0.506435,0.050303,0.166326,0.241999,0.183829,0.570695,0.545276,0.505022,0.660086
2,0.853476,0.289471,0.308269,0.22871,0.325874,0.216183,1.355433,1.5349,1.096478,0.546163,...,0.982246,0.454354,0.20422,0.265746,0.388661,0.550612,1.244114,0.389868,0.696252,0.579973
3,1.372698,0.215068,0.444799,0.228239,0.089943,0.07474,0.304626,0.216049,0.832947,0.167974,...,0.85471,0.395143,0.039985,0.227161,0.185243,0.185975,0.537369,0.15329,0.48548,0.602552
4,0.653174,0.036279,0.086351,0.144197,0.012249,0.047814,0.931178,1.241221,0.83195,0.483595,...,0.632397,0.976254,0.07224,0.347068,0.147313,0.266927,1.20778,0.957075,0.614932,0.644738


In [16]:
deg_1day_pH10_57_preds_2['id'] = OpenVaccineRound6_3['id']
deg_1day_pH10_57_preds_2['sequence'] = OpenVaccineRound6_3['sequence']
deg_1day_pH10_57_preds_2['bprna_string'] = OpenVaccineRound6_3['bprna_string']
deg_1day_pH10_57_preds_2['BT_sum'] = sum_57
deg_1day_pH10_57_preds_2.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_615,col_616,col_617,col_618,col_619,col_620,id,sequence,bprna_string,BT_sum
0,1.366867,0.467511,0.293193,0.29211,0.238905,0.162171,0.700372,0.258511,0.849207,0.427987,...,0.245885,0.207553,0.508007,0.116429,0.692021,0.645087,10421216,AUGGCCGUCUACCCCUACGACGUGCCCGACUACGCCGGCUACCCCU...,EESSSSSSSHHHHHHHHHSSSSBSSSEEEEEEESSSSSSSMSSSSS...,238.853288
1,0.9914,0.258527,0.20665,0.212319,0.220645,0.109767,1.002435,1.002825,1.19676,0.32593,...,0.241999,0.183829,0.570695,0.545276,0.505022,0.660086,10421202,AUGGCUGUAUAUCCGUAUGAUGUGCCCGAUUAUGCUGGGUACCCAU...,EESSSSSBSSSSSSSSMMMMMSSSSSSSHHHHHHHSSSSSSSMSSS...,276.105116
2,0.853476,0.289471,0.308269,0.22871,0.325874,0.216183,1.355433,1.5349,1.096478,0.546163,...,0.388661,0.550612,1.244114,0.389868,0.696252,0.579973,10421119,AUGGCUGUUUACCCUUACGAUGUUCCCGACUACGCCGGGUACCCGU...,EESSSSSSSMMMMMMMMMMMMSSSSSSSISIIISISSSSSSSISSS...,236.715743
3,1.372698,0.215068,0.444799,0.228239,0.089943,0.07474,0.304626,0.216049,0.832947,0.167974,...,0.185243,0.185975,0.537369,0.15329,0.48548,0.602552,10421010,AUGGCCGUGUAUCCUUAUGAUGUUCCAGAUUAUGCCGGCUACCCCU...,EEESSSSSSSSBSSSIISSSSSSSSSSSSSSSSISSSSSIIIIIII...,289.297513
4,0.653174,0.036279,0.086351,0.144197,0.012249,0.047814,0.931178,1.241221,0.83195,0.483595,...,0.147313,0.266927,1.20778,0.957075,0.614932,0.644738,10420989,AUGGCCGUUUACCCGUACGAUGUGCCUGACUACGCGGGCUAUCCGU...,SSSSSSSSSIISSSSSSSSIIIISSSSSHHHHHHSSSSSIIIISSS...,270.365182


In [17]:
deg_1day_pH10_57_preds_2.to_csv('deg_1day_pH10_57_preds_2.csv', index=False)

In [18]:
for i in range(57):
    print(sum_57[i])

238.85328751662746
276.1051158129703
236.7157426159829
289.29751271614805
270.3651821296662
275.4253836515127
201.94534282223321
235.18483065231703
215.9517898503691
216.38620715774596
196.5790274363244
206.33846317930147
292.8861091900617
301.24809522740543
297.3690806250088
205.87048443651292
202.05372267775238
205.26905496811378
204.41424323339015
205.90181652875617
209.78569882793818
197.74951936025172
220.40913736307994
212.74511714931577
212.34147571632639
249.53155837580562
203.54864060046384
268.71609279327095
329.65351447835565
212.7993324934505
322.85669470205903
223.1831052149646
224.57764851255342
213.63982614467386
218.87946580071002
209.40636825514957
209.5410630822298
197.10683260968653
322.77259090170264
220.73290779069066
233.67283127922565
227.25478633027524
208.31483600812498
209.78801424836274
207.06692019617185
267.4883631132543
266.5370724289678
207.05884172394872
299.3039131257683
193.87337963434402
251.8742900788784
250.2750458677765
213.07390192477033
203.03669