In [1]:
import pandas as pd
import numpy as np
import sys
from xgboost import XGBRegressor
import csv
from sklearn.metrics import r2_score

In [2]:
def encode_input(df, window_size=1, pad=0, no_onehot=False, seq=True, struct=True):
    '''Creat input/output for regression model for predicting structure probing data.
    Inputs:
    
    dataframe (in EternaBench RDAT format)
    window_size: size of window (in one direction). so window_size=1 is a total window size of 3
    pad: number of nucleotides at start to not include
    seq (bool): include sequence encoding
    struct (bool): include bpRNA structure encoding
    
    Outputs:
    Input array (n_samples x n_features): array of windowed input features
    feature_names (list, length = kernel x window): feature names, i.e. `S_-12`
    
    '''    
    inpts = []

    feature_kernel=[]
    if seq:
        feature_kernel.extend(['A','U','G','C'])
    if struct:
        feature_kernel.extend(['H','E','I','M','B','S', 'X'])

    feature_names = ['%s_%d' % (k, val) for val in range(-1*window_size, window_size+1) for k in feature_kernel]
    
    for i, row in df.iterrows():
        length = len(row['sequence'])
        arr = np.zeros([length,len(feature_kernel)])
        
        for index in range(length):
            ctr=0

            #encode sequence
            if seq:
                
                for char in ['A','U','G','C']:
                    if row['sequence'][index]==char:
                        arr[index,ctr]+=1
                    ctr+=1

            if struct:
                for char in ['H','E','I','M','B','S', 'X']:
                    if row['bpRNA_string'][index]==char:
                        arr[index,ctr]+=1
                    ctr+=1

        # add zero padding to the side

        padded_arr = np.vstack([np.zeros([window_size,len(feature_kernel)]), arr, np.zeros([window_size,len(feature_kernel)])])

        for index in range(length):
            new_index = index+window_size-pad
            tmp = padded_arr[new_index-window_size:new_index+window_size+1]
            inpts.append(tmp.flatten())
            
    return np.array(inpts), feature_names

In [3]:
reg = XGBRegressor(n_estimators=8200, tree_method='hist', learning_rate=0.005, max_depth=7, subsample=0.8, colsample_bytree=0.9, reg_alpha=0.005)

In [4]:
reg.load_model('../../model_files/bt_xgb/bt_xgb.model')

In [5]:
OpenVaccineRound6_3 = pd.read_csv('../../data/Round6_3/nov16_last57_CDS.csv')
OpenVaccineRound6_3.head()

Unnamed: 0,id,title,name,sequence,CAI,GC content,SUP vienna,SUP first 14 vienna,SUP first 30 vienna,punp_vec vienna,...,MFE Struct EternaFold,bprna_string,degscore_vec,degscore,MLD,n_hairpins,n_3WJs,n_4WJs,n_5WJs_up,hp_3WJ_ratio
0,10421216,Mod of wateronthemoon's OV:nanofl 6,Merida,AUGGCCGUCUACCCCUACGACGUGCCCGACUACGCCGGCUACCCCU...,0.890295,59.742351,195.361171,5.711359,16.950537,[9.98375821e-01 6.18258391e-01 2.89686821e-03 ...,...,..(((((((.........)))).))).......(((((((.(((((...,EESSSSSSSHHHHHHHHHSSSSBSSSEEEEEEESSSSSSSMSSSSS...,[ 0.747 0.393 0.36 0.242 -0.1 0.11 0....,177.064,78,20,5,1,3,3.999999
1,10421202,phoning it in 5,cynwulf28,AUGGCUGUAUAUCCGUAUGAUGUGCCCGAUUAUGCUGGGUACCCAU...,0.754168,51.529791,191.430001,3.47641,10.565406,[9.87866639e-01 9.84536094e-01 3.36349718e-03 ...,...,..(((((.((((((((.....(((((((.......))))))).(((...,EESSSSSBSSSSSSSSMMMMMSSSSSSSHHHHHHHSSSSSSSMSSS...,[ 5.96e-01 3.17e-01 3.28e-01 1.21e-01 1.50...,188.019,118,14,8,2,0,1.75
2,10421119,Mod of DigitalEmbrace's Curevac 4 Mod 2-Kaggl...,Merida,AUGGCUGUUUACCCUUACGAUGUUCCCGACUACGCCGGGUACCCGU...,0.786049,59.742351,179.628614,7.173642,15.45094,[9.74989208e-01 9.99191615e-01 5.90287475e-03 ...,...,..(((((((............(((((((.(...(.(((((((.(((...,EESSSSSSSMMMMMMMMMMMMSSSSSSSISIIISISSSSSSSISSS...,[ 0.6 0.324 0.326 0.066 0.106 0.075 0....,170.49,96,17,6,1,2,2.833333
3,10421010,phoning it in 4,cynwulf28,AUGGCCGUGUAUCCUUAUGAUGUUCCAGAUUAUGCCGGCUACCCCU...,0.778542,49.758454,218.053576,5.990581,8.63337,[9.73084676e-01 9.34549226e-01 9.55890625e-01 ...,...,...((((((((.(((..((((((((((((((((.(((((..........,EEESSSSSSSSBSSSIISSSSSSSSSSSSSSSSISSSSSIIIIIII...,[ 7.24000000e-01 3.49000000e-01 4.86000000e-...,206.416,61,16,4,1,2,3.999999
4,10420989,phoning it in 3,cynwulf28,AUGGCCGUUUACCCGUACGAUGUGCCUGACUACGCGGGCUAUCCGU...,0.797147,55.877617,213.95994,3.269031,6.95299,[7.92954489e-01 6.56017560e-01 1.23806781e-01 ...,...,(((((((((..((((((((....(((((......)))))....)))...,SSSSSSSSSIISSSSSSSSIIIISSSSSHHHHHHSSSSSIIIISSS...,[ 4.88000000e-01 1.36000000e-01 1.69000000e-...,191.271,90,18,6,1,2,3.0


In [6]:
OpenVaccineRound6_3.columns

Index(['id', 'title', 'name', 'sequence', 'CAI', 'GC content', 'SUP vienna',
       'SUP first 14 vienna', 'SUP first 30 vienna', 'punp_vec vienna',
       'mean bp prox vienna', 'SUP eternafold', 'SUP first 14 eternafold',
       'SUP first 30 eternafold', 'punp_vec eternafold',
       'mean bp prox eternafold', 'MFE Struct EternaFold', 'bprna_string',
       'degscore_vec', 'degscore', 'MLD', 'n_hairpins', 'n_3WJs', 'n_4WJs',
       'n_5WJs_up', 'hp_3WJ_ratio'],
      dtype='object')

In [7]:
OpenVaccineRound6_3.shape

(57, 26)

In [8]:
OpenVaccineRound6_3['bpRNA_string'] = OpenVaccineRound6_3['bprna_string']

In [9]:
%%time
encodings_6 = []
for jj in range(57):
    encoding, feature_names = encode_input(OpenVaccineRound6_3.iloc[[jj]], window_size=20)
    encodings_6.append(encoding)

CPU times: user 5.1 s, sys: 27.5 ms, total: 5.13 s
Wall time: 5.13 s


In [10]:
%%time
preds_57 = []
for jj in range(57):
    preds_57.append(list(reg.predict(encodings_6[jj])))

CPU times: user 1min 9s, sys: 2.63 s, total: 1min 11s
Wall time: 1.9 s


In [11]:
with open("deg_1day_pH10_57_preds_1.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(preds_57)

In [12]:
sum_57 = [sum(preds_57[j]) for j in range(57)]

In [14]:
len(preds_57[0])

621

In [15]:
deg_1day_pH10_57_preds_2 = pd.DataFrame(data=preds_57, columns=[f'col_{i}' for i in range(621)])
deg_1day_pH10_57_preds_2.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_611,col_612,col_613,col_614,col_615,col_616,col_617,col_618,col_619,col_620
0,1.366874,0.467512,0.293191,0.29211,0.238905,0.162171,0.700368,0.258512,0.849204,0.427986,...,0.278565,0.475118,0.02522,0.128702,0.245885,0.207553,0.508007,0.116428,0.692019,0.645085
1,0.991398,0.258527,0.206651,0.212318,0.220644,0.109766,1.002434,1.002821,1.196758,0.32593,...,1.053305,0.506435,0.050303,0.166325,0.242,0.18383,0.570695,0.545275,0.505022,0.660084
2,0.853473,0.28947,0.308268,0.228709,0.325875,0.216182,1.355434,1.5349,1.096475,0.546166,...,0.982242,0.454353,0.20422,0.265745,0.388661,0.550611,1.244115,0.389869,0.69625,0.57997
3,1.372702,0.21507,0.444798,0.228239,0.089942,0.07474,0.304627,0.21605,0.832948,0.167975,...,0.854706,0.395143,0.039986,0.22716,0.185243,0.185975,0.537367,0.15329,0.48548,0.602548
4,0.653171,0.03628,0.086352,0.144198,0.012249,0.047815,0.931173,1.241223,0.831947,0.483596,...,0.632396,0.976251,0.072241,0.347068,0.147313,0.266926,1.207785,0.957072,0.614928,0.644736


In [16]:
deg_1day_pH10_57_preds_2['id'] = OpenVaccineRound6_3['id']
deg_1day_pH10_57_preds_2['sequence'] = OpenVaccineRound6_3['sequence']
deg_1day_pH10_57_preds_2['bprna_string'] = OpenVaccineRound6_3['bprna_string']
deg_1day_pH10_57_preds_2['BT_sum'] = sum_57
deg_1day_pH10_57_preds_2.head()

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_615,col_616,col_617,col_618,col_619,col_620,id,sequence,bprna_string,BT_sum
0,1.366874,0.467512,0.293191,0.29211,0.238905,0.162171,0.700368,0.258512,0.849204,0.427986,...,0.245885,0.207553,0.508007,0.116428,0.692019,0.645085,10421216,AUGGCCGUCUACCCCUACGACGUGCCCGACUACGCCGGCUACCCCU...,EESSSSSSSHHHHHHHHHSSSSBSSSEEEEEEESSSSSSSMSSSSS...,238.853224
1,0.991398,0.258527,0.206651,0.212318,0.220644,0.109766,1.002434,1.002821,1.196758,0.32593,...,0.242,0.18383,0.570695,0.545275,0.505022,0.660084,10421202,AUGGCUGUAUAUCCGUAUGAUGUGCCCGAUUAUGCUGGGUACCCAU...,EESSSSSBSSSSSSSSMMMMMSSSSSSSHHHHHHHSSSSSSSMSSS...,276.105022
2,0.853473,0.28947,0.308268,0.228709,0.325875,0.216182,1.355434,1.5349,1.096475,0.546166,...,0.388661,0.550611,1.244115,0.389869,0.69625,0.57997,10421119,AUGGCUGUUUACCCUUACGAUGUUCCCGACUACGCCGGGUACCCGU...,EESSSSSSSMMMMMMMMMMMMSSSSSSSISIIISISSSSSSSISSS...,236.715657
3,1.372702,0.21507,0.444798,0.228239,0.089942,0.07474,0.304627,0.21605,0.832948,0.167975,...,0.185243,0.185975,0.537367,0.15329,0.48548,0.602548,10421010,AUGGCCGUGUAUCCUUAUGAUGUUCCAGAUUAUGCCGGCUACCCCU...,EEESSSSSSSSBSSSIISSSSSSSSSSSSSSSSISSSSSIIIIIII...,289.297384
4,0.653171,0.03628,0.086352,0.144198,0.012249,0.047815,0.931173,1.241223,0.831947,0.483596,...,0.147313,0.266926,1.207785,0.957072,0.614928,0.644736,10420989,AUGGCCGUUUACCCGUACGAUGUGCCUGACUACGCGGGCUAUCCGU...,SSSSSSSSSIISSSSSSSSIIIISSSSSHHHHHHSSSSSIIIISSS...,270.365078


In [17]:
deg_1day_pH10_57_preds_2.to_csv('deg_1day_pH10_57_preds_2.csv', index=False)

In [None]:
for i in range(57):
    print(sum_57[i])