In [None]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('..')
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
os.environ["MKL_NUM_THREADS"] = "8" 
os.environ["NUMEXPR_NUM_THREADS"] = "8" 
os.environ["OMP_NUM_THREADS"] = "8" 

In [None]:
import rnacomp
import rnacomp.models
from rnacomp.fastai_fit import *
from rnacomp.dataset import LenMatchBatchSampler, DeviceDataLoader
from rnacomp.utils import seed_everything, MAE, loss_laplace
import gc
import wandb 
from fastxtend.vision.all import EMACallback


In [3]:
!ls ../data/

Ribonanza_bpp_files	  split
all_test_data.npy	  supplementary_silico_predictions
eterna_openknot_metadata  test_sequences.csv
rcentral.parquet	  train_corrected.parquet
sample_submission.csv	  train_data.csv
sequence_libraries	  train_data.parquet


In [4]:
class CFG:
    path = Path("../data/")
    pathbb = Path("../data/Ribonanza_bpp_files")
    pathss = Path("../eda/train_ss_vienna_rna.parquet")
    split_id = Path('../eda/fold_split.csv')
    bs = 64
    num_workers = 4
    device = 'cuda'
    seed = 2023
    out = 'exp_32_v2'
    dataset_name = 'RNA_DatasetBaselineSplitssbppV6SAVED'
    sn_train = False
    
    model_name = 'RNA_ModelV25'
    model_kwargs = dict(dim=192 * 2,
        depth=4,
        head_size=32,
        drop_pat_dropout=0.2,
        dropout=0.2,
        bpp_transfomer_depth = 6)

   
    epoch = 64
    lr = 5e-4
    wd = 0.05
    pct_start = 0.02
    
    wandb = True
    
seed_everything(CFG.seed)
os.makedirs(CFG.out, exist_ok=True)

# wandb.init(
#     # set the wandb project where this run will be logged
#     project="my-awesome-project",
    
#     # track hyperparameters and run metadata
#     config={
#     "learning_rate": 0.02,
#     "architecture": "CNN",
#     "dataset": "CIFAR-100",
#     "epochs": 10,
#     }
# )

def class_to_dict(cls):
    # Create a dictionary from the class attributes
    return {key: value for key, value in cls.__dict__.items() if not key.startswith("__") and not callable(value)}

In [5]:



split = pd.read_csv(CFG.split_id)
df = pd.read_parquet(CFG.path/'train_corrected.parquet')
df = pd.merge(df, split, on='sequence_id')
#df = df.query("SN_filter==1").reset_index(drop=True)
df_train = df.query('is_train==True').reset_index(drop=True)
df_valid = df.query('is_train==False').reset_index(drop=True)


ds_train = getattr(rnacomp.dataset, CFG.dataset_name)(df_train, mode='train',sn_train=CFG.sn_train)
ds_train_len = getattr(rnacomp.dataset, CFG.dataset_name)(df_train, mode='train', mask_only=True,sn_train=CFG.sn_train)
sampler_train = torch.utils.data.RandomSampler(ds_train_len)
len_sampler_train = LenMatchBatchSampler(sampler_train, batch_size=CFG.bs,
            drop_last=True)
dl_train = DeviceDataLoader(torch.utils.data.DataLoader(ds_train, 
            batch_sampler=len_sampler_train, num_workers=CFG.num_workers,
            persistent_workers=True), CFG.device)


ds_val = getattr(rnacomp.dataset, CFG.dataset_name)(df_valid, mode='eval')
ds_val_len = getattr(rnacomp.dataset, CFG.dataset_name)(df_valid, mode='eval', mask_only=True)
sampler_val = torch.utils.data.SequentialSampler(ds_val_len)
len_sampler_val = LenMatchBatchSampler(sampler_val, batch_size=CFG.bs, 
               drop_last=False)
dl_val= DeviceDataLoader(torch.utils.data.DataLoader(ds_val, 
               batch_sampler=len_sampler_val, num_workers=CFG.num_workers), CFG.device)

data = DataLoaders(dl_train,dl_val)
del split
del df
gc.collect()



62

In [None]:
#the training was interepeted on epoch 3 
#so i had to restart from scratch but loading weights from epoch 3
learn = Learner(data,
                getattr(rnacomp.models, CFG.model_name)(**CFG.model_kwargs).cuda(), 
                path = CFG.out, 
                loss_func=loss_laplace,
                cbs=[GradientClip(3.0),
                    #WandbCallback(log_preds=False),
                    CSVLogger(),
                     EMACallback(),
                    SaveModelCallback(monitor='mae',comp=np.less,at_end=True)],
                metrics=[MAE()]).to_fp16() 
learn.load('model')
learn.fit_one_cycle(CFG.epoch, lr_max=CFG.lr, wd=CFG.wd, pct_start=CFG.pct_start)
wandb.finish()

  elif with_opt: warn("Saved file doesn't contain an optimizer state.")


epoch,train_loss,valid_loss,mae,time
0,0.114404,0.122266,0.133318,55:00
1,0.113798,0.120959,0.131923,55:00
2,0.112666,0.119966,0.130842,55:04
3,0.113025,0.119408,0.130245,55:07
4,0.111541,0.118981,0.129791,55:04
5,0.112813,0.118522,0.129292,55:03
6,0.11284,0.118179,0.128923,55:06
7,0.111589,0.117925,0.128647,55:05
8,0.111128,0.117707,0.128419,54:57
9,0.109994,0.117503,0.128205,54:55


Better model found at epoch 0 with mae value: 0.1333177648684585.
Better model found at epoch 1 with mae value: 0.13192295587250602.
Better model found at epoch 2 with mae value: 0.13084185863430356.
Better model found at epoch 3 with mae value: 0.13024502104270502.
Better model found at epoch 4 with mae value: 0.12979116642492394.
Better model found at epoch 5 with mae value: 0.12929176185429211.
Better model found at epoch 6 with mae value: 0.12892279594354908.
Better model found at epoch 7 with mae value: 0.12864714859381451.
Better model found at epoch 8 with mae value: 0.12841936640800192.
Better model found at epoch 9 with mae value: 0.12820456293304744.
Better model found at epoch 10 with mae value: 0.12808339726036602.
Better model found at epoch 11 with mae value: 0.12790885784860706.
Better model found at epoch 12 with mae value: 0.12774237123595134.
Better model found at epoch 13 with mae value: 0.12748308165689326.
Better model found at epoch 14 with mae value: 0.1274048801

In [None]:
res = dict()
for l in df_valid["L"].unique():
    ds_val = getattr(rnacomp.dataset, CFG.dataset_name)(df_valid.query("L==@l").copy(), mode='eval')
    ds_val_len = getattr(rnacomp.dataset, CFG.dataset_name)(df_valid.query("L==@l").copy(), mode='eval', mask_only=True)
    sampler_val = torch.utils.data.SequentialSampler(ds_val_len)
    len_sampler_val = LenMatchBatchSampler(sampler_val, batch_size=CFG.bs, 
                   drop_last=False)
    dl_val= DeviceDataLoader(torch.utils.data.DataLoader(ds_val, 
                   batch_sampler=len_sampler_val, num_workers=CFG.num_workers), CFG.device)

    data = DataLoaders(dl_val,dl_val)
    learn = Learner(data,
                    getattr(rnacomp.models, CFG.model_name)(dim = CFG.dim, graph_layers=CFG.graph_layers).cuda(), 
                    path = CFG.out, 
                    loss_func=loss_laplace,
                    metrics=[MAE()]).to_fp16() 
    learn.load('model')
    learn.eval()
    loss_, score_ = learn.validate()
    res[l]= score_

res = pd.DataFrame(pd.Series(res)).reset_index()
res.columns = ['L', 'mae']
res.sort_values(by="L")

In [None]:
res

In [None]:
res