In [None]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('..')
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [None]:
import rnacomp
import rnacomp.models
from rnacomp.fastai_fit import *
from rnacomp.dataset import LenMatchBatchSampler, DeviceDataLoader
from rnacomp.utils import seed_everything, MAE, loss_laplace
import gc
from tqdm import tqdm_notebook as tqdm

In [None]:
class CFG_test:
    dataset_name = 'RNA_Dataset_Test'
    bs = 512
    num_workers = 16
    device = 'cuda'
    
    

In [None]:
df_test = pd.read_csv(os.path.join('../data/','test_sequences.csv'))
ds = getattr(rnacomp.dataset, CFG_test.dataset_name)(df_test)
dl = DeviceDataLoader(torch.utils.data.DataLoader(ds, batch_size=CFG_test.bs, 
               shuffle=False, drop_last=False, num_workers=CFG_test.num_workers), CFG_test.device)
del df_test
gc.collect()



In [None]:
def get_model_from_cfg(CFG):
    md= getattr(rnacomp.models, CFG.model_name)()
    mw = f'../exp/{CFG.out}/models/model.pth'
    print(f"loading : {mw}")
    md.load_state_dict(torch.load(mw))
    return md

def generate_sub(config_list, out_name):
    models = []
    for m in config_list:
        model = get_model_from_cfg(m)
        model = model.to(CFG_test.device)
        model.eval()
        models.append(model)

    ids,preds = [],[]
    for x,y in tqdm(dl):
        with torch.no_grad(),torch.cuda.amp.autocast():
            p = torch.stack([torch.nan_to_num(model(x)) for model in models]
                            ,0).mean(0).clip(0,1)
            
        for idx, mask, pi in zip(y['ids'].cpu(), x['mask'].cpu(), p.cpu()):
            ids.append(idx[mask])
            preds.append(pi[mask[:pi.shape[0]]])

    ids = torch.concat(ids)
    preds = torch.concat(preds)

    df = pd.DataFrame({'id':ids.numpy(), 'reactivity_DMS_MaP':preds[:,1].numpy(), 
                    'reactivity_2A3_MaP':preds[:,0].numpy()})
    df["reactivity_DMS_MaP"] = df["reactivity_DMS_MaP"].astype("float32")
    df["reactivity_2A3_MaP"] = df["reactivity_2A3_MaP"].astype("float32")
    df.to_parquet(f'{out_name}.parquet', index=False) # 6.5GB
    df.head()

In [None]:
class CFG:
    path = Path("../data/")
    pathbb = Path("../data/Ribonanza_bpp_files")
    split_id = Path('../eda/fold_split.csv')
    bs = 64
    num_workers = 12
    device = 'cuda'
    seed = 2023
    out = 'exp_04_ft'
    dataset_name = 'RNA_DatasetBaselineSplit'
    
    model_name = 'RNA_ModelV2'
    dim = 192
    depth = 12
    dim_head = 32
   
    epoch = 64
    lr = 5e-4
    wd = 0.05
    pct_start = 0.02
    
out_name = CFG.out
generate_sub([CFG], out_name)
print(f"kaggle competitions submit stanford-ribonanza-rna-folding -f {out_name}.parquet -m '{out_name}'")

In [None]:
((pd.read_parquet("exp_03_ft.parquet").set_index('id') + pd.read_parquet("exp_04_ft.parquet").set_index('id'))/2).to_parquet(f'cv3v4ft.parquet')