In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('..')
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
import rnacomp
import rnacomp.models
from rnacomp.fastai_fit import *
from rnacomp.dataset import LenMatchBatchSampler, DeviceDataLoader
from rnacomp.utils import seed_everything, MAE, loss_laplace
import gc
from tqdm import tqdm_notebook as tqdm



In [3]:
class CFG_test:
    dataset_name = 'RNA_Dataset_TestBppSSFullV2' #"RNA_Dataset_TestBppSSFullV1" #'RNA_Dataset_TestBppSSFullV0' #`RNA_Dataset_TestBppSS' #'
    pathbb = Path("../data/Ribonanza_bpp_files")
    pathss = Path("../eda/test_ss_vienna_rna.parquet")
    bs = 256
    num_workers = 16
    device = 'cuda'
    
def plot_test_example(fn):
    import polars as pl
    import matplotlib.pyplot as plt
    #read your sub here
    df=pl.read_parquet(f"{fn}.parquet")
    #some parameters
    font_size=6
    id1=269545321
    id2=269724007
    reshape1=391
    reshape2=457
    #get predictions
    pred_DMS=df[id1:id2+1]['reactivity_DMS_MaP'].to_numpy().reshape(reshape1,reshape2)
    pred_2A3=df[id1:id2+1]['reactivity_2A3_MaP'].to_numpy().reshape(reshape1,reshape2)
    #plot mutate and map
    fig = plt.figure()
    plt.subplot(121)
    plt.title(f'reactivity_DMS_MaP', fontsize=font_size)
    plt.imshow(pred_DMS,vmin=0,vmax=1, cmap='gray_r')
    plt.subplot(122)
    plt.title(f'reactivity_2A3_MaP', fontsize=font_size)
    plt.imshow(pred_2A3,vmin=0,vmax=1, cmap='gray_r')
    plt.tight_layout()
    plt.savefig(f"plot.png",dpi=500)
    plt.clf()
    plt.close()


In [4]:
df_test = pd.read_csv(os.path.join('../data/','test_sequences.csv'))
print(df_test.shape)
fns = list(CFG_test.pathbb.rglob("*.txt"))
bpp_df = pd.DataFrame({"bpp": fns})
bpp_df['sequence_id'] = bpp_df['bpp'].apply(lambda x: x.stem)
print(df_test.shape)
bpp_df.drop_duplicates(subset=['sequence_id'], inplace=True)
ss = pd.read_parquet(CFG_test.pathss)[["sequence_id", "ss_roi", "ss_full"]]
print(df_test.shape)
df_test = pd.merge(df_test, bpp_df, on='sequence_id')
print(df_test.shape)
df_test = pd.merge(df_test, ss, on='sequence_id')
print(df_test.shape)
ds = getattr(rnacomp.dataset, CFG_test.dataset_name)(df_test)
dl = DeviceDataLoader(torch.utils.data.DataLoader(ds, batch_size=CFG_test.bs, 
               shuffle=False, drop_last=False, num_workers=CFG_test.num_workers), CFG_test.device)
del df_test
gc.collect()



(1343823, 5)
(1343823, 5)
(1343823, 5)
(1343823, 6)
(1343823, 8)


0

In [5]:
def get_model_from_cfg(CFG):
    if CFG.dim:
        print('dim')
        md= getattr(rnacomp.models, CFG.model_name)(dim = CFG.dim)
    else: 
        print('kwargs')
        md= getattr(rnacomp.models, CFG.model_name)(**CFG.model_kwargs)
    mw = f'../exp/{CFG.out}/models/model.pth'
    print(f"loading : {mw}")
    md.load_state_dict(torch.load(mw))
    return md

def generate_sub(config_list, out_name, clip=True):
    models = []
    for m in config_list:
        model = get_model_from_cfg(m)
        model = model.to(CFG_test.device)
        model.eval()
        models.append(model)
    if not clip: 
        out_name = out_name + '_no_clip'
        print(f'generating sub with no cliping: {out_name}')
    ids,preds = [],[]
    for x,y in tqdm(dl):
        with torch.no_grad(),torch.cuda.amp.autocast():
            if clip:
                p = torch.stack([torch.nan_to_num(model(x)) for model in models]
                                ,0).mean(0).clip(0,1)
            else:
                p = torch.stack([torch.nan_to_num(model(x)) for model in models]
                                ,0).mean(0)
                
    
        for idx, mask, pi in zip(y['ids'].cpu(), x['mask'].cpu(), p.cpu()):
            ids.append(idx[mask])
            preds.append(pi[mask[:pi.shape[0]]])

    ids = torch.concat(ids)
    preds = torch.concat(preds)

    df = pd.DataFrame({'id':ids.numpy(), 'reactivity_DMS_MaP':preds[:,1].numpy(), 
                    'reactivity_2A3_MaP':preds[:,0].numpy()})
    df["reactivity_DMS_MaP"] = df["reactivity_DMS_MaP"].astype("float32")
    df["reactivity_2A3_MaP"] = df["reactivity_2A3_MaP"].astype("float32")

    df.to_parquet(f'{out_name}.parquet', index=False) # 6.5GB
    df.head()

In [6]:
# # use this to generate submission
# class CFG:
#     path = Path("../data/")
#     pathbb = Path("../data/Ribonanza_bpp_files")
#     pathss = Path("../eda/train_ss_vienna_rna.parquet")
#     split_id = Path('../eda/fold_split.csv')
#     bs = 64
#     num_workers = 12
#     device = 'cuda'
#     seed = 2023
#     out = 'exp_09_ft'
#     dataset_name = 'RNA_DatasetBaselineSplitbppV0'
#     sn_train = True
    
#     model_name = 'RNA_ModelV7'
#     dim = 192
#     depth = 12
#     dim_head = 32
   
#     epoch = 12
#     lr = 5e-5
#     wd = 0.05
#     pct_start = 0.01
#     md_wt = 'exp_09/models/model.pth'
    
    
# class CFGFT:
#     path = Path("../data/")
#     pathbb = Path("../data/Ribonanza_bpp_files")
#     pathss = Path("../eda/train_ss_vienna_rna.parquet")
#     split_id = Path('../eda/fold_split.csv')
#     bs = 64
#     num_workers = 12
#     device = 'cuda'
#     seed = 2023
#     out = 'exp_08_ft'
#     dataset_name = 'RNA_DatasetBaselineSplitssV0'
#     sn_train = True
    
#     model_name = 'RNA_ModelV3SS'
#     dim = 192
#     depth = 12
#     dim_head = 32
   
#     epoch = 12
#     lr = 5e-5
#     wd = 0.05
#     pct_start = 0.01
#     md_wt = 'exp_08/models/model.pth'

    
    
# out_name = CFG.out + '_' + CFGFT.out
# generate_sub([CFG, CFGFT], out_name, clip=False)
# print(f"kaggle competitions submit stanford-ribonanza-rna-folding -f {out_name}.parquet -m '{out_name}'")

In [7]:
# class CFG12:
#     path = Path("../data/")
#     pathbb = Path("../data/Ribonanza_bpp_files")
#     pathss = Path("../eda/train_ss_vienna_rna.parquet")
#     split_id = Path('../eda/fold_split.csv')
#     bs = 64
#     num_workers = 12
#     device = 'cuda'
#     seed = 2023
#     out = 'exp_12_ft'
#     dataset_name = 'RNA_DatasetBaselineSplitbppV1'
#     sn_train = True
    
#     model_name = 'RNA_ModelV7'
#     dim = 192 * 2
#     depth = 12
#     dim_head = 32
   
#     epoch = 10
#     lr = 5e-5
#     wd = 0.05
#     pct_start = 0.01
    
#     md_wt = 'exp_12/models/model.pth'
    
    
# class CFG14:
#     path = Path("../data/")
#     pathbb = Path("../data/Ribonanza_bpp_files")
#     pathss = Path("../eda/train_ss_vienna_rna.parquet")
#     split_id = Path('../eda/fold_split.csv')
#     bs = 64
#     num_workers = 12
#     device = 'cuda'
#     seed = 2023
#     out = 'exp_14_ft'
#     dataset_name = 'RNA_DatasetBaselineSplitssbppV0'
#     sn_train = True
    
#     model_name = 'RNA_ModelV10'
#     dim = 192 * 2
#     depth = 12
#     dim_head = 32
   
#     epoch = 8
#     lr = 5e-6
#     wd = 0.05
#     pct_start = 0.02
#     md_wt = 'exp_14/models/model.pth'
    
# out_name = CFG12.out + "_" + CFG14.out
# generate_sub([CFG12, CFG14], out_name, clip=False)
# print(f"kaggle competitions submit stanford-ribonanza-rna-folding -f {out_name}.parquet -m '{out_name}'")

In [8]:
# class CFG:
#     path = Path("../data/")
#     pathbb = Path("../data/Ribonanza_bpp_files")
#     pathss = Path("../eda/train_ss_vienna_rna.parquet")
#     split_id = Path('../eda/fold_split.csv')
#     bs = 64
#     num_workers = 12
#     device = 'cuda'
#     seed = 2023
#     out = 'exp_16_ft'
#     dataset_name = 'RNA_DatasetBaselineSplitssbppV1'
#     sn_train = True
    
#     model_name = 'RNA_ModelV11'
#     dim = 192 * 2
#     depth = 12
#     dim_head = 32
   
#     epoch = 8
#     lr = 5e-6
#     wd = 0.05
#     pct_start = 0.02
    
#     md_wt = 'exp_16/models/model.pth'
    
# class CFG19:
#     path = Path("../data/")
#     pathbb = Path("../data/Ribonanza_bpp_files")
#     pathss = Path("../eda/train_ss_vienna_rna.parquet")
#     split_id = Path('../eda/fold_split.csv')
#     bs = 64
#     num_workers = 12
#     device = 'cuda'
#     seed = 2023
#     out = 'exp_19_ft'
#     dataset_name = 'RNA_DatasetBaselineSplitssbppV1'
#     sn_train = True
    
#     model_name = 'RNA_ModelV14'
#     dim = 192 * 2
#     depth = 12
#     dim_head = 32
   
#     epoch = 8
#     lr = 5e-6
#     wd = 0.05
#     pct_start = 0.02
    
#     md_wt = 'exp_19/models/model.pth'
    
# out_name = CFG.out + "_" + CFG19.out
# generate_sub([CFG, CFG19], out_name, clip=False)
# print(f"kaggle competitions submit stanford-ribonanza-rna-folding -f {out_name}.parquet -m '{out_name}'")

In [9]:
# class CFG:
#     path = Path("../data/")
#     pathbb = Path("../data/Ribonanza_bpp_files")
#     pathss = Path("../eda/train_ss_vienna_rna.parquet")
#     split_id = Path('../eda/fold_split.csv')
#     bs = 64
#     num_workers = 12
#     device = 'cuda'
#     seed = 2023
#     out = 'exp_22_ft'
#     dataset_name = 'RNA_DatasetBaselineSplitssbppV1'
#     sn_train = True
    
#     model_name = 'RNA_ModelV17'
#     dim = 192 * 2
#     depth = 12
#     dim_head = 32
   
#     epoch = 24
#     lr = 5e-6
#     wd = 0.05
#     pct_start = 0.01
    
#     md_wt = 'exp_22/models/model.pth'
    
# out_name = CFG.out
# generate_sub([CFG], out_name, clip=False)
# print(f"kaggle competitions submit stanford-ribonanza-rna-folding -f {out_name}.parquet -m '{out_name}'")

In [11]:
# #RNA_Dataset_TestBppSSFullV1s
# class CFG:
#     path = Path("../data/")
#     pathbb = Path("../data/Ribonanza_bpp_files")
#     pathss = Path("../eda/train_ss_vienna_rna.parquet")
#     split_id = Path('../eda/fold_split.csv')
#     bs = 64
#     num_workers = 12
#     device = 'cuda'
#     seed = 2023
#     out = 'exp_23_ft'
#     dataset_name = 'RNA_DatasetBaselineSplitssbppV2'
#     sn_train = True
#     dim = False
#     model_name = 'RNA_ModelV17'
#     model_kwargs = {"dim" : 192 * 2, 
#                     "depth": 12,
#                     "layer_dropout": 0.25,
#                     "attn_dropout": 0.25,
#                     "drop_pat_dropout": 0.25}

   
#     epoch = 24
#     lr = 5e-6
#     wd = 0.05
#     pct_start = 0.01
    
#     md_wt = 'exp_23/models/model.pth'
    
# out_name = CFG.out
# generate_sub([CFG], out_name, clip=False)

# a = pd.read_parquet(f'{out_name}_no_clip.parquet').set_index('id')
# a = a.clip(0,1)
# a = a.reset_index()
# a.to_parquet(f'{out_name}.parquet', index=False)
# print(f"kaggle competitions submit stanford-ribonanza-rna-folding -f {out_name}.parquet -m '{out_name}'")


kwargs
Non-A100 GPU detected, using math or mem efficient attention if input tensor is on cuda
loading : ../exp/exp_23_ft/models/model.pth
generating sub with no cliping: exp_23_ft_no_clip


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for x,y in tqdm(dl):


  0%|          | 0/5250 [00:00<?, ?it/s]

kaggle competitions submit stanford-ribonanza-rna-folding -f exp_23_ft.parquet -m 'exp_23_ft'


In [10]:
class CFG:
    path = Path("../data/")
    pathbb = Path("../data/Ribonanza_bpp_files")
    pathss = Path("../eda/train_ss_vienna_rna.parquet")
    split_id = Path('../eda/fold_split.csv')
    bs = 64
    num_workers = 12
    device = 'cuda'
    seed = 2023
    out = 'exp_26'
    dataset_name = 'RNA_DatasetBaselineSplitssbppV3'
    sn_train = True
    
    model_name = 'RNA_ModelV20'
    model_kwargs = {"dim" : 192 * 2, 
                    "depth": 12,
                    "layer_dropout": 0.25,
                    "attn_dropout": 0.25,
                    "drop_pat_dropout": 0.25}
    dim =False
   
    epoch = 24
    lr = 5e-6
    wd = 0.05
    pct_start = 0.01
    
    md_wt = 'exp_26/models/model.pth'
    
out_name = CFG.out
generate_sub([CFG], out_name, clip=False)

a = pd.read_parquet(f'{out_name}_no_clip.parquet').set_index('id')
a = a.clip(0,1)
a = a.reset_index()
a.to_parquet(f'{out_name}.parquet', index=False)
print(f"kaggle competitions submit stanford-ribonanza-rna-folding -f {out_name}.parquet -m '{out_name}'")


kwargs
loading : ../exp/exp_26/models/model.pth
generating sub with no cliping: exp_26_no_clip


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for x,y in tqdm(dl):


  0%|          | 0/5250 [00:00<?, ?it/s]

kaggle competitions submit stanford-ribonanza-rna-folding -f exp_26.parquet -m 'exp_26'


In [None]:

# a = pd.read_parquet('exp_22_ft_no_clip.parquet').set_index('id')
# a = a.clip(0,1)
# a = a.reset_index()
# a.to_parquet(f'exp_22_ft.parquet', index=False)

In [12]:
plot_test_example(out_name)

In [None]:
# name = 'comb_v1'
# a  = (pd.read_parquet('exp_12_ft_exp_14_ft_no_clip.parquet').set_index('id') + pd.read_parquet('exp_09_ft_exp_08_ft.parquet').set_index('id'))/2
# b = (a + pd.read_parquet('exp_16_ft_exp_19_ft_no_clip.parquet').set_index('id'))/2
# b = b.clip(0,1)
# b = b.reset_index()
# b.to_parquet(f'{name}.parquet', index=False)
# print(print(f"kaggle competitions submit stanford-ribonanza-rna-folding -f {name}.parquet -m '{name}'"))
# pd.read_parquet(f'{name}.parquet')


In [None]:
#  a  = (pd.read_parquet('comb_v1.parquet').set_index('id') + pd.read_parquet('submission.parquet').set_index('id'))/2
#  a = a.reset_index()
#  a.to_parquet(f'comb_v2.parquet', index=False)
 

In [19]:
# name = 'comb_v3'
# res = (pd.read_parquet('exp_16_ft_exp_19_ft_no_clip.parquet').set_index('id').values + pd.read_parquet('exp_22_ft_no_clip.parquet').set_index('id').values)/2
# res = np.clip(res, 0, 1)
# res = pd.DataFrame(res, columns=['reactivity_DMS_MaP', 'reactivity_2A3_MaP'], index=pd.read_parquet('exp_16_ft_exp_19_ft_no_clip.parquet').set_index('id').index)
# res.reset_index().to_parquet(f'{name}.parquet', index=False)
# print(print(f"kaggle competitions submit stanford-ribonanza-rna-folding -f {name}.parquet -m '{name}'"))

kaggle competitions submit stanford-ribonanza-rna-folding -f comb_v3.parquet -m 'comb_v3'
None


In [29]:
# name = "comb_v4"
# res = 0.6 * pd.read_parquet("comb_v3.parquet").set_index("id").values +  0.4 * pd.read_parquet("submission.parquet").set_index("id").values
# res = np.clip(res, 0, 1)
# res = pd.DataFrame(
#     res,
#     columns=["reactivity_DMS_MaP", "reactivity_2A3_MaP"],
#     index=pd.read_parquet("exp_16_ft_exp_19_ft_no_clip.parquet").set_index("id").index,
# )
# res.reset_index().to_parquet(f"{name}.parquet", index=False)
# print(
#     print(
#         f"kaggle competitions submit stanford-ribonanza-rna-folding -f {name}.parquet -m '{name}'"
#     )
# )
# plot_test_example(name)

kaggle competitions submit stanford-ribonanza-rna-folding -f comb_v4.parquet -m 'comb_v4'
None


In [5]:
# name = 'comb_v5'
# res = (0.4 * pd.read_parquet('exp_16_ft_exp_19_ft_no_clip.parquet').set_index('id').values + 0.4 * pd.read_parquet('exp_22_ft_no_clip.parquet').set_index('id').values + 0.2* pd.read_parquet('exp_12_ft_exp_14_ft_no_clip.parquet').set_index('id').values)
# res = np.clip(res, 0, 1)
# res = pd.DataFrame(res, columns=['reactivity_DMS_MaP', 'reactivity_2A3_MaP'], index=pd.read_parquet('exp_16_ft_exp_19_ft_no_clip.parquet').set_index('id').index)
# res.reset_index().to_parquet(f'{name}.parquet', index=False)
# print(print(f"kaggle competitions submit stanford-ribonanza-rna-folding -f {name}.parquet -m '{name}'"))
# plot_test_example(name)

kaggle competitions submit stanford-ribonanza-rna-folding -f comb_v5.parquet -m 'comb_v5'
None


In [20]:
# name = 'comb_v6'
# res = (pd.read_parquet('exp_22_ft_no_clip.parquet').set_index('id').values + pd.read_parquet('exp_23_ft_no_clip.parquet').set_index('id').values)/2
# res = np.clip(res, 0, 1)
# res = pd.DataFrame(res, columns=['reactivity_DMS_MaP', 'reactivity_2A3_MaP'], index=pd.read_parquet('exp_16_ft_exp_19_ft_no_clip.parquet').set_index('id').index)
# res.reset_index().to_parquet(f'{name}.parquet', index=False)
# print(print(f"kaggle competitions submit stanford-ribonanza-rna-folding -f {name}.parquet -m '{name}'"))
# plot_test_example(name)

kaggle competitions submit stanford-ribonanza-rna-folding -f comb_v6.parquet -m 'comb_v6'
None


In [6]:
# name = 'comb_v6_no_clip'
# res = (pd.read_parquet('exp_22_ft_no_clip.parquet').set_index('id').values + pd.read_parquet('exp_23_ft_no_clip.parquet').set_index('id').values)/2
# res = pd.DataFrame(res, columns=['reactivity_DMS_MaP', 'reactivity_2A3_MaP'], index=pd.read_parquet('exp_16_ft_exp_19_ft_no_clip.parquet').set_index('id').index)
# res.reset_index().to_parquet(f'{name}.parquet', index=False)
# print(print(f"kaggle competitions submit stanford-ribonanza-rna-folding -f {name}.parquet -m '{name}'"))
# plot_test_example(name)

kaggle competitions submit stanford-ribonanza-rna-folding -f comb_v6_no_clip.parquet -m 'comb_v6_no_clip'
None


In [7]:
# name = 'comb_v7'
# res = (0.4 * pd.read_parquet('exp_16_ft_exp_19_ft_no_clip.parquet').set_index('id').values + 0.4 * pd.read_parquet('comb_v6_no_clip.parquet').set_index('id').values + 0.2* pd.read_parquet('exp_12_ft_exp_14_ft_no_clip.parquet').set_index('id').values)
# res = np.clip(res, 0, 1)
# res = pd.DataFrame(res, columns=['reactivity_DMS_MaP', 'reactivity_2A3_MaP'], index=pd.read_parquet('exp_16_ft_exp_19_ft_no_clip.parquet').set_index('id').index)
# res.reset_index().to_parquet(f'{name}.parquet', index=False)
# print(print(f"kaggle competitions submit stanford-ribonanza-rna-folding -f {name}.parquet -m '{name}'"))
# plot_test_example(name)

kaggle competitions submit stanford-ribonanza-rna-folding -f comb_v7.parquet -m 'comb_v7'
None


In [7]:
name = 'comb_v8'
res = (0.5 * pd.read_parquet('comb_v7.parquet').set_index('id').values + 0.5 * pd.read_parquet('/opt/slh/rna/sub/submission_ft_14226.parquet').set_index('id').values)
res = np.clip(res, 0, 1)
res = pd.DataFrame(res, columns=['reactivity_DMS_MaP', 'reactivity_2A3_MaP'], index=pd.read_parquet('exp_16_ft_exp_19_ft_no_clip.parquet').set_index('id').index)
res.reset_index().to_parquet(f'{name}.parquet', index=False)
print(print(f"kaggle competitions submit stanford-ribonanza-rna-folding -f {name}.parquet -m '{name}'"))
plot_test_example(name)

kaggle competitions submit stanford-ribonanza-rna-folding -f comb_v8.parquet -m 'comb_v8'
None


In [6]:
name = 'comb_v9'
res = (0.5 * pd.read_parquet('comb_v4.parquet').set_index('id').values + 0.5 * pd.read_parquet('/opt/slh/rna/sub/submission_ft_14226.parquet').set_index('id').values)
res = np.clip(res, 0, 1)
res = pd.DataFrame(res, columns=['reactivity_DMS_MaP', 'reactivity_2A3_MaP'], index=pd.read_parquet('exp_16_ft_exp_19_ft_no_clip.parquet').set_index('id').index)
res.reset_index().to_parquet(f'{name}.parquet', index=False)
print(print(f"kaggle competitions submit stanford-ribonanza-rna-folding -f {name}.parquet -m '{name}'"))
plot_test_example(name)

kaggle competitions submit stanford-ribonanza-rna-folding -f comb_v9.parquet -m 'comb_v9'
None


In [7]:
name = 'comb_v10'
res = (0.65 * pd.read_parquet('comb_v4.parquet').set_index('id').values + 0.35 * pd.read_parquet('/opt/slh/rna/sub/submission_ft_14226.parquet').set_index('id').values)
res = np.clip(res, 0, 1)
res = pd.DataFrame(res, columns=['reactivity_DMS_MaP', 'reactivity_2A3_MaP'], index=pd.read_parquet('exp_16_ft_exp_19_ft_no_clip.parquet').set_index('id').index)
res.reset_index().to_parquet(f'{name}.parquet', index=False)
print(print(f"kaggle competitions submit stanford-ribonanza-rna-folding -f {name}.parquet -m '{name}'"))
plot_test_example(name)

kaggle competitions submit stanford-ribonanza-rna-folding -f comb_v10.parquet -m 'comb_v10'
None
