### This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing libraries.
import pandas as pd
import os, gc
import numpy as np
from sklearn.model_selection import KFold

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [None]:
import torch
from fastai.vision.all import *
def flatten(o):
    "Concatenate all collections and items as a generator"
    for item in o:
        if isinstance(o, dict): yield o[item]; continue
        elif isinstance(item, str): yield item; continue
        try: yield from flatten(item)
        except TypeError: yield item

In [None]:
from torch.cuda.amp import GradScaler autocast
@delegates(GradScaler)
class MixedPrecision(Callback):
    "Mixed precison training using Pytorch's 'autocast and GradScaler'"
    order = 10
    def __init__(self, **kwargs): self.kwargs = kwargs
    def before_fit(self): self.autocast, self.learn.scaler, self.scales = autocast(), GradScaler(**self.kwargs), L()
    def before_batch(self): self.autocast.__enter__()
    def after_pred(self):
        if next(flatten(self.pred)).dtype==torch.float16:
            self.learn.pred = to_float(self.pred)
    def after_loss(self): self.autocast.__exit__(None, None, None)
    def before_backward(self): self.learn.loss_grad = self.scaler.scale(self.loss_grad)
    def before_step(self):
        self.skipped = True
        self.scaler.step(self)
        if self.skipped: raise CancelStepException()
        self.scales.append(self.scaler.get_scale())
    def after_step(self): self.learn.scaler.update()
        
    @property
    def param_groups(self):
        return self.opt.param_groups
    def step(self, *args, **kwargs):
        self.skipped = False
    def after_fit(self): self.autocast, self.learn.scaler, self.scales = None, None, None
        

import fastai
fastai.callback.fp16.MixedPrecision = MixedPrecision

        
    
        
        


In [None]:
fname = 'example0'
PATH = '/kaggle/input/stanford-ribonanza-rna-folding-converted/'
OUT = './'
bs = 256
num_workers = 2
SEED = 2023
nfolds = 4
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [None]:
class RNA_Dataset(Dataset):
    def __init__(self, df, mode='train', seed=2023, fold= 0, nfolds = 4, mask_onlt = False, **kwargs):
        self.seq_map = {'A' : 0, 'C': 1, 'G': 2, 'U': 3}
        self.Lmax = 206
        df['L'] = df.sequence.apply(len)
        df_2A3 = df.loc[df.experiment_type == '2A3_MaP']
        df_DMS = df.loc[df.experiment_type == 'DMS_MaP']
        split = list(KFold(n_splits = nfolds, random_state = seed,
                          shuffle = True).split(df_2A3))[fold][0 if mode == 'train' else 1]
        
        df_2A3 = df_2A3.iloc[split].reset_index(drop = True)
        df_DMS = df_DMS.iloc[split].reset_index(drop = True)
        
        m = (df_2A3['SN_filter'].values > 0) & (df_DMS['SN_filter'].values > 0)
        
        df_2A3 = df_2A3.loc[m].reset_index(drop = True)
        df_DMS = df_DMS.loc[m].reset_index(drop = True)
        
        self.seq = df_2A3['sequence'].values
        self.L = df_2A3['L'].values
        
        self.react_2A3 = df_2A3[[c for c in df_2A3.columns if \
                                'reactivity_0' in c]].values
        
        self.react_DMS = df_DMS[[c for c in df_DMS.columns if \
                                'reactivity_0' in c]].values
        self.react_err_2A3 = df_2A3[[ c for c in df_2A3.columns if \
                                    'reactivity_error_0' in c]].values
        self.react_err_DMS = df_DMS[[c for c in df_DMS.columns if \
                                    'reactivity_error_0' in c]].values
        self.sn_2A3 = df_2A3['signal_to_noise'].values
        self.sn_DMS = df_DMS['signal_to_noise'].values
        self.mask_only = mask_only
        
    def __len__(self):
        return len(self.seq)
    
    def __getitem__(self, idx):
        seq = self.seq[idx]
        if self.mask_only:
            mask = torch.zeros(self.Lmax, dtype = torch.bool)
            mask[:len(seq)] = True
            return {'mask': mask}, {'mask': mask}
        seq = [self.seq_map[s] for s in seq]
        seq = np.array(seq)
        mask = torch.zeros(self.Lmax, dtype=torch.bool)
        mask[:len(seq)] = True
        seq = np.pad(seq, (0, self.Lmax - len(seq)))
        react = torch.from_numpy(np.stack([self.react_2A3[idx], self.react_DMS[idx]], -1))
        react_err = torch.from_numpy(np.stack([self.react_err_2A3[idx], self.react_err_DMS[idx]], -1))
        sn = torch.FloatTensor([self.sn_2A3[idx], self.sn_DMS[idx]])
        return {'seq': torch.from_numpy(seq), 'mask':mask}, {'react': react, 'react_err': react_err, 'sn': sn, 'mask': mask}
    

class LenMatchBatchSampler(torch.utils.data.BatchSampler):
        def __iter__(self):
            buckets = [[]] * 100
            yielded = 0
            for idx in self.sampler:
                s = self.sampler.data_source[idx]
                if isinstance(s, tuple): L = s[0]['mask'].sum()
                else: L = s['mask'].sum()
                L = max(1, L // 16)
                if len(buckets[L]) == 0: buckets[L] = []
                buckets[L].append(idx)
                
                if len(buckets[L]) == self.batch_size:
                    batch = list(bucktes[L])
                    yield batch
                    yielded += 1
                    bucktes[L] = []
                    
            batch = []
            leftover = [idx for bucket in buckets for idx in bucket]
            
            for idx in leftover:
                batch.append(idx)
                if len(batch) == self.batch_size:
                    yielded += 1
                    yield batch
                    batch = []
                    
            if len(batch) > 0 and not self.drop_last:
                yielded += 1
                yield batch
        
        def dict_to(x, device = 'cuda'):
            return {k:x[k].to(device) for k in x}
        
        def to_device(x, device = 'cuda'):
            return tuple(dict_to(e device) for e in x)
        
class DeviceDataLoader:
        def __init__(self, dataloader, device = 'cuda'):
            self.dataloader = dataloader
            self.device = device
            
        def __len__(self):
            return len(self.dataloader)
        
        def __iter__(self):
            for batch in self.dataloader:
                yield tuple(dict_to(x, self.device) for x in batch)
        