# Overview
This is kernel is almost the same as [Lightweight Roberta solution in PyTorch](https://www.kaggle.com/andretugan/lightweight-roberta-solution-in-pytorch), but instead of "roberta-base", it starts from [Maunish's pre-trained model](https://www.kaggle.com/maunish/clrp-roberta-base).

Acknowledgments: some ideas were taken from kernels by [Torch](https://www.kaggle.com/rhtsingh) and [Maunish](https://www.kaggle.com/maunish).

In [1]:
!git init
!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
!apt-get install git-lfs
!git lfs install
!git clone https://huggingface.co/roberta-base

# !git clone https://huggingface.co/roberta-large
# !git clone https://huggingface.co/facebook/bart-base
# !git clone https://huggingface.co/bert-base-uncased
# !git clone https://huggingface.co/microsoft/deberta-base
# !git clone https://huggingface.co/distilroberta-base

Initialized empty Git repository in /kaggle/working/.git/
Detected operating system as Ubuntu/bionic.
Checking for curl...
Detected curl...
Checking for gpg...
Detected gpg...
Running apt-get update... done.
Installing apt-transport-https... done.
Installing /etc/apt/sources.list.d/github_git-lfs.list...done.
Importing packagecloud gpg key... done.
Running apt-get update... done.

The repository is setup! You can now install packages.



The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 52 not upgraded.
Need to get 2129 kB of archives.
After this operation, 7662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2129 kB]
Fetched 2129 kB in 0s (10.2 MB/s)
debconf: delaying package configuration, since apt-utils is not installed
Selecting previously unselected package git-lfs.
(Reading database ... 100757 files and directories currently 

In [2]:
import os
import math
import random
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
from transformers import get_cosine_schedule_with_warmup

from sklearn.model_selection import KFold

import gc
gc.enable()

In [3]:
NUM_FOLDS = 6#10#9#8#7#6#5
NUM_EPOCHS = 3
BATCH_SIZE = 16#24#32#16#1
MAX_LEN = 300#248
EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]
ROBERTA_PATH = "../input/commonlit-readability-prize-roberta-torch-itpt/output/pytorch_model.bin"#"../input/clrp-roberta-base/clrp_roberta_base"
TOKENIZER_PATH = "./roberta-base"#"../input/clrp-roberta-base/clrp_roberta_base"
CONFIG_PATH = "../input/commonlit-readability-prize-roberta-torch-itpt/output/config.json"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

In [5]:
train_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv")

# Remove incomplete entries if any.
train_df.drop(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index,
              inplace=True)
train_df.reset_index(drop=True, inplace=True)

test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
submission_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")

In [6]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

# Dataset

In [7]:
class LitDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__()

        self.df = df        
        self.inference_only = inference_only
        self.text = df.excerpt.tolist()
        #self.text = [text.replace("\n", " ") for text in self.text]
        
        if not self.inference_only:
            self.target = torch.tensor(df.target.values, dtype=torch.float32)        
    
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )        
 

    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return (input_ids, attention_mask)            
        else:
            target = self.target[index]
            return (input_ids, attention_mask, target)

# Model
The model is inspired by the one from [Maunish](https://www.kaggle.com/maunish/clrp-roberta-svm).

In [8]:
# class LitModel(nn.Module):
#     def __init__(self):
#         super().__init__()

#         config = AutoConfig.from_pretrained(CONFIG_PATH)
#         config.update({"output_hidden_states":True, 
#                        "hidden_dropout_prob": 0.0,
# #                        "attention_probs_dropout_prob":0.0,
#                        "layer_norm_eps": 1e-7})                       
        
#         self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  
            
#         self.attention = nn.Sequential(            
#             nn.Linear(768, 512),            
#             nn.Tanh(),  
#             nn.Linear(512, 1),
#             nn.Softmax(dim=1)
#         )        

#         self.regressor = nn.Sequential(      
# #             nn.LayerNorm(768),
#             nn.Linear(768, 1),          
#         )
        

#     def forward(self, input_ids, attention_mask):
#         roberta_output = self.roberta(input_ids=input_ids,
#                                       attention_mask=attention_mask)        

#         # There are a total of 13 layers of hidden states.
#         # 1 for the embedding layer, and 12 for the 12 Roberta layers.
#         # We take the hidden states from the last Roberta layer.
#         last_layer_hidden_states = roberta_output.hidden_states[-1]

#         # The number of cells is MAX_LEN.
#         # The size of the hidden state of each cell is 768 (for roberta-base).
#         # In order to condense hidden states of all cells to a context vector,
#         # we compute a weighted average of the hidden states of all cells.
#         # We compute the weight of each cell, using the attention neural network.
#         weights = self.attention(last_layer_hidden_states)
                
#         # weights.shape is BATCH_SIZE x MAX_LEN x 1
#         # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
#         # Now we compute context_vector as the weighted average.
#         # context_vector.shape is BATCH_SIZE x 768
#         context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
#         # Now we reduce the context vector to the prediction score.
#         return self.regressor(context_vector)

In [9]:
# class LitModel(nn.Module):
#     def __init__(self):
#         super().__init__()

#         config = AutoConfig.from_pretrained(CONFIG_PATH)
#         config.update({"output_hidden_states":True, 
#                        "hidden_dropout_prob": 0.0,
# #                        "attention_probs_dropout_prob":0.0,
#                        "layer_norm_eps": 1e-7})                       
        
#         self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  
#         self.cnn1 = nn.Conv1d(768, MAX_LEN, kernel_size=2, padding=1)
#         self.cnn2 = nn.Conv1d(MAX_LEN, 1, kernel_size=2, padding=1)

        

#     def forward(self, input_ids, attention_mask):
#         roberta_output = self.roberta(input_ids=input_ids,
#                                       attention_mask=attention_mask)        
#         last_hidden_state = roberta_output[0]
#         last_hidden_state = last_hidden_state.permute(0, 2, 1)
#         cnn_embeddings = F.relu(self.cnn1(last_hidden_state))
#         cnn_embeddings = self.cnn2(cnn_embeddings)
#         logits, _ = torch.max(cnn_embeddings, 2)
#         return logits

In [10]:
# https://arxiv.org/pdf/2103.04083v1.pdf
class LitModel(nn.Module):  
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CONFIG_PATH)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
#                        "attention_probs_dropout_prob":0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  
#         self.cnn1 = nn.Conv1d(768, MAX_LEN, kernel_size=1)
#         self.cnn2 = nn.Conv1d(MAX_LEN, 1, kernel_size=1)
        self.cnn1 = nn.Conv1d(768, 512, kernel_size=1)
        self.cnn2 = nn.Conv1d(512, MAX_LEN, kernel_size=1)
         
#         self.layernorm = nn.LayerNorm(MAX_LEN,MAX_LEN)    
        self.layernorm = nn.LayerNorm(MAX_LEN)
            
        self.attention = nn.Sequential(            
            nn.Linear(MAX_LEN, MAX_LEN),            
            nn.Tanh(),  
            nn.Linear(MAX_LEN, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(      
#             nn.LayerNorm(768),
            nn.Linear(MAX_LEN, 1),          
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)   
        last_hidden_state = roberta_output.hidden_states[-1]
#         print(last_hidden_state.shape)
        last_hidden_state = last_hidden_state.permute(0, 2, 1)#16*768*MAX_LEN
#         print(last_hidden_state.shape)
        cnn_embeddings = F.relu(self.cnn1(last_hidden_state))#16*512*MAX_LEN
#         print(cnn_embeddings.shape)
        cnn_embeddings = self.cnn2(cnn_embeddings)#16*MAX_LEN(embedding)*MAX_LEN(tokens)
#         print(cnn_embeddings.shape)
        cnn_embeddings = cnn_embeddings.permute(0, 2, 1)
#         cnn_embeddings = self.layernorm(cnn_embeddings)
#         print(cnn_embeddings.shape)
        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
#         last_layer_hidden_states = roberta_output.hidden_states[-1]
        

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
#         print(cnn_embeddings.shape)
        weights = self.attention(cnn_embeddings)#16*MAX_LEN*1
#         print('weights.shape',weights.shape)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * cnn_embeddings, dim=1)#16*MAX_LEN   
#         print('context_vector',context_vector.shape)
        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)#16

In [11]:
def eval_mse(model, data_loader):
    """Evaluates the mean squared error of the |model| on |data_loader|"""
    model.eval()            
    mse_sum = 0

    with torch.no_grad():
        for batch_num, (input_ids, attention_mask, target) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)                        
            target = target.to(DEVICE)           
            
            pred = model(input_ids, attention_mask)                       

            mse_sum += nn.MSELoss(reduction="sum")(pred.flatten(), target).item()
                

    return mse_sum / len(data_loader.dataset)

In [12]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
                        
            pred = model(input_ids, attention_mask)                        

            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]

    return result

In [13]:
def train(model, model_path, train_loader, val_loader,
          optimizer, scheduler=None, num_epochs=NUM_EPOCHS):    
    best_val_rmse = None
    best_epoch = 0
    step = 0
    last_eval_step = 0
    eval_period = EVAL_SCHEDULE[0][1]    

    start = time.time()

    for epoch in range(num_epochs):                           
        val_rmse = None         

        for batch_num, (input_ids, attention_mask, target) in enumerate(train_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)            
            target = target.to(DEVICE)                        

            optimizer.zero_grad()
            
            model.train()

            pred = model(input_ids, attention_mask)
                                                        
            mse = nn.MSELoss(reduction="mean")(pred.flatten(), target)
                        
            mse.backward()

            optimizer.step()
            if scheduler:
                scheduler.step()
            
            if step >= last_eval_step + eval_period:
                # Evaluate the model on val_loader.
                elapsed_seconds = time.time() - start
                num_steps = step - last_eval_step
                print(f"\n{num_steps} steps took {elapsed_seconds:0.3} seconds")
                last_eval_step = step
                
                val_rmse = math.sqrt(eval_mse(model, val_loader))                            

                print(f"Epoch: {epoch} batch_num: {batch_num}", 
                      f"val_rmse: {val_rmse:0.4}")

                for rmse, period in EVAL_SCHEDULE:
                    if val_rmse >= rmse:
                        eval_period = period
                        break                               
                
                if not best_val_rmse or val_rmse < best_val_rmse:                    
                    best_val_rmse = val_rmse
                    best_epoch = epoch
                    torch.save(model.state_dict(), model_path)
                    print(f"New best_val_rmse: {best_val_rmse:0.4}")
                else:       
                    print(f"Still best_val_rmse: {best_val_rmse:0.4}",
                          f"(from epoch {best_epoch})")                                    
                    
                start = time.time()
                                            
            step += 1
                        
    
    return best_val_rmse

In [14]:
def create_optimizer(model):
    named_parameters = list(model.named_parameters())    
    
    roberta_parameters = named_parameters[:197]    
    attention_parameters = named_parameters[199:203]
    regressor_parameters = named_parameters[203:]
        
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]

    parameters = []
    parameters.append({"params": attention_group,
                       "weight_decay": 0.001,
                      "lr": 1e-3})
    parameters.append({"params": regressor_group,
                       "weight_decay": 0.001,
                      "lr": 1e-3})

    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if "bias" in name else 0.01

        lr = 2e-5

        if layer_num >= 69:        
            lr = 5e-5

        if layer_num >= 133:
            lr = 1e-4

        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})

    return AdamW(parameters)

In [15]:
gc.collect()

SEED = 1000
list_val_rmse = []

kfold = KFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True)

for fold, (train_indices, val_indices) in enumerate(kfold.split(train_df)):    
    print(f"\nFold {fold + 1}/{NUM_FOLDS}")
    model_path = f"model_{fold + 1}.pth"
        
    set_random_seed(SEED + fold)
    
    train_dataset = LitDataset(train_df.loc[train_indices])    
    val_dataset = LitDataset(train_df.loc[val_indices])    
        
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              drop_last=True, shuffle=True, num_workers=2)    
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                            drop_last=False, shuffle=False, num_workers=2)    
        
    set_random_seed(SEED + fold)    
    
    model = LitModel().to(DEVICE)
    
    optimizer = create_optimizer(model)                        
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_training_steps=NUM_EPOCHS * len(train_loader),
        num_warmup_steps=50)    
    
    list_val_rmse.append(train(model, model_path, train_loader,
                               val_loader, optimizer, scheduler=scheduler))

    del model
    gc.collect()
    
    print("\nPerformance estimates:")
    print(list_val_rmse)
    print("Mean:", np.array(list_val_rmse).mean())
    


Fold 1/6


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/commonlit-readability-prize-roberta-torch-itpt/output/pytorch_model.bin and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



16 steps took 9.94 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9289
New best_val_rmse: 0.9289

16 steps took 8.23 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7253
New best_val_rmse: 0.7253

16 steps took 8.31 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6896
New best_val_rmse: 0.6896

16 steps took 8.19 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6827
New best_val_rmse: 0.6827

16 steps took 8.23 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6979
Still best_val_rmse: 0.6827 (from epoch 0)

16 steps took 8.18 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5887
New best_val_rmse: 0.5887

16 steps took 8.22 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5648
New best_val_rmse: 0.5648

16 steps took 8.25 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5512
New best_val_rmse: 0.5512

16 steps took 8.2 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5338
New best_val_rmse: 0.5338

16 steps took 8.36 seconds
Epoch: 1 batch_num: 13 val_rmse: 0.5995
Still best_val_rmse: 0.5338 (from epoch 0)

16 steps took 8.24 seco

Some weights of RobertaModel were not initialized from the model checkpoint at ../input/commonlit-readability-prize-roberta-torch-itpt/output/pytorch_model.bin and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



16 steps took 8.82 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9211
New best_val_rmse: 0.9211

16 steps took 8.2 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6967
New best_val_rmse: 0.6967

16 steps took 8.23 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6419
New best_val_rmse: 0.6419

16 steps took 8.27 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6455
Still best_val_rmse: 0.6419 (from epoch 0)

16 steps took 8.32 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6516
Still best_val_rmse: 0.6419 (from epoch 0)

16 steps took 8.16 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6302
New best_val_rmse: 0.6302

16 steps took 8.35 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5903
New best_val_rmse: 0.5903

16 steps took 8.26 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5599
New best_val_rmse: 0.5599

16 steps took 8.23 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.5026
New best_val_rmse: 0.5026

16 steps took 8.39 seconds
Epoch: 1 batch_num: 13 val_rmse: 0.5309
Still best_val_rmse: 0.5026 (from epoch 0)

16 ste

Some weights of RobertaModel were not initialized from the model checkpoint at ../input/commonlit-readability-prize-roberta-torch-itpt/output/pytorch_model.bin and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



16 steps took 8.82 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.944
New best_val_rmse: 0.944

16 steps took 8.25 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7514
New best_val_rmse: 0.7514

16 steps took 8.26 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.9206
Still best_val_rmse: 0.7514 (from epoch 0)

16 steps took 8.21 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8612
Still best_val_rmse: 0.7514 (from epoch 0)

16 steps took 8.24 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6576
New best_val_rmse: 0.6576

16 steps took 8.22 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5836
New best_val_rmse: 0.5836

16 steps took 8.2 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5441
New best_val_rmse: 0.5441

16 steps took 8.2 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5801
Still best_val_rmse: 0.5441 (from epoch 0)

16 steps took 8.21 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.7879
Still best_val_rmse: 0.5441 (from epoch 0)

16 steps took 8.43 seconds
Epoch: 1 batch_num: 13 val_rmse: 0.7212
Still best_val_rmse

Some weights of RobertaModel were not initialized from the model checkpoint at ../input/commonlit-readability-prize-roberta-torch-itpt/output/pytorch_model.bin and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



16 steps took 8.88 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9605
New best_val_rmse: 0.9605

16 steps took 8.17 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.8249
New best_val_rmse: 0.8249

16 steps took 8.25 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6208
New best_val_rmse: 0.6208

16 steps took 8.24 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.613
New best_val_rmse: 0.613

16 steps took 8.29 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.647
Still best_val_rmse: 0.613 (from epoch 0)

16 steps took 8.26 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.626
Still best_val_rmse: 0.613 (from epoch 0)

16 steps took 8.22 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.6757
Still best_val_rmse: 0.613 (from epoch 0)

16 steps took 8.24 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.5936
New best_val_rmse: 0.5936

16 steps took 8.23 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6632
Still best_val_rmse: 0.5936 (from epoch 0)

16 steps took 8.45 seconds
Epoch: 1 batch_num: 13 val_rmse: 0.5341
New best_val_rmse: 0.5

Some weights of RobertaModel were not initialized from the model checkpoint at ../input/commonlit-readability-prize-roberta-torch-itpt/output/pytorch_model.bin and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



16 steps took 8.89 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9672
New best_val_rmse: 0.9672

16 steps took 8.23 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6773
New best_val_rmse: 0.6773

16 steps took 8.22 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6504
New best_val_rmse: 0.6504

16 steps took 8.2 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6742
Still best_val_rmse: 0.6504 (from epoch 0)

16 steps took 8.21 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.7225
Still best_val_rmse: 0.6504 (from epoch 0)

16 steps took 8.32 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.5714
New best_val_rmse: 0.5714

16 steps took 8.23 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.5638
New best_val_rmse: 0.5638

16 steps took 8.3 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6124
Still best_val_rmse: 0.5638 (from epoch 0)

16 steps took 8.19 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.7455
Still best_val_rmse: 0.5638 (from epoch 0)

16 steps took 8.43 seconds
Epoch: 1 batch_num: 13 val_rmse: 0.5442
New best_val_rmse

Some weights of RobertaModel were not initialized from the model checkpoint at ../input/commonlit-readability-prize-roberta-torch-itpt/output/pytorch_model.bin and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



16 steps took 8.85 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9248
New best_val_rmse: 0.9248

16 steps took 8.28 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7227
New best_val_rmse: 0.7227

16 steps took 8.2 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.8201
Still best_val_rmse: 0.7227 (from epoch 0)

16 steps took 8.22 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.8578
Still best_val_rmse: 0.7227 (from epoch 0)

16 steps took 8.2 seconds
Epoch: 0 batch_num: 80 val_rmse: 0.6491
New best_val_rmse: 0.6491

16 steps took 8.29 seconds
Epoch: 0 batch_num: 96 val_rmse: 0.6223
New best_val_rmse: 0.6223

16 steps took 8.23 seconds
Epoch: 0 batch_num: 112 val_rmse: 0.564
New best_val_rmse: 0.564

16 steps took 8.22 seconds
Epoch: 0 batch_num: 128 val_rmse: 0.6135
Still best_val_rmse: 0.564 (from epoch 0)

16 steps took 8.18 seconds
Epoch: 0 batch_num: 144 val_rmse: 0.6076
Still best_val_rmse: 0.564 (from epoch 0)

16 steps took 8.37 seconds
Epoch: 1 batch_num: 13 val_rmse: 0.6053
Still best_val_rmse: 

# Inference

In [16]:
test_dataset = LitDataset(test_df, inference_only=True)

In [17]:
all_predictions = np.zeros((len(list_val_rmse), len(test_df)))

test_dataset = LitDataset(test_df, inference_only=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         drop_last=False, shuffle=False, num_workers=2)

for index in range(len(list_val_rmse)):            
    model_path = f"model_{index + 1}.pth"
    print(f"\nUsing {model_path}")
                        
    model = LitModel()
    model.load_state_dict(torch.load(model_path))    
    model.to(DEVICE)
    
    all_predictions[index] = predict(model, test_loader)
    
    del model
    gc.collect()


Using model_1.pth


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/commonlit-readability-prize-roberta-torch-itpt/output/pytorch_model.bin and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Using model_2.pth


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/commonlit-readability-prize-roberta-torch-itpt/output/pytorch_model.bin and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Using model_3.pth


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/commonlit-readability-prize-roberta-torch-itpt/output/pytorch_model.bin and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Using model_4.pth


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/commonlit-readability-prize-roberta-torch-itpt/output/pytorch_model.bin and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Using model_5.pth


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/commonlit-readability-prize-roberta-torch-itpt/output/pytorch_model.bin and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Using model_6.pth


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/commonlit-readability-prize-roberta-torch-itpt/output/pytorch_model.bin and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
predictions = all_predictions.mean(axis=0)
submission_df.target = predictions
print(submission_df)
submission_df.to_csv("submission.csv", index=False)

          id    target
0  c0f722661 -0.534043
1  f0953f0a5 -0.554323
2  0df072751 -0.468059
3  04caf4e0c -2.560002
4  0e63f8bea -1.818977
5  12537fe78 -1.439899
6  965e592c0  0.249321
