# **Notebooks sequence;)**
* Train-val split notebook [here](https://www.kaggle.com/chamecall/train-val-split).<br>
* Pretrain roberta-base on mlm with the competition data notebook [here](https://www.kaggle.com/chamecall/clrp-pretrain).<br>
* Finetune pretrained roberta-base on readability task notebook [here](https://www.kaggle.com/chamecall/clrp-finetune).<br>
* Inference model notebook [*CURRENT ONE*].<br>

In [1]:

import os
from pathlib import Path
in_folder_path = Path('../input/clrp-finetune')#Path('../input/clrp-finetune')
scripts_dir = Path(in_folder_path / 'scripts')

In [2]:

os.chdir(scripts_dir)
exec(Path("imports.py").read_text())
exec(Path("config.py").read_text())
exec(Path("dataset.py").read_text())
exec(Path("model.py").read_text())
os.chdir('/kaggle/working')

In [3]:

test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")

tokenizer = torch.load('../input/tokenizers/roberta-tokenizer.pt')
models_folder_path = Path(in_folder_path / 'models')
models_preds = []
# n_models = 5
n_models = [0,2]

# for model_num in range(n_models):
for model_num in n_models:
    print(f'Inference#{model_num+1}/{n_models}')
    test_ds = CLRPDataset(data=test_df, tokenizer=tokenizer, max_len=Config.max_len, is_test=True)
    test_sampler = SequentialSampler(test_ds)
    test_dataloader = DataLoader(test_ds, sampler = test_sampler, batch_size=Config.batch_size)
    model = torch.load(models_folder_path / f'best_model_{model_num}.pt').to(Config.device)

    all_preds = []
    model.eval()

    for step,batch in enumerate(test_dataloader):
        sent_id, mask = batch['input_ids'].to(Config.device), batch['attention_mask'].to(Config.device)
        with torch.no_grad():
            preds = model(sent_id, mask)
            all_preds += preds.flatten().cpu().tolist()
    
    models_preds.append(all_preds)

Inference#1/[0, 2]
Inference#3/[0, 2]


In [4]:
# models_preds = np.array(models_preds)
# print(models_preds.shape)
# # print(models_preds)
# model1_predictions = models_preds.mean(axis=0)

# model 2

In [5]:
NUM_FOLDS = 5
NUM_EPOCHS = 3
BATCH_SIZE = 16
MAX_LEN = 300#248
EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]
ROBERTA_PATH = "../input/pre-trained-roberta-solution-in-pytorch-train/roberta-base/pytorch_model.bin"#"../input/clrp-roberta-base/clrp_roberta_base"
TOKENIZER_PATH = "../input/pre-trained-roberta-solution-in-pytorch-train/roberta-base/"
CONFIG_PATH = "../input/pre-trained-roberta-solution-in-pytorch-train/roberta-base/config.json"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

In [7]:
train_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv")

# Remove incomplete entries if any.
train_df.drop(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index,
              inplace=True)
train_df.reset_index(drop=True, inplace=True)

test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
submission_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")

In [8]:
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)


## Dataset

In [9]:
class LitDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__()

        self.df = df        
        self.inference_only = inference_only
        self.text = df.excerpt.tolist()
        #self.text = [text.replace("\n", " ") for text in self.text]
        
        if not self.inference_only:
            self.target = torch.tensor(df.target.values, dtype=torch.float32)        
    
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )        
 

    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return (input_ids, attention_mask)            
        else:
            target = self.target[index]
            return (input_ids, attention_mask, target)

## model

In [10]:
class LitModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(CONFIG_PATH)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  
            
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(last_layer_hidden_states)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)

In [11]:
def eval_mse(model, data_loader):
    """Evaluates the mean squared error of the |model| on |data_loader|"""
    model.eval()            
    mse_sum = 0

    with torch.no_grad():
        for batch_num, (input_ids, attention_mask, target) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)                        
            target = target.to(DEVICE)           
            
            pred = model(input_ids, attention_mask)                       

            mse_sum += nn.MSELoss(reduction="sum")(pred.flatten(), target).item()
                

    return mse_sum / len(data_loader.dataset)

In [12]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
                        
            pred = model(input_ids, attention_mask)                        

            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]

    return result

## infer

In [13]:
model_num= [1,3,4]
test_dataset = LitDataset(test_df, inference_only=True)
# all_predictions = np.zeros((len(model_num), len(test_df)))

test_dataset = LitDataset(test_df, inference_only=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         drop_last=False, shuffle=False, num_workers=2)

# for index in range(5):
for index,val in enumerate(model_num):            
    model_path = f"../input/pre-trained-roberta-solution-in-pytorch-train/model_{val + 1}.pth" #../input/pre-trained-roberta-solution-in-pytorch-train/model_1.pth
    print(f"\nUsing {model_path}")
                        
    model = LitModel()
    model.load_state_dict(torch.load(model_path))    
    model.to(DEVICE)
    
    models_preds.append(predict(model, test_loader))
    
    del model
    gc.collect()


Using ../input/pre-trained-roberta-solution-in-pytorch-train/model_2.pth

Using ../input/pre-trained-roberta-solution-in-pytorch-train/model_4.pth

Using ../input/pre-trained-roberta-solution-in-pytorch-train/model_5.pth


In [14]:
# model2_predictions = all_predictions.mean(axis=0)

In [15]:
# predictions = model1_predictions * 0.5 + model2_predictions * 0.5
models_preds = np.array(models_preds)
print(models_preds.shape)
# print(models_preds)
predictions = models_preds.mean(axis=0)

(5, 7)


In [16]:
# # results = pd.DataFrame(np.vstack((model1_predictions, model2_predictions, model3_predictions, predictions)).transpose(), 
# #                        columns=['model1','model2','model3','ensemble'])
# results = pd.DataFrame(np.vstack((model1_predictions, model2_predictions, predictions)).transpose(), 
#                        columns=['model1','model2','ensemble'])
# results.head()

In [17]:
# submission_df.target = predictions
# print(submission_df)
# submission_df.to_csv("submission.csv", index=False)

In [18]:
test_df['target']=predictions
# submission_df.target = predictions
cols_to_keep=['id','target']
print(test_df[cols_to_keep])
test_df.loc[:, cols_to_keep].to_csv("submission.csv", index=False)

          id    target
0  c0f722661 -0.445603
1  f0953f0a5 -0.582775
2  0df072751 -0.430027
3  04caf4e0c -2.531679
4  0e63f8bea -1.707110
5  12537fe78 -1.540228
6  965e592c0  0.108431
