In [1]:
import os
import torch
import pandas as pd
import numpy as np

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AdamW,
    get_cosine_schedule_with_warmup,
)
from sklearn.metrics import mean_squared_error

from clrp_utils import (
    seed_everything,
    create_folds,
    create_dataloaders,
    train_fold,
    oof_predictions,
)

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
BATCH_SIZE = 8
VAL_BATCH_SIZE = 16
LR = 2.0e-5
LR_CLF = 5.0e-4
WARMUP_RATIO = 0.1
EPOCHS = 3
SEED_VAL = 3
VAL_STEP = [100, 50, 20]
DEVICE = torch.device("cuda")
WEIGHT_DECAY = 0.01
WEIGHT_DECAY_CLF = 0.00
NUM_FOLDS = 5
FOLDS_RANDOM_STATE = 1325
GRADIENT_CLIPPING = True
TRAIN_CSV = "~/datasets/commonlitreadabilityprize/train.csv"

model_cfg = {
    "model": "roberta-base",
    "weights_dir": "",
    "tokenizer": "roberta-base",
    "max_len": 256,
    "hidden_dropout_prob": 0.0,
    "attention_probs_dropout_prob": 0.1,
}

In [3]:
seed_everything(SEED_VAL)

tokenizer = AutoTokenizer.from_pretrained(model_cfg["tokenizer"])

df = pd.read_csv(TRAIN_CSV)
df = create_folds(df, num_splits=NUM_FOLDS, random_state=FOLDS_RANDOM_STATE)

In [4]:
best_val_losses = list()

for fold in range(NUM_FOLDS):

    train_set, valid_set = df[df["kfold"] != fold], df[df["kfold"] == fold]

    train_dataloader, validation_dataloader = create_dataloaders(
        tokenizer,
        train_set,
        valid_set=valid_set,
        max_len=model_cfg["max_len"],
        train_batch_size=BATCH_SIZE,
        valid_batch_size=VAL_BATCH_SIZE,
    )

    if "bert" in model_cfg["model"]:
        model = AutoModelForSequenceClassification.from_pretrained(
            model_cfg["model"],
            num_labels=1,
            output_attentions=False,
            output_hidden_states=False,
            hidden_dropout_prob=model_cfg["hidden_dropout_prob"],
            attention_probs_dropout_prob=model_cfg["attention_probs_dropout_prob"],
        )
    else:
        model = AutoModelForSequenceClassification.from_pretrained(
            model_cfg["model"],
            num_labels=1,
            output_attentions=False,
            output_hidden_states=False,
            dropout=model_cfg["droput"],
            summary_last_dropout=model_cfg["summary_last_dropout"],
        )

    model = model.to(DEVICE)

    classifier = ["classifier"]
    optimizer_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if "classifier" not in n],
            "lr": LR,
            "weight_decay_rate": WEIGHT_DECAY,
        },
        {
            "params": [p for n, p in model.named_parameters() if "classifier" in n],
            "lr": LR_CLF,
            "weight_decay_rate": WEIGHT_DECAY_CLF,
        },
    ]

    optimizer = AdamW(
        optimizer_parameters,
        lr=LR,
        betas=(0.9, 0.98),
        weight_decay=WEIGHT_DECAY,
        eps=1e-6,
        correct_bias=False,
    )

    total_steps = len(train_dataloader) * EPOCHS

    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=WARMUP_RATIO * total_steps,
        num_training_steps=total_steps,
    )

    best_val_loss = train_fold(
        model,
        optimizer,
        scheduler,
        train_dataloader,
        validation_dataloader,
        DEVICE,
        fold,
        model_cfg["model"],
        epochs=EPOCHS,
        val_step=VAL_STEP,
        num_folds=NUM_FOLDS,
        gradient_clipping=GRADIENT_CLIPPING,
    )
    best_val_losses.append(best_val_loss)

    torch.cuda.empty_cache()
    del train_dataloader, validation_dataloader, model, optimizer, scheduler

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi


Fold 1 / 5   Epoch 1 / 3   Batch 100 / 284   Val_Loss: 0.6240   Best_Val_Loss: 0.6240
Fold 1 / 5   Epoch 1 / 3   Batch 200 / 284   Val_Loss: 0.6485   Best_Val_Loss: 0.6240
Fold 1 / 5   Epoch 1 / 3   Batch 284 / 284   Val_Loss: 0.5674   Best_Val_Loss: 0.5674
  Average training loss: 0.7142
  Best Val Loss: 0.5674
  Training epoch took: 0:01:26

Fold 1 / 5   Epoch 2 / 3   Batch  50 / 284   Val_Loss: 0.5253   Best_Val_Loss: 0.5253
Fold 1 / 5   Epoch 2 / 3   Batch 100 / 284   Val_Loss: 0.5286   Best_Val_Loss: 0.5253
Fold 1 / 5   Epoch 2 / 3   Batch 150 / 284   Val_Loss: 0.5619   Best_Val_Loss: 0.5253
Fold 1 / 5   Epoch 2 / 3   Batch 200 / 284   Val_Loss: 0.5011   Best_Val_Loss: 0.5011
Fold 1 / 5   Epoch 2 / 3   Batch 250 / 284   Val_Loss: 0.5417   Best_Val_Loss: 0.5011
Fold 1 / 5   Epoch 2 / 3   Batch 284 / 284   Val_Loss: 0.4974   Best_Val_Loss: 0.4974
  Average training loss: 0.4427
  Best Val Loss: 0.4974
  Training epoch took: 0:01:43

Fold 1 / 5   Epoch 3 / 3   Batch  20 / 284   Val_

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi


Fold 2 / 5   Epoch 1 / 3   Batch 100 / 284   Val_Loss: 1.1038   Best_Val_Loss: 1.1038
Fold 2 / 5   Epoch 1 / 3   Batch 200 / 284   Val_Loss: 0.6133   Best_Val_Loss: 0.6133
Fold 2 / 5   Epoch 1 / 3   Batch 284 / 284   Val_Loss: 0.5323   Best_Val_Loss: 0.5323
  Average training loss: 0.7332
  Best Val Loss: 0.5323
  Training epoch took: 0:01:28

Fold 2 / 5   Epoch 2 / 3   Batch  50 / 284   Val_Loss: 0.5477   Best_Val_Loss: 0.5323
Fold 2 / 5   Epoch 2 / 3   Batch 100 / 284   Val_Loss: 0.5357   Best_Val_Loss: 0.5323
Fold 2 / 5   Epoch 2 / 3   Batch 150 / 284   Val_Loss: 0.4878   Best_Val_Loss: 0.4878
Fold 2 / 5   Epoch 2 / 3   Batch 200 / 284   Val_Loss: 0.4835   Best_Val_Loss: 0.4835
Fold 2 / 5   Epoch 2 / 3   Batch 250 / 284   Val_Loss: 0.4977   Best_Val_Loss: 0.4835
Fold 2 / 5   Epoch 2 / 3   Batch 284 / 284   Val_Loss: 0.4979   Best_Val_Loss: 0.4835
  Average training loss: 0.4579
  Best Val Loss: 0.4835
  Training epoch took: 0:01:42

Fold 2 / 5   Epoch 3 / 3   Batch  20 / 284   Val_

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi


Fold 3 / 5   Epoch 1 / 3   Batch 100 / 284   Val_Loss: 0.7001   Best_Val_Loss: 0.7001
Fold 3 / 5   Epoch 1 / 3   Batch 200 / 284   Val_Loss: 0.5704   Best_Val_Loss: 0.5704
Fold 3 / 5   Epoch 1 / 3   Batch 284 / 284   Val_Loss: 0.5168   Best_Val_Loss: 0.5168
  Average training loss: 0.7043
  Best Val Loss: 0.5168
  Training epoch took: 0:01:28

Fold 3 / 5   Epoch 2 / 3   Batch  50 / 284   Val_Loss: 0.5142   Best_Val_Loss: 0.5142
Fold 3 / 5   Epoch 2 / 3   Batch 100 / 284   Val_Loss: 0.5862   Best_Val_Loss: 0.5142
Fold 3 / 5   Epoch 2 / 3   Batch 150 / 284   Val_Loss: 0.5226   Best_Val_Loss: 0.5142
Fold 3 / 5   Epoch 2 / 3   Batch 200 / 284   Val_Loss: 0.5667   Best_Val_Loss: 0.5142
Fold 3 / 5   Epoch 2 / 3   Batch 250 / 284   Val_Loss: 0.5384   Best_Val_Loss: 0.5142
Fold 3 / 5   Epoch 2 / 3   Batch 284 / 284   Val_Loss: 0.5097   Best_Val_Loss: 0.5097
  Average training loss: 0.4427
  Best Val Loss: 0.5097
  Training epoch took: 0:01:42

Fold 3 / 5   Epoch 3 / 3   Batch  20 / 284   Val_

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi


Fold 4 / 5   Epoch 1 / 3   Batch 100 / 284   Val_Loss: 0.6797   Best_Val_Loss: 0.6797
Fold 4 / 5   Epoch 1 / 3   Batch 200 / 284   Val_Loss: 0.6709   Best_Val_Loss: 0.6709
Fold 4 / 5   Epoch 1 / 3   Batch 284 / 284   Val_Loss: 0.6214   Best_Val_Loss: 0.6214
  Average training loss: 0.7064
  Best Val Loss: 0.6214
  Training epoch took: 0:01:28

Fold 4 / 5   Epoch 2 / 3   Batch  50 / 284   Val_Loss: 0.5524   Best_Val_Loss: 0.5524
Fold 4 / 5   Epoch 2 / 3   Batch 100 / 284   Val_Loss: 0.4915   Best_Val_Loss: 0.4915
Fold 4 / 5   Epoch 2 / 3   Batch 150 / 284   Val_Loss: 0.4961   Best_Val_Loss: 0.4915
Fold 4 / 5   Epoch 2 / 3   Batch 200 / 284   Val_Loss: 0.4834   Best_Val_Loss: 0.4834
Fold 4 / 5   Epoch 2 / 3   Batch 250 / 284   Val_Loss: 0.4954   Best_Val_Loss: 0.4834
Fold 4 / 5   Epoch 2 / 3   Batch 284 / 284   Val_Loss: 0.5231   Best_Val_Loss: 0.4834
  Average training loss: 0.4585
  Best Val Loss: 0.4834
  Training epoch took: 0:01:43

Fold 4 / 5   Epoch 3 / 3   Batch  20 / 284   Val_

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi


Fold 5 / 5   Epoch 1 / 3   Batch 100 / 284   Val_Loss: 0.7368   Best_Val_Loss: 0.7368
Fold 5 / 5   Epoch 1 / 3   Batch 200 / 284   Val_Loss: 0.6031   Best_Val_Loss: 0.6031
Fold 5 / 5   Epoch 1 / 3   Batch 284 / 284   Val_Loss: 0.5366   Best_Val_Loss: 0.5366
  Average training loss: 0.6957
  Best Val Loss: 0.5366
  Training epoch took: 0:01:28

Fold 5 / 5   Epoch 2 / 3   Batch  50 / 284   Val_Loss: 0.5801   Best_Val_Loss: 0.5366
Fold 5 / 5   Epoch 2 / 3   Batch 100 / 284   Val_Loss: 0.5208   Best_Val_Loss: 0.5208
Fold 5 / 5   Epoch 2 / 3   Batch 150 / 284   Val_Loss: 0.5146   Best_Val_Loss: 0.5146
Fold 5 / 5   Epoch 2 / 3   Batch 200 / 284   Val_Loss: 0.4928   Best_Val_Loss: 0.4928
Fold 5 / 5   Epoch 2 / 3   Batch 250 / 284   Val_Loss: 0.4898   Best_Val_Loss: 0.4898
Fold 5 / 5   Epoch 2 / 3   Batch 284 / 284   Val_Loss: 0.4980   Best_Val_Loss: 0.4898
  Average training loss: 0.4493
  Best Val Loss: 0.4898
  Training epoch took: 0:01:43

Fold 5 / 5   Epoch 3 / 3   Batch  20 / 284   Val_

In [5]:
print("\nBest Val Losses:")
for i, loss in enumerate(best_val_losses):
    print("Fold: {:}   Loss: {:.5f}".format(i, loss))


Best Val Losses:
Fold: 0   Loss: 0.48605
Fold: 1   Loss: 0.46094
Fold: 2   Loss: 0.48767
Fold: 3   Loss: 0.46103
Fold: 4   Loss: 0.46878


In [6]:
oof_preds = oof_predictions(df, model_cfg, DEVICE)

oof_combined = np.zeros(len(df))
for fold in oof_preds:
    oof_combined[oof_preds[fold]["val_index"]] += oof_preds[fold]["preds"]

cv_score = np.sqrt(mean_squared_error(df.target.values, oof_combined))
print("CV score = {:.5f}".format(cv_score))

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

Model 0:   0%|          | 0/71 [00:00<?, ?it/s]

Model 1:   0%|          | 0/71 [00:00<?, ?it/s]

Model 2:   0%|          | 0/71 [00:00<?, ?it/s]

Model 3:   0%|          | 0/71 [00:00<?, ?it/s]

Model 4:   0%|          | 0/71 [00:00<?, ?it/s]

CV score = 0.47956
