# 📖 Torch Roberta - ITPT - Intra-task pre-training

![](https://storage.googleapis.com/kaggle-competitions/kaggle/31779/logos/header.png)

## Intra-task pre-training of a `roberta-large` (but trivially adaptable to any MLM model) over the [Feedback Prize - Evaluating Student Writing](https://www.kaggle.com/c/feedback-prize-2021)


Based on this notebook by [torch](): [CommonLit Readability Prize - RoBERTa Torch|ITPT](https://www.kaggle.com/rhtsingh/commonlit-readability-prize-roberta-torch-itpt), which in turn is based on this script by huggingface: https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_mlm_no_trainer.py

A good learning reference for this is also the Chapter 7 of the HuggingFace course, there is a detailed step-by-step explanation of the code atoms of this notebook: [Chapter 7 - Section 3 - Fine-tuning a masked language model](https://huggingface.co/course/chapter7/3?fw=pt).


# 🤗 Please _DO_ upvote if you find this helpful or interesting! 🤗


# Imports

In [1]:
import os
import math
import logging
import pandas as pd
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoModelForMaskedLM, AutoTokenizer,\
                         AdamW, DataCollatorForLanguageModeling,\
                         get_scheduler, AutoConfig
import codecs
from typing import Dict, List, Tuple
from text_unidecode import unidecode

In [2]:
os.environ["WANDB_DISABLED"] = "true" # disable wandb

# Configuration

In [3]:
class Config:
#     model_name = 'roberta-large'
    model_name = "microsoft/deberta-v3-base"
    max_length = 1024
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    validation_size = 0.05
    mlm_probability = 0.15
    
    train_batch_size = 2
    eval_batch_size = 2
    
    learning_rate = 2.5e-5
    
    num_train_epochs = 4
        
    lr_scheduler_type = 'constant_with_warmup'
    num_warmup_steps = 0

args = Config()

## Read Data

In [4]:
input_path_2021 = '../input/feedback-prize-2021/'
input_path_2022 = '../input/feedback-prize-effectiveness/'

# Create one CSV with all the texts

In [5]:
def create_mlm_csv(path, year):
    """ Read all training texts to a csv file with one column 'text' """
    texts = []
    
    for f in tqdm(list(os.listdir(path))):
        with open(path + f, 'r') as fp:
            texts.append(fp.read())
    
    df = pd.DataFrame({'text': texts})
    
    display(df.head())
#     df.to_csv(year + "mlm_train.csv", index=False)
    return df

In [6]:
train_2021 = create_mlm_csv(input_path_2021+'train/', '2021')
train_2022 = create_mlm_csv(input_path_2022+'train/', '2022')

100%|██████████| 15594/15594 [01:23<00:00, 185.91it/s]


Unnamed: 0,text
0,I think we should be able to play in a sport i...
1,Some schools require summer projects for stude...
2,Driverless cars have been argued and talked ab...
3,"The author of ""The Challenge of Exploring Venu..."
4,"Wow, from the mar really look like humans face..."


100%|██████████| 4191/4191 [00:21<00:00, 196.47it/s]


Unnamed: 0,text
0,"The author of ""The Challenge of Exploring Venu..."
1,"Wow, from the mar really look like humans face..."
2,Scientists wants to keep on exploring venus ev...
3,I reconsider that are principal should let us ...
4,"In 2013, BMW announces the development of the ..."


## Encoding Error

In [7]:
# https://www.kaggle.com/code/brandonhu0215/feedback-deberta-large-lb0-619

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)


def resolve_encodings_and_normalize(text: str) -> str:
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    
    text = unidecode(text)
    
    return text


def fetch_essay(essay_id: str, txt_dir: str):
    essay_path = os.path.join(input_path_2021 + txt_dir, essay_id + '.txt')
    # print(input_path_2022 + txt_dir, essay_id + '.txt')
    essay_text = open(essay_path, 'r').read()
    
    return essay_text

In [8]:
train_2021['text'] = train_2021['text'].apply(resolve_encodings_and_normalize)
train_2022['text'] = train_2022['text'].apply(resolve_encodings_and_normalize)

In [9]:
df  = pd.concat([train_2021, train_2022]).drop_duplicates()

In [10]:
df.to_csv("mlm_train.csv", index=False)

# Model and Tokenizer

In [11]:
model = AutoModelForMaskedLM.from_pretrained(args.model_name).to(args.device)

Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForMaskedLM: ['mask_predictions.LayerNorm.bias', 'mask_predictions.dense.weight', 'deberta.embeddings.position_embeddings.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model

In [12]:
tokenizer = AutoTokenizer.from_pretrained(args.model_name)

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Dataset and DataLoader

In [13]:
def tokenize_function(examples):
    return tokenizer(examples['text'], return_special_tokens_mask=True)

def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // args.max_length) * args.max_length
    result = {
        k: [t[i : i + args.max_length] for i in range(0, total_length, args.max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

In [14]:
raw_datasets = load_dataset("csv", data_files={'train': 'mlm_train.csv'})

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, remove_columns=['text'])\
                                 .map(group_texts, batched=True)

tokenized_datasets = tokenized_datasets["train"].train_test_split(test_size=args.validation_size)
tokenized_datasets['validation'] = tokenized_datasets.pop("test")


data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=args.mlm_probability)


dl_train = DataLoader(tokenized_datasets["train"], 
                      shuffle=True, 
                      collate_fn=data_collator, 
                      batch_size=args.train_batch_size)

dl_val = DataLoader(tokenized_datasets["validation"], collate_fn=data_collator, batch_size=args.eval_batch_size)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-ffcbdf97cac509ca/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-ffcbdf97cac509ca/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/16 [00:00<?, ?ba/s]

# Optimizer and Scheduler

In [15]:
optimizer = AdamW(model.parameters(), lr=args.learning_rate)

num_training_steps = args.num_train_epochs * len(dl_train)
lr_scheduler = get_scheduler(
    name=args.lr_scheduler_type,
    optimizer=optimizer,
    num_warmup_steps=args.num_warmup_steps,
    num_training_steps=num_training_steps,
)

# Training/validation loop

In [16]:
print("***** Running training *****")
print(f"  Num examples = {len(tokenized_datasets['train'])}")
print(f"  Num Epochs = {args.num_train_epochs}")
print(f"  Total training steps = {num_training_steps}")

***** Running training *****
  Num examples = 7161
  Num Epochs = 4
  Total training steps = 14324


In [17]:
# from transformers import TrainingArguments

# batch_size = 4
# # Show the training loss with every epoch
# logging_steps = len(dataset_split["train"]) // batch_size
# model_name = args.model_name

# training_args = TrainingArguments(
#     output_dir=f"/kaggle/working/{model_name}-finetuned-essay",
#     overwrite_output_dir=False,
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     weight_decay=0.01,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     push_to_hub=False,
#     fp16=True,
#     logging_steps=logging_steps,
# )

In [18]:
# from transformers import Trainer

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=dataset_split["train"],
#     eval_dataset=dataset_split["validation"],
#     data_collator=data_collator,
# )

In [19]:
# trainer.train()

In [20]:
progress_bar = tqdm(range(num_training_steps))
completed_steps = 0

for epoch in range(args.num_train_epochs):
    model.train()
    cum_loss = 0
    for batch_idx, batch in enumerate(dl_train, 1):
        
        outputs = model(**{k: v.to(args.device) for k, v in batch.items()})
        loss = outputs.loss
        cum_loss += loss.item()
        
        
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        progress_bar.set_postfix({'loss': cum_loss / batch_idx})
        #if batch_idx > 100:
        #    break

    model.eval()
    losses = []
    for batch_idx, batch in enumerate(dl_val, 1):
        with torch.no_grad():
            outputs = model(**{k: v.to(args.device) for k, v in batch.items()})

        loss = outputs.loss
        losses.append(loss)
        #if batch_idx > 100:
        #    break

    losses = torch.tensor(losses)
    losses = losses[: len(tokenized_datasets['validation'])]
    perplexity = math.exp(torch.mean(losses))

#     print(f"Epoch {epoch}: perplexity: {perplexity}")
    model.save_pretrained(f'Deberta_large-v3-itpt-e{epoch}')

100%|██████████| 14324/14324 [2:51:12<00:00,  1.60it/s, loss=1.93]

In [21]:
# trainer.model.save_pretrained('mlm-model')

In [22]:
config = AutoConfig.from_pretrained(args.model_name)

tokenizer.save_pretrained('model_tokenizer')
config.save_pretrained('model_tokenizer')

# 🤗 Please _DO_ upvote if you find this helpful or interesting! 🤗