# Model pretraining
https://www.kaggle.com/maunish/clrp-pytorch-roberta-pretrain

In [1]:
import pandas as pd
from pathlib import Path
import warnings

warnings.filterwarnings("ignore")

from transformers import (
    AutoModel,
    AutoModelForMaskedLM,
    AutoTokenizer,
    LineByLineTextDataset,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

INPUT_PATH = Path("/mnt/storage_dimm2/kaggle_data/commonlitreadabilityprize")
OUTPUT_PATH = Path("/mnt/storage/kaggle_output/commonlitreadabilityprize")
MODEL_CACHE = Path("/mnt/storage/model_cache/torch")

In [2]:
train_data = pd.read_csv(INPUT_PATH / 'train.csv')
test_data = pd.read_csv(INPUT_PATH / 'test.csv')

# goodreads = pd.read_csv('../input/goodreads-books-100k/GoodReads_100k_books.csv')
# goodreads['desc'] = goodreads['desc'].apply(lambda x: str(x))
# extra = pd.read_table('../input/clrp-external/weebit_reextracted.tsv')
# one_stop_english = pd.read_csv('../input/onestopenglishcorpus/all_data.csv')
# jigsaw = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')

text  = '.'.join(train_data.excerpt.tolist() + test_data.excerpt.tolist())

with open(INPUT_PATH / 'pretrain_text.txt','w') as f:
    f.write(text)

In [3]:
model_name = 'deepset/roberta-large-squad2'

model = AutoModelForMaskedLM.from_pretrained(model_name, cache_dir=MODEL_CACHE)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=MODEL_CACHE)

Some weights of the model checkpoint at deepset/roberta-large-squad2 were not used when initializing RobertaForMaskedLM: ['qa_outputs.weight', 'qa_outputs.bias']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at deepset/roberta-large-squad2 and are newly initialized: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

In [4]:
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=str(INPUT_PATH / "pretrain_text.txt"),
    block_size=128,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    output_dir=str(OUTPUT_PATH / "pretraining" / model_name),
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=32,
    save_total_limit=2,
    prediction_loss_only=True,
    report_to="none",
)

trainer = Trainer(
    model=model, args=training_args, data_collator=data_collator, train_dataset=dataset
)

In [5]:
trainer.train()

100%|██████████| 134/134 [03:24<00:00,  1.53s/it]


{'train_runtime': 204.6879, 'train_samples_per_second': 0.655, 'epoch': 2.0}


TrainOutput(global_step=134, training_loss=5.318245958926073, metrics={'train_runtime': 204.6879, 'train_samples_per_second': 0.655, 'epoch': 2.0, 'init_mem_cpu_alloc_delta': 2099240960, 'init_mem_gpu_alloc_delta': 1421652992, 'init_mem_cpu_peaked_delta': 1417252864, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 1268400128, 'train_mem_gpu_alloc_delta': 4265105408, 'train_mem_cpu_peaked_delta': 32768, 'train_mem_gpu_peaked_delta': 12647294464})

In [6]:
trainer.save_model(str(OUTPUT_PATH / "pretraining" / model_name))