In [None]:
!pip install accelerate datasets transformers[sentencepiece]

Collecting accelerate
  Downloading accelerate-0.22.0-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━

In [None]:
import logging
import math
import random
import sentencepiece

from accelerate import Accelerator
from datasets import Dataset, DatasetDict
import numpy as np
import pandas as pd
import torch
from torch.optim import AdamW
from torch.utils.data.dataloader import DataLoader
from tqdm.auto import tqdm
from transformers import AutoModelForMaskedLM, AutoTokenizer, DataCollatorForWholeWordMask, set_seed, TrainingArguments

In [None]:
MODEL_CHECKPOINT = "microsoft/mdeberta-v3-base"
PATH_TO_PREPROCESSED_ANAMNESIS = "./all_anamnesis.csv"

PATH_TO_NEW_TOKENIZER = "./tokenizer"

CHECK_MODEL_NUMBER_CANDIDATE = 20

TRAIN_PART_ANAMNESIS = 0.8

GROUPING_TEXT_CHUNK_SIZE = 512

MLM_PROBABILITY = 0.15

OUTPUT_DIR_TRAINING_ARG = f"{MODEL_CHECKPOINT}-finetuned"
OVERWRITE_OUTPUT_DIR_TRAINING_ARG = True
BATCH_SIZE_TRAINING_ARG = 8
NUM_EPOCH_TRAINING_ARG = 6
LR_TRAINING_ARG = 1e-5
WEIGHT_DECAY_TRAINING_ARG = 0.01
PER_DEVICE_TRAIN_BATCH_SIZE_TRAINING_ARG = 1
GRADIENT_CHECKPOINTING_TRAINING_ARG = True
PER_DEVICE_EVAL_BATCH_SIZE_TRAINING_ARG = 1
FP16_TRAINING_ARG = True

PATH_TO_SAVE_FINETUNED_MODEL_METRIC_HISTORY = "./metric_history.csv"
PATH_TO_SAVE_FINETUNED_MODEL = "./model"


def setup_random():
    random_state = 100
    random.seed(random_state)
    torch.manual_seed(random_state)
    torch.cuda.manual_seed(random_state)
    set_seed(random_state)
    np.random.seed(random_state)


def check_tokenizer_behaviour(tokenizer):
    print("\nTokenizer behaviour\nExample text:")
    text = f"ультразвуковой исследование {tokenizer.mask_token} полость"
    print(text)
    print("Decoded text:")
    print(tokenizer.decode(tokenizer(text)["input_ids"]))
    # Tokenizer should return [1, index, 2], but it didn't by default
    word = "хронический"
    print(f"Tokenizer `input_ids` for word '{word}'")
    print(tokenizer(word)["input_ids"])


def get_anamnesis():
    processed_anamnesis = pd.read_csv(PATH_TO_PREPROCESSED_ANAMNESIS, header=None, names=["anamnesis"])
    print(processed_anamnesis[processed_anamnesis['anamnesis'].isnull()])
    # ru-Roberta-large didn't work fine with `ё`, let's replace it with `е`
    processed_anamnesis["anamnesis"] = processed_anamnesis["anamnesis"].map(lambda x: x.replace('ё', 'е'))
    print(f"\nLoaded anamnesis.")
    print(f"Anamnesis number: {len(processed_anamnesis)}")
    print(f"Anamnesis head:")
    print(processed_anamnesis.head())
    return processed_anamnesis


def check_model_prediction(model, tokenizer, text, on_gpu):
    print("\nCheck model prediction")
    print(f"Text: {text}")
    inputs = tokenizer(text, return_tensors="pt")
    if on_gpu:
        inputs = inputs.to(torch.cuda.current_device())
    print(f"Inputs ids: {inputs['input_ids']}")
    print(inputs)
    # Find the location of <mask> and extract its logits
    token_logits = model(**inputs).logits
    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
    mask_token_logits = token_logits[0, mask_token_index, :]
    mask_token_logits = torch.softmax(mask_token_logits, dim=1)
    # Pick the <mask> candidates with the highest logits
    top_n = torch.topk(mask_token_logits, CHECK_MODEL_NUMBER_CANDIDATE, dim=1)
    top_n_tokens = zip(top_n.indices[0].tolist(), top_n.values[0].tolist())
    print(f"Top {CHECK_MODEL_NUMBER_CANDIDATE} candidates")
    for token, score in top_n_tokens:
        print(f"{text.replace(tokenizer.mask_token, tokenizer.decode([token]))}, score: {score}")


def prepare_datasets(anamnesis):
    np.random.shuffle(anamnesis)
    train = anamnesis[:int((len(anamnesis) + 1) * TRAIN_PART_ANAMNESIS)]
    test = anamnesis[int((len(anamnesis) + 1) * TRAIN_PART_ANAMNESIS):]
    print(f"\nTrain dataset - {len(train)}")
    print(f"Test dataset - {len(test)}")
    train_dataset = Dataset.from_dict({"text": train})
    test_dataset = Dataset.from_dict({"text": test})
    anamnesis_dataset = DatasetDict({"train": train_dataset, "test": test_dataset})
    print(f"Result dataset - {anamnesis_dataset}")
    return anamnesis_dataset


def tokenize_dataset(tokenizer, dataset):
    def tokenize_function(examples):
        result = tokenizer(examples["text"])
        # if tokenizer.is_fast:
        #    result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
        return result

    tokenized_datasets = dataset.map(
        tokenize_function, batched=True, remove_columns=["text"]
    )
    return tokenized_datasets


def group_datasets_text(tokenized_dataset):
    def group_texts(examples):
        # Concatenate all texts
        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
        # Compute length of concatenated texts
        total_length = len(concatenated_examples['input_ids'])
        # We drop the last chunk if it's smaller than chunk_size
        total_length = (total_length // GROUPING_TEXT_CHUNK_SIZE) * GROUPING_TEXT_CHUNK_SIZE
        # Split by chunks of max_len
        result = {
            k: [t[i: i + GROUPING_TEXT_CHUNK_SIZE] for i in range(0, total_length, GROUPING_TEXT_CHUNK_SIZE)]
            for k, t in concatenated_examples.items()
        }
        # Create a new labels column
        result["labels"] = result["input_ids"].copy()
        return result

    lm_datasets = tokenized_dataset.map(group_texts, batched=True, batch_size=len(tokenized_dataset["train"]))
    lm_datasets.set_format("pt")
    print(f"\nGroupped text dataset - {lm_datasets}")
    return lm_datasets


def build_training_arguments():
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR_TRAINING_ARG,
        overwrite_output_dir=OVERWRITE_OUTPUT_DIR_TRAINING_ARG,
        num_train_epochs=NUM_EPOCH_TRAINING_ARG,
        learning_rate=LR_TRAINING_ARG,
        weight_decay=WEIGHT_DECAY_TRAINING_ARG,
        per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE_TRAINING_ARG,
        gradient_accumulation_steps=BATCH_SIZE_TRAINING_ARG,
        gradient_checkpointing=GRADIENT_CHECKPOINTING_TRAINING_ARG,
        per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE_TRAINING_ARG,
        # no_cuda=True,
        fp16=FP16_TRAINING_ARG
    )

    return training_args


def train_model(model, optimizer, accelerator, train_dataloader, test_dataloader, training_args):
    train_perplexity_history = []
    train_mean_loss_history = []
    test_perplexity_history = []
    test_mean_loss_history = []
    for epoch in range(training_args.num_train_epochs):
        progress_bar = tqdm(range(len(train_dataloader)))

        print(f"TRAIN EPOCH {epoch}")
        model.train()
        train_loses = []
        for step, batch in enumerate(train_dataloader, start=1):
            loss = model(**batch).loss
            train_loses.append(accelerator.gather(loss.repeat(BATCH_SIZE_TRAINING_ARG)))
            loss = loss / training_args.gradient_accumulation_steps
            accelerator.backward(loss)
            if step % training_args.gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
            progress_bar.update(1)

        train_loses = torch.cat(train_loses)
        train_loses = train_loses[: len(train_dataloader)]
        train_mean_loss = torch.mean(train_loses).cpu().detach().numpy()
        train_mean_loss_history.append(train_mean_loss)


        print(f"EVAL EPOCH {epoch}")
        progress_bar = tqdm(range(len(test_dataloader)))
        model.eval()
        test_losses = []
        for step, batch in enumerate(test_dataloader):
            with torch.no_grad():
                outputs = model(**batch)

            loss = outputs.loss
            test_losses.append(accelerator.gather(loss.repeat(BATCH_SIZE_TRAINING_ARG)))
            progress_bar.update(1)

        test_losses = torch.cat(test_losses)
        test_losses = test_losses[: len(test_dataloader)]
        test_mean_loss = torch.mean(test_losses).cpu().detach().numpy()
        test_mean_loss_history.append(test_mean_loss)

        try:
            train_perplexity = math.exp(train_mean_loss)
        except OverflowError:
            train_perplexity = float("inf")
        train_perplexity_history.append(train_perplexity)

        try:
            test_perplexity = math.exp(torch.mean(test_losses))
        except OverflowError:
            test_perplexity = float("inf")
        test_perplexity_history.append(test_perplexity)

        print(f">>> Epoch {epoch}:"
              f"\nTrain Mean Loss: {train_mean_loss}"
              f"\nTest Mean Loss: {test_mean_loss}"
              f"\nTrain Perplexity: {train_perplexity}"
              f"\nTest Perplexity: {test_perplexity}")

    df_metrics = pd.DataFrame({"train_perplexity": train_perplexity_history,
                               "train_mean_loss": train_mean_loss_history,
                               "test_perplexity": test_perplexity_history,
                               "test_mean_loss": test_mean_loss_history})
    df_metrics.to_csv(PATH_TO_SAVE_FINETUNED_MODEL_METRIC_HISTORY)


def fine_tune_model():
    logging.basicConfig(level=logging.DEBUG)
    setup_random()

    model = AutoModelForMaskedLM.from_pretrained(MODEL_CHECKPOINT)
    print(f"Model {MODEL_CHECKPOINT} loaded.")

    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
    print(f"Tokenizer {MODEL_CHECKPOINT} loaded.")

    check_tokenizer_behaviour(tokenizer)

    anamnesis = get_anamnesis()

    # No need to update tokenizer because it's messing up input ids and pretrained model works much worse than before
    # Just save new tokenizer to dir
    tokenizer.save_pretrained(PATH_TO_NEW_TOKENIZER)

    # Check model prediction candidates
    check_model_test_text = f"ультразвуковой исследование {tokenizer.mask_token} полость"
    check_model_prediction(model, tokenizer, check_model_test_text, False)

    anamnesis_list = anamnesis["anamnesis"].values
    dataset = prepare_datasets(anamnesis_list)
    tokenized_dataset = tokenize_dataset(tokenizer, dataset)
    lm_datasets = group_datasets_text(tokenized_dataset)

    print("Example of decoded first text block")
    print(tokenizer.decode(lm_datasets["train"][0]["input_ids"]))

    data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm_probability=MLM_PROBABILITY)
    training_args = build_training_arguments()

    train_dataloader = DataLoader(lm_datasets["train"],
                                  batch_size=training_args.per_device_train_batch_size,
                                  collate_fn=data_collator)

    test_dataloader = DataLoader(lm_datasets["test"],
                                 batch_size=training_args.per_device_train_batch_size,
                                 collate_fn=data_collator)


    torch.cuda.set_device(torch.device(0))

    accelerator = Accelerator(mixed_precision="fp16" if training_args.fp16 else None)
    adam_w_optim = AdamW(model.parameters(), lr=training_args.learning_rate, weight_decay=training_args.weight_decay)
    model, optimizer, train_dataloader, test_dataloader = accelerator.prepare(model, adam_w_optim, train_dataloader,
                                                                              test_dataloader)

    train_model(model, optimizer, accelerator, train_dataloader, test_dataloader, training_args)

    model.save_pretrained(PATH_TO_SAVE_FINETUNED_MODEL)

    check_model_prediction(model, tokenizer, check_model_test_text, True)


if __name__ == '__main__':
    """
    Fine-tuning microsoft/mdeberta-v3-base for MLM task on the russian medical text
    """
    fine_tune_model()


Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of DebertaV2ForMaskedLM were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model microsoft/mdeberta-v3-base loaded.


Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tokenizer microsoft/mdeberta-v3-base loaded.

Tokenizer behaviour
Example text:
ультразвуковой исследование [MASK] полость
Decoded text:
[CLS] ультразвуковой исследование[MASK] полость[SEP]
Tokenizer `input_ids` for word 'хронический'
[1, 49385, 21000, 2]
Empty DataFrame
Columns: [anamnesis]
Index: []

Loaded anamnesis.
Anamnesis number: 30737
Anamnesis head:
                                           anamnesis
0  стенокардия напряжение хронический гастродуоде...
1  стенокардия аппендэктомия летний возраст насле...
2  отрицать респираторный заболевание хронический...
3  сахарный диабет тип язвенный болезнь двенадцат...
4  повод са толстой кишка выведение калостома г а...

Check model prediction
Text: ультразвуковой исследование [MASK] полость
Inputs ids: tensor([[     1, 121021, 136078,    818,  39138,    325, 250101,  94606,   1068,
              2]])
{'input_ids': tensor([[     1, 121021, 136078,    818,  39138,    325, 250101,  94606,   1068,
              2]]), 'token_type_ids': te

Map:   0%|          | 0/24590 [00:00<?, ? examples/s]

Map:   0%|          | 0/6147 [00:00<?, ? examples/s]

Map:   0%|          | 0/24590 [00:00<?, ? examples/s]

Map:   0%|          | 0/6147 [00:00<?, ? examples/s]


Groupped text dataset - DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2318
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 564
    })
})
Example of decoded first text block
[CLS] респираторный симптом отделение неотложный помощь[SEP][CLS] жалоба перепад эйфория возбуэжение ухудшение течение день снижение память близкий событие[SEP][CLS] пациент жаловаться одышка боль грудь[SEP][CLS] метеозависимость звон право уха весь голова носить постоянный хара ра день менее заметно ночной время усиливаться головокружение системный хара ра секунда купировать[SEP][CLS] затем обильный количество жидкий зеленый понос[SEP][CLS] апгар[SEP][CLS] повышение гликемия натощак ммоль л[SEP][CLS] пациентка провести искусственный оплодотворение[SEP][CLS] ранее болеть неконтролировать анамнез из перенести заболевание обморожение кисть рука острый кишечн

  0%|          | 0/2318 [00:00<?, ?it/s]

TRAIN EPOCH 0




EVAL EPOCH 0


  0%|          | 0/564 [00:00<?, ?it/s]

>>> Epoch 0:
Train Mean Loss: 16.645545959472656
Test Mean Loss: 6.981122970581055
Train Perplexity: 16946060.73003315
Test Perplexity: 1076.1261467225704


  0%|          | 0/2318 [00:00<?, ?it/s]

TRAIN EPOCH 1
EVAL EPOCH 1


  0%|          | 0/564 [00:00<?, ?it/s]

>>> Epoch 1:
Train Mean Loss: 7.262138366699219
Test Mean Loss: 5.614068984985352
Train Perplexity: 1425.3010972508564
Test Perplexity: 274.25792205329776


  0%|          | 0/2318 [00:00<?, ?it/s]

TRAIN EPOCH 2
EVAL EPOCH 2


  0%|          | 0/564 [00:00<?, ?it/s]

>>> Epoch 2:
Train Mean Loss: 5.9857869148254395
Test Mean Loss: 4.685321807861328
Train Perplexity: 397.7353820045251
Test Perplexity: 108.34513301419894


  0%|          | 0/2318 [00:00<?, ?it/s]

TRAIN EPOCH 3
EVAL EPOCH 3


  0%|          | 0/564 [00:00<?, ?it/s]

>>> Epoch 3:
Train Mean Loss: 5.0856547355651855
Test Mean Loss: 4.1587066650390625
Train Perplexity: 161.68576601093497
Test Perplexity: 63.988710223371996


  0%|          | 0/2318 [00:00<?, ?it/s]

TRAIN EPOCH 4
EVAL EPOCH 4


  0%|          | 0/564 [00:00<?, ?it/s]

>>> Epoch 4:
Train Mean Loss: 4.516120433807373
Test Mean Loss: 3.705047607421875
Train Perplexity: 91.48000591443767
Test Perplexity: 40.65198260689734


  0%|          | 0/2318 [00:00<?, ?it/s]

TRAIN EPOCH 5
EVAL EPOCH 5


  0%|          | 0/564 [00:00<?, ?it/s]

>>> Epoch 5:
Train Mean Loss: 4.1290059089660645
Test Mean Loss: 3.432429313659668
Train Perplexity: 62.11614313152451
Test Perplexity: 30.95174298375169

Check model prediction
Text: ультразвуковой исследование [MASK] полость
Inputs ids: tensor([[     1, 121021, 136078,    818,  39138,    325, 250101,  94606,   1068,
              2]], device='cuda:0')
{'input_ids': tensor([[     1, 121021, 136078,    818,  39138,    325, 250101,  94606,   1068,
              2]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
Top 20 candidates
ультразвуковой исследование проток полость, score: 0.1766219586133957
ультразвуковой исследование ение полость, score: 0.03370888903737068
ультразвуковой исследование ьев полость, score: 0.029516490176320076
ультразвуковой исследование cuti полость, score: 0.024090657010674477
ультразвуковой исследование ضات полость, score: 0.014385187998

In [None]:
!pip show sentencepiece

Name: sentencepiece
Version: 0.1.99
Summary: SentencePiece python wrapper
Home-page: https://github.com/google/sentencepiece
Author: Taku Kudo
Author-email: taku@google.com
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: 
Required-by: 
