In [4]:
!pip install bert_score 
!pip install peft
!pip install accelerate
!pip install transformers==4.32.0



In [5]:
#!pip install -i https://test.pypi.org/simple/ bitsandbytes-cuda113 # version with GPU support.
#!pip install bitsandbytes-cuda110 bitsandbytes # version with GPU support.
!pip install -i https://test.pypi.org/simple/ bitsandbytes

Looking in indexes: https://test.pypi.org/simple/


In [7]:
import codecs
import copy
import gc
import json
import logging
import math
import os
import random
import sys
from collections import defaultdict
from typing import Dict, List, Set, Tuple, Union

import bert_score
import numpy as np
import torch
from accelerate import infer_auto_device_map
from bitsandbytes.optim import Adam8bit
from datasets import load_dataset
from peft import LoraConfig, PeftModel, get_peft_model, TaskType
from tqdm import trange, tqdm
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForSeq2SeqLM
from transformers import BertModel, BertTokenizer
from pathlib import Path

# os.environ['TRANSFORMERS_OFFLINE'] = '1'

llm_training_logger = logging.getLogger(__name__)
DEFAULT_RANDOM_SEED = 44
MAX_EPOCHS: int = 20
bert_minibatch_size = 2
minibatch_size = 2
learning_rate = 5e-5
penalty_weight = 10000.0
l2_regularizer_weight = 0.1
accumulate_gradients = 1
use_birm = False
max_seq_len = None
bert_num_layers = 9

input_model_name = "ai-forever/FRED-T5-1.7B"
data_name = "training_dataset.jsonl"
data_path = Path('/kaggle/input/trains-dataset/')

test = data_path / data_name
train = data_path / data_name

def mean_pooling(model_output: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

def generate_bert_embeddings(texts: List[str], tokenizer: BertTokenizer, model: BertModel,
                             minibatch: int) -> np.ndarray:
    n_batches = math.ceil(len(texts) / minibatch)
    res = []
    for batch_idx in trange(n_batches):
        batch_start = batch_idx * minibatch
        batch_end = min(batch_start + minibatch, len(texts))
        inputs = tokenizer(
            [cur.lower() for cur in texts[batch_start:batch_end]],
            max_length=512, padding='longest', truncation='longest_first', return_tensors='pt'
        ).to(model.device)
        with torch.no_grad():
            model_outputs = model(**inputs)
            sentence_embeddings = mean_pooling(model_outputs, inputs['attention_mask']).cpu().numpy()
        if len(sentence_embeddings.shape) != 2:
            err_msg = f'The sentence embedding shape is wrong! Expected 2, got {sentence_embeddings.shape}.'
            llm_training_logger.error(err_msg)
            raise ValueError(err_msg)
        if sentence_embeddings.shape[0] != (batch_end - batch_start):
            err_msg = (f'The first sentence embedding shape is wrong! Expected {batch_end - batch_start}, '
                       f'got {sentence_embeddings.shape[0]}.')
            llm_training_logger.error(err_msg)
            raise ValueError(err_msg)
        res.append(sentence_embeddings)
    return np.vstack(res)

def generate_minibatch(dataset: Dict[str, List[Dict[str, List[int]]]], categories: List[str], minibatch: int,
                       padding: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Set[int]]:
    if minibatch < 2:
        err_msg = f'The mini-batch is too small! Expected 2 or greater, got {minibatch}.'
        llm_training_logger.error(err_msg)
        raise ValueError(err_msg)
    if minibatch < len(categories):
        environments = random.sample(population=categories, k=minibatch)
    elif minibatch == len(categories):
        environments = categories
    else:
        environments = copy.copy(categories)
        while len(environments) < minibatch:
            environments.append(random.choice(categories))
        pass
    # environments.append(random.choice(categories))
    input_ids = []
    attention_mask = []
    labels = []
    environment_IDs = []
    for env in environments:
        sample = random.choice(dataset[env])
        input_ids.append(torch.tensor(data=sample['input_ids'], dtype=torch.long))
        attention_mask.append(torch.tensor(data=sample['attention_mask'], dtype=torch.long))
        labels.append(torch.tensor(data=sample['labels'], dtype=torch.long))
        environment_IDs.append(categories.index(env))
    set_of_env = set(environment_IDs)
    batched_input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=padding).cuda()
    batched_attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0).cuda()
    batched_labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True).cuda()
    batched_environment_IDs = torch.tensor(data=environment_IDs, dtype=torch.long).cuda()
    return batched_input_ids, batched_attention_mask, batched_labels, batched_environment_IDs, set_of_env

def predict(testset: List[Tuple[str, str, str]], tokenizer: AutoTokenizer, model: PeftModel,
            minibatch: int) -> Tuple[List[str], List[str], List[str]]:
    gen_config = model.generation_config
    gen_config.max_new_tokens = 300
    gen_config.do_sample = True
    gen_config.top_k = 10
    # gen_config.top_p = 0.9
    # gen_config.num_return_sequences = 1
    gen_config.pad_token_id = tokenizer.eos_token_id
    gen_config.eos_token_id = tokenizer.eos_token_id
    n_batches = math.ceil(len(testset) / minibatch)
    true_answers = []
    predicted_answers = []
    input_prompts = []
    for batch_idx in range(n_batches):
        batch_start = batch_idx * minibatch
        batch_end = min(len(testset), batch_start + minibatch)
        input_ids = []
        attention_mask = []
        for sample_idx in range(batch_start, batch_end):
            prompt = format_my_data(testset[sample_idx][0:2], True)
            tokenized_text = tokenize_prompt(
                prompt,
                tokenizer,
                add_labels=False
            )
            input_ids.append(torch.tensor(data=tokenized_text['input_ids'], dtype=torch.long))
            attention_mask.append(torch.tensor(data=tokenized_text['attention_mask'], dtype=torch.long))
            true_answers.append(testset[sample_idx][2])
        batched_input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True,
                                                            padding_value=tokenizer.pad_token_id).cuda()
        batched_attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True,
                                                                 padding_value=0).cuda()
        with torch.no_grad():
            generated_ids = model.generate(input_ids=batched_input_ids, attention_mask=batched_attention_mask,
                                           generation_config=gen_config)
        for sample_idx in range(batch_end - batch_start):
            pass
        predicted_answers += tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        input_prompts += [cur.strip() for cur in tokenizer.batch_decode(input_ids, skip_special_tokens=True)]
        del input_ids, attention_mask
        del batched_input_ids, batched_attention_mask, generated_ids
    if len(predicted_answers) != len(testset):
        err_msg = f'The predicted answers do not correspond to the test set! {len(predicted_answers)} != {len(testset)}'
        llm_training_logger.error(err_msg)
        raise ValueError(err_msg)
    if len(true_answers) != len(testset):
        err_msg = f'The true answers do not correspond to the test set! {len(true_answers)} != {len(testset)}'
        llm_training_logger.error(err_msg)
        raise ValueError(err_msg)
    return input_prompts, predicted_answers, true_answers

def strip_texts_for_bert(texts: List[str], tokenizer: BertTokenizer, max_len: int = 500) -> List[str]:
    stripped_texts = []
    for cur in texts:
        tokenized = tokenizer.encode(cur.lower())
        if len(tokenized) > max_len:
            tokenized = tokenized[0:max_len]
            stripped_texts.append(tokenizer.decode(token_ids=tokenized, skip_special_tokens=True))
        else:
            stripped_texts.append(cur.lower())
    return stripped_texts

def prepare_model_for_bert_score(model: BertModel, num_layers: int) -> BertModel:
    model.encoder.layer = torch.nn.ModuleList(
        [layer for layer in model.encoder.layer[:num_layers]]
    )
    return model

def evaluate(questions: List[str], predicted_answers: List[str], true_answers: List[str],
             tokenizer: BertTokenizer, model: BertModel,
             minibatch: int) -> Tuple[float, List[Dict[str, Union[str, float]]]]:
    if len(true_answers) != len(predicted_answers):
        err_msg = f'The true answers do not correspond to the predicted answers! ' \
                  f'{len(true_answers)} != {len(predicted_answers)}.'
        llm_training_logger.error(err_msg)
        raise ValueError(err_msg)
    true_answers_ = strip_texts_for_bert(true_answers, tokenizer)
    predicted_answers_ = strip_texts_for_bert(predicted_answers, tokenizer)
    # idf_dict = bert_score.get_idf_dict(true_answers_, tokenizer, nthreads=max(1, os.cpu_count()))
    idf_dict = defaultdict(lambda: 1.0)
    idf_dict[tokenizer.sep_token_id] = 0
    idf_dict[tokenizer.cls_token_id] = 0
    all_preds = bert_score.bert_cos_score_idf(
        model,
        true_answers_,
        predicted_answers_,
        tokenizer,
        idf_dict,
        device=model.device,
        batch_size=minibatch,
        all_layers=False,
    ).cpu()
    f1_list = all_preds[..., 2].numpy().tolist()
    if len(true_answers) != len(f1_list):
        err_msg = f'The true answers do not correspond to the BERT scores! {len(true_answers)} != {len(f1_list)}.'
        llm_training_logger.error(err_msg)
        raise ValueError(err_msg)
    f1_mean = float(np.mean(f1_list))
    res = []
    for pred_, true_, f1_val, question in zip(predicted_answers, true_answers, f1_list, questions):
        res.append({
            'PROMPT': question,
            'TRUE': true_,
            'PRED': pred_,
            'F1': f1_val
        })
    return f1_mean, sorted(res, key=lambda it: it['F1'])

def add_epoch_to_report_fname(report_fname: str, epoch: int) -> str:
    if not report_fname.lower().endswith('.json'):
        err_msg = f'The report file name = {os.path.basename(report_fname)} is wrong, because it is not JSON!'
        llm_training_logger.error(err_msg)
        raise ValueError(err_msg)
    new_report_fname = report_fname[:-5] + '_epoch{0:>03}'.format(epoch) + '.json'
    return new_report_fname

def tokenize_prompt(prompt: str, tokenizer: AutoTokenizer, add_eos_token: bool = True,
                    add_labels: bool = True) -> Dict[str, List[int]]:
    result = tokenizer(prompt, padding=False, return_tensors=None)
    if (result['input_ids'][-1] != tokenizer.eos_token_id) and add_eos_token:
        result['input_ids'].append(tokenizer.eos_token_id)
        result['attention_mask'].append(1)
    if add_labels:
        result['labels'] = result['input_ids'].copy()
    return result

def format_my_data(sample: Union[Tuple[str, str, str], Tuple[str, str]], is_instruction_first: bool) -> str:
    instruction = f'{sample[0]}'.strip()
    context = f'{sample[1]}'.strip()

    # prompt = 'Прочитай текст: ' + context + ' Теперь на основе текста ' + instruction
    prompt = '<LM>Город, ' + instruction
    prompt = ' '.join(prompt.split()).strip()
    if len(prompt) == 0:
        err_msg = f'{sample}: The instruction and context are empty!'
        llm_training_logger.error(err_msg)
        raise ValueError(err_msg)
    if len(sample) > 2:
        response = f'{sample[2].strip()}'
        if len(response) == 0:
            err_msg = f'{sample}: The response is empty!'
            llm_training_logger.error(err_msg)
            raise ValueError(err_msg)
        prompt += (' ' + response + '</s>')
    return prompt

def generate_and_tokenize_prompt(data_point: Tuple[str, str, str], is_instruction_first: bool,
                                 tokenizer: AutoTokenizer) -> Dict[str, List[int]]:
    full_prompt = format_my_data(data_point, is_instruction_first)
    tokenized_full_prompt = tokenize_prompt(full_prompt, tokenizer)
    user_prompt = format_my_data(data_point[0:2], is_instruction_first)
    tokenized_user_prompt = tokenize_prompt(user_prompt, tokenizer)
    user_prompt_len = len(tokenized_user_prompt['input_ids'])
    user_prompt_len -= 1
    tokenized_full_prompt['labels'] = tokenized_full_prompt['labels'][user_prompt_len:]
    tokenized_full_prompt['attention_mask'] = tokenized_full_prompt['attention_mask'][:user_prompt_len]
    tokenized_full_prompt["input_ids"] = tokenized_full_prompt['input_ids'][:user_prompt_len]
    return tokenized_full_prompt

def print_trainable_parameters(model: PeftModel):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    info_msg = (f'Trainable parameters: {trainable_params}, all parameters: {all_param}, '
                f'trainable %: {100 * trainable_params / all_param}')
    llm_training_logger.info(info_msg)

def tokenize_dataset(dataset: Dict[str, List[Tuple[str, str, str]]], tokenizer: AutoTokenizer) -> \
        Dict[str, List[Dict[str, List[int]]]]:
    tokenized = dict()
    for category in sorted(list(dataset.keys())):
        tokenized_subset = []
        llm_training_logger.info(f'Texts of the category {category} are tokenized.')
        for sample in tqdm(dataset[category]):
            tokenized_subset.append(generate_and_tokenize_prompt(sample, False, tokenizer))
            # if sample[1] is not None:
            #    if len(sample[1]) > 0:
            #        tokenized_subset.append(generate_and_tokenize_prompt(sample, False, tokenizer))
        tokenized[category] = tokenized_subset
        del tokenized_subset
    return tokenized

def split_dolly_dataset(dataset: Dict[str, List[Tuple[str, str, str]]],
                        emb_tokenizer: BertTokenizer, emb_model: BertModel, minibatch: int,
                        random_state: int = None) -> (
        Tuple)[Dict[str, List[Tuple[str, str, str]]], Dict[str, List[Tuple[str, str, str]]]]:
    trainset = dict()
    testset = dict()
    for category in sorted(list(dataset.keys())):
        train_indices = []
        test_indices = []
        for i in range(0, 66):
            train_indices.append(i)
        for i in range(66, 84):
            test_indices.append(i)
        trainset[category] = []
        testset[category] = []
        for idx in train_indices:
            trainset[category].append(dataset[category][idx])
        for idx in test_indices:
            testset[category].append(dataset[category][idx])
        del train_indices, test_indices
        gc.collect()
    return trainset, testset

def load_bert(bert_name: str) -> Tuple[BertTokenizer, BertModel]:
    bert_name = "DeepPavlov/rubert-base-cased"
    tokenizer = BertTokenizer.from_pretrained(bert_name)
    model = BertModel.from_pretrained(bert_name).cuda()
    model.eval()
    llm_training_logger.info(f'The BERT model {bert_name} is loaded.')
    return tokenizer, model

def create_peft_model(llm: AutoModelForSeq2SeqLM, lora_r: int = 64, lora_alpha: int = 128,
                      lora_dropout: float = 0.05) -> PeftModel:
    config = LoraConfig(
        r=lora_r,
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        target_modules=["q", "v"],
        bias='none',
        task_type=TaskType.SEQ_2_SEQ_LM,
    )
    return get_peft_model(llm, config)

def load_my_data_dataset(dataset_name: str) -> Dict[str, List[Tuple[str, str, str]]]:
    res = dict()
    dataset = load_dataset("json", data_files= str(train))
    dataset_splits = list(dataset.keys())
    print(len(dataset_splits))
    info_msg = f'The dataset "{dataset_name}" is loaded.'
    if len(dataset_splits) > 1:
        info_msg += f' There are {len(dataset_splits)} splits: {dataset_splits}.'
    else:
        if len(dataset_splits) < 1:
            err_msg = f'The dataset "{dataset_name}" contains no data splits!'
            llm_training_logger.error(err_msg)
            raise ValueError(err_msg)
        info_msg += f' There is 1 split: {dataset_splits}.'
    llm_training_logger.info(info_msg)
    true_sample_keys = {'instruction', 'context', 'response', 'category'}
    for cur_split in dataset_splits:
        print(enumerate(dataset[cur_split]))
        for sample_idx, cur_sample in enumerate(dataset[cur_split]):
            if not isinstance(cur_sample, dict):
                err_msg = (f'The sample {sample_idx} of the split {cur_split} has a wrong type! '
                           f'Expected {type({"a": 1})}, got {type(cur_sample)}.')
                llm_training_logger.error(err_msg)
                raise ValueError(err_msg)
            sample_keys = set(cur_sample.keys())
            if sample_keys != true_sample_keys:
                err_msg = (f'The sample {sample_idx} of the split {cur_split} has wrong fields! '
                           f'Expected {sorted(list(true_sample_keys))}, got {sorted(list(sample_keys))}.')
                llm_training_logger.error(err_msg)
                raise ValueError(err_msg)
            new_category = cur_sample['category'].strip()
            if len(new_category) == 0:
                err_msg = f'The sample {sample_idx} of the split {cur_split} has an empty category!'
                llm_training_logger.error(err_msg)
                raise ValueError(err_msg)
            if new_category not in res:
                res[new_category] = []
            instruction = cur_sample['instruction'].strip()
            if len(instruction) == 0:
                err_msg = f'The sample {sample_idx} of the split {cur_split} has an empty instruction!'
                llm_training_logger.error(err_msg)
                raise ValueError(err_msg)
            response = cur_sample['response'].strip()
            if len(response) == 0:
                err_msg = f'The sample {sample_idx} of the split {cur_split} has an empty response!'
                llm_training_logger.error(err_msg)
                raise ValueError(err_msg)
            context = cur_sample['context'].strip()

            res[new_category].append((
                ' '.join(instruction.split()).strip(),
                ' '.join(context.split()).strip(),
                ' '.join(response.split()).strip()
            ))

    all_categories = sorted(list(res.keys()))
    if len(all_categories) == 0:
        err_msg = f'The dataset "{dataset_name}" is empty!'
        llm_training_logger.error(err_msg)
        raise ValueError(err_msg)
    info_msg = f'There are {len(all_categories)} categories in the dataset "{dataset_name}". They are:'
    llm_training_logger.info(info_msg)
    max_width = len(all_categories[0])
    for cur in all_categories[1:]:
        if len(cur) > max_width:
            max_width = len(cur)
    for cur in all_categories[:-1]:
        llm_training_logger.info('  - {0:<{1}}: {2:>5} samples;'.format(cur + ':', max_width + 1, len(res[cur])))
    cur = all_categories[-1]
    llm_training_logger.info('  - {0:<{1}}: {2:>5} samples.'.format(cur + ':', max_width + 1, len(res[cur])))
    del dataset
    return res

def load_model_and_tokenizer(model_name: str) -> Tuple[AutoTokenizer, AutoModelForSeq2SeqLM]:
    tokenizer = AutoTokenizer.from_pretrained(model_name, eos_token='</s>', skip_special_tokens=True,
                                              trust_remote_code=True)  # E

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=False,
    )

    device_map = 'auto'

    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        device_map=device_map,
        quantization_config=bnb_config,
        # offload_folder="offload",
        trust_remote_code=True,
        # offload_state_dict=True,
        torch_dtype=torch.float16
    )
    device_map = infer_auto_device_map(model)
    print(device_map)
    print(model.get_memory_footprint())
    print(model)
    return tokenizer, model

def main(l2_regularizer_weight=0.1):
    print(f"Is CUDA supported by this system?  {torch.cuda.is_available()}")
    print(f"CUDA version: {torch.version.cuda}")
    if not torch.cuda.is_available():
        err_msg = 'CUDA is not available!'
        llm_training_logger.error(err_msg)
        raise ValueError(err_msg)

    random.seed(DEFAULT_RANDOM_SEED)
    torch.manual_seed(DEFAULT_RANDOM_SEED)
    np.random.seed(DEFAULT_RANDOM_SEED)
    torch.cuda.manual_seed(DEFAULT_RANDOM_SEED)

    output_model_name = os.path.normpath("my_model")
    if os.path.basename(output_model_name).lower() == os.path.basename(input_model_name).lower():
        err_msg = 'The input model name and the model output name are same!'
        llm_training_logger.error(err_msg)
        raise ValueError(err_msg)
    if not os.path.isdir(output_model_name):
        base_dir = os.path.dirname(output_model_name)
        if len(base_dir) > 0:
            if not os.path.isdir(base_dir):
                err_msg = f'The directory "{base_dir}" does not exist!'
                llm_training_logger.error(err_msg)
                raise ValueError(err_msg)
        os.mkdir(output_model_name)

    report_fname = os.path.normpath("report.json")
    if not os.path.isfile(report_fname):
        base_dir = os.path.dirname(report_fname)
        if len(base_dir) > 0:
            if not os.path.isdir(base_dir):
                err_msg = f'The directory "{base_dir}" does not exist!'
                llm_training_logger.error(err_msg)
                raise ValueError(err_msg)

    bert_name = os.path.normpath("DeepPavlov/rubert-base-cased")

    my_model_tokenizer, my_model = load_model_and_tokenizer(input_model_name)

    my_model = create_peft_model(my_model)

    full_my_data = load_my_data_dataset(dataset_name=data_name)

    all_categories = sorted(list(full_my_data.keys()))

    bert_tokenizer, bert_model = load_bert(bert_name)

    my_data_for_training, my_data_for_testing = split_dolly_dataset(
        dataset=full_my_data,
        emb_tokenizer=bert_tokenizer,
        emb_model=bert_model,
        minibatch=bert_minibatch_size,
        random_state=random.randint(0, 2147483647)
    )

    if set(my_data_for_testing.keys()) != set(my_data_for_training.keys()):
        err_msg = 'The training categories do not correspond to the testing categories.'
        llm_training_logger.error(err_msg)
        raise ValueError(err_msg)
    if set(my_data_for_testing.keys()) != set(all_categories):
        err_msg = 'The training categories do not correspond to the testing categories.'
        llm_training_logger.error(err_msg)
        raise ValueError(err_msg)
    n_samples_for_training = len(my_data_for_training[all_categories[0]])
    for category in all_categories[1:]:
        n_samples_for_training += len(my_data_for_training[category])

    gc.collect()

    tokenized_my_data_for_training = tokenize_dataset(my_data_for_training, my_model_tokenizer)

    gc.collect()

    print_trainable_parameters(my_model)
    my_model.eval()

    validation_true = []
    validation_pred = []
    validation_questions = []
    for category in tqdm(all_categories):
        prompt_, pred_, true_ = predict(my_data_for_testing[category], my_model_tokenizer, my_model,
                                        minibatch_size)
        validation_pred += pred_
        validation_true += true_
        validation_questions += prompt_
        del pred_, true_, prompt_
    bert_model = prepare_model_for_bert_score(bert_model, num_layers=bert_num_layers)
    best_f1, detailed_validation_report = evaluate(validation_questions, validation_pred, validation_true,
                                                   bert_tokenizer, bert_model, bert_minibatch_size)
    llm_training_logger.info(f'Before training: validation BERT F1 = {best_f1}.')
    with codecs.open(add_epoch_to_report_fname(report_fname, 0), mode='w', encoding='utf-8') as fp:
        json.dump(
            obj={
                'total': {'f1': best_f1},
                'detailed': detailed_validation_report
            },
            fp=fp,
            ensure_ascii=False,
            indent=4
        )
    with codecs.open(report_fname, mode='w', encoding='utf-8') as fp:
        json.dump(
            obj={
                'total': {'f1': best_f1},
                'detailed': detailed_validation_report
            },
            fp=fp,
            ensure_ascii=False,
            indent=4
        )
    del detailed_validation_report, validation_pred, validation_true

    optimizer = Adam8bit(
        my_model.parameters(),
        lr=learning_rate,
    )

    n_training_batches = int(np.ceil(n_samples_for_training / minibatch_size))
    if accumulate_gradients > 1:
        while (n_training_batches % accumulate_gradients) != 0:
            n_training_batches += 1
    llm_training_logger.info(f'Iterations per epoch is {n_training_batches}.')

    if use_birm:
        loss_fct = torch.nn.CrossEntropyLoss().cuda()

        with torch.no_grad():
            weight_norm = torch.tensor(0.).cuda()
            for w in my_model.parameters():
                if w.requires_grad:
                    weight_norm += w.norm().pow(2)
            weight_norm_val = float(weight_norm.detach().cpu())
        n = 1.0
        weight_norm_val_ = weight_norm_val
        while weight_norm_val_ > 1.0:
            n *= 10.0
            weight_norm_val_ /= 10.0
        l2_regularizer_weight = l2_regularizer_weight / n
        info_msg = (f'BIRM is used. The weight norm is {weight_norm_val}, '
                    f'and L2 regularizer weight is {l2_regularizer_weight}.')
        llm_training_logger.info(info_msg)
    else:
        loss_fct = None
        llm_training_logger.info('ERM is used.')
        l2_regularizer_weight = 0.0

    torch.cuda.empty_cache()
    del full_my_data
    for epoch in range(1, MAX_EPOCHS + 1):
        llm_training_logger.info(f'Epoch {epoch} is started.')
        total_training_loss_val = 0.0
        training_fct_loss_val = 0.0
        weight_norm_val = 0.0
        training_penalty_val = 0.0
        my_model.train()
        for iter_idx in trange(1, n_training_batches + 1):
            input_ids, attention_mask, labels, environments, set_of_env = generate_minibatch(
                dataset=tokenized_my_data_for_training,
                categories=all_categories,
                minibatch=minibatch_size,
                padding=my_model_tokenizer.pad_token_id
            )
            res = my_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels, return_dict=True)
            if use_birm:
                train_logits = res.logits
                train_nll = res.loss
                training_fct_loss_val += train_nll.detach().cpu()
                loss_list = []
                for env_ID in set_of_env:
                    ei = (environments == env_ID).view(-1)
                    labels_for_env = labels[ei]
                    logits_for_env = train_logits[ei]
                    shift_logits = logits_for_env[..., :-1, :].contiguous()
                    shift_labels = labels_for_env[..., 1:].contiguous()
                    batch_size, seq_length, vocab_size = shift_logits.shape
                    train_nll_ = loss_fct(
                        shift_logits.view(batch_size * seq_length, vocab_size),
                        shift_labels.view(batch_size * seq_length)
                    )
                    loss_list.append(train_nll_)
                loss_t = torch.stack(loss_list)
                train_penalty = ((loss_t - loss_t.mean()) ** 2).mean()
                training_penalty_val += train_penalty.detach().cpu()
                weight_norm = torch.tensor(0.).cuda()
                for w in my_model.parameters():
                    if w.requires_grad:
                        weight_norm += w.norm().pow(2)
                weight_norm_val += float(weight_norm.detach().cpu())
                loss = train_nll.clone()
                loss += l2_regularizer_weight * weight_norm
                loss += penalty_weight * train_penalty
                total_training_loss_val += loss.detach().cpu()
            else:
                # print(res)
                loss = res.loss
                instant_loss = loss.detach().cpu().numpy()
                total_training_loss_val += instant_loss
            if accumulate_gradients > 1:
                loss = loss / accumulate_gradients
            loss.backward()
            if accumulate_gradients > 1:
                if iter_idx % accumulate_gradients == 0:
                    torch.nn.utils.clip_grad_norm_(my_model.parameters(), 1.0)
                    optimizer.step()
                    optimizer.zero_grad()
            else:
                torch.nn.utils.clip_grad_norm_(my_model.parameters(), 1.0)
                optimizer.step()
                optimizer.zero_grad()
            del input_ids, attention_mask, labels, res, environments, set_of_env
        print(total_training_loss_val)
        total_training_loss_val /= float(n_training_batches / accumulate_gradients)
        training_fct_loss_val /= float(n_training_batches / accumulate_gradients)
        weight_norm_val /= float(n_training_batches / accumulate_gradients)
        training_penalty_val /= float(n_training_batches / accumulate_gradients)
        if use_birm:
            info_msg = (f'Epoch {epoch}: total training loss is {total_training_loss_val}, '
                        f'training cross-entropy is {training_fct_loss_val}, '
                        f'training penalty is {training_penalty_val}, weight norm is {weight_norm_val}.')
        else:
            info_msg = f'Epoch {epoch}: training loss is {total_training_loss_val}.'
        llm_training_logger.info(info_msg)
        my_model.eval()
        validation_true = []
        validation_pred = []
        for category in tqdm(all_categories):
            _, pred_, true_ = predict(my_data_for_testing[category], my_model_tokenizer, my_model, minibatch_size)
            validation_pred += pred_
            validation_true += true_
            del pred_, true_
        new_f1, detailed_validation_report = evaluate(validation_questions, validation_pred, validation_true,
                                                      bert_tokenizer, bert_model, bert_minibatch_size)
        llm_training_logger.info(f'Epoch {epoch}: validation BERT F1 = {new_f1}.')
        if new_f1 > best_f1:
            best_f1 = new_f1
            with codecs.open(report_fname, mode='w', encoding='utf-8') as fp:
                json.dump(
                    obj={
                        'total': {'f1': best_f1},
                        'detailed': detailed_validation_report
                    },
                    fp=fp,
                    ensure_ascii=False,
                    indent=4
                )
            my_model.save_pretrained(output_model_name)
            llm_training_logger.info(f'The model is updated with F1 = {best_f1}.')
        with codecs.open(add_epoch_to_report_fname(report_fname, epoch), mode='w', encoding='utf-8') as fp:
            json.dump(
                obj={
                    'total': {'f1': new_f1},
                    'detailed': detailed_validation_report
                },
                fp=fp,
                ensure_ascii=False,
                indent=4
            )
        del detailed_validation_report, validation_pred, validation_true
        llm_training_logger.info(f'Epoch {epoch} is finished.')

if __name__ == '__main__':
    llm_training_logger.setLevel(logging.INFO)
    fmt_str = '%(filename)s[LINE:%(lineno)d]# %(levelname)-8s ' \
              '[%(asctime)s]  %(message)s'
    formatter = logging.Formatter(fmt_str)
    stdout_handler = logging.StreamHandler(sys.stdout)
    stdout_handler.setFormatter(formatter)
    llm_training_logger.addHandler(stdout_handler)
    file_handler = logging.FileHandler('llm_training.log')
    file_handler.setFormatter(formatter)
    llm_training_logger.addHandler(file_handler)
    main()

Is CUDA supported by this system?  True
CUDA version: 11.8
{'': 0}
2159502336
T5ForConditionalGeneration(
  (shared): Embedding(50364, 1536)
  (encoder): T5Stack(
    (embed_tokens): Embedding(50364, 1536)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear4bit(in_features=1536, out_features=1536, bias=False)
              (k): Linear4bit(in_features=1536, out_features=1536, bias=False)
              (v): Linear4bit(in_features=1536, out_features=1536, bias=False)
              (o): Linear4bit(in_features=1536, out_features=1536, bias=False)
              (relative_attention_bias): Embedding(32, 24)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear4bit(in_features=1536, out_features=4096,

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-c10fb2d1d2b86574/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

1
2495533887.py[LINE:356]# INFO     [2023-12-31 08:04:05,507]  The dataset "training_dataset.jsonl" is loaded. There is 1 split: ['train'].
2495533887.py[LINE:356]# INFO     [2023-12-31 08:04:05,507]  The dataset "training_dataset.jsonl" is loaded. There is 1 split: ['train'].
2495533887.py[LINE:356]# INFO     [2023-12-31 08:04:05,507]  The dataset "training_dataset.jsonl" is loaded. There is 1 split: ['train'].
<enumerate object at 0x78704d396cc0>
2495533887.py[LINE:403]# INFO     [2023-12-31 08:04:05,519]  There are 1 categories in the dataset "training_dataset.jsonl". They are:
2495533887.py[LINE:403]# INFO     [2023-12-31 08:04:05,519]  There are 1 categories in the dataset "training_dataset.jsonl". They are:
2495533887.py[LINE:403]# INFO     [2023-12-31 08:04:05,519]  There are 1 categories in the dataset "training_dataset.jsonl". They are:
2495533887.py[LINE:411]# INFO     [2023-12-31 08:04:05,521]    - creative_writing::    84 samples.
2495533887.py[LINE:411]# INFO     [2023-12-

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


2495533887.py[LINE:327]# INFO     [2023-12-31 08:04:13,085]  The BERT model DeepPavlov/rubert-base-cased is loaded.
2495533887.py[LINE:327]# INFO     [2023-12-31 08:04:13,085]  The BERT model DeepPavlov/rubert-base-cased is loaded.
2495533887.py[LINE:327]# INFO     [2023-12-31 08:04:13,085]  The BERT model DeepPavlov/rubert-base-cased is loaded.
2495533887.py[LINE:289]# INFO     [2023-12-31 08:04:13,505]  Texts of the category creative_writing are tokenized.
2495533887.py[LINE:289]# INFO     [2023-12-31 08:04:13,505]  Texts of the category creative_writing are tokenized.
2495533887.py[LINE:289]# INFO     [2023-12-31 08:04:13,505]  Texts of the category creative_writing are tokenized.


100%|██████████| 66/66 [00:00<00:00, 1242.58it/s]

2495533887.py[LINE:282]# INFO     [2023-12-31 08:04:13,790]  Trainable parameters: 28311552, all parameters: 1126937088, trainable %: 2.5122566558036645





2495533887.py[LINE:282]# INFO     [2023-12-31 08:04:13,790]  Trainable parameters: 28311552, all parameters: 1126937088, trainable %: 2.5122566558036645
2495533887.py[LINE:282]# INFO     [2023-12-31 08:04:13,790]  Trainable parameters: 28311552, all parameters: 1126937088, trainable %: 2.5122566558036645


100%|██████████| 1/1 [00:32<00:00, 32.35s/it]


2495533887.py[LINE:533]# INFO     [2023-12-31 08:04:46,458]  Before training: validation BERT F1 = 0.4913090169429779.
2495533887.py[LINE:533]# INFO     [2023-12-31 08:04:46,458]  Before training: validation BERT F1 = 0.4913090169429779.
2495533887.py[LINE:533]# INFO     [2023-12-31 08:04:46,458]  Before training: validation BERT F1 = 0.4913090169429779.
2495533887.py[LINE:565]# INFO     [2023-12-31 08:04:46,477]  Iterations per epoch is 33.
2495533887.py[LINE:565]# INFO     [2023-12-31 08:04:46,477]  Iterations per epoch is 33.
2495533887.py[LINE:565]# INFO     [2023-12-31 08:04:46,477]  Iterations per epoch is 33.
2495533887.py[LINE:587]# INFO     [2023-12-31 08:04:46,480]  ERM is used.
2495533887.py[LINE:587]# INFO     [2023-12-31 08:04:46,480]  ERM is used.
2495533887.py[LINE:587]# INFO     [2023-12-31 08:04:46,480]  ERM is used.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:04:46,489]  Epoch 1 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:04:46,489]  Epoch 1 is s

100%|██████████| 33/33 [00:15<00:00,  2.19it/s]

329.89453125
2495533887.py[LINE:665]# INFO     [2023-12-31 08:05:01,570]  Epoch 1: training loss is 9.996803977272727.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:05:01,570]  Epoch 1: training loss is 9.996803977272727.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:05:01,570]  Epoch 1: training loss is 9.996803977272727.



100%|██████████| 1/1 [00:19<00:00, 19.48s/it]


2495533887.py[LINE:676]# INFO     [2023-12-31 08:05:21,338]  Epoch 1: validation BERT F1 = 0.5036631955040826.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:05:21,338]  Epoch 1: validation BERT F1 = 0.5036631955040826.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:05:21,338]  Epoch 1: validation BERT F1 = 0.5036631955040826.
2495533887.py[LINE:690]# INFO     [2023-12-31 08:05:21,604]  The model is updated with F1 = 0.5036631955040826.
2495533887.py[LINE:690]# INFO     [2023-12-31 08:05:21,604]  The model is updated with F1 = 0.5036631955040826.
2495533887.py[LINE:690]# INFO     [2023-12-31 08:05:21,604]  The model is updated with F1 = 0.5036631955040826.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:05:21,608]  Epoch 1 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:05:21,608]  Epoch 1 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:05:21,608]  Epoch 1 is finished.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:05:21,611]  Epoch 2 is started.
2495

100%|██████████| 33/33 [00:15<00:00,  2.11it/s]

213.19140625
2495533887.py[LINE:665]# INFO     [2023-12-31 08:05:37,259]  Epoch 2: training loss is 6.460345643939394.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:05:37,259]  Epoch 2: training loss is 6.460345643939394.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:05:37,259]  Epoch 2: training loss is 6.460345643939394.



100%|██████████| 1/1 [00:29<00:00, 29.99s/it]


2495533887.py[LINE:676]# INFO     [2023-12-31 08:06:07,546]  Epoch 2: validation BERT F1 = 0.5294411778450012.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:06:07,546]  Epoch 2: validation BERT F1 = 0.5294411778450012.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:06:07,546]  Epoch 2: validation BERT F1 = 0.5294411778450012.
2495533887.py[LINE:690]# INFO     [2023-12-31 08:06:07,819]  The model is updated with F1 = 0.5294411778450012.
2495533887.py[LINE:690]# INFO     [2023-12-31 08:06:07,819]  The model is updated with F1 = 0.5294411778450012.
2495533887.py[LINE:690]# INFO     [2023-12-31 08:06:07,819]  The model is updated with F1 = 0.5294411778450012.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:06:07,824]  Epoch 2 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:06:07,824]  Epoch 2 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:06:07,824]  Epoch 2 is finished.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:06:07,827]  Epoch 3 is started.
2495

100%|██████████| 33/33 [00:14<00:00,  2.26it/s]

150.296875
2495533887.py[LINE:665]# INFO     [2023-12-31 08:06:22,444]  Epoch 3: training loss is 4.554450757575758.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:06:22,444]  Epoch 3: training loss is 4.554450757575758.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:06:22,444]  Epoch 3: training loss is 4.554450757575758.



100%|██████████| 1/1 [01:24<00:00, 84.88s/it]


2495533887.py[LINE:676]# INFO     [2023-12-31 08:07:47,618]  Epoch 3: validation BERT F1 = 0.5445684194564819.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:07:47,618]  Epoch 3: validation BERT F1 = 0.5445684194564819.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:07:47,618]  Epoch 3: validation BERT F1 = 0.5445684194564819.
2495533887.py[LINE:690]# INFO     [2023-12-31 08:07:47,884]  The model is updated with F1 = 0.5445684194564819.
2495533887.py[LINE:690]# INFO     [2023-12-31 08:07:47,884]  The model is updated with F1 = 0.5445684194564819.
2495533887.py[LINE:690]# INFO     [2023-12-31 08:07:47,884]  The model is updated with F1 = 0.5445684194564819.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:07:47,889]  Epoch 3 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:07:47,889]  Epoch 3 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:07:47,889]  Epoch 3 is finished.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:07:47,891]  Epoch 4 is started.
2495

100%|██████████| 33/33 [00:14<00:00,  2.28it/s]

127.845703125
2495533887.py[LINE:665]# INFO     [2023-12-31 08:08:02,391]  Epoch 4: training loss is 3.874112215909091.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:08:02,391]  Epoch 4: training loss is 3.874112215909091.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:08:02,391]  Epoch 4: training loss is 3.874112215909091.



100%|██████████| 1/1 [00:20<00:00, 20.07s/it]


2495533887.py[LINE:676]# INFO     [2023-12-31 08:08:22,752]  Epoch 4: validation BERT F1 = 0.5375705460707346.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:08:22,752]  Epoch 4: validation BERT F1 = 0.5375705460707346.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:08:22,752]  Epoch 4: validation BERT F1 = 0.5375705460707346.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:08:22,756]  Epoch 4 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:08:22,756]  Epoch 4 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:08:22,756]  Epoch 4 is finished.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:08:22,759]  Epoch 5 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:08:22,759]  Epoch 5 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:08:22,759]  Epoch 5 is started.


100%|██████████| 33/33 [00:14<00:00,  2.31it/s]

118.810546875
2495533887.py[LINE:665]# INFO     [2023-12-31 08:08:37,052]  Epoch 5: training loss is 3.600319602272727.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:08:37,052]  Epoch 5: training loss is 3.600319602272727.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:08:37,052]  Epoch 5: training loss is 3.600319602272727.



100%|██████████| 1/1 [00:19<00:00, 19.73s/it]


2495533887.py[LINE:676]# INFO     [2023-12-31 08:08:57,069]  Epoch 5: validation BERT F1 = 0.5412630389134089.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:08:57,069]  Epoch 5: validation BERT F1 = 0.5412630389134089.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:08:57,069]  Epoch 5: validation BERT F1 = 0.5412630389134089.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:08:57,073]  Epoch 5 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:08:57,073]  Epoch 5 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:08:57,073]  Epoch 5 is finished.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:08:57,075]  Epoch 6 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:08:57,075]  Epoch 6 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:08:57,075]  Epoch 6 is started.


100%|██████████| 33/33 [00:14<00:00,  2.25it/s]

116.796875
2495533887.py[LINE:665]# INFO     [2023-12-31 08:09:11,768]  Epoch 6: training loss is 3.539299242424242.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:09:11,768]  Epoch 6: training loss is 3.539299242424242.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:09:11,768]  Epoch 6: training loss is 3.539299242424242.



100%|██████████| 1/1 [00:20<00:00, 20.32s/it]


2495533887.py[LINE:676]# INFO     [2023-12-31 08:09:32,390]  Epoch 6: validation BERT F1 = 0.5342353714836968.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:09:32,390]  Epoch 6: validation BERT F1 = 0.5342353714836968.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:09:32,390]  Epoch 6: validation BERT F1 = 0.5342353714836968.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:09:32,394]  Epoch 6 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:09:32,394]  Epoch 6 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:09:32,394]  Epoch 6 is finished.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:09:32,397]  Epoch 7 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:09:32,397]  Epoch 7 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:09:32,397]  Epoch 7 is started.


100%|██████████| 33/33 [00:14<00:00,  2.29it/s]

110.8701171875
2495533887.py[LINE:665]# INFO     [2023-12-31 08:09:46,796]  Epoch 7: training loss is 3.3597005208333335.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:09:46,796]  Epoch 7: training loss is 3.3597005208333335.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:09:46,796]  Epoch 7: training loss is 3.3597005208333335.



100%|██████████| 1/1 [00:25<00:00, 25.25s/it]


2495533887.py[LINE:676]# INFO     [2023-12-31 08:10:12,349]  Epoch 7: validation BERT F1 = 0.5384660065174103.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:10:12,349]  Epoch 7: validation BERT F1 = 0.5384660065174103.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:10:12,349]  Epoch 7: validation BERT F1 = 0.5384660065174103.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:10:12,354]  Epoch 7 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:10:12,354]  Epoch 7 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:10:12,354]  Epoch 7 is finished.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:10:12,356]  Epoch 8 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:10:12,356]  Epoch 8 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:10:12,356]  Epoch 8 is started.


100%|██████████| 33/33 [00:14<00:00,  2.34it/s]

103.16796875
2495533887.py[LINE:665]# INFO     [2023-12-31 08:10:26,490]  Epoch 8: training loss is 3.1263020833333335.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:10:26,490]  Epoch 8: training loss is 3.1263020833333335.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:10:26,490]  Epoch 8: training loss is 3.1263020833333335.



100%|██████████| 1/1 [00:32<00:00, 32.26s/it]


2495533887.py[LINE:676]# INFO     [2023-12-31 08:10:59,059]  Epoch 8: validation BERT F1 = 0.5537660006019804.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:10:59,059]  Epoch 8: validation BERT F1 = 0.5537660006019804.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:10:59,059]  Epoch 8: validation BERT F1 = 0.5537660006019804.
2495533887.py[LINE:690]# INFO     [2023-12-31 08:10:59,330]  The model is updated with F1 = 0.5537660006019804.
2495533887.py[LINE:690]# INFO     [2023-12-31 08:10:59,330]  The model is updated with F1 = 0.5537660006019804.
2495533887.py[LINE:690]# INFO     [2023-12-31 08:10:59,330]  The model is updated with F1 = 0.5537660006019804.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:10:59,335]  Epoch 8 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:10:59,335]  Epoch 8 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:10:59,335]  Epoch 8 is finished.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:10:59,337]  Epoch 9 is started.
2495

100%|██████████| 33/33 [00:14<00:00,  2.28it/s]

106.7216796875
2495533887.py[LINE:665]# INFO     [2023-12-31 08:11:13,818]  Epoch 9: training loss is 3.233990293560606.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:11:13,818]  Epoch 9: training loss is 3.233990293560606.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:11:13,818]  Epoch 9: training loss is 3.233990293560606.



100%|██████████| 1/1 [01:02<00:00, 62.84s/it]


2495533887.py[LINE:676]# INFO     [2023-12-31 08:12:16,966]  Epoch 9: validation BERT F1 = 0.5426027145650651.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:12:16,966]  Epoch 9: validation BERT F1 = 0.5426027145650651.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:12:16,966]  Epoch 9: validation BERT F1 = 0.5426027145650651.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:12:16,970]  Epoch 9 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:12:16,970]  Epoch 9 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:12:16,970]  Epoch 9 is finished.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:12:16,972]  Epoch 10 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:12:16,972]  Epoch 10 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:12:16,972]  Epoch 10 is started.


100%|██████████| 33/33 [00:14<00:00,  2.25it/s]

104.2607421875
2495533887.py[LINE:665]# INFO     [2023-12-31 08:12:31,657]  Epoch 10: training loss is 3.159416429924242.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:12:31,657]  Epoch 10: training loss is 3.159416429924242.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:12:31,657]  Epoch 10: training loss is 3.159416429924242.



100%|██████████| 1/1 [00:23<00:00, 23.58s/it]


2495533887.py[LINE:676]# INFO     [2023-12-31 08:12:55,531]  Epoch 10: validation BERT F1 = 0.5253857788112428.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:12:55,531]  Epoch 10: validation BERT F1 = 0.5253857788112428.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:12:55,531]  Epoch 10: validation BERT F1 = 0.5253857788112428.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:12:55,535]  Epoch 10 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:12:55,535]  Epoch 10 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:12:55,535]  Epoch 10 is finished.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:12:55,538]  Epoch 11 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:12:55,538]  Epoch 11 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:12:55,538]  Epoch 11 is started.


100%|██████████| 33/33 [00:14<00:00,  2.30it/s]

95.9775390625
2495533887.py[LINE:665]# INFO     [2023-12-31 08:13:09,927]  Epoch 11: training loss is 2.908410274621212.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:13:09,927]  Epoch 11: training loss is 2.908410274621212.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:13:09,927]  Epoch 11: training loss is 2.908410274621212.



100%|██████████| 1/1 [00:25<00:00, 25.58s/it]


2495533887.py[LINE:676]# INFO     [2023-12-31 08:13:35,806]  Epoch 11: validation BERT F1 = 0.5327060719331106.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:13:35,806]  Epoch 11: validation BERT F1 = 0.5327060719331106.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:13:35,806]  Epoch 11: validation BERT F1 = 0.5327060719331106.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:13:35,810]  Epoch 11 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:13:35,810]  Epoch 11 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:13:35,810]  Epoch 11 is finished.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:13:35,813]  Epoch 12 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:13:35,813]  Epoch 12 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:13:35,813]  Epoch 12 is started.


100%|██████████| 33/33 [00:14<00:00,  2.30it/s]

94.1865234375
2495533887.py[LINE:665]# INFO     [2023-12-31 08:13:50,200]  Epoch 12: training loss is 2.8541370738636362.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:13:50,200]  Epoch 12: training loss is 2.8541370738636362.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:13:50,200]  Epoch 12: training loss is 2.8541370738636362.



100%|██████████| 1/1 [00:22<00:00, 22.92s/it]


2495533887.py[LINE:676]# INFO     [2023-12-31 08:14:13,402]  Epoch 12: validation BERT F1 = 0.5406846387518777.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:14:13,402]  Epoch 12: validation BERT F1 = 0.5406846387518777.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:14:13,402]  Epoch 12: validation BERT F1 = 0.5406846387518777.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:14:13,406]  Epoch 12 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:14:13,406]  Epoch 12 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:14:13,406]  Epoch 12 is finished.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:14:13,408]  Epoch 13 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:14:13,408]  Epoch 13 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:14:13,408]  Epoch 13 is started.


100%|██████████| 33/33 [00:14<00:00,  2.29it/s]

92.0888671875
2495533887.py[LINE:665]# INFO     [2023-12-31 08:14:27,868]  Epoch 13: training loss is 2.7905717329545454.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:14:27,868]  Epoch 13: training loss is 2.7905717329545454.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:14:27,868]  Epoch 13: training loss is 2.7905717329545454.



100%|██████████| 1/1 [00:30<00:00, 30.01s/it]


2495533887.py[LINE:676]# INFO     [2023-12-31 08:14:58,185]  Epoch 13: validation BERT F1 = 0.5407669610447354.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:14:58,185]  Epoch 13: validation BERT F1 = 0.5407669610447354.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:14:58,185]  Epoch 13: validation BERT F1 = 0.5407669610447354.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:14:58,189]  Epoch 13 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:14:58,189]  Epoch 13 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:14:58,189]  Epoch 13 is finished.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:14:58,192]  Epoch 14 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:14:58,192]  Epoch 14 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:14:58,192]  Epoch 14 is started.


100%|██████████| 33/33 [00:15<00:00,  2.19it/s]

93.255859375
2495533887.py[LINE:665]# INFO     [2023-12-31 08:15:13,255]  Epoch 14: training loss is 2.825935132575758.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:15:13,255]  Epoch 14: training loss is 2.825935132575758.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:15:13,255]  Epoch 14: training loss is 2.825935132575758.



100%|██████████| 1/1 [00:19<00:00, 19.42s/it]


2495533887.py[LINE:676]# INFO     [2023-12-31 08:15:32,976]  Epoch 14: validation BERT F1 = 0.5323542141252093.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:15:32,976]  Epoch 14: validation BERT F1 = 0.5323542141252093.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:15:32,976]  Epoch 14: validation BERT F1 = 0.5323542141252093.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:15:32,981]  Epoch 14 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:15:32,981]  Epoch 14 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:15:32,981]  Epoch 14 is finished.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:15:32,983]  Epoch 15 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:15:32,983]  Epoch 15 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:15:32,983]  Epoch 15 is started.


100%|██████████| 33/33 [00:14<00:00,  2.26it/s]

91.958984375
2495533887.py[LINE:665]# INFO     [2023-12-31 08:15:47,576]  Epoch 15: training loss is 2.786635890151515.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:15:47,576]  Epoch 15: training loss is 2.786635890151515.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:15:47,576]  Epoch 15: training loss is 2.786635890151515.



100%|██████████| 1/1 [00:29<00:00, 29.40s/it]


2495533887.py[LINE:676]# INFO     [2023-12-31 08:16:17,284]  Epoch 15: validation BERT F1 = 0.5374505834447013.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:16:17,284]  Epoch 15: validation BERT F1 = 0.5374505834447013.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:16:17,284]  Epoch 15: validation BERT F1 = 0.5374505834447013.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:16:17,288]  Epoch 15 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:16:17,288]  Epoch 15 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:16:17,288]  Epoch 15 is finished.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:16:17,290]  Epoch 16 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:16:17,290]  Epoch 16 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:16:17,290]  Epoch 16 is started.


100%|██████████| 33/33 [00:14<00:00,  2.26it/s]

80.2998046875
2495533887.py[LINE:665]# INFO     [2023-12-31 08:16:31,883]  Epoch 16: training loss is 2.433327414772727.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:16:31,883]  Epoch 16: training loss is 2.433327414772727.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:16:31,883]  Epoch 16: training loss is 2.433327414772727.



100%|██████████| 1/1 [00:25<00:00, 25.42s/it]


2495533887.py[LINE:676]# INFO     [2023-12-31 08:16:57,590]  Epoch 16: validation BERT F1 = 0.530038156443172.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:16:57,590]  Epoch 16: validation BERT F1 = 0.530038156443172.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:16:57,590]  Epoch 16: validation BERT F1 = 0.530038156443172.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:16:57,594]  Epoch 16 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:16:57,594]  Epoch 16 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:16:57,594]  Epoch 16 is finished.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:16:57,596]  Epoch 17 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:16:57,596]  Epoch 17 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:16:57,596]  Epoch 17 is started.


100%|██████████| 33/33 [00:14<00:00,  2.31it/s]

75.474609375
2495533887.py[LINE:665]# INFO     [2023-12-31 08:17:11,909]  Epoch 17: training loss is 2.287109375.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:17:11,909]  Epoch 17: training loss is 2.287109375.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:17:11,909]  Epoch 17: training loss is 2.287109375.



100%|██████████| 1/1 [00:34<00:00, 34.20s/it]


2495533887.py[LINE:676]# INFO     [2023-12-31 08:17:46,416]  Epoch 17: validation BERT F1 = 0.5336063769128587.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:17:46,416]  Epoch 17: validation BERT F1 = 0.5336063769128587.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:17:46,416]  Epoch 17: validation BERT F1 = 0.5336063769128587.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:17:46,420]  Epoch 17 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:17:46,420]  Epoch 17 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:17:46,420]  Epoch 17 is finished.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:17:46,422]  Epoch 18 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:17:46,422]  Epoch 18 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:17:46,422]  Epoch 18 is started.


100%|██████████| 33/33 [00:14<00:00,  2.29it/s]

74.1923828125
2495533887.py[LINE:665]# INFO     [2023-12-31 08:18:00,839]  Epoch 18: training loss is 2.248254024621212.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:18:00,839]  Epoch 18: training loss is 2.248254024621212.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:18:00,839]  Epoch 18: training loss is 2.248254024621212.



100%|██████████| 1/1 [00:46<00:00, 46.94s/it]


2495533887.py[LINE:676]# INFO     [2023-12-31 08:18:48,091]  Epoch 18: validation BERT F1 = 0.5326210475630231.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:18:48,091]  Epoch 18: validation BERT F1 = 0.5326210475630231.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:18:48,091]  Epoch 18: validation BERT F1 = 0.5326210475630231.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:18:48,095]  Epoch 18 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:18:48,095]  Epoch 18 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:18:48,095]  Epoch 18 is finished.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:18:48,098]  Epoch 19 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:18:48,098]  Epoch 19 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:18:48,098]  Epoch 19 is started.


100%|██████████| 33/33 [00:14<00:00,  2.28it/s]

72.8681640625
2495533887.py[LINE:665]# INFO     [2023-12-31 08:19:02,617]  Epoch 19: training loss is 2.208126183712121.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:19:02,617]  Epoch 19: training loss is 2.208126183712121.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:19:02,617]  Epoch 19: training loss is 2.208126183712121.



100%|██████████| 1/1 [00:14<00:00, 14.05s/it]


2495533887.py[LINE:676]# INFO     [2023-12-31 08:19:16,956]  Epoch 19: validation BERT F1 = 0.5121606720818414.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:19:16,956]  Epoch 19: validation BERT F1 = 0.5121606720818414.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:19:16,956]  Epoch 19: validation BERT F1 = 0.5121606720818414.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:19:16,960]  Epoch 19 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:19:16,960]  Epoch 19 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:19:16,960]  Epoch 19 is finished.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:19:16,962]  Epoch 20 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:19:16,962]  Epoch 20 is started.
2495533887.py[LINE:593]# INFO     [2023-12-31 08:19:16,962]  Epoch 20 is started.


100%|██████████| 33/33 [00:14<00:00,  2.28it/s]

77.623046875
2495533887.py[LINE:665]# INFO     [2023-12-31 08:19:31,481]  Epoch 20: training loss is 2.3522135416666665.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:19:31,481]  Epoch 20: training loss is 2.3522135416666665.
2495533887.py[LINE:665]# INFO     [2023-12-31 08:19:31,481]  Epoch 20: training loss is 2.3522135416666665.



100%|██████████| 1/1 [00:25<00:00, 25.68s/it]


2495533887.py[LINE:676]# INFO     [2023-12-31 08:19:57,484]  Epoch 20: validation BERT F1 = 0.5260635763406754.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:19:57,484]  Epoch 20: validation BERT F1 = 0.5260635763406754.
2495533887.py[LINE:676]# INFO     [2023-12-31 08:19:57,484]  Epoch 20: validation BERT F1 = 0.5260635763406754.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:19:57,489]  Epoch 20 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:19:57,489]  Epoch 20 is finished.
2495533887.py[LINE:702]# INFO     [2023-12-31 08:19:57,489]  Epoch 20 is finished.
