# **BERT MODEL TRAINING**

## **Prepare and split the extracted data for Training**

In [None]:
!python "/content/drive/MyDrive/data_split.py"

In [None]:
import pandas as pd
data = pd.read_csv("/content/Dta.csv")
data = pd.DataFrame(data)

In [None]:
data.columns

Index(['Unnamed: 0', 'hadm_id', 'input_text', 'output_text'], dtype='object')

In [None]:
is_colab = True

In [None]:
if is_colab:
    !pip install datasets
    !pip install rouge_score
    !pip install transformers==4.5.0

Collecting datasets
  Downloading datasets-2.14.1-py3-none-any.whl (492 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.4/492.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed datasets-2.14.

In [None]:
!pip install transformers



In [None]:
import os

import datasets
from datasets import load_dataset, load_metric
import numpy as np
import pandas as pd
import torch
from transformers import (BertTokenizerFast, EncoderDecoderModel,
                          Seq2SeqTrainer, Seq2SeqTrainingArguments)

if is_colab:
    from google.colab import drive

In [None]:
data_folder = "/content/mimic-iii_discharge_summary.csv"
train_path = os.path.join(data_folder)
val_path = os.path.join(data_folder)
test_path = os.path.join(data_folder)

train_data = load_dataset('csv', data_files=train_path, split='train')
val_data = load_dataset('csv', data_files=val_path, split='train')
test_data = load_dataset('csv', data_files=test_path, split='train')

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

## **Tokenize and preprocess data (text --> numerical embedding) to feed into model**

In [None]:
def map_to_length(x):
    x["text_len"] = len(tokenizer(x["text"]).input_ids)
    x["text_longer_128"] = int(x["text_len"] > 128)
    x["text_longer_256"] = int(x["text_len"] > 256)
    x["summary_len"] = len(tokenizer(x["summary"]).input_ids)
    x["summary_longer_64"] = int(x["summary_len"] > 64)
    x["summary_longer_128"] = int(x["summary_len"] > 128)
    return x


def compute_and_print_stats(x, sample_size=10000):
    if len(x["summary_len"]) == sample_size:
        print("Text Mean: {:.3f}, %-Text > 128: {:.3f}, %-Text > 256: {:.3f}\n"\
              "Summary Mean: {:.3f}, %-Summary > 64: {:.3f}, %-Summary > 128: {:.3f}".format(
                sum(x["text_len"]) / sample_size,
                sum(x["text_longer_128"]) / sample_size * 100,
                sum(x["text_longer_256"]) / sample_size * 100,
                sum(x["summary_len"]) / sample_size,
                sum(x["summary_longer_64"]) / sample_size * 100,
                sum(x["summary_longer_128"]) / sample_size * 100,
        )
    )


def tokenize_batch(batch, enc_max_len=256, dec_max_len=128):
    x = tokenizer(batch["text"], padding="max_length", truncation=True,
                  max_length=enc_max_len)
    y = tokenizer(batch["summary"], padding="max_length", truncation=True,
                  max_length=dec_max_len)

    # include info in dict
    batch["input_ids"] = x.input_ids
    batch["attention_mask"] = x.attention_mask
    batch["decoder_input_ids"] = y.input_ids
    batch["decoder_attention_mask"] = y.attention_mask
    batch["labels"] = y.input_ids.copy()

    # ignore PAD token
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id
                        else token for token in labels]
                       for labels in batch["labels"]]
    return batch


def tokenize_data(data):

    data = data.map(
        tokenize_batch,
        batched=True,
        batch_size=16,
        remove_columns=["study_id", "subject_id", "text", "summary"]
    )
    data.set_format(
        type="torch",
        columns=["input_ids", "attention_mask",
                 "decoder_input_ids", "decoder_attention_mask",
                 "labels"]
    )
    return data


In [None]:
tokenizer_name = "dmis-lab/biobert-base-cased-v1.1"

tokenizer = BertTokenizerFast.from_pretrained(tokenizer_name)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

In [None]:
pd.DataFrame(train_data).columns

Index(['Unnamed: 0', 'hadm_id', 'input_text', 'output_text'], dtype='object')

In [None]:
train_data = train_data.rename_column('Unnamed: 0', 'subject_id')
train_data = train_data.rename_column('hadm_id', 'study_id')
train_data = train_data.rename_column('input_text', 'text')
train_data = train_data.rename_column('output_text', 'summary')

In [None]:
train_data_tokenized = tokenize_data(train_data)

Map:   0%|          | 0/33298 [00:00<?, ? examples/s]

In [None]:
train_data_tokenized

Dataset({
    features: ['input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask', 'labels'],
    num_rows: 33298
})

In [None]:

total_rows = train_data_tokenized.num_rows

# 10% subset
subset_size = int(total_rows * 0.1)

# first 10% of the data
train_data_subset = train_data_tokenized.select(range(subset_size))

train_data_subset.train_test_split(test_size=0.1)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask', 'labels'],
        num_rows: 898
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask', 'labels'],
        num_rows: 100
    })
})

In [None]:
train_data_subset

Dataset({
    features: ['input_ids', 'attention_mask', 'decoder_input_ids', 'decoder_attention_mask', 'labels'],
    num_rows: 998
})

In [None]:
val_data = val_data.rename_column('Unnamed: 0', 'subject_id')
val_data = val_data.rename_column('hadm_id', 'study_id')
val_data = val_data.rename_column('input_text', 'text')
val_data = val_data.rename_column('output_text', 'summary')

In [None]:
val_data_tokenized = tokenize_data(val_data)

Map:   0%|          | 0/33298 [00:00<?, ? examples/s]

In [None]:
total_rows = val_data_tokenized.num_rows
val_subset_size = int(total_rows * 0.1)
val_data_subset = val_data_tokenized.select(range(total_rows - val_subset_size, total_rows))


## **Structure Model**

In [None]:
enc_name = "dmis-lab/biobert-base-cased-v1.1"
dec_name = "dmis-lab/biobert-base-cased-v1.1"
tie_encoder_decoder=False
model_name = "biobert2biobert"

In [None]:
model = EncoderDecoderModel.from_encoder_decoder_pretrained(enc_name, dec_name,
                                                            tie_encoder_decoder=tie_encoder_decoder)
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size

model.config.max_length = 142
model.config.min_length = 56
model.config.no_repeat_ngram_size = 3
model.config.early_stopping = True
model.config.length_penalty = 2.0
model.config.num_beams = 4

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertLMHeadModel were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['bert.encoder.layer.8.crossattention.self.query.bias', 'bert.encoder.layer.10.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.5.crossattention.self.key.bias', 'bert.encoder.layer.2.crossattention.self.value.bias', 'bert.encoder.layer.2.crossattention.self.query.bias', 'bert.encoder.layer.9.crossattention.output.dense.bias', 'bert.encoder.layer.5.crossattention.output.dense.bias', 'bert.encoder.layer.10.crossattention.output.dense.weight', 'bert.encoder.layer.4.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.4.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.3.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.7.crossattention.self.value.bias', 

In [None]:
! pip install -U accelerate
! pip install -U transformers
def compute_metrics(pred):

    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str,
                                 rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }


training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    fp16=False,
    output_dir="./",
    logging_steps=1000,
    save_steps=500,
    eval_steps=7500,
    warmup_steps=2000,
    save_total_limit=3,
)

rouge = load_metric("rouge")

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data_subset,
    eval_dataset=val_data_subset,
)



  rouge = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

## **Training**

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss




TrainOutput(global_step=2994, training_loss=1.402514947440199, metrics={'train_runtime': 21394.4527, 'train_samples_per_second': 0.14, 'train_steps_per_second': 0.14, 'total_flos': 918333770674176.0, 'train_loss': 1.402514947440199, 'epoch': 3.0})

## **Save model to load and use**

In [None]:
torch.save(model,"model.pth")