# Pre-Training/Continous Training Model pada dataset Pubmed

## Load Data yang sudah di extract sebelumnya

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from datasets import load_from_disk
dataset = load_from_disk("data_pubmed")
print(dataset)

Dataset({
    features: ['en'],
    num_rows: 1998515
})


In [None]:
dataset = dataset.train_test_split(test_size=0.1)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['en'],
        num_rows: 1798663
    })
    test: Dataset({
        features: ['en'],
        num_rows: 199852
    })
})


In [None]:
dataset['train'][0]

{'en': 'Hypophosphatasia (HPP) is a congenital skeletal disease. Impairment of bone mineralization and seizures are due to a deficiency of tissue-nonspecific alkaline phosphatase (TNAP). Enzyme replacement therapy (ERT) is available as a highly successful treatment for pediatric-onset HPP. However, the potential for prenatal ERT has not been fully investigated to date. In this study, we assessed outcomes and maternal safety using a combinational approach with prenatal and postnatal administration of recombinant TNAP in Akp2  mice as a model of infantile HPP. For the prenatal ERT, we administered subcutaneous injections of recombinant TNAP to pregnant mice from embryonic day 11.5-14.5 until delivery, and then sequentially to Akp2  pups from birth to day 18. For the postnatal ERT, we injected Akp2  pups from birth until day 18. Prenatal ERT did not cause any ectopic mineralization in heterozygous maternal mice. Both prenatal and postnatal ERT preserved growth, survival rate and improved 

# Pre-processing

In [None]:
model_checkpoint = "google-bert/bert-base-uncased"

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [None]:
# import matplotlib.pyplot as plt
# token_lengths = [
#     len(tokenizer(" ".join(example["en"]), truncation=True, max_length=512)["input_ids"])
#     for example in dataset
# ]


# # Statistik ringkasan
# print(f"Min token: {min(token_lengths)}")
# print(f"Max token: {max(token_lengths)}")
# print(f"Mean token: {sum(token_lengths) / len(token_lengths):.2f}")

# # Plot histogram
# plt.figure(figsize=(8, 5))
# plt.hist(token_lengths, bins=20, color='green', edgecolor="black", alpha=0.7)
# plt.xlabel("Jumlah Token")
# plt.ylabel("Frekuensi")
# plt.title("Distribusi Panjang Teks dalam Dataset (Berdasarkan Token BERT)")
# plt.show()

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["en"])

In [None]:
tokenized_data = dataset.map(
    preprocess_function,
    batched=True,
    # num_proc=4,
    remove_columns=dataset['train'].column_names,
)

print(tokenized_data)

Map:   0%|          | 0/1798663 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (517 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/199852 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1798663
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 199852
    })
})


In [None]:
block_size = 128


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

In [None]:
lm_dataset = tokenized_data.map(group_texts, batched=True)

Map:   0%|          | 0/1798663 [00:00<?, ? examples/s]

Map:   0%|          | 0/199852 [00:00<?, ? examples/s]

In [None]:
print(lm_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3690754
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 409695
    })
})


In [None]:
# lm_dataset.save_to_disk("data_pubmed_BERT")

Saving the dataset (0/6 shards):   0%|          | 0/3690754 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/409695 [00:00<?, ? examples/s]

# Modelling

In [None]:
from datasets import load_from_disk

lm_dataset = load_from_disk("data_pubmed_BERT")

In [None]:
print(lm_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3690754
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 409695
    })
})


In [None]:
ids = lm_dataset['train']['input_ids'][104212]
print(ids)

[3775, 6779, 1010, 2348, 2023, 3921, 2003, 2025, 2109, 18228, 2012, 2556, 1025, 3572, 1997, 26721, 1011, 21183, 24226, 2331, 2064, 2022, 2641, 1037, 7070, 3120, 1997, 2592, 2008, 3791, 2582, 3086, 1012, 4117, 1010, 1996, 2682, 2093, 3210, 1997, 2470, 2089, 6011, 13318, 1010, 2021, 1996, 2640, 1997, 1037, 7721, 2470, 2622, 2052, 2342, 2000, 2022, 8971, 5362, 1012, 12020, 2005, 9740, 1997, 1996, 14404, 1997, 22935, 15451, 14192, 10708, 2024, 2988, 1012, 2083, 3768, 1997, 3716, 1997, 28102, 3012, 1010, 2012, 2556, 1010, 2069, 3905, 9740, 2089, 2022, 2825, 1998, 6516, 17210, 3086, 1012, 2174, 1010, 2045, 3544, 2000, 2022, 2053, 9347, 2005, 16030, 3653, 1011, 17489, 11326, 5852, 2005, 27480, 2540, 4295, 1999, 2019, 4895, 11246, 22471, 2098, 2313, 1012, 102, 101, 2057, 2556, 3350]


In [None]:
len(ids)

128

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="tf")

In [None]:
from transformers import AdamWeightDecay

optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [None]:
import tensorflow as tf

tf.keras.mixed_precision.set_global_policy('mixed_float16')

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 4070, compute capability 8.9


In [None]:
from transformers import TFAutoModelForMaskedLM

model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)

All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [None]:
print(model.dtype_policy)

<Policy "mixed_float16">


In [None]:
tf_train_set = model.prepare_tf_dataset(
    lm_dataset["train"],
    shuffle=True,
    batch_size=64,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    lm_dataset["test"],
    shuffle=False,
    batch_size=64,
    collate_fn=data_collator,
)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
model.compile(optimizer=optimizer)  # No loss argument!

In [None]:
model.summary()

Model: "tf_bert_for_masked_lm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108891648 
                                                                 
 mlm___cls (TFBertMLMHead)   multiple                  24459834  
                                                                 
Total params: 109,514,298
Trainable params: 109,514,298
Non-trainable params: 0
_________________________________________________________________


## Callback

In [None]:
from transformers.keras_callbacks import PushToHubCallback

hub_callback = PushToHubCallback(
    output_dir="PubMedAbstract2M-BERT",
    tokenizer=tokenizer,
)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/Indahgalaputri/PubMedAbstract2M-BERT into local empty directory.


In [None]:
from tensorflow.keras.callbacks import Callback
import os
import pandas as pd

class HistoryCSVAppendCallback(Callback):
    def __init__(self, filepath):
        super(HistoryCSVAppendCallback, self).__init__()
        self.filepath = filepath
        self.first_epoch = not os.path.exists(filepath)  # Check if file exists

    def on_epoch_end(self, epoch, logs=None):
        # Convert logs to a pandas DataFrame
        epoch_history = pd.DataFrame(logs, index=[epoch])

        # If file doesn't exist, create it with the header, otherwise append
        if self.first_epoch:
            epoch_history.to_csv(self.filepath, mode='w', header=True)
            self.first_epoch = False
        else:
            epoch_history.to_csv(self.filepath, mode='a', header=False)

# Define file paths for saving model and history
history_filepath = 'history_preOrcontinous_training.csv'

history_callback = HistoryCSVAppendCallback(filepath=history_filepath)

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=2,
    restore_best_weights=True,
    mode="min",
    verbose=1
)

In [None]:
callback = [early_stopping, hub_callback, history_callback]

In [None]:
history = model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=5, callbacks=callback)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
import numpy as np
# take loss
eval_loss = model.evaluate(tf_test_set)  # Loss per token

# Count perplexity
perplexity = np.exp(eval_loss)
print(f"Perplexity: {perplexity:.4f}")


Perplexity: 2.9644
