In [None]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import pickle
import math

In [None]:
# Load train_data_final_exported and test_data_final_exported from a pickle file

train_data_final_exported,_= pickle.load(open('./data/data_for_model.pickle','rb'))

In [None]:
# Optional code for TensorFlow stability

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)

# Domain adaptation of AlephBert

In [None]:
# This section is based on the HuggingFace instruction for domain adaptation of a pretrained model.

## Loading the pretrained data and prepairing for domain adaptation

This section I use a large non labled data of hospital notes regarrding other patients to adapt the pretrainied model weights to be closer to the "medical domain" thus improve its performace when finetuning.  
*This however does not add additional words to the embedding which is only possible when training form scratch. 

In [None]:
# Load pretraining data that was prepaired before
pretraing_data= pickle.load(open('./data/pretraining_data.pickle','rb'))

In [None]:
from datasets import load_dataset,Dataset,Features, Value, ClassLabel
from transformers import AutoTokenizer

In [None]:
# Convert pretraining_data to a Dataset object
pretraing_data=Dataset.from_pandas(pretraing_data,Features({'text': Value('string')}))

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification,TFAutoModelForMaskedLM
model_checkpoint = "onlplab/alephbert-base"

In [None]:
model = TFAutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
def tokenize_function(examples):
    result = tokenizer(examples["text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

In [None]:
# Tokenize the data
tokenized_datasets = pretraing_data.map(
    tokenize_function, batched=True, remove_columns=["text"])

In [None]:
tokenizer.model_max_length


In [None]:
chunk_size = 64

In [None]:
# Group texts into chunks for language modeling

def group_texts(examples):
    chunk_size = 64
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True, num_proc=4)
lm_datasets

In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors='tf')

## Domain adaptation

In [None]:
# Split the dataset into train and test sets
train_size = 400000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets.train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

In [None]:
tf_train_dataset = downsampled_dataset["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=8,
)

tf_eval_dataset = downsampled_dataset["test"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

In [None]:
import tensorflow as tf
callbacks = tf.keras.callbacks

In [None]:
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf

num_train_steps = len(tf_train_dataset)
optimizer, schedule = create_optimizer(
    init_lr=1e-5,
    num_warmup_steps=10,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Train in mixed-precision float16 - another stability and calability optional code
tf.keras.mixed_precision.set_global_policy("mixed_float16")

In [None]:
import math
# Use perplexity to evaluate the adaptation of the model to our data
eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

In [None]:
model.fit(tf_train_dataset,validation_data=tf_eval_dataset)

In [None]:
import math

In [None]:
eval_loss = model.evaluate(tf_eval_dataset)
print(f"Perplexity: {math.exp(eval_loss):.2f}")

In [None]:
# Final model is saved
model.save_pretrained('./models/aleph_bert_med')

# Finetuning

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
model_checkpoint = "onlplab/alephbert-base"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
from sklearn.model_selection import train_test_split
train=train_data_final_exported[['row_text','clf']].copy()
train,val=train_test_split(train,test_size=0.2, random_state=42)

In [None]:
from datasets import Features, Value, ClassLabel,load_dataset,Dataset
features_load = Features({'row_text': Value('string'), 'clf': ClassLabel(num_classes=2)})

In [None]:
train_df=Dataset.from_pandas(train.reset_index(drop=True), features=features_load)
val_df=Dataset.from_pandas(val.reset_index(drop=True),features=features_load)

In [None]:
def tokenize_function(example):
    return tokenizer(example['row_text'])

In [None]:
tokenized_train = train_df.map(tokenize_function, batched=True)

In [None]:
tokenized_val = val_df.map(tokenize_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [None]:
tf_train_dataset = tokenized_train.to_tf_dataset(
    columns=["input_ids",'token_type_ids',"attention_mask"],
    label_cols=["clf"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=32,
)

tf_val_dataset = tokenized_val.to_tf_dataset(
    columns=["input_ids",'token_type_ids',"attention_mask"],
    label_cols=["clf"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=32,
)

In [None]:
checkpoint = './models/aleph_bert_med'

In [None]:
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers.schedules import PolynomialDecay
import random
batch_size = 32
num_epochs = 5

import tensorflow as tf
tf.keras.mixed_precision.set_global_policy('mixed_float16')

min_loss=math.inf
for i in tqdm(range(50)):
    random.seed(i)
    num_train_steps = len(tf_train_dataset) * num_epochs
    model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
    lr_scheduler = PolynomialDecay(
        initial_learning_rate=1e-5, end_learning_rate=0.0, decay_steps=num_train_steps
    )
    from tensorflow.keras.optimizers import Adam

    opt = Adam(learning_rate=lr_scheduler)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    model.compile(optimizer=opt, loss=loss)
    model.fit(tf_train_dataset, validation_data=tf_val_dataset,batch_size=batch_size, 
          epochs=num_epochs, shuffle=True,verbose=0)
    new_loss=model.evaluate(tf_val_dataset)
    if new_loss < min_loss:
        model.save_pretrained('./models/aleph_bert_finetuned')
        min_loss=new_loss