# Task 5
# BERT Modules

## 5.1 Base BERT

Pre-training followed by fine-tuning is a kind of transfer learning - learning knowledge from one task, and applying it to thiese SDMH classification tasks.

### 5.1.1 Import Packages & Setup
First, I installed the  TensorFlow library to enable building and fine-tuning a Hugging Face model.

In [2]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from transformers import BertTokenizerFast
from transformers import BertPreTrainedModel
from transformers import BertModel
from transformers import AutoTokenizer
from transformers import AutoModel

from transformers import TrainingArguments, Trainer
from datasets import Dataset
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import AutoConfig


# Read in three datasets again
data_train = pd.read_csv('data_train.csv')
data_val = pd.read_csv('data_val.csv')
data_test = pd.read_csv('data_test.csv')

# For build
# data_train = data_train.sample(frac=0.003, random_state=42).reset_index(drop=True)
# data_val = data_val.sample(frac=0.05, random_state=42).reset_index(drop=True)
# data_test = data_test.sample(frac=0.01, random_state=42).reset_index(drop=True)

full_cols= [
    'sdoh_community_present', 'sdoh_community_absent', 'sdoh_education',
    'sdoh_economics', 'sdoh_environment', 'behavior_alcohol',
    'behavior_tobacco', 'behavior_drug'
]

#### Convert to Hugging Face Dataset
Using the Dataset makes it easier to handle data for using Hugging Face training tools

In [3]:
# Convert Pandas DataFrames to Hugging Face Dataset
train_ds=Dataset.from_pandas(data_train)
val_ds=Dataset.from_pandas(data_val)
test_ds=Dataset.from_pandas(data_test)

#### Preprocess the Dataset

##### Tokenise

start by loading the BERT Base cased pretrained tokenisation model form Hugging Face.

In [4]:
modelname = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

##### Have a try on some sentences

In [5]:
encoding = tokenizer.encode('the patient saw a quick brown fox jumped over the lazy dog')
print(encoding)

[101, 1996, 5776, 2387, 1037, 4248, 2829, 4419, 5598, 2058, 1996, 13971, 3899, 102]


##### define two simple functions that takes a batch of text and combined labels as input

Define a simple tokenisation function that takes a batch of text and labels as input, takes out the text part of it, and returns the tokenised text and the labels.

In [6]:
# tokenisation function
def tokenize(batch):
    return tokenizer(batch['social_history'], padding='max_length',truncation=True,max_length=128)
    
train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

# Combine labels function
def combine(batch):
    labels = []
    for i in range(len(batch[full_cols[0]])):
        row=[]
        for cols in full_cols:
            row.append(batch[cols][i])
        labels.append(row)
    batch['labels']= labels
    return batch

train_ds = train_ds.map(combine, batched=True)
val_ds = val_ds.map(combine, batched=True)
test_ds = test_ds.map(combine, batched=True)


# Remove extra columns to avoid passing extra original 8 labels' columns to model
# Use remove rather than select
use_cols=['input_ids', 'attention_mask', 'labels']
train_ds=train_ds.remove_columns([cols for cols in train_ds.column_names if cols not in use_cols])
val_ds=val_ds.remove_columns([cols for cols in val_ds.column_names if cols not in use_cols])
test_ds= test_ds.remove_columns([cols for cols in test_ds.column_names if cols not in use_cols])

Map:   0%|          | 0/4917 [00:00<?, ? examples/s]

Map:   0%|          | 0/1055 [00:00<?, ? examples/s]

Map:   0%|          | 0/1053 [00:00<?, ? examples/s]

Map:   0%|          | 0/4917 [00:00<?, ? examples/s]

Map:   0%|          | 0/1055 [00:00<?, ? examples/s]

Map:   0%|          | 0/1053 [00:00<?, ? examples/s]

### 5.1.2 Create the Standard BERT Module

Similarly, I build a model that shares a single encoder and uses a separate classification head (nn.Linear) for each task
- Merges logits from all tasks into a unified tensor with padding for alignment
- Supports joint loss computation across tasks using CrossEntropyLoss

In [7]:
labelclass_list=[2,2,2,3,3,5,5,5]

# Use the pretrained BertPreTrainedModel class for initialization
class MultiTaskBertModel(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.labelclass_list = labelclass_list
        self.num_tasks = 8
        
        self.bert=AutoModel.from_pretrained('bert-base-uncased', config=config)

        # Add a Hidden layer before classification heads
        #self.hidden = nn.Sequential(
            #nn.Linear(config.hidden_size, 256),
            #nn.GELU(),
            # Add a 50% dropout
            #nn.Dropout(0.5)
        #)
        
        # Use one classification head per task and register them using nn.ModuleList.
        self.classifiers = nn.ModuleList([nn.Linear(config.hidden_size, num_labels) for num_labels in labelclass_list])
        self.init_weights()

        
    # Define forward pass for multi-task model to compute combined loss for joint training
    def forward(self, input_ids, attention_mask, labels):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        # hidden_output = self.hidden(pooled_output)
        logits_list = [classifier(pooled_output) for classifier in self.classifiers]

        batch_size = pooled_output.size(0)
        device = pooled_output.device

        # As Using Trainer needs unified logits with consistent shape across tasks. Combine logits from 8 tasks and use 0 as padding to mask unused positions.
        # maximun of num_labels is 5

        # Initialize logits with zeros, shape: (batch_size, num_tasks, max_labels)
        logits = torch.zeros(batch_size, self.num_tasks, 5,device=device)

        for i, logit_task in enumerate(logits_list):

            num_labels = self.labelclass_list[i]
            #print(logit_task.shape)
            logits[:, i, :num_labels] = logit_task

        loss_fct = nn.CrossEntropyLoss()
        loss = 0
        for i in range(self.num_tasks):
            loss += loss_fct(logits[:, i, :self.labelclass_list[i]], labels[:, i])

        return (loss, logits)

config = AutoConfig.from_pretrained('bert-base-uncased')
model = MultiTaskBertModel.from_pretrained('bert-base-uncased',config=config)

Some weights of MultiTaskBertModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifiers.0.bias', 'classifiers.0.weight', 'classifiers.1.bias', 'classifiers.1.weight', 'classifiers.2.bias', 'classifiers.2.weight', 'classifiers.3.bias', 'classifiers.3.weight', 'classifiers.4.bias', 'classifiers.4.weight', 'classifiers.5.bias', 'classifiers.5.weight', 'classifiers.6.bias', 'classifiers.6.weight', 'classifiers.7.bias', 'classifiers.7.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 5.1.3 Set Up Evaluation

Same as other basic models above, macro and weighted scores are chosen to evaluate model's performance

In [8]:
def compute_metrics(eval_pred):
    logits,labels=eval_pred
    # labels = labels.astype(int)

    # Initialize lists
    precision_macro=[]
    recall_macro = []
    f1_macro=[]

    precision_weighted=[]
    recall_weighted=[]
    f1_weighted=[]

    acclist=[]
    task_f1_scores={}

    for i in range(8):
        logit_task = logits[:, i, :labelclass_list[i]]
        preds_i = np.argmax(logit_task, axis=1)
        y_true = labels[:, i]

        acc = accuracy_score(y_true, preds_i)
        acclist.append(acc)

        # macro
        prec=precision_score(y_true, preds_i, average='macro')
        rec=recall_score(y_true, preds_i, average='macro')
        f1=f1_score(y_true, preds_i, average='macro')
        precision_macro.append(prec)
        recall_macro.append(rec)
        f1_macro.append(f1)
        
        # As it's not convinent to directly compute the macro F1 score for each task category on the test dataset,
        # the macro F1 score at each epoch was calculated
        task_f1_scores[full_cols[i]+"_macro_f1"] = f1

        # weighted
        prec_w=precision_score(y_true, preds_i, average='weighted')
        rec_w=recall_score(y_true, preds_i, average='weighted')
        f1_w=f1_score(y_true, preds_i, average='weighted')
        precision_weighted.append(prec_w)
        recall_weighted.append(rec_w)
        f1_weighted.append(f1_w)

    metrics = {
        'macro_acc': np.mean(acclist),

        'macro_precision': np.mean(precision_macro),
        'macro_recall': np.mean(recall_macro),
        'macro_f1': np.mean(f1_macro),

        'weighted_precision': np.mean(precision_weighted),
        'weighted_recall': np.mean(recall_weighted),
        'weighted_f1': np.mean(f1_weighted),
    }

    metrics.update(task_f1_scores)

    return metrics

### 5.1.4 Set up Training

In [9]:
# Set up a few arguments in a seperate TrainingArguments class
training_args = TrainingArguments(
    output_dir ='./Final_Project',
    learning_rate=2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size=16,
    #15 epochs to reach relatively stable
    num_train_epochs=15,
    weight_decay=0.01,
    logging_strategy='steps',    
    logging_steps=50,
    push_to_hub=False,
    #report_to="tensorboard",
    #Evaluate after each epoch and save checkpoints to easily resume training, add epochs
    eval_strategy='epoch',
    save_strategy='epoch',

    # Set to automatically load the the best macro_f1 score model on validation set
    load_best_model_at_end = True,
    metric_for_best_model='macro_f1',
    greater_is_better=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    processing_class= tokenizer,
    compute_metrics=compute_metrics,
)

### 5.1.5 Train the Module & Test

In [45]:
trainer.train()

test_metrics = trainer.evaluate(test_ds)
print("Best model Performance on test dataset: ")
print(test_metrics)

Epoch,Training Loss,Validation Loss,Macro Acc,Macro Precision,Macro Recall,Macro F1,Weighted Precision,Weighted Recall,Weighted F1,Sdoh Community Present Macro F1,Sdoh Community Absent Macro F1,Sdoh Education Macro F1,Sdoh Economics Macro F1,Sdoh Environment Macro F1,Behavior Alcohol Macro F1,Behavior Tobacco Macro F1,Behavior Drug Macro F1
1,3.9104,3.701662,0.848934,0.655736,0.578165,0.566819,0.828689,0.848934,0.823307,0.910095,0.557343,0.4923,0.550012,0.607533,0.475393,0.562691,0.379181
2,2.5146,2.322437,0.916825,0.806499,0.718903,0.74023,0.910222,0.916825,0.90967,0.973265,0.87627,0.647328,0.838419,0.632662,0.670736,0.778025,0.505133
3,1.6644,1.758617,0.936611,0.852275,0.786118,0.804176,0.933339,0.936611,0.932459,0.979435,0.94493,0.828956,0.892471,0.639514,0.770375,0.826875,0.550853
4,1.3284,1.532853,0.942417,0.887173,0.815248,0.832272,0.940539,0.942417,0.940114,0.977352,0.950455,0.864768,0.886598,0.704612,0.821946,0.852799,0.599649
5,0.9533,1.396813,0.947986,0.897378,0.838233,0.854001,0.946808,0.947986,0.946342,0.981575,0.946323,0.903317,0.890066,0.757534,0.842351,0.877696,0.633145
6,0.7826,1.30524,0.950474,0.89788,0.84277,0.856179,0.949265,0.950474,0.94902,0.97535,0.949719,0.89365,0.902314,0.758244,0.846654,0.887983,0.635515
7,0.561,1.296835,0.950948,0.893783,0.846942,0.860911,0.949855,0.950948,0.949735,0.977455,0.949719,0.877288,0.893195,0.801522,0.862729,0.886555,0.638828
8,0.4986,1.26639,0.954028,0.913661,0.858461,0.875488,0.953555,0.954028,0.953008,0.979482,0.956741,0.904098,0.912758,0.798439,0.875257,0.881556,0.695573
9,0.361,1.247788,0.955687,0.910708,0.874528,0.886149,0.955211,0.955687,0.954997,0.978419,0.952641,0.906155,0.903038,0.839612,0.870337,0.897282,0.741712
10,0.3202,1.236513,0.954621,0.910801,0.881029,0.892894,0.954464,0.954621,0.954207,0.975378,0.952641,0.89365,0.905183,0.898923,0.865713,0.897208,0.754459




Best model Performance on test dataset: 
{'eval_loss': 1.2566423416137695, 'eval_macro_acc': 0.9586894586894587, 'eval_macro_precision': 0.9098005480315166, 'eval_macro_recall': 0.8891243667621189, 'eval_macro_f1': 0.8976103070831851, 'eval_weighted_precision': 0.9584484113093603, 'eval_weighted_recall': 0.9586894586894587, 'eval_weighted_f1': 0.9583890595215988, 'eval_sdoh_community_present_macro_f1': 0.9824086949258307, 'eval_sdoh_community_absent_macro_f1': 0.941429498470381, 'eval_sdoh_education_macro_f1': 0.8968152866242038, 'eval_sdoh_economics_macro_f1': 0.9124918716761746, 'eval_sdoh_environment_macro_f1': 0.9162276768414875, 'eval_behavior_alcohol_macro_f1': 0.8556693389993264, 'eval_behavior_tobacco_macro_f1': 0.9245133722814941, 'eval_behavior_drug_macro_f1': 0.7513267168465827, 'eval_runtime': 54.9432, 'eval_samples_per_second': 19.165, 'eval_steps_per_second': 1.201, 'epoch': 15.0}


### 5.2 Bio-Clinical BERT Model

In [11]:
# Choose Bio-ClinicalBERT as model 2
modelname2='emilyalsentzer/Bio_ClinicalBERT'
tokenizer = AutoTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')

##### Convert these three datasets to Hugging Face Dataset again

In [12]:
train_ds = Dataset.from_pandas(data_train)
val_ds = Dataset.from_pandas(data_val)
test_ds = Dataset.from_pandas(data_test)

### 5.2.1 Preprocess

In [13]:
# Same as above, use two functions to take a batch of text and combined labels as input
def tokenize(batch):
    return tokenizer(batch['social_history'], padding='max_length',truncation=True,max_length=128)
    
train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

def combine(batch):
    labels = []
    for i in range(len(batch[full_cols[0]])):
        row =[]
        for cols in full_cols:
            row.append(batch[cols][i])
        labels.append(row)
    batch['labels'] = labels
    return batch

train_ds = train_ds.map(combine, batched=True)
val_ds = val_ds.map(combine, batched=True)
test_ds = test_ds.map(combine, batched=True)


# Remove extra columns to avoid passing original labels' columns to model
use_cols=['input_ids', 'attention_mask', 'labels']
train_ds = train_ds.remove_columns([cols for cols in train_ds.column_names if cols not in use_cols])
val_ds = val_ds.remove_columns([cols for cols in val_ds.column_names if cols not in use_cols])
test_ds = test_ds.remove_columns([cols for cols in test_ds.column_names if cols not in use_cols])

Map:   0%|          | 0/4917 [00:00<?, ? examples/s]

Map:   0%|          | 0/1055 [00:00<?, ? examples/s]

Map:   0%|          | 0/1053 [00:00<?, ? examples/s]

Map:   0%|          | 0/4917 [00:00<?, ? examples/s]

Map:   0%|          | 0/1055 [00:00<?, ? examples/s]

Map:   0%|          | 0/1053 [00:00<?, ? examples/s]

### 5.2.2 Create the BioClinicalBERT Module

In [14]:
class MultiTaskBertModel2(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_tasks = 8
        # self.bert = BertModel(config) 
        self.bert = AutoModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT', config=config)  # Use Bio+Clinical BERT from huggingface
        
        self.classifiers = nn.ModuleList([nn.Linear(config.hidden_size, num_labels) for num_labels in labelclass_list])
        self.labelclass_list = labelclass_list
        self.init_weights()

    def forward(self, input_ids, attention_mask, labels):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output=outputs.pooler_output

        logits_list= [classifier(pooled_output) for classifier in self.classifiers]

        batch_size = pooled_output.size(0)
        device = pooled_output.device

        # Initialize logits with zeros, shape: (batch_size, num_tasks, max_labels)
        logits = torch.zeros(batch_size, self.num_tasks, 5, device=device)
        for i,logit_task in enumerate(logits_list):
            num_labels = self.labelclass_list[i]
            logits[:, i, :num_labels] = logit_task

        loss_fct = nn.CrossEntropyLoss()
        loss = 0
        for i in range(self.num_tasks):
            loss += loss_fct(logits[:, i, :self.labelclass_list[i]],labels[:, i])

        return (loss, logits)

config=AutoConfig.from_pretrained(modelname2)
model=MultiTaskBertModel2.from_pretrained(modelname2, config=config)

Some weights of MultiTaskBertModel2 were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifiers.0.bias', 'classifiers.0.weight', 'classifiers.1.bias', 'classifiers.1.weight', 'classifiers.2.bias', 'classifiers.2.weight', 'classifiers.3.bias', 'classifiers.3.weight', 'classifiers.4.bias', 'classifiers.4.weight', 'classifiers.5.bias', 'classifiers.5.weight', 'classifiers.6.bias', 'classifiers.6.weight', 'classifiers.7.bias', 'classifiers.7.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### 5.2.3Set up training

Adam optimiser was chosen to be used to optimise multi-task classification

In [15]:
# Set up a few arguments in a seperate TrainingArguments class
training_args = TrainingArguments(
    output_dir='./Final_Project2',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    logging_strategy='steps',    
    logging_steps=50,
    push_to_hub=False,
    #report_to="tensorboard",
    #Evaluate after each epoch and save checkpoints to easily resume training, add epochs
    eval_strategy='epoch',
    save_strategy='epoch',

    # Set to automatically load the the best macro_f1 score model on validation set
    load_best_model_at_end=True,
    metric_for_best_model='macro_f1',
    greater_is_better=True,
)

trainer2 = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
    # tokenizer=tokenizer,
    processing_class= tokenizer
)

#### Train & Test

In [12]:
trainer2.train()

test_metrics = trainer2.evaluate(test_ds)
print("Performance on test dataset: ")
print(test_metrics)



Epoch,Training Loss,Validation Loss,Macro Acc,Macro Precision,Macro Recall,Macro F1,Weighted Precision,Weighted Recall,Weighted F1,Sdoh Community Present Macro F1,Sdoh Community Absent Macro F1,Sdoh Education Macro F1,Sdoh Economics Macro F1,Sdoh Environment Macro F1,Behavior Alcohol Macro F1,Behavior Tobacco Macro F1,Behavior Drug Macro F1
1,3.7338,3.480148,0.863863,0.583767,0.588099,0.57662,0.822185,0.863863,0.838668,0.885264,0.470647,0.4923,0.695792,0.613403,0.477653,0.603042,0.374857
2,2.5242,2.347675,0.913744,0.830754,0.682979,0.706172,0.909429,0.913744,0.90127,0.968105,0.716197,0.651621,0.846188,0.642113,0.599974,0.757656,0.467522
3,1.6775,1.763966,0.934953,0.853512,0.760982,0.787933,0.93051,0.934953,0.929175,0.970094,0.911619,0.777764,0.896979,0.643527,0.684037,0.846764,0.572679
4,1.3887,1.480179,0.945142,0.860823,0.812089,0.830907,0.942533,0.945142,0.942727,0.981533,0.936312,0.841318,0.891117,0.643623,0.806188,0.868439,0.678723
5,0.9425,1.373281,0.949526,0.88233,0.84558,0.858694,0.948452,0.949526,0.948355,0.982509,0.944517,0.883679,0.906433,0.742974,0.831188,0.869776,0.708479
6,0.8035,1.277832,0.952725,0.912232,0.838079,0.855107,0.951953,0.952725,0.951129,0.977299,0.947128,0.846737,0.909935,0.709488,0.860093,0.882454,0.707727
7,0.5821,1.227464,0.95391,0.907465,0.853427,0.867909,0.953077,0.95391,0.952794,0.98253,0.949342,0.89365,0.904754,0.759702,0.854005,0.879714,0.719575
8,0.5341,1.235442,0.953791,0.916702,0.846153,0.86254,0.953465,0.953791,0.952585,0.977352,0.95868,0.889936,0.900669,0.70668,0.865426,0.887577,0.714001
9,0.3886,1.22531,0.956398,0.924391,0.865781,0.885331,0.955921,0.956398,0.955541,0.976253,0.954166,0.911356,0.912837,0.802855,0.86352,0.896223,0.765433
10,0.3487,1.206875,0.956872,0.919542,0.865574,0.883303,0.956306,0.956872,0.956113,0.979458,0.956741,0.893628,0.908801,0.802212,0.861524,0.899059,0.765003




Performance on test dataset: 
{'eval_loss': 1.252211570739746, 'eval_macro_acc': 0.9560778727445394, 'eval_macro_precision': 0.9073933692731901, 'eval_macro_recall': 0.8607660058002333, 'eval_macro_f1': 0.8727981196506881, 'eval_weighted_precision': 0.9554850126387042, 'eval_weighted_recall': 0.9560778727445394, 'eval_weighted_f1': 0.955216154636358, 'eval_sdoh_community_present_macro_f1': 0.9698462243349274, 'eval_sdoh_community_absent_macro_f1': 0.9267868730879762, 'eval_sdoh_education_macro_f1': 0.9033117042115573, 'eval_sdoh_economics_macro_f1': 0.9144219159207383, 'eval_sdoh_environment_macro_f1': 0.7741762648353204, 'eval_behavior_alcohol_macro_f1': 0.870937793832842, 'eval_behavior_tobacco_macro_f1': 0.9096333577625522, 'eval_behavior_drug_macro_f1': 0.713270823219591, 'eval_runtime': 73.8317, 'eval_samples_per_second': 14.262, 'eval_steps_per_second': 0.894, 'epoch': 15.0}


# Project Codes END, Thanks Very Much!