In [3]:
import json
import pandas as pd

with open("/Users/casey/Documents/GitHub/LLM_Healthcare/ori_pqal.json") as f:
    data = json.load(f)

# Transform the dictionary into a DataFrame with the desired columns
df = pd.DataFrame.from_dict(data, orient='index')
df = df.rename(columns={
    'QUESTION': 'question',
    'CONTEXTS': 'context',
    'LONG_ANSWER': 'long_answer',
    'final_decision': 'final_answer'  # Assuming 'final_decision' corresponds to 'final_answer'
})
df = df[['question', 'context', 'long_answer', 'final_answer']]
df.head()

Unnamed: 0,question,context,long_answer,final_answer
21645374,Do mitochondria play a role in remodelling lac...,[Programmed cell death (PCD) is the regulated ...,Results depicted mitochondrial dynamics in viv...,yes
16418930,Landolt C and snellen e acuity: differences in...,[Assessment of visual acuity depends on the op...,"Using the charts described, there was only a s...",no
9488747,"Syncope during bathing in infants, a pediatric...",[Apparent life-threatening events in infants a...,"""Aquagenic maladies"" could be a pediatric form...",yes
17208539,Are the long-term results of the transanal pul...,[The transanal endorectal pull-through (TERPT)...,Our long-term study showed significantly bette...,no
10808977,Can tailored interventions increase mammograph...,[Telephone counseling and tailored print commu...,The effects of the intervention were most pron...,yes


### Clean & Normalize Biomedical Text

We will handle things like:
- Lowercasing (if using uncased model)
- Removing excessive whitespace
	•	Optional: Abbreviation expansion (with domain dict)

In [6]:
import re

def clean_text(text):
    if isinstance(text, str):  # Ensure the input is a string
        text = text.lower().strip()
        text = re.sub(r'\s+', ' ', text)
    return text  # Return the original value if it's not a string

for col in ['question', 'context', 'long_answer']:
    df[col] = df[col].apply(clean_text)

### Convert Labels to Integers

We’ll map yes / no / maybe -> 0 / 1 / 2 for classification.

In [7]:
label_map = {'yes': 0, 'no': 1, 'maybe': 2}
df['label'] = df['final_answer'].map(label_map)

### Tokenize with PubMedBERT / BioBERT

Using Hugging Face’s transformers to tokenize the input pairs.

In [20]:
from transformers import AutoTokenizer

# Choose the model
MODEL_NAME = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(example):
    return tokenizer(
        str(example['question']),
        str(example['context']),
        padding='max_length',
        truncation=True,
        max_length=512
    )

In [24]:
# Created dataset and tokenize without batching
dataset = Dataset.from_pandas(df[['question', 'context', 'label']])
tokenized_dataset = dataset.map(tokenize_function, batched=False)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

### Prepare Hugging Face Dataset

Convert the cleaned DataFrame into the datasets format and tokenize it.

In [25]:
#Batched tokenization (faster)
def tokenize_batch_function(examples):
    questions = [str(q) for q in examples['question']]
    contexts = [str(c) for c in examples['context']]
    
    return tokenizer(
        questions,
        contexts,
        padding='max_length',
        truncation=True,
        max_length=512
    )

# Dataset and tokenize with batching
dataset = Dataset.from_pandas(df[['question', 'context', 'label']])
tokenized_dataset_batched = dataset.map(tokenize_batch_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [28]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Convert the Hugging Face Dataset to a Pandas DataFrame
tokenized_df = tokenized_dataset.to_pandas()

# Train and temporary sets
train_df, temp_df = train_test_split(tokenized_df, test_size=0.2, random_state=42)

In [None]:
# Splitting into train, val, and test sets
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# ting indices to ensure clean dataframes
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [32]:
# Creating Dataset objects from DataFrames
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [34]:
# Applying tokenization to each split
train_tokenized = train_dataset.map(tokenize_batch_function, batched=True)
val_tokenized = val_dataset.map(tokenize_batch_function, batched=True)
test_tokenized = test_dataset.map(tokenize_batch_function, batched=True)

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [36]:
from datasets import DatasetDict

# Creating a DatasetDict to organize the datasets
dataset_dict = DatasetDict({
    'train': train_tokenized,
    'validation': val_tokenized,
    'test': test_tokenized
})

In [37]:
# Defining metrics function for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [39]:
from transformers import AutoModelForSequenceClassification

# Loading and configuring the model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3  # yes, no, maybe -> 0, 1, 2
)

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
from transformers import TrainingArguments

# Setting up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`