In [3]:
import json
import pandas as pd

with open("/Users/casey/Documents/GitHub/LLM_Healthcare/ori_pqal.json") as f:
    data = json.load(f)

# Transform the dictionary into a DataFrame with the desired columns
df = pd.DataFrame.from_dict(data, orient='index')
df = df.rename(columns={
    'QUESTION': 'question',
    'CONTEXTS': 'context',
    'LONG_ANSWER': 'long_answer',
    'final_decision': 'final_answer'  # Assuming 'final_decision' corresponds to 'final_answer'
})
df = df[['question', 'context', 'long_answer', 'final_answer']]
df.head()

Unnamed: 0,question,context,long_answer,final_answer
21645374,Do mitochondria play a role in remodelling lac...,[Programmed cell death (PCD) is the regulated ...,Results depicted mitochondrial dynamics in viv...,yes
16418930,Landolt C and snellen e acuity: differences in...,[Assessment of visual acuity depends on the op...,"Using the charts described, there was only a s...",no
9488747,"Syncope during bathing in infants, a pediatric...",[Apparent life-threatening events in infants a...,"""Aquagenic maladies"" could be a pediatric form...",yes
17208539,Are the long-term results of the transanal pul...,[The transanal endorectal pull-through (TERPT)...,Our long-term study showed significantly bette...,no
10808977,Can tailored interventions increase mammograph...,[Telephone counseling and tailored print commu...,The effects of the intervention were most pron...,yes


### Clean & Normalize Biomedical Text

We will handle things like:
- Lowercasing (if using uncased model)
- Removing excessive whitespace
	•	Optional: Abbreviation expansion (with domain dict)

In [6]:
import re

def clean_text(text):
    if isinstance(text, str):  # Ensure the input is a string
        text = text.lower().strip()
        text = re.sub(r'\s+', ' ', text)
    return text  # Return the original value if it's not a string

for col in ['question', 'context', 'long_answer']:
    df[col] = df[col].apply(clean_text)

### Convert Labels to Integers

We’ll map yes / no / maybe -> 0 / 1 / 2 for classification.

In [7]:
label_map = {'yes': 0, 'no': 1, 'maybe': 2}
df['label'] = df['final_answer'].map(label_map)

### Tokenize with PubMedBERT / BioBERT

Using Hugging Face’s transformers to tokenize the input pairs.

In [20]:
from transformers import AutoTokenizer

# Choose the model
MODEL_NAME = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(example):
    return tokenizer(
        str(example['question']),
        str(example['context']),
        padding='max_length',
        truncation=True,
        max_length=512
    )

### Prepare Hugging Face Dataset

Convert the cleaned DataFrame into the datasets format and tokenize it.

In [21]:
!pip install datasets



In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df[['question', 'context', 'label']])

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]