In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers)
  Downloading huggingface_hub-0.23.4-py3-none-any.whl.metadata (12 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.5.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
[

In [4]:
import json
import re
from transformers import BertTokenizer, get_linear_schedule_with_warmup
import pandas as pd
import torch
from transformers import BertForQuestionAnswering, AdamW
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, TensorDataset
import torch.nn.functional as F
from sklearn.metrics import f1_score

In [2]:
# Define device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Load dataset
df = pd.read_json('test.final.json')

# Display the dataset
print(df.head())

                                                data version
0  {'title': '177', 'paragraphs': [{'context': 'a...    v0.1
1  {'title': '185', 'paragraphs': [{'context': 'a...    v0.1
2  {'title': '195', 'paragraphs': [{'context': 'a...    v0.1
3  {'title': '2', 'paragraphs': [{'context': 'adm...    v0.1
4  {'title': '29', 'paragraphs': [{'context': 'ad...    v0.1


In [4]:
# Preprocess the text data
def preprocess(text):
    text = text.lower().strip()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

In [5]:
# Extract question-answer pairs
qa_pairs = []
for item in df['data']:
    for paragraph in item['paragraphs']:
        context = paragraph['context']
        for qa in paragraph['qas']:
            question = qa['question']
            answer = qa['answers'][0]['text']
            qa_pairs.append({'question': question, 'answer': answer, 'context': context})

qa_df = pd.DataFrame(qa_pairs)

# Display the extracted dataframe
print(qa_df.head())

                                            question  \
0  does the patient have a current copd exacerbation   
1  does the patient have a history of shortness o...   
2  has been the patient ever been considered for ...   
3  does the patient have a prior history of short...   
4    what is the patient 's copd exacerbation status   

                                              answer  \
0  chief complaint: copd exacerbation/shortness o...   
1  chief complaint: copd exacerbation/shortness o...   
2  chief complaint: copd exacerbation/shortness o...   
3  chief complaint: copd exacerbation/shortness o...   
4  chief complaint: copd exacerbation/shortness o...   

                                             context  
0  admission date: [**2124-7-21**] discharge date...  
1  admission date: [**2124-7-21**] discharge date...  
2  admission date: [**2124-7-21**] discharge date...  
3  admission date: [**2124-7-21**] discharge date...  
4  admission date: [**2124-7-21**] discharge date..

In [6]:
# Apply preprocessing to 'question', 'answer', and 'context' columns
qa_df['question'] = qa_df['question'].apply(preprocess)
qa_df['answer'] = qa_df['answer'].apply(preprocess)
qa_df['context'] = qa_df['context'].apply(preprocess)

In [7]:
# Tokenize the text data using the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(text, context=None):
    if context:
        return tokenizer.encode_plus(
            text,
            context,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
    else:
        return tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

qa_df['question_tokens'] = qa_df.apply(lambda x: tokenize(x['question'], x['context']), axis=1)
qa_df['answer_tokens'] = qa_df['answer'].apply(lambda x: tokenize(x))

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [8]:
# Display the tokenized dataframe
print(qa_df.head())

                                            question  \
0  does the patient have a current copd exacerbation   
1  does the patient have a history of shortness o...   
2  has been the patient ever been considered for ...   
3  does the patient have a prior history of short...   
4     what is the patient s copd exacerbation status   

                                              answer  \
0  chief complaint copd exacerbationshortness of ...   
1  chief complaint copd exacerbationshortness of ...   
2  chief complaint copd exacerbationshortness of ...   
3  chief complaint copd exacerbationshortness of ...   
4  chief complaint copd exacerbationshortness of ...   

                                             context  \
0  admission date 2124721 discharge date 2124818 ...   
1  admission date 2124721 discharge date 2124818 ...   
2  admission date 2124721 discharge date 2124818 ...   
3  admission date 2124721 discharge date 2124818 ...   
4  admission date 2124721 discharge date 21248

In [11]:
# Prepare input tensors for BERT
input_ids = torch.cat([x['input_ids'] for x in qa_df['question_tokens'].tolist()], dim=0)
attention_masks = torch.cat([x['attention_mask'] for x in qa_df['question_tokens'].tolist()], dim=0)

# Function to find start and end positions of the answer in the context
def get_start_end_positions(context, answer):
    context_tokens = tokenizer.encode(context, add_special_tokens=True, max_length=512, truncation=True)
    answer_tokens = tokenizer.encode(answer, add_special_tokens=True, max_length=512, truncation=True)
    
    for i in range(len(context_tokens) - len(answer_tokens) + 1):
        if context_tokens[i:i+len(answer_tokens)-2] == answer_tokens[1:-1]:
            start_position = i
            end_position = i + len(answer_tokens) - 3 
            return start_position, end_position

    return None, None  

qa_df['start_position'] = qa_df.apply(lambda x: get_start_end_positions(x['context'], x['answer'])[0], axis=1)
qa_df['end_position'] = qa_df.apply(lambda x: get_start_end_positions(x['context'], x['answer'])[1], axis=1)

In [12]:
# Drop rows with NaN values in start and end positions
qa_df = qa_df.dropna(subset=['start_position', 'end_position'])

start_positions = torch.tensor(qa_df['start_position'].tolist(), dtype=torch.long)
end_positions = torch.tensor(qa_df['end_position'].tolist(), dtype=torch.long)

# Ensure the lengths of input_ids, attention_masks, start_positions, and end_positions match
print(f"input_ids size: {input_ids.size()}")
print(f"attention_masks size: {attention_masks.size()}")
print(f"start_positions size: {start_positions.size()}")
print(f"end_positions size: {end_positions.size()}")

# Verify lengths before creating TensorDataset
assert input_ids.size(0) == attention_masks.size(0) == start_positions.size(0) == end_positions.size(0), "Size mismatch between tensors"

dataset = TensorDataset(input_ids, attention_masks, start_positions, end_positions)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=16) 
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=16) 

print("Data preparation complete.")

input_ids size: torch.Size([381, 512])
attention_masks size: torch.Size([381, 512])
start_positions size: torch.Size([381])
end_positions size: torch.Size([381])
Data preparation complete.


In [14]:
from sklearn.metrics import f1_score

In [21]:
# Load pretrained BERT model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
model.to(device) 

# Fine-tune the model
optimizer = AdamW(model.parameters(), lr=2e-5)

# Total number of training steps
total_steps = len(train_dataloader) * 5

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

for epoch in range(5):  # Increased epochs
    model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids, attention_masks, start_positions, end_positions = batch

        # Move tensors to the correct device
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        start_positions = start_positions.to(device)
        end_positions = end_positions.to(device)

        outputs = model(input_ids=input_ids, 
                        attention_mask=attention_masks, 
                        start_positions=start_positions, 
                        end_positions=end_positions)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()  # Update learning rate
        
        # Calculate accuracy for this batch
        start_preds = torch.argmax(outputs.start_logits, dim=-1)
        end_preds = torch.argmax(outputs.end_logits, dim=-1)
        
        train_accuracy = ((start_preds == start_positions).float().mean().item() + 
                          (end_preds == end_positions).float().mean().item()) / 2
        total_train_accuracy += train_accuracy

    avg_train_loss = total_train_loss / len(train_dataloader)
    avg_train_accuracy = total_train_accuracy / len(train_dataloader)

    # Validation step
    model.eval()
    total_eval_accuracy = 0

    for batch in val_dataloader:
        input_ids, attention_masks, start_positions, end_positions = batch

        # Move tensors to the correct device
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        start_positions = start_positions.to(device)
        end_positions = end_positions.to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_masks)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits
        
        start_preds = torch.argmax(start_logits, dim=-1)
        end_preds = torch.argmax(end_logits, dim=-1)
        
        total_eval_accuracy += ((start_preds == start_positions).float().mean().item() + 
                                (end_preds == end_positions).float().mean().item()) / 2

    print(f"Epoch {epoch+1}: Validation Accuracy: {total_eval_accuracy / len(val_dataloader):.2f}")

print("Training complete.")


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [None]:
# Load pretrained BERT model
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
model.to(device) 

# Fine-tune the model
optimizer = AdamW(model.parameters(), lr=2e-5)

# Total number of training steps
total_steps = len(train_dataloader) * 5

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

for epoch in range(5):  # Increased epochs
    model.train()
    total_train_loss = 0
    total_train_accuracy = 0
    total_train_f1 = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids, attention_masks, start_positions, end_positions = batch

        # Move tensors to the correct device
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        start_positions = start_positions.to(device)
        end_positions = end_positions.to(device)

        outputs = model(input_ids=input_ids, 
                        attention_mask=attention_masks, 
                        start_positions=start_positions, 
                        end_positions=end_positions)
        loss = outputs.loss
        total_train_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()  # Update learning rate

        # Calculate accuracy for this batch
        start_preds = torch.argmax(outputs.start_logits, dim=-1)
        end_preds = torch.argmax(outputs.end_logits, dim=-1)
        
        train_accuracy = ((start_preds == start_positions).float().mean().item() + 
                          (end_preds == end_positions).float().mean().item()) / 2
        total_train_accuracy += train_accuracy

        # Calculate F1 score for this batch
        start_f1 = f1_score(start_positions.cpu().numpy(), start_preds.cpu().numpy(), average='micro')
        end_f1 = f1_score(end_positions.cpu().numpy(), end_preds.cpu().numpy(), average='micro')
        train_f1 = (start_f1 + end_f1) / 2
        total_train_f1 += train_f1

    avg_train_loss = total_train_loss / len(train_dataloader)
    avg_train_accuracy = total_train_accuracy / len(train_dataloader)
    avg_train_f1 = total_train_f1 / len(train_dataloader)

    # Validation step
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    total_eval_f1 = 0

    for batch in val_dataloader:
        input_ids, attention_masks, start_positions, end_positions = batch

        # Move tensors to the correct device
        input_ids = input_ids.to(device)
        attention_masks = attention_masks.to(device)
        start_positions = start_positions.to(device)
        end_positions = end_positions.to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_masks,
                            start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        total_eval_loss += loss.item()
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits
        
        start_preds = torch.argmax(start_logits, dim=-1)
        end_preds = torch.argmax(end_logits, dim=-1)
        
        total_eval_accuracy += ((start_preds == start_positions).float().mean().item() + 
                                (end_preds == end_positions).float().mean().item()) / 2

        # Calculate F1 score for this batch
        start_f1 = f1_score(start_positions.cpu().numpy(), start_preds.cpu().numpy(), average='micro')
        end_f1 = f1_score(end_positions.cpu().numpy(), end_preds.cpu().numpy(), average='micro')
        eval_f1 = (start_f1 + end_f1) / 2
        total_eval_f1 += eval_f1

    avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
    avg_val_loss = total_eval_loss / len(val_dataloader)
    avg_val_f1 = total_eval_f1 / len(val_dataloader)

    print(f"Epoch {epoch+1}: Train Loss: {avg_train_loss:.3f}, Train Accuracy: {avg_train_accuracy:.3f}, Train F1: {avg_train_f1:.3f}, Validation Loss: {avg_val_loss:.3f}, Validation Accuracy: {avg_val_accuracy:.3f}, Validation F1: {avg_val_f1:.3f}")



Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1: Train Loss: 6.190, Train Accuracy: 0.002, Train F1: 0.002, Validation Loss: 6.052, Validation Accuracy: 0.025, Validation F1: 0.025
Epoch 2: Train Loss: 5.866, Train Accuracy: 0.020, Train F1: 0.020, Validation Loss: 5.855, Validation Accuracy: 0.037, Validation F1: 0.037
Epoch 3: Train Loss: 5.529, Train Accuracy: 0.066, Train F1: 0.066, Validation Loss: 5.665, Validation Accuracy: 0.050, Validation F1: 0.050
Epoch 5: Train Loss: 4.937, Train Accuracy: 0.102, Train F1: 0.102, Validation Loss: 5.356, Validation Accuracy: 0.044, Validation F1: 0.044
