In [2]:
!pip install transformers datasets torch

import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU instead")

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m58.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading saf

In [13]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from google.colab import drive
import numpy as np

drive.mount('/content/drive')
df = pd.read_pickle("/content/drive/MyDrive/test.pkl")

df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


0    Note 1: EXAMINATION:  Chest radiograph\n\nINDI...
1    Note 1: ADDENDUM\nAGATSTON SCORE: The total (a...
2    Note 1: EXAMINATION:  DX CHEST PORT LINE/TUBE ...
3    Note 1: EXAMINATION:  CHEST (PORTABLE AP)\n\nI...
4    Note 1: ADDENDUM  The right common femoral art...
Name: text, dtype: object

In [3]:
from transformers import BertForSequenceClassification, AdamW, BertConfig, AutoTokenizer

model_name = 'dmis-lab/biobert-v1.1'
loaded_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2, output_attentions=False, output_hidden_states=False).to(device)
loaded_model.load_state_dict(torch.load('/content/drive/MyDrive/finetuned_biobert_final.pt',map_location='cuda'))
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)okenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [22]:
import torch.nn.functional as F
import csv

def separate_notes(note):
  notes = note.split("Note")
  notes = [note.strip() for note in notes if note.strip()]
  return notes


file_path = "/content/drive/MyDrive/test_result.csv"
test_result = pd.DataFrame(columns=['Output'])
count = 1
threshold = 0.7
data = []


for item in df:
    # print(item)
    notes = separate_notes(item)
    trues = 0
    falses = 0

    for note in notes:
      note_input_ids = []
      note_attention_masks = []
      encoded_dict = tokenizer.encode_plus(
                        note,
                        add_special_tokens = True, #'[CLS]'and'[SEP]'
                        max_length = 512,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        truncation = True,
                        return_tensors = 'pt',
                   )
      note_input_ids.append(encoded_dict['input_ids'])
      note_attention_masks.append(encoded_dict['attention_mask'])

      note_input_ids = torch.cat(note_input_ids, dim=0).to(device)
      note_attention_masks = torch.cat(note_attention_masks, dim=0).to(device)

      with torch.no_grad():
        val_results = loaded_model(note_input_ids, token_type_ids=None, attention_mask=note_attention_masks)
      logits = val_results['logits']

      logits = logits.detach().cpu()
      prob = F.softmax(logits, dim=-1).numpy()
      # val_y_pred = np.argmax(prob, axis=1).flatten()
      val_y_pred = np.where(prob.max(axis=1) > threshold, prob.argmax(axis=1), 0)
      if val_y_pred == 0:
        falses+=1
      else:
        trues+=1

    output = 1 if (trues / (falses + trues)) > 0.95 else 0
    label = "True" if output == 1 else "False"
    data.append(label)
    # test_result.append({'Output': label}, ignore_index=True)

    print(label)
    print(str(count) + " finished")
    count+=1








True
1 finished
False
2 finished
False
3 finished
False
4 finished
False
5 finished
False
6 finished
True
7 finished
True
8 finished
True
9 finished
False
10 finished
False
11 finished
False
12 finished
False
13 finished
True
14 finished
True
15 finished
True
16 finished
False
17 finished
False
18 finished
False
19 finished
True
20 finished
False
21 finished
False
22 finished
True
23 finished
False
24 finished
False
25 finished
True
26 finished
False
27 finished
True
28 finished
False
29 finished
True
30 finished
False
31 finished
False
32 finished
False
33 finished
False
34 finished
False
35 finished
False
36 finished
True
37 finished
False
38 finished
True
39 finished
False
40 finished
False
41 finished
False
42 finished
False
43 finished
True
44 finished
True
45 finished
False
46 finished
False
47 finished
False
48 finished
False
49 finished
True
50 finished
False
51 finished
False
52 finished
False
53 finished
False
54 finished
False
55 finished
False
56 finished
False
57 finished


In [34]:
import numpy as np

q = np.array(data)

In [38]:
w = pd.DataFrame(q).to_csv('/content/drive/MyDrive/test_result.csv', index=False, header=False)

array(['True', 'False', 'False', ..., 'False', 'False', 'False'],
      dtype='<U5')