<a href="https://colab.research.google.com/github/Dan-La/scientific-challenges-and-directions/blob/main/old_notebooks/Inference_Notebook_(OLD).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A notebook to perform inference on sentences with the problem/direction labels

Note: this is an older version that loads the best model from the paper, since then we've done some upgrades. Check the new version on huggingface: https://huggingface.co/DanL/scientific-challenges-and-directions, or on our repo: https://github.com/Dan-La/scientific-challenges-and-directions

## Installs and setup

In [None]:
### install/import
import pip
import numpy as np
import pandas as pd

!pip install -v transformers==4.9.2 
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

Using pip 21.1.3 from /usr/local/lib/python3.7/dist-packages/pip (python 3.7)
Value for scheme.platlib does not match. Please report this to <https://github.com/pypa/pip/issues/9617>
distutils: /usr/local/lib/python3.7/dist-packages
sysconfig: /usr/lib/python3.7/site-packages
Value for scheme.purelib does not match. Please report this to <https://github.com/pypa/pip/issues/9617>
distutils: /usr/local/lib/python3.7/dist-packages
sysconfig: /usr/lib/python3.7/site-packages
Value for scheme.headers does not match. Please report this to <https://github.com/pypa/pip/issues/9617>
distutils: /usr/local/include/python3.7/UNKNOWN
sysconfig: /usr/include/python3.7m/UNKNOWN
Value for scheme.scripts does not match. Please report this to <https://github.com/pypa/pip/issues/9617>
distutils: /usr/local/bin
sysconfig: /usr/bin
Value for scheme.data does not match. Please report this to <https://github.com/pypa/pip/issues/9617>
distutils: /usr/local
sysconfig: /usr
Additional context:
user = False
home

In [None]:
### check I am using a GPU
from tensorflow.python.client import device_lib
import tensorflow as tf
tf.test.gpu_device_name()
device_lib.list_local_devices()

if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


In [None]:
import torch
!python --version
print(torch.__version__)

Python 3.7.12
1.9.0+cu111


## Model path, param and classes

In [None]:
### Defining the model and inference params

MAX_LEN = 128 # set per the expected len 
NUM_LABELS=2 # col names of the labels in the dataset - ['problem', 'direction'] 

INFERENCE_BATCH_SIZE = 16 # set per the required 
inference_params = {'batch_size': INFERENCE_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0,
                }

In [None]:
### transform the data the required tokenized form and prepare for the DataLoader
class ProblemDirectionDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        text = " ".join(text.split())
        
        inputs = self.tokenizer.encode_plus( #TODO: change to encoding per batch; to avoid a global max_len padding
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.labels[idx], dtype=torch.long),
            'text': self.texts[idx]
        } 

    def __len__(self):
        return len(self.labels)

In [None]:
### define the NN
class PubmedBERTClass(torch.nn.Module):
    def __init__(self):
        super(PubmedBERTClass, self).__init__()
        self.l1 = AutoModel.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext", num_labels=NUM_LABELS)
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids = token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.l2(pooler)
        output = self.l3(pooler)
        return output

## Load the model

In [None]:
### import the tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext", do_lower_case=True)

### load the model
!wget https://challenges-directions.s3.us-west-2.amazonaws.com/Multilabel_ProblemDirection.pth
model = PubmedBERTClass()
OPTIMIZER = torch.optim.Adam(params =  model.parameters(), lr=1e-05)
LOSS_FUNCTION = torch.nn.BCELoss() # for the MultiLabel Case
MODEL = torch.load('Multilabel_ProblemDirection.pth')

MODEL.to(device)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

--2021-10-16 23:33:21--  https://challenges-directions.s3.us-west-2.amazonaws.com/Multilabel_ProblemDirection.pth
Resolving challenges-directions.s3.us-west-2.amazonaws.com (challenges-directions.s3.us-west-2.amazonaws.com)... 52.218.252.17
Connecting to challenges-directions.s3.us-west-2.amazonaws.com (challenges-directions.s3.us-west-2.amazonaws.com)|52.218.252.17|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 440407131 (420M) [application/x-www-form-urlencoded]
Saving to: ‘Multilabel_ProblemDirection.pth’


2021-10-16 23:33:34 (32.7 MB/s) - ‘Multilabel_ProblemDirection.pth’ saved [440407131/440407131]



Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


PubmedBERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

## Inference Functions

In [None]:
### inference functions
def valid(model, testing_loader, loss_function, optimizer, threshold=0.5):
    model.eval()
    final_outputs = []; final_targets = []; final_logits = []; final_texts = []
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask, token_type_ids).squeeze()
            if list(outputs.size())==[2]: # in case only 1 sentence in batch
                outputs = torch.reshape(outputs, (1,NUM_LABELS))
            logits = outputs[0]
            loss = loss_function(torch.sigmoid(outputs.view(-1,NUM_LABELS)), targets.type_as(logits).view(-1, NUM_LABELS)) # convert labels to float for calculation

            final_targets.extend(targets.cpu().detach().numpy())
            outputs_idx = torch.sigmoid(outputs).cpu().detach().numpy()
            final_outputs.extend([idx for idx in outputs_idx])
            final_logits.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            final_texts.extend(data['text'])

    #the default threshold is 0.5 for both labels
    final_outputs = [np.where(array>threshold, 1, 0) for array in final_outputs] 
    return final_targets, final_outputs, final_logits, final_texts


def inference(sentences, model=MODEL, loss_function=LOSS_FUNCTION, optimizer=OPTIMIZER, threshold=0.5):
    """
    function to wrap the validation function for inference purposes
    """
    results = []
    if len(sentences): # sentences != NULL
        sentences_labels = [[0,0]] * len(sentences) # dummy label since running in inference 
        sentences_dataset = ProblemDirectionDataset(sentences, sentences_labels, tokenizer, MAX_LEN) # prep the dataset
        sentences_loader = DataLoader(sentences_dataset, **inference_params) 
        final_targets, final_outputs, final_logits, final_texts = valid(model=model, testing_loader=sentences_loader, loss_function=loss_function, optimizer=optimizer, threshold=threshold)
        for logit, text in zip(final_logits, final_texts):
            results.append({'sequence':text, 'output': {'problem': logit[0], 'direction': logit[1]}})
    return results

## Inference

In [None]:
### infer sentences

sentences = ["we speculate that studying IL-6 will be beneficial",
             "there is no solution to IRB limitation",
             "germs find replications difficult",
             "IbMADS1-transformed potatoes exhibited tuber morphogenesis in the fibrous roots.",
             "Severe atypical cases of pneumonia emerged and quickly spread worldwide.",
             "The use of aprotinin has been linked with a higher incidence of both perioperative myocardial infarction and early vein graft closure.2"]

results = inference(sentences, model=MODEL)

print(*results, sep='\n')

{'sequence': 'we speculate that studying IL-6 will be beneficial', 'output': {'problem': 0.18894514441490173, 'direction': 0.9751405715942383}}
{'sequence': 'there is no solution to IRB limitation', 'output': {'problem': 0.7472769021987915, 'direction': 0.006989801302552223}}
{'sequence': 'germs find replications difficult', 'output': {'problem': 0.47354599833488464, 'direction': 0.006683778017759323}}
{'sequence': 'IbMADS1-transformed potatoes exhibited tuber morphogenesis in the fibrous roots.', 'output': {'problem': 0.015217977575957775, 'direction': 0.011578606441617012}}
{'sequence': 'Severe atypical cases of pneumonia emerged and quickly spread worldwide.', 'output': {'problem': 0.9731208086013794, 'direction': 0.01800379902124405}}
{'sequence': 'The use of aprotinin has been linked with a higher incidence of both perioperative myocardial infarction and early vein graft closure.2', 'output': {'problem': 0.9820389151573181, 'direction': 0.02942519076168537}}
