<a href="https://colab.research.google.com/github/Dan-La/scientific-challenges-and-directions/blob/main/Inference_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A notebook to perform inference on sentences with the challenge/direction labels


## Installs and setup

In [None]:
### install/import
import pip
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
### check I am using a GPU
from tensorflow.python.client import device_lib
import tensorflow as tf
tf.test.gpu_device_name()
device_lib.list_local_devices()

if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


## Load the model from huggingface



In [None]:
!pip install transformers 
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("DanL/scientific-challenges-and-directions")

model = AutoModelForSequenceClassification.from_pretrained("DanL/scientific-challenges-and-directions")

## Inference

In [None]:
### inference function
def inference(sentences, model=model, threshold=0.5):
    results = []
    if len(sentences): # sentences != NULL
        encoding = tokenizer(sentences, return_tensors="pt", padding=True)
        outputs = model(**encoding)
        logits = outputs['logits'].sigmoid().cpu().detach().numpy()
        for logit, text in zip(logits, sentences):
            results.append({'sequence':text, 'output': {'challenge': logit[0], 'direction': logit[1]}})
    return results

In [None]:
### to play with the model insert your own sentences
sentences = ["we speculate that studying IL-6 will be beneficial.",
             "severe atypical cases of pneumonia emerged and quickly spread worldwide.",
             "in future studies, both PRRs should be tested as the cause for multiple deaths.",
             "IbMADS1-transformed potatoes exhibited tuber morphogenesis in the fibrous roots.",]

results = inference(sentences, model=model)
print(*results, sep='\n')

{'sequence': 'we speculate that studying IL-6 will be beneficial.', 'output': {'challenge': 0.0003476635, 'direction': 0.99976546}}
{'sequence': 'Severe atypical cases of pneumonia emerged and quickly spread worldwide.', 'output': {'challenge': 0.99971944, 'direction': 0.00024237744}}
{'sequence': 'in future studies, both PRRs should be tested as the cause for multiple deaths.', 'output': {'challenge': 0.9997559, 'direction': 0.9998951}}
{'sequence': 'IbMADS1-transformed potatoes exhibited tuber morphogenesis in the fibrous roots.', 'output': {'challenge': 0.00014355127, 'direction': 0.00014841928}}


## Further examples

You can check out our dataset for further examples

In [None]:
### get the dataset 
!pip install datasets
from datasets import load_dataset
dataset = load_dataset("DanL/scientific-challenges-and-directions-dataset")

In [None]:
### the first 10 train sentences
sentences = dataset['test']['text'][:10]

In [None]:
results = inference(sentences, model=model)
print(*results, sep='\n')

{'sequence': 'Interestingly, when R40 was\nmutated to an aliphatic residue (Leu or Ala), the density of the D190\nnegative charge would be enhanced and the proteolytic activity might\nin theory be improved.', 'output': {'challenge': 0.00013575204, 'direction': 0.9991775}}
{'sequence': 'Extending the health sector preparedness to include other sectors/disciplines (eg,\nsocial services, critical infrastructure, regulatory authorities, public safety,\njustice) is critical to a seamless response.', 'output': {'challenge': 0.00021095444, 'direction': 0.000119132936}}
{'sequence': 'Both patients were treated with classical medical therapy including lactulose, but, despite increasing doses of lactulose for 3 days, ammonia levels remained unchanged.', 'output': {'challenge': 0.00012701421, 'direction': 0.000168087}}
{'sequence': 'Briefly, the bait construct pGBKT7-SΔC was first transformed into the yeast strain AH109 using lithium acetate method described in the Clontech manual, and 100 µg of 