In [29]:
import torch
from transformers import pipeline
import pandas as pd
from tqdm.notebook import tqdm
from torch.utils.data import Dataset


model_id = "NousResearch/Llama-3.2-1B"
pipe = pipeline("text-generation",
                model=model_id,
                torch_dtype=torch.bfloat16,
                device_map="auto")

eos_token_id = pipe.tokenizer.eos_token_id

In [30]:
def generate(prompt):
    result = pipe(prompt,
                  max_new_tokens=10,
                  return_full_text=False,
                  eos_token_id=eos_token_id,
                  pad_token_id=eos_token_id)
    print(result[0]['generated_text'])
    return result[0]['generated_text']

def generate_batch(prompts):

    results = pipe(prompts,
                   max_new_tokens=10,
                   return_full_text=False,
                   eos_token_id=eos_token_id,
                   pad_token_id=eos_token_id)

    generated_texts = [result[0]['generated_text'] for result in results]
    return generated_texts


def inference(prompt_template, new_data, batch_size=16):
    predictions = []
    batch_prompts = []

    for i in tqdm(range(new_data.shape[0])):
        title = new_data["Title"].tolist()[i]
        abstract = new_data["Abstract"].tolist()[i]
        prompt = prompt_template.format(title=title, abstract=abstract)

        batch_prompts.append(prompt)

        if len(batch_prompts) == batch_size:
            batch_predictions = generate_batch(batch_prompts)
            predictions.extend(batch_predictions)
            batch_prompts = []

    if batch_prompts:
        batch_predictions = generate_batch(batch_prompts)
        predictions.extend(batch_predictions)
    return predictions

In [31]:
data = pd.read_csv('collection_with_abstracts.csv')
data.head(3)

Unnamed: 0,PMID,Title,Authors,Citation,First Author,Journal/Book,Publication Year,Create Date,PMCID,NIHMS ID,DOI,Abstract
0,39435445,Editorial: The operationalization of cognitive...,"Winter M, Probst T, Tallon M, Schobel J, Pryss R.",Front Neurosci. 2024 Oct 7;18:1501636. doi: 10...,Winter M,Front Neurosci,2024,2024/10/22,PMC11491427,,10.3389/fnins.2024.1501636,
1,39398866,Characterization of arteriosclerosis based on ...,"Zhou J, Li X, Demeke D, Dinh TA, Yang Y, Janow...",J Med Imaging (Bellingham). 2024 Sep;11(5):057...,Zhou J,J Med Imaging (Bellingham),2024,2024/10/14,PMC11466048,,10.1117/1.JMI.11.5.057501,PURPOSE: Our purpose is to develop a computer ...
2,39390053,Multi-scale input layers and dense decoder agg...,"Lan X, Jin W.",Sci Rep. 2024 Oct 10;14(1):23729. doi: 10.1038...,Lan X,Sci Rep,2024,2024/10/10,PMC11467340,,10.1038/s41598-024-74701-0,Accurate segmentation of COVID-19 lesions from...


In [32]:
new_data = data[['PMID', 'Title', 'Abstract']].copy()

In [33]:
prompt1 = """Your task is to extract the name of the model or method used in the following research paper. Provide the model name in quotes.

Title: {title}
Abstract: {abstract}"""

prompt1_inference = inference(prompt1, new_data, batch_size=128)

  0%|          | 0/11450 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 4.80 GiB. GPU 0 has a total capacity of 14.75 GiB of which 483.06 MiB is free. Process 3657 has 14.27 GiB memory in use. Of the allocated memory 13.60 GiB is allocated by PyTorch, and 566.91 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
prompt2 = """Extract the name of the model or method from the provided title and abstract. If no model or method is mentioned, return "NA".

Title: {title}
Abstract: {abstract}
Response: ["model_name"]"""

prompt2_inference = inference(prompt2, new_data, batch_size=128)

In [None]:
prompt3 = """Analyze the following title and abstract of a research paper to identify and extract the specific model or method being discussed. If you find multiple names, list them all in quotes, separated by commas. If none is found, respond with "NA".

Title: {title}
Abstract: {abstract}"""

prompt3_inference = inference(prompt3, new_data, batch_size=128)

In [None]:
prompt4 = """Your task is to extract the name of the model or method from research papers. For example, if the abstract mentions "BERT" or "ResNet", those are valid responses.

Title: {title}
Abstract: {abstract}
Response: """


prompt4_inference = inference(prompt4, new_data, batch_size=128)

In [None]:
prompt5 = """Please read the following title and abstract carefully and extract any names of models or methods mentioned within them. If there are none, simply state "No model found".

Title: {title}
Abstract: {abstract}"""

prompt5_inference = inference(prompt5, new_data, batch_size=128)

In [None]:
prompt6= """You are given the title and abstract of a research article. Your task is to identify and list the names of all methods used in the study. Please provide only the names in a bullet-point format.

**Example 1:**
Title: "A Novel Approach to Data Extraction"
Abstract: "We propose the ChatExtract method that automates data extraction using advanced conversational LLMs."
Methods:
- ChatExtract

**Example 2:**
Title: "Structured Information Extraction from Scientific Text"
Abstract: "This study employs the LLM-NERRE method for extracting structured data from scientific documents."
Methods:
- LLM-NERRE

Now, for the following article:
Title: {title}
Abstract: {abstract}
Methods:"""


prompt6_inference = inference(prompt6, new_data, batch_size=128)


In [None]:
prompt7 = """You are an AI assistant tasked with extracting method names from academic articles. Your job is to read the provided title and abstract, then list all the methods used in the research. Please format your response as a bullet-point list containing only the names of the methods.

**Instructions:**
1. Read the title and abstract carefully.
2. Identify all distinct methods mentioned.
3. Present the methods in a clean bullet-point format, without any additional commentary or explanation.

**Example 1:**
Title: "Enhancing Image Recognition with Deep Learning Techniques"
Abstract: "This paper introduces a new convolutional neural network (CNN) architecture, named DeepVision, which outperforms existing models in image classification tasks."
Methods:
- DeepVision

**Example 2:**
Title: "A Comprehensive Survey on Natural Language Processing"
Abstract: "We utilize a hybrid approach combining rule-based techniques and machine learning algorithms to improve text analysis."
Methods:
- Rule-based techniques
- Machine learning algorithms

**Example 3:**
Title: "Optimizing Supply Chain Management with AI"
Abstract: "Our research applies reinforcement learning (RL) strategies to optimize inventory management and reduce costs."
Methods:
- Reinforcement learning (RL)

Now, please analyze the following article:
Title: {title}
Abstract: {abstract}
Methods:"""


prompt7_inference = inference(prompt7, new_data, batch_size=128)


In [None]:
prompt8 = """You are an AI assistant designed to extract method names from academic articles. Your task is to read the title and abstract provided below and list all the methods used in the research. Please format your response as a bullet-point list containing only the names of the methods.

**Example:**
Title: "Advancements in Machine Learning for Predictive Analytics"
Abstract: "This study introduces a novel algorithm called PredictiveNet, which integrates ensemble learning techniques and deep neural networks to enhance predictive accuracy."
Methods:
- PredictiveNet
- Ensemble learning techniques
- Deep neural networks

Now, please analyze the following article and return the name of the methods without any extera information. if you can not figure out the method return "NAN":
Title: {title}
Abstract: {abstract}
Methods:"""

prompt8_inference = inference(prompt8, new_data, batch_size=128)


In [None]:
new_data['prompt1_inference'] = prompt1_inference
new_data['prompt2_inference'] = prompt2_inference
new_data['prompt3_inference'] = prompt3_inference
new_data['prompt4_inference'] = prompt4_inference
new_data['prompt5_inference'] = prompt5_inference
new_data['prompt6_inference'] = prompt6_inference
new_data['prompt7_inference'] = prompt7_inference
new_data['prompt8_inference'] = prompt8_inference


In [None]:
def find_label(text):
    if "yes" in text.lower():
        return "yes"
    elif "no" in text.lower():
        return "no"
    else:
        return "not known"

new_data['prompt1_inference_label'] = [find_label(inf) for inf in prompt1_inference]
new_data['prompt2_inference_label'] = [find_label(inf) for inf in prompt2_inference]
new_data['prompt3_inference_label'] = [find_label(inf) for inf in prompt3_inference]
new_data['prompt4_inference_label'] = [find_label(inf) for inf in prompt4_inference]
new_data['prompt5_inference_label'] = [find_label(inf) for inf in prompt5_inference]
new_data['prompt6_inference_label'] = [find_label(inf) for inf in prompt6_inference]
new_data['prompt7_inference_label'] = [find_label(inf) for inf in prompt7_inference]
new_data['prompt8_inference_label'] = [find_label(inf) for inf in prompt8_inference]


In [None]:
new_data.to_csv("task_1_inference.csv", index=False)

In [None]:
new_data['prompt1_inference_label'].value_counts()

prompt1_inference_label
not known    9571
yes          1194
no            685
Name: count, dtype: int64

In [None]:
new_data['prompt2_inference_label'].value_counts()

prompt2_inference_label
yes    11403
no        47
Name: count, dtype: int64

In [None]:
new_data['prompt3_inference_label'].value_counts()

prompt3_inference_label
yes          11364
not known       44
no              42
Name: count, dtype: int64

In [None]:
new_data['prompt4_inference_label'].value_counts()

prompt4_inference_label
yes          11349
not known       98
no               3
Name: count, dtype: int64

In [None]:
new_data['prompt5_inference_label'].value_counts()

prompt5_inference_label
yes          11440
no               6
not known        4
Name: count, dtype: int64