In [None]:
import torch
from transformers import pipeline
import pandas as pd
from tqdm.notebook import tqdm
from torch.utils.data import Dataset


model_id = "NousResearch/Llama-3.2-1B"
pipe = pipeline("text-generation",
                model=model_id,
                torch_dtype=torch.bfloat16,
                device_map="auto")

eos_token_id = pipe.tokenizer.eos_token_id

2024-11-09 13:47:44.477596: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-09 13:47:44.487263: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-09 13:47:44.498658: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-09 13:47:44.501996: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-09 13:47:44.510744: I tensorflow/core/platform/cpu_feature_guar

In [None]:
import ast
from tqdm import tqdm

defined_cats = ["text mining", "computer vision", "both", "other"]

def generate(prompt):
    result = pipe(prompt,
                  max_new_tokens=10,
                  return_full_text=False,
                  eos_token_id=eos_token_id,
                  pad_token_id=eos_token_id)
    return result[0]['generated_text']

def generate_batch(prompts):
    results = pipe(prompts,
                   max_new_tokens=10,
                   return_full_text=False,
                   eos_token_id=eos_token_id,
                   pad_token_id=eos_token_id)

    processed_results = []

    for result in results:
        try:
            corrected_string = ast.literal_eval(result[0]['generated_text'] + '"]')
            result_list = corrected_string

            if "text mining" in result_list and "computer vision" in result_list:
                processed_results.append("both")
            else:
                found_category = False
                for each in result_list:
                    if each in defined_cats:
                        processed_results.append(each)
                        found_category = True
                        break
                if not found_category:
                    processed_results.append("not known")
        except (SyntaxError, ValueError) as e:
            processed_results.append("error processing")

    return processed_results

def inference(prompt_template, new_data, batch_size=16):
    predictions = []
    batch_prompts = []

    for i in tqdm(range(new_data.shape[0])):
        title = new_data["Title"].tolist()[i]
        abstract = new_data["Abstract"].tolist()[i]
        prompt = prompt_template.format(title=title, abstract=abstract)

        batch_prompts.append(prompt)

        if len(batch_prompts) == batch_size:
            batch_predictions = generate_batch(batch_prompts)
            predictions.extend(batch_predictions)
            batch_prompts = []

    if batch_prompts:
        batch_predictions = generate_batch(batch_prompts)
        predictions.extend(batch_predictions)
    print(predictions)
    return predictions

In [None]:
data = pd.read_csv('task_1_inference.csv')
data.head(3)

Unnamed: 0,PMID,Title,Authors,Citation,First Author,Journal/Book,Publication Year,Create Date,PMCID,NIHMS ID,DOI,Abstract
0,39435445,Editorial: The operationalization of cognitive...,"Winter M, Probst T, Tallon M, Schobel J, Pryss R.",Front Neurosci. 2024 Oct 7;18:1501636. doi: 10...,Winter M,Front Neurosci,2024,2024/10/22,PMC11491427,,10.3389/fnins.2024.1501636,
1,39398866,Characterization of arteriosclerosis based on ...,"Zhou J, Li X, Demeke D, Dinh TA, Yang Y, Janow...",J Med Imaging (Bellingham). 2024 Sep;11(5):057...,Zhou J,J Med Imaging (Bellingham),2024,2024/10/14,PMC11466048,,10.1117/1.JMI.11.5.057501,PURPOSE: Our purpose is to develop a computer ...
2,39390053,Multi-scale input layers and dense decoder agg...,"Lan X, Jin W.",Sci Rep. 2024 Oct 10;14(1):23729. doi: 10.1038...,Lan X,Sci Rep,2024,2024/10/10,PMC11467340,,10.1038/s41598-024-74701-0,Accurate segmentation of COVID-19 lesions from...


In [None]:
new_data = data[['PMID', 'Title', 'Abstract']].copy()

In [None]:
prompt1 = """Classify the following research paper based on the type of method used. Choose from the categories: ["text mining", "computer vision", "both", "other"].

Title: {title}
Abstract: {abstract}
Classification: """

prompt1_inference = inference(prompt1, new_data, batch_size=128)

  0%|          | 0/11450 [00:00<?, ?it/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
prompt2 = """Based on the title and abstract provided, determine the primary method used in this research paper. Classify it into one of the following categories: ["text mining", "computer vision", "both", "other"].

Title: {title}
Abstract: {abstract}
Your classification is:"""

prompt2_inference = inference(prompt2, new_data, batch_size=128)

  0%|          | 0/11450 [00:00<?, ?it/s]

In [None]:
prompt3 = """Read the title and abstract of the following research paper and classify it according to the method used. The options are: ["text mining", "computer vision", "both", "other"]. Provide a clear classification.

Title: {title}
Abstract: {abstract}
Classification: """

prompt3_inference = inference(prompt3, new_data, batch_size=128)

  0%|          | 0/11450 [00:00<?, ?it/s]

In [None]:
prompt4 = """Analyze the title and abstract below and classify them based on the type of method employed in the research. Use one of these options: ["text mining", "computer vision", "both", "other"].
For example, if the paper discusses techniques like NLP or sentiment analysis, classify it as "text mining".

Title: {title}
Abstract: {abstract}
Classification:"""


prompt4_inference = inference(prompt4, new_data, batch_size=128)

  0%|          | 0/11450 [00:00<?, ?it/s]

In [None]:
prompt5 = """Classify the following title and abstract into one of these categories: ["text mining", "computer vision", "both", "other"].

Title: {title}
Abstract: {abstract}
Result:"""

prompt5_inference = inference(prompt5, new_data, batch_size=128)

  0%|          | 0/11450 [00:00<?, ?it/s]

In [None]:
new_data['prompt1_inference'] = prompt1_inference
new_data['prompt2_inference'] = prompt2_inference
new_data['prompt3_inference'] = prompt3_inference
new_data['prompt4_inference'] = prompt4_inference
new_data['prompt5_inference'] = prompt5_inference

In [None]:
# def find_label(text):
#     if "yes" in text.lower():
#         return "yes"
#     elif "no" in text.lower():
#         return "no"
#     else:
#         return "not known"

# new_data['prompt1_inference_label'] = [find_label(inf) for inf in prompt1_inference]
# new_data['prompt2_inference_label'] = [find_label(inf) for inf in prompt2_inference]
# new_data['prompt3_inference_label'] = [find_label(inf) for inf in prompt3_inference]
# new_data['prompt4_inference_label'] = [find_label(inf) for inf in prompt4_inference]
# new_data['prompt5_inference_label'] = [find_label(inf) for inf in prompt5_inference]

In [None]:
new_data.to_csv("task_2_inference.csv", index=False)

In [None]:
new_data['prompt1_inference_label'].value_counts()

prompt1_inference_label
not known    9571
yes          1194
no            685
Name: count, dtype: int64

In [None]:
new_data['prompt2_inference_label'].value_counts()

prompt2_inference_label
yes    11403
no        47
Name: count, dtype: int64

In [None]:
new_data['prompt3_inference_label'].value_counts()

prompt3_inference_label
yes          11364
not known       44
no              42
Name: count, dtype: int64

In [None]:
new_data['prompt4_inference_label'].value_counts()

prompt4_inference_label
yes          11349
not known       98
no               3
Name: count, dtype: int64

In [None]:
new_data['prompt5_inference_label'].value_counts()

prompt5_inference_label
yes          11440
no               6
not known        4
Name: count, dtype: int64