In [1]:
from openai import OpenAI

client = OpenAI(
    base_url="https://api.groq.com/openai/v1",
    api_key='<INSERT API KEY>'
)

In [11]:
def llm(prompt):
    response = client.chat.completions.create(
        model = 'gemma2-9b-it',
        messages = [{'role': 'user', 'content': prompt}]
    )

    return response.choices[0].message.content

In [161]:
def prompt_builder(paper):
    prompt_template = """
        You are a researcher looking for archived papers supporting your current paper's claim. 
        Create 5 questions a researcher may ask a search engine to retrieve that paper from the database. 
        The archived paper must be the answer to the 5 questions that are generated.
        The questions must be complete and not too short.
        And use as few words from the archived paper as possible.

        This is the information from the archived paper:

        Title: {title}
        Abstract: {abstract}

        Format the questions in the following json-parsable text without using code blocks or ANY latex commands:

        ["question 1", "question 2", ... "question 5"]
    """.strip()

    prompt = prompt_template.format(title = paper['inputs'], abstract = paper['inputs']['abstract'])

    return prompt

In [5]:
from datasets import load_dataset

ds = load_dataset("rubrix/research_papers_multi-label", split='train', streaming=True)

ds_head = ds.take(1000)

papers=list(ds_head)

papers

[{'text': None,
  'inputs': {'abstract': "  Predictive models allow subject-specific inference when analyzing disease\nrelated alterations in neuroimaging data. Given a subject's data, inference can\nbe made at two levels: global, i.e. identifiying condition presence for the\nsubject, and local, i.e. detecting condition effect on each individual\nmeasurement extracted from the subject's data. While global inference is widely\nused, local inference, which can be used to form subject-specific effect maps,\nis rarely used because existing models often yield noisy detections composed of\ndispersed isolated islands. In this article, we propose a reconstruction\nmethod, named RSM, to improve subject-specific detections of predictive\nmodeling approaches and in particular, binary classifiers. RSM specifically\naims to reduce noise due to sampling error associated with using a finite\nsample of examples to train classifiers. The proposed method is a wrapper-type\nalgorithm that can be used wit

In [25]:
def gen_questions(paper):
    prompt = prompt_builder(paper)
    return llm(prompt)

In [7]:
from tqdm.auto import tqdm

In [118]:
gen_questions(papers[15])

'["What is the rank of the Waring decomposition of  sM_{<3>}?", "Describe the symmetry group of sM_{<3>}.", "What is the exponent of matrix multiplication related to?", "What tensor property determines the exponent of matrix multiplication?", "How many symmetries does the Waring decomposition of sM_{<3>} have?"]  \n'

In [120]:
ground_truth_data = []

In [121]:
for paper in tqdm(papers):
    prompt = prompt_builder(paper)
    results[paper['pid']] = llm(prompt)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [122]:
results

{'00ecc542': '["How can predictive models be used to analyze neuroimaging data at a subject-specific level?",\n"What are the limitations of local inference in subject-specific analysis of neuroimaging data?",\n"What is the name of the proposed method for improving subject-specific detections in predictive modeling?",\n"How does the proposed method aim to reduce noise in subject-specific detections?",\n"What datasets were used to evaluate the performance of the proposed method?"] \n',
 '692d6bcc': '["What convolutional neural network architecture  achieves rotation invariance in 2-D symbol recognition?", "How can a neural network using a cyclic convolutional layer be used for object detection?", "Are there any advantages to using a cyclic convolutional layer in a CNN for image recognition?", "Can using rotation and translation invariant features in a CNN lead to one-shot learning?", "What is the name of the new CNN architecture proposed in a paper focusing on rotation invariance?"] \n',

In [62]:
ground_truth_data

['["What methods exist for analyzing disease-related changes in neuroimaging data at a subject-specific level?", "Why is local inference, despite its potential benefits, infrequently utilized in neuroimaging analysis?", "How does the proposed method, RSM, aim to improve the accuracy of subject-specific detections in predictive modeling?", "What type of data was used to evaluate the performance of RSM, and what were the findings?", "Besides accuracy, what other benefits does RSM offer in the context of subject-specific neuroimaging analysis?"] \n',
 '["What are the benefits of rotation invariance and translation invariance in image recognition?", "How does the cyclic convolutional layer contribute to rotation invariance in 2-D symbol recognition?", "Can this new architecture be used for detecting the position and orientation of multiple non-overlapping targets?", "What type of learning is achieved in some cases using the proposed architecture?", "In what type of tasks does this architec

In [69]:
import hashlib

In [70]:
for paper in tqdm(papers):
    paper['pid'] = hashlib.md5((paper['inputs']['abstract'] + paper['inputs']['title']).encode('utf-8')).hexdigest()[:8]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [518]:
papers[0]

{'text': None,
 'inputs': {'abstract': "  Predictive models allow subject-specific inference when analyzing disease\nrelated alterations in neuroimaging data. Given a subject's data, inference can\nbe made at two levels: global, i.e. identifiying condition presence for the\nsubject, and local, i.e. detecting condition effect on each individual\nmeasurement extracted from the subject's data. While global inference is widely\nused, local inference, which can be used to form subject-specific effect maps,\nis rarely used because existing models often yield noisy detections composed of\ndispersed isolated islands. In this article, we propose a reconstruction\nmethod, named RSM, to improve subject-specific detections of predictive\nmodeling approaches and in particular, binary classifiers. RSM specifically\naims to reduce noise due to sampling error associated with using a finite\nsample of examples to train classifiers. The proposed method is a wrapper-type\nalgorithm that can be used with 

In [519]:
df_papers= pd.DataFrame(papers)

In [522]:
df_papers.to_csv('papers-with-ids.csv', index=False)

In [59]:
import pandas as pd

In [60]:
df_ground_truth = pd.DataFrame(ground_truth_data)

In [42]:
import json

In [51]:
ground_truth_data[0] = '["What methods are currently used for analyzing disease-related alterations in neuroimaging data?","How can subject-specific inferences be made from neuroimaging data, and what are the limitations of one of these methods?","What problem does the proposed method, RSM, aim to address in the context of predictive modeling?","What type of algorithm is RSM, and how does it integrate with existing binary classifiers?","Besides accuracy, what other benefits does RSM provide in terms of analyzing neuroimaging data, according to the authors?"]'

In [52]:
json.loads(ground_truth_data[0])

['What methods are currently used for analyzing disease-related alterations in neuroimaging data?',
 'How can subject-specific inferences be made from neuroimaging data, and what are the limitations of one of these methods?',
 'What problem does the proposed method, RSM, aim to address in the context of predictive modeling?',
 'What type of algorithm is RSM, and how does it integrate with existing binary classifiers?',
 'Besides accuracy, what other benefits does RSM provide in terms of analyzing neuroimaging data, according to the authors?']

In [61]:
df_ground_truth.to_csv('ground_truth.csv')

In [72]:
results = {}

In [361]:
df_res = pd.DataFrame(results)

ValueError: If using all scalar values, you must pass an index

In [135]:
parsed_res = {}

In [503]:
for pid, queries in tqdm(results.items()):
    parsed_res[pid] = json.loads(queries)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [510]:
df_parsed_res = pd.DataFrame.from_dict(parsed_res, orient='index')

In [511]:
df_parsed_res

Unnamed: 0,0,1,2,3,4
00ecc542,How can predictive models be used to analyze n...,What are the limitations of local inference in...,What is the name of the proposed method for im...,How does the proposed method aim to reduce noi...,What datasets were used to evaluate the perfor...
692d6bcc,What convolutional neural network architecture...,How can a neural network using a cyclic convol...,Are there any advantages to using a cyclic con...,Can using rotation and translation invariant f...,What is the name of the new CNN architecture p...
e2508c43,What are spherical polyharmonics and how do th...,How can zonal polyharmonics be used to constru...,What type of functions do these Poisson kernel...,What mathematical functions are used to repres...,What is the relationship between Poisson kerne...
1c7d1c5a,What numerical methods are used to approximate...,How does stochasticity impact the Landau-Lif...,What are domain walls and vortices in the cont...,Can you explain the existence of weak marting...,What are the key challenges in solving the sto...
6ffba902,What is the effect of preprocessing and featur...,How do Wavelet Tensor Train (WTT) and Discrete...,What are the differences in performance betwee...,Which signal processing methods resulted in th...,What are the advantages of using WTT over DWT ...
...,...,...,...,...,...
0b9112c0,How does the composition of the gas phase abov...,What is the relationship between total pressur...,Can changes in the gas phase composition influ...,What type of analysis was used to investigate ...,Under what conditions does transport in the ga...
eca312a3,What graph-based method was created for verify...,Can you find a paper that uses a graph model t...,How can a sliding window protocol be verified ...,Is there research on verifying distributed alg...,What methods exist for formally verifying the ...
e6186000,What Bayesian model can be used to find the ra...,How can regularization be used to determine th...,Describe an algorithm for solving the sparse a...,What is the relationship between the sparse st...,What type of data was used to demonstrate the ...
cbb4e504,What are thermostat flows on the unit tangent ...,Which types of geodesic flows are included in ...,How can thermostat flows be parameterized?,What properties are investigated in relation t...,Under what conditions do Anosov thermostat flo...


In [513]:
final_results = []

for pid, questions in parsed_res.items():
    for q in questions:
        final_results.append((q, pid))

In [515]:
df_final_res = pd.DataFrame(final_results, columns=['query', 'pid'])

In [516]:
df_final_res.to_csv('ground-truth-data.csv', index=False)

In [92]:
queries = '["What additional short-term epidemic classification, besides \"no epidemic\" and \"normal epidemic\", is identified in this paper? ", "How does natural-boosting immunity influence the initial phase of an epidemic\'s curve? ", "What type of mathematical model was used to study short-term transmission dynamics? ", "What is the impact of time varying host susceptibility on the transmission of diseases in the short-term? ", "What specific factor related to immunity is considered in the model, and how does it play a role? "]'

In [501]:
queries

'["How can human input be used to improve a robot\'s understanding of a partially observed environment?, What are the challenges of representing a robot\'s knowledge when it receives probabilistic information from humans?, How can a belief state representation be made more efficient for robots operating in open domains?,  What methods exist for dynamically adapting the representation of a robot\'s belief state based on correlations between pieces of information?,  In what types of real-world tasks could a dynamic belief state representation that incorporates human input be beneficial?",] \n\n\n'

In [139]:
pid

'1c7d1c5a'

In [502]:
results[pid] = '["How can human input be used to improve a robot\'s understanding of a partially observed environment?, What are the challenges of representing a robot\'s knowledge when it receives probabilistic information from humans?, How can a belief state representation be made more efficient for robots operating in open domains?,  What methods exist for dynamically adapting the representation of a robot\'s belief state based on correlations between pieces of information?,  In what types of real-world tasks could a dynamic belief state representation that incorporates human input be beneficial?"]'

In [None]:
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor
import time

pool = ThreadPoolExecutor(max_workers=6)

def map_progress(pool, seq, f):
    results = []

    with tqdm(total=len(seq)) as progress:
        futures = []

        for i, el in enumerate(seq):
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)

    return results