### LLM-based data extraction from scientific papers using GPT4o-mini

In [None]:
from config import OPEN_AI_API_KEY
from openai import OpenAI
import pandas as pd
import json
import time

model = 'gpt-4o-mini-2024-07-18'
client = OpenAI(api_key=OPEN_AI_API_KEY)

In [None]:
rs = '' # À spécifier avant d'exécuter (revue systématique de référence)
studies = pd.read_csv(f'../csvs/{rs}.csv')
studies = studies[studies['oa_status'] == True]
studies.head()

In [None]:
studies = studies.to_dict('records')

In [None]:
with open('utils/data_elements_doco.txt') as f:
    data_elements_doco = f.read()

with open(f'utils/data_elements_{rs}.txt') as f:
    specific_data_elements = f.read()

In [None]:
system_prompt = f"""
    - You are an expert in data extraction for literature reviews in the social sciences.\n
    - You extract data from the paper provided by the user, based on the data elements specified below:\n
    {specific_data_elements}.

    - The data extracted should be a few keywords only, no full sentences.\n
    - Return the annotated paper into a valid JSON object, with one field for each data element.\n

    If the information from a specific data element is not available in the paper, simply return NA for that element.\n
    Return the data as closely as they appear in the original paper.\n
    Do not include information outside the given paper. Do not make up an answer if the information is not available.
"""

In [None]:
system_prompt_doco = f"""
    - You are an expert at extracting semantic information extraction from scientific papers.\n
    - You extract data from the paper provided by the user, based on the classes of the Discourse Elements Ontology (DEO) specified below:\n 
    {data_elements_doco}.

    - The data extracted should be a few keywords or a very short summarized sentence, but no full sentences.\n
    - Return the annotated paper into a valid JSON object with one field for each DEO element.\n

    If the information from a specific data class is not available in the paper, simply return NA for that element.\n
    Return the data as closely as they appear in the original paper.\n
    Do not include information outside the given paper. Do not make up an answer if the information is not available.
"""

In [None]:
data_extraction = [] 
for study in studies:
    user_prompt = f"Paper to annotate: {study['body']}"

    chat_response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "developer",
                "content": system_prompt_doco
            },
            {
                "role": "user",
                "content": user_prompt
            }
        ],
        response_format = {
            "type": "json_object"
        },
        temperature=0.3 # same as mistral-small-2503's default temperature
    )
    study_data = {'study': study ['study']}
    study_data.update(json.loads(chat_response.choices[0].message.content))

    data_extraction.append(study_data)
    time.sleep(2)

data_extraction = pd.DataFrame(data_extraction)
data_extraction.to_csv(f'../llm-based_extraction/gpt-4o-mini/doco_extraction_{rs}.csv', index=False)

In [None]:
data_extraction