### LLM-based data extraction from scientific papers using GPT4o-mini

In [17]:
from config import OPEN_AI_API_KEY
from openai import OpenAI
import pandas as pd
import json
import time

model = 'gpt-4o-mini-2024-07-18'
client = OpenAI(api_key=OPEN_AI_API_KEY)

In [18]:
rs = 'Shahzad_Khan_2023' # À spécifier avant d'exécuter (revue systématique de référence)
studies = pd.read_csv(f'../csvs/{rs}.csv')
studies = studies[studies['oa_status'] == True]
studies.head()

Unnamed: 0,doi,url,study,oa_status,first_author,title,abstract,published,journal,publisher,body,nb_tokens_openai_tiktoken,nb_tokens_mistral_sentencepiece
0,10.1080/00049670.2006.10721856,,Cossham and Fields 2006,True,Amanda Cossham,Keeping the roses watered: the continuing prof...,\nContinuing professional development is a nec...,2006-08,The Australian Library Journal,Informa UK Limited,"\nT he mAiNteNANce Of prOfessiONAl educAtiON, ...",6095,6777
1,-,https://journals.unizik.edu.ng/lrj/article/vie...,Anyaegbu and Wali 2020,True,Anyaegbu,INFLUENCE OF STAFF TRAINING AND DEVELOPMENT ON...,,2019,,,Introduction\nUniversity libraries are academi...,6221,6916
4,10.1515/9783598440168.3.157,,Chan and Auster 2005,True,Chan,Understanding Librarians Motivation to Partici...,,2005,,,Introduction\nIn twenty first century librarie...,5768,6392
6,-,https://digitalcommons.unl.edu/libphilprac/2042/,Dina and Olowosoke 2018,True,Dina,The Effect of Motivation and Job Performance o...,,2018,,,Introduction\nMotivation is a process that eli...,4030,4556
8,,https://digitalcommons.unl.edu/libphilprac/5220,Anyim 2021,True,Dr Wisdom O Anyim,Cross-training and Development of Librarians f...,\nCross-training exercise is very important in...,2011,Scholarly Journal of Education,,Introduction\nUniversity library is a very imp...,5496,6204


In [19]:
studies = studies.to_dict('records')

In [20]:
with open('utils/data_elements_doco.txt') as f:
    data_elements_doco = f.read()

with open(f'utils/data_elements_{rs}.txt') as f:
    specific_data_elements = f.read()

In [21]:
system_prompt = f"""
    - You are an expert in data extraction for literature reviews in the social sciences.\n
    - You extract data from the paper provided by the user, based on the data elements specified below:\n
    {specific_data_elements}.

    - The data extracted should be a few keywords only, no full sentences.\n
    - Return the annotated paper into a valid JSON object, with one field for each data element.\n

    If the information from a specific data element is not available in the paper, simply return NA for that element.\n
    Return the data as closely as they appear in the original paper, do not modify the text\n
    Do not include information outside the given paper. Do not make up an answer if the information is not available.
"""

In [22]:
system_prompt_doco = f"""
    - You are an expert at extracting semantic information extraction from scientific papers.\n
    - You extract data from the paper provided by the user, based on the classes of the Discourse Elements Ontology (DEO) specified below:\n 
    {data_elements_doco}.

    - The data extracted should be a few keywords only, no full sentences.\n
    - Return the annotated paper into a valid JSON object with one field for each DEO element.\n

    If the information from a specific data class is not available in the paper, simply return NA for that element.\n
    Return the data as closely as they appear in the original paper, do not modify the text\n
    Do not include information outside the given paper. Do not make up an answer if the information is not available.
"""

In [23]:
data_extraction = [] 
for study in studies:
    user_prompt = f"Paper to annotate: {study['body']}"

    chat_response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "developer",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": user_prompt
            }
        ],
        response_format = {
            "type": "json_object"
        },
        temperature=0.3 # same as mistral-small-2503
    )
    study_data = {'study': study ['study']}
    study_data.update(json.loads(chat_response.choices[0].message.content))

    data_extraction.append(study_data)
    time.sleep(2)

data_extraction = pd.DataFrame(data_extraction)
data_extraction.to_csv(f'../llm-based_extraction/gpt-4o-mini/specific_extraction_{rs}.csv', index=False)

In [24]:
data_extraction

Unnamed: 0,study,Country,Motivational factors toward PD,Relation between Motivation and Librarians' PD,Challenges in implementation of PD activities
0,Cossham and Fields 2006,New Zealand,"[personal satisfaction, preparation for a desi...",Employer support (82.5% encouraged by employer...,"[Employer reluctance, tight budgets, lack of i..."
1,Anyaegbu and Wali 2020,Nigeria,"self-pride, self-respect, achievement, satisfa...",correlation between organizational success and...,"lack of training and development, non-particip..."
2,Chan and Auster 2005,Canada,"[professional competence, patron service, coll...",Motivation is a significant predictor of parti...,"[heavy workload, lack of interest, lack of lib..."
3,Dina and Olowosoke 2018,Nigeria,"[achievement, recognition, advancement, work e...","strong relationship between motivation, job sa...","[systematic motivation inadequacy, unsatisfact..."
4,Anyim 2021,,"[cross-training, development, employee recogni...",Cross-training improves motivation and promote...,"[distraction, employee's lack of enthusiasm, d..."
5,Chidiadi 2019,Nigeria,"training, motivation, teamwork, mentoring, com...","positive attitudes of library staff, correlati...","under-funding, inadequate stock, inadequate ma..."
6,"Madukoma, Akpa, and Okafor 2014",Nigeria,"[training, good salary packages, conducive wor...",Training as a precursor to motivation and job ...,"[lack of training, lack of motivation]"
7,Hamid and Younus 2021,Pakistan,"job enrichment, merit pay, flexible working ho...","positive correlation, significant effects on p...",lack of professional development opportunities...
8,Issa 2021,Nigeria,"[job satisfaction, financial reward, career de...",job satisfaction translates into better job pe...,"[low salaries, irregular promotional structure..."
9,Venturella and Breland 2019,,"more funding, administrative support, incentiv...","PD helps librarians keep pace with technology,...","scheduling conflicts, time constraints, limite..."
