### LLM-based data extraction from scientific paper using mistral-small

In [None]:
from config import MISTRALAI_API_KEY
from mistralai import Mistral
import requests
import pandas as pd
import json
import time

model = "mistral-small-2503"
client = Mistral(api_key=MISTRALAI_API_KEY)

In [None]:
headers = {
    "Authorization": f"Bearer {MISTRALAI_API_KEY}"
}

response = requests.get("https://api.mistral.ai/v1/models", headers=headers)

if response.status_code == 200:
    models = response.json()
    for model in models["data"]:
        if "mistral-small-2503" in model["id"]:
            print(model)
else:
    print(f"Error: {response.status_code}")
    print(response.text)

{'id': 'mistral-small-2503', 'object': 'model', 'created': 1746061539, 'owned_by': 'mistralai', 'capabilities': {'completion_chat': True, 'completion_fim': False, 'function_calling': True, 'fine_tuning': False, 'vision': True, 'classification': False}, 'name': 'mistral-small-2503', 'description': 'Official mistral-small-2503 Mistral AI model', 'max_context_length': 131072, 'aliases': ['mistral-small-latest'], 'deprecation': None, 'default_model_temperature': 0.3, 'type': 'base'}


In [20]:
rs = 'Shahzad_Khan_2023'
studies = pd.read_csv(f'../csvs/{rs}.csv')
studies = studies[studies['oa_status'] == True]
studies.head()

Unnamed: 0,doi,url,study,oa_status,first_author,title,abstract,published,journal,publisher,body,nb_tokens_openai_tiktoken,nb_tokens_mistral_sentencepiece
0,10.1080/00049670.2006.10721856,,Cossham and Fields 2006,True,Amanda Cossham,Keeping the roses watered: the continuing prof...,\r\nContinuing professional development is a n...,2006-08,The Australian Library Journal,Informa UK Limited,\r\nT he mAiNteNANce Of prOfessiONAl educAtiON...,6095,6777
1,-,https://journals.unizik.edu.ng/lrj/article/vie...,Anyaegbu and Wali 2020,True,Anyaegbu,INFLUENCE OF STAFF TRAINING AND DEVELOPMENT ON...,,2019,,,Introduction\nUniversity libraries are academi...,6221,6916
4,10.1515/9783598440168.3.157,,Chan and Auster 2005,True,Chan,Understanding Librarians Motivation to Partici...,,2005,,,Introduction\nIn twenty first century librarie...,5768,6392
6,-,https://digitalcommons.unl.edu/libphilprac/2042/,Dina and Olowosoke 2018,True,Dina,The Effect of Motivation and Job Performance o...,,2018,,,Introduction\nMotivation is a process that eli...,4030,4556
8,,https://digitalcommons.unl.edu/libphilprac/5220,Anyaegbu and Wali 2020,True,Dr Wisdom O Anyim,Cross-training and Development of Librarians f...,\r\nCross-training exercise is very important ...,2011,Scholarly Journal of Education,,Introduction\r\nUniversity library is a very i...,5496,6204


In [21]:
studies = studies.to_dict('records')

In [22]:
with open('utils/data_elements_doco.txt') as f:
    data_elements_doco = f.read()

with open(f'utils/data_elements_{rs}.txt') as f:
    specific_data_elements = f.read()

In [23]:
system_prompt = f"""
    - You are an expert in data extraction for literature reviews in the social sciences.\n
    - You extract data from the paper provided by the user, based on the data elements specified below:\n
    {specific_data_elements}.

    - The data extracted should be a few keywords only, no full sentences.\n
    - Return the annotated paper into a valid JSON object, with one field for each data element.\n

    If the information from a specific data element is not available in the paper, simply return NA for that element.\n
    Return the data as closely as they appear in the original paper.\n
    Do not include information outside the given paper. Do not make up an answer if the information is not available.
"""

In [24]:
system_prompt_doco = f"""
    - You are an expert at extracting semantic information extraction from scientific papers.\n
    - You extract data from the paper provided by the user, based on the classes of the Discourse Elements Ontology (DEO) specified below:\n 
    {data_elements_doco}.

    - The data extracted should be a few keywords or a very short summarized sentence, but no full sentences.\n
    - Return the annotated paper into a valid JSON object with one field for each DEO element.\n

    If the information from a specific data class is not available in the paper, simply return NA for that element.\n
    Return the data as closely as they appear in the original paper.\n
    Do not include information outside the given paper. Do not make up an answer if the information is not available.
"""

In [25]:
data_extraction = [] 
for study in studies:
    user_prompt = f"Paper to annotate: {study['body']}"

    chat_response = client.chat.complete(
        model = model,
        messages = [
            {
                "role": "system",
                "content": f"{system_prompt}",
            },
            {
                "role": "user",
                "content": f"{user_prompt}",
            }
        ],
        response_format = {
            "type": "json_object"
        }
    )
    study_data = {'study': study ['study']}
    study_data.update(json.loads(chat_response.choices[0].message.content))

    data_extraction.append(study_data)
    time.sleep(2)

data_extraction = pd.DataFrame(data_extraction)
data_extraction.to_csv(f'../llm-based_extraction/specific_extraction_{rs}.csv', index=False)

In [27]:
data_extraction

Unnamed: 0,study,Country,Motivational factors toward PD,Relation between Motivation and Librarians' PD,Challenges in implementation of PD activities
0,Cossham and Fields 2006,New Zealand,"personal satisfaction, preparation for a desir...","personal satisfaction, preparation for a desir...","employer reluctance, budgets, sharing of limit..."
1,Anyaegbu and Wali 2020,Nigeria,,,
2,Chan and Auster 2005,Canada,"professional competence, patron service, colle...",significant predictor of participation in both...,"heavy workload, lack of interest, lack of libr..."
3,Dina and Olowosoke 2018,Nigeria,"reward, internship training, transformational ...",Motivation enhances library personnel effectiv...,
4,Anyaegbu and Wali 2020,,"[professional growth, job satisfaction, non-mo...","[cross-training, development, training, job pe...","[distraction, lack of enthusiasm, dissatisfact..."
5,Chidiadi 2019,Nigeria,,,
6,"Madukoma, Akpa, and Okafor 2014",Nigeria,"training, motivation, salary packages, conduci...","training motivates library employees, motivati...",
7,Hamid and Younus 2021,Pakistan,,,
8,Issa 2021,Nigeria,"feeling of being involved at work, good workin...","job motivation, job satisfaction, job performa...","low salaries, irregular promotional structure,..."
9,Venturella and Breland 2019,,"[upgrading knowledge, abilities, competencies,...",[PD helps librarians keep pace with technology...,"[scheduling conflicts, devote time to the acti..."
