### LLM-based data extraction from scientific paper using mistral-small

In [18]:
from config import MISTRALAI_API_KEY
from mistralai import Mistral
import requests
import pandas as pd
import json
import time

model = "mistral-small-2503"
client = Mistral(api_key=MISTRALAI_API_KEY)

In [19]:
headers = {
    "Authorization": f"Bearer {MISTRALAI_API_KEY}"
}

response = requests.get("https://api.mistral.ai/v1/models", headers=headers)

if response.status_code == 200:
    models = response.json()
    for model in models["data"]:
        if "mistral-small-2503" in model["id"]:
            print(model)
else:
    print(f"Error: {response.status_code}")
    print(response.text)

{'id': 'mistral-small-2503', 'object': 'model', 'created': 1746076564, 'owned_by': 'mistralai', 'capabilities': {'completion_chat': True, 'completion_fim': False, 'function_calling': True, 'fine_tuning': False, 'vision': True, 'classification': False}, 'name': 'mistral-small-2503', 'description': 'Official mistral-small-2503 Mistral AI model', 'max_context_length': 131072, 'aliases': ['mistral-small-latest'], 'deprecation': None, 'default_model_temperature': 0.3, 'type': 'base'}


In [None]:
rs = 'Safdar_Siddique_Khan_2024'
studies = pd.read_csv(f'../csvs/{rs}.csv')
studies = studies[studies['oa_status'] == True]
studies.head()

Unnamed: 0,doi,url,study,oa_status,first_author,title,abstract,published,journal,publisher,body,nb_tokens_openai_tiktoken,nb_tokens_mistral_sentencepiece
2,10.1145/3234942,,Andalibi et al 2018,True,Nazanin Andalibi,"Social Support, Reciprocity, and Anonymity in ...",\nSeeking and providing support is challenging...,2018-10,ACM Transactions on Computer-Human Interaction,Association for Computing Machinery (ACM),INTRODUCTION\nSocial media platforms are often...,5956,6882
6,10.1177/2056305120974610,,Deal et al 2020,True,Bonnie-Elene Deal,â€œI Definitely Did Not Report It When I Was R...,"\nThe ""Me Too"" movement, founded by activist T...",2020-10,Social Media + Society,SAGE Publications,"\nThe ""Me Too"" movement was founded in 2006 by...",6989,8163
14,10.1177/2056305120984447,,Madden and Alt 2021,True,Stephanie Madden,Know Her Name: Open Dialogue on Social Media a...,\nInnovative justice focuses on crime victim n...,2021-01,Social Media + Society,SAGE Publications,"\nIn March 2016, Brock Turner was convicted of...",6754,7629
16,10.1177/1461444818820069,,Mendes et al 2019,True,Kaitlynn Mendes,Digitized narratives of sexual violence: Makin...,"\nIn this article, we argue that social media ...",2018-12-29,New Media & Society,SAGE Publications,"Introduction\nIn 2012, 16 students enrolled in...",6901,7986
17,10.2196/13837,,Modrek and Chakalov 2019,True,PhD Sepideh Modrek,The #MeToo Movement in the United States: Text...,\nBackground:The #MeToo movement sparked an in...,2019-09-03,Journal of Medical Internet Research,JMIR Publications Inc.,Introduction\nPublic discourse on sensitive to...,6364,7166


In [21]:
studies = studies.to_dict('records')

In [22]:
with open('utils/data_elements_doco.txt') as f:
    data_elements_doco = f.read()

with open(f'utils/data_elements_{rs}.txt') as f:
    specific_data_elements = f.read()

In [23]:
system_prompt = f"""
    - You are an expert in data extraction for literature reviews in the social sciences.\n
    - You extract data from the paper provided by the user, based on the data elements specified below:\n
    {specific_data_elements}.

    - The data extracted should be a few keywords only, no full sentences.\n
    - Return the annotated paper into a valid JSON object, with one field for each data element.\n

    If the information from a specific data element is not available in the paper, simply return NA for that element.\n
    Return the data as closely as they appear in the original paper. do not modify the text.\n
    Do not include information outside the given paper. Do not make up an answer if the information is not available.
"""

In [24]:
system_prompt_doco = f"""
    - You are an expert at extracting semantic information extraction from scientific papers.\n
    - You extract data from the paper provided by the user, based on the classes of the Discourse Elements Ontology (DEO) specified below:\n 
    {data_elements_doco}.

    - The data extracted should be a few keywords only, no full sentences.\n
    - Return the annotated paper into a valid JSON object with one field for each DEO element.\n

    If the information from a specific data class is not available in the paper, simply return NA for that element.\n
    Return the data as closely as they appear in the original paper, do not modify the text\n
    Do not include information outside the given paper. Do not make up an answer if the information is not available.
"""

In [None]:
data_extraction = [] 
for study in studies:
    user_prompt = f"Paper to annotate: {study['body']}"

    chat_response = client.chat.complete(
        model = "mistral-small-2503",
        messages = [
            {
                "role": "system",
                "content": f"{system_prompt}",
            },
            {
                "role": "user",
                "content": f"{user_prompt}",
            }
        ],
        response_format = {
            "type": "json_object"
        },
    )
    study_data = {'study': study ['study']}
    study_data.update(json.loads(chat_response.choices[0].message.content))

    data_extraction.append(study_data)
    time.sleep(1)

data_extraction = pd.DataFrame(data_extraction)
data_extraction.to_csv(f'../llm-based_extraction/mistral-small-2503/specific_extraction_{rs}.csv', index=False)

In [None]:
data_extraction

Unnamed: 0,study,Geographical breadth of research,Analysis method,Data sources,Participants,Participant size,Roles of Library Service Platforms (LSP),Challenges that libraries face,Roles of Library Service Platforms
0,Ahmad 2021,"Perth, Australia",Quantitative,Ebook Library (EBL) transaction log files,ECU e-book users,"8,482, 9,353, and 11,690",,,
1,Bernard et al 2015,"Bremerhaven, Germany",Mixed methods,"PANGAEA repository, Baseline Surface Radiation...","Experts from computer science, a scientific di...",,"Content-based overview visualization, visual q...","Varying similarity notions, different time int...",
2,Dadhich et al 2021,India,Mixed methods,"Surveys, secondary data",Librarians and users of Indian universities,510,,,
3,Fu 2014,,quantitative,"online job postings, vendor staffing proposals...",systems librarians,52,"systems administration, system implementer, op...",transition from traditional ILS to next-genera...,
4,Fu and Carmen 2015,"Oregon, Washington, and Idaho",Qualitative,"direct observation, participant observation, e...",forty-one librarians and staff,,,"Electronic Resources, Primo, Acquisitions, Cat...",
5,Grammenis and Mourikis 2020,"Greece, USA",Qualitative,"Literature review, semi-structured interviews,...","Academic librarians, IT professionals, vendor ...","5 academic libraries, 1 vendor representative,...","Systems Administration, Cataloging, Acquisitio...","Inadequate funding, increasing demands for ser...",
6,Guo and Xu 2023,"United States, Canada",Qualitative,Survey,"Deans, directors of the library, university li...",,"Resource management, acquisitions, user reques...","Cost sharing, workflow design, policy, collabo...",
7,Lindy et al 2015,,quantitative,"questionnaire, direct observation, one-on-one ...","library users, library staff, library supervisors",,,"declining budget, complex library system, stat...","circulation management, online book procuremen..."
8,Liu and Fu 2018,,,"Marshall Breeding's annual product reports, Li...",,,"Resource sharing, cooperative acquisitions, e-...","funding, evolving role of the library, changin...",
9,Singley and Natches 2017,U.S. academic libraries,Quantitative,Online survey,Library staff involved in ERM tasks,299,"ERM, integrated print and electronic resource ...","Manual workarounds, patchwork of systems, lack...",
