### Exploration API Mistral

In [1]:
from config import MISTRALAI_API_KEY
from mistralai import Mistral
import pandas as pd
import json
import time
import ast

model = "mistral-small-2503"
client = Mistral(api_key=MISTRALAI_API_KEY)

In [2]:
studies = pd.read_csv('../csvs/Lookingbill_Wagner_2025.csv')
studies = studies[studies['oa_status'] == True]
studies.head()

Unnamed: 0,doi,url,study,oa_status,first_author,title,abstract,published,journal,publisher,body,nb_tokens_openai_tiktoken,nb_tokens_mistral_sentencepiece
2,10.1145/3234942,,Andalibi et al 2018,True,Nazanin Andalibi,"Social Support, Reciprocity, and Anonymity in ...",\nSeeking and providing support is challenging...,2018-10,ACM Transactions on Computer-Human Interaction,Association for Computing Machinery (ACM),INTRODUCTION\nSocial media platforms are often...,5956,6882
6,10.1177/2056305120974610,,Deal et al 2020,True,Bonnie-Elene Deal,â€œI Definitely Did Not Report It When I Was R...,"\nThe ""Me Too"" movement, founded by activist T...",2020-10,Social Media + Society,SAGE Publications,"\nThe ""Me Too"" movement was founded in 2006 by...",6989,8163
14,10.1177/2056305120984447,,Madden and Alt 2021,True,Stephanie Madden,Know Her Name: Open Dialogue on Social Media a...,\nInnovative justice focuses on crime victim n...,2021-01,Social Media + Society,SAGE Publications,"\nIn March 2016, Brock Turner was convicted of...",6754,7629
16,10.1177/1461444818820069,,Mendes et al 2019,True,Kaitlynn Mendes,Digitized narratives of sexual violence: Makin...,"\nIn this article, we argue that social media ...",2018-12-29,New Media & Society,SAGE Publications,"Introduction\nIn 2012, 16 students enrolled in...",6901,7986
17,10.2196/13837,,Modrek and Chakalov 2019,True,PhD Sepideh Modrek,The #MeToo Movement in the United States: Text...,\nBackground:The #MeToo movement sparked an in...,2019-09-03,Journal of Medical Internet Research,JMIR Publications Inc.,Introduction\nPublic discourse on sensitive to...,6364,7166


In [3]:
studies = studies.to_dict('records')

In [4]:
specific_data_elements = """
    - Methodology: Whether the study followed a qualitative, quantitative or mixed methods research design\n
    - Sample : Size and nature of the sample used in the study\n
    - ICT : The Information and Communication Technologies (ICTs) used by the participants of the study. ICTs include search engines, social network sites, and smartphones\n
"""

system_prompt = f"""
    - You are an expert in data extraction for literature reviews in the social sciences.
    - You extract data from the paper provided by the user, based on the data elements specified below:\n
    {specific_data_elements}.

    - The data extracted should be a few keywords only, no full sentences
    - Return the annotated paper into a valid JSON object structured as follows : 
    {{
        'methodology', # Extracted methodology
        'sample', # Extracted study sample
        'ICT' # Extracted ICTs
    }}
    If the information from a specific data element is not available in the paper, simply return NA for that element.\n
    Return the data as closely as they appear in the original paper. Do not change or modify the text.\n
    Do not include information outside the given paper. Do not make up an answer if the information is not available.
"""

In [5]:
data_extraction = [] 
for study in studies:
    user_prompt = f"Paper to annotate: {study['body']}"

    chat_response = client.chat.complete(
        model = model,
        messages = [
            {
                "role": "system",
                "content": f"{system_prompt}",
            },
            {
                "role": "user",
                "content": f"{user_prompt}",
            }
        ],
        response_format = {
            "type": "json_object"
        }
    )
    study_data = {'study': study ['study']}
    study_data.update(json.loads(chat_response.choices[0].message.content))

    data_extraction.append(study_data)
    time.sleep(1)

data_extraction = pd.DataFrame(data_extraction)
data_extraction.to_csv('../llm-based_extraction/extraction_Lookingbill_Wagner_2025.csv', index=False)

In [6]:
pd.DataFrame(data_extraction)

Unnamed: 0,study,methodology,sample,ICT
0,Andalibi et al 2018,,,"[Reddit, Facebook, social media platforms, soc..."
1,Deal et al 2020,quantitative,"1,458 tweets from unique users in seven of the...","Twitter, Social Media Analytic and Research Te..."
2,Madden and Alt 2021,qualitative,,"Instagram, social media"
3,Mendes et al 2019,"qualitative content analysis, critical discour...","450 texts, 150 posts from the Who Needs Femini...","Tumblr, Twitter, smartphones"
4,Modrek and Chakalov 2019,quantitative,"11,935 novel English language US-based tweets",Twitter
5,O'Neill 2018,qualitative,176 posts by victim-survivors of sexual violence,"reddit, Tumblr, Facebook, Yahoo! Answers, smar..."
