### Exploration API Mistral

In [1]:
from config import MISTRALAI_API_KEY
from mistralai import Mistral
import pandas as pd
import json
import time
import ast

model = "mistral-small-2503"
client = Mistral(api_key=MISTRALAI_API_KEY)

In [2]:
studies = pd.read_csv('../csvs/Lookingbill_Wagner_2025.csv')
studies = studies[studies['oa_status'] == True]
studies.head()

Unnamed: 0,doi,url,study,oa_status,first_author,title,abstract,published,journal,publisher,body,nb_tokens_openai_tiktoken,nb_tokens_mistral_sentencepiece
2,10.1145/3234942,,Andalibi et al 2018,True,Nazanin Andalibi,"Social Support, Reciprocity, and Anonymity in ...",\r\nSeeking and providing support is challengi...,2018-10,ACM Transactions on Computer-Human Interaction,Association for Computing Machinery (ACM),INTRODUCTION\r\nSocial media platforms are oft...,5956,6882
6,10.1177/2056305120974610,,Deal et al 2020,True,Bonnie-Elene Deal,â€œI Definitely Did Not Report It When I Was R...,"\r\nThe ""Me Too"" movement, founded by activist...",2020-10,Social Media + Society,SAGE Publications,"\r\nThe ""Me Too"" movement was founded in 2006 ...",6989,8163
14,10.1177/2056305120984447,,Madden and Alt 2021,True,Stephanie Madden,Know Her Name: Open Dialogue on Social Media a...,\r\nInnovative justice focuses on crime victim...,2021-01,Social Media + Society,SAGE Publications,"\r\nIn March 2016, Brock Turner was convicted ...",6754,7629
16,10.1177/1461444818820069,,Mendes et al 2019,True,Kaitlynn Mendes,Digitized narratives of sexual violence: Makin...,"\r\nIn this article, we argue that social medi...",2018-12-29,New Media & Society,SAGE Publications,"Introduction\r\nIn 2012, 16 students enrolled ...",6901,7986
17,10.2196/13837,,Modrek and Chakalov 2019,True,PhD Sepideh Modrek,The #MeToo Movement in the United States: Text...,\r\nBackground:The #MeToo movement sparked an ...,2019-09-03,Journal of Medical Internet Research,JMIR Publications Inc.,Introduction\r\nPublic discourse on sensitive ...,6364,7166


In [3]:
studies = studies.to_dict('records')

In [4]:
specific_data_elements = """
    - Methodology: Whether the study followed a qualitative, quantitative or mixed methods research design\n
    - Sample : Size and nature of the sample used in the study\n
    - ICT : The Information and Communication Technologies (ICTs) used by the participants of the study. ICTs include search engines, social network sites, and smartphones\n
"""

system_prompt = f"""
    - You are an expert in data extraction for literature reviews in the social sciences.
    - You extract data from the paper provided by the user, based on the data elements specified below:\n
    {specific_data_elements}.

    - The data extracted should be a few keywords only, no full sentences
    - Return the annotated paper into a JSON object structured as follows : 
    {{
        'Methodology', # Extracted methodology
        'Sample', # Extracted study sample
        'ICT' # Extracted ICTs
    }}
    If the information from a specific data element is not available in the paper, simply return NA for that element.\n
    Return the data as closely as they appear in the original paper. Do not change or modify the text.\n
    Do not include information outside the given paper. Do not make up an answer if the information is not available.
"""

In [5]:
data_extraction = [] 
for study in studies:
    user_prompt = f"Paper to annotate: {study['body']}"

    chat_response = client.chat.complete(
        model = model,
        messages = [
            {
                "role": "system",
                "content": f"{system_prompt}",
            },
            {
                "role": "user",
                "content": f"{user_prompt}",
            }
        ],
        response_format = {
            "type": "json_object",
        }
    )
    data_extraction.append((study['study'], chat_response.choices[0].message.content))
    time.sleep(1)

In [7]:
data_extraction

[('Andalibi et al 2018',
  '    {\n        "Methodology": "NA",\n        "Sample": "NA",\n        "ICT": "Reddit, Facebook, social media platforms, social network sites, throwaway accounts, pseudonymous accounts, primary Reddit identity, social media posts, social media comments, social network accounts, Google+, Tumblr, Ask.fm, Class Confessions pages, discussion boards, news articles"\n    }'),
 ('Deal et al 2020',
  '{\n    "Methodology": "quantitative",\n    "Sample": "1,458 tweets from unique users in seven of the eight cities",\n    "ICT": "Twitter, Social Media Analytic and Research Testbed (SMART) dashboard, Twitter search API, MongoDB database"\n}'),
 ('Madden and Alt 2021',
  '{\n    "Methodology": "qualitative",\n    "Sample": "NA",\n    "ICT": "Instagram"\n}'),
 ('Mendes et al 2019',
  '{\n    "Methodology": "qualitative content analysis, critical discourse analysis",\n    "Sample": "150 posts from the Who Needs Feminism? Tumblr between March 2014 and January 2015, 300 twee

In [None]:
# Your raw model output
df_data_extraction = pd.Da
ast.literal_eval(data_extraction)
parsed_json['Study'] = data_extraction[0][0]

# Now it's a Python dictionary
print(parsed_json)

NameError: name 'df_data_extraction' is not defined

In [None]:
data_extraction_parsed = []
for i in range(len(data_extraction)):
    # Your raw model output
    raw_response = data_extraction[i][1]

    cleaned = raw_response.strip("{}").strip()
    cleaned = cleaned.replace('```json\n', '').replace('```', '')
    parsed_json = json.loads(cleaned)
    parsed_json['Study'] = data_extraction[i][0]

    data_extraction_parsed.append(parsed_json)


In [None]:
extraction = pd.DataFrame(data_extraction_parsed).columns