In [6]:
import os
import json
import pandas as pd
import asyncio
from mistralai import Mistral
import nest_asyncio
import config
from tqdm.notebook import tqdm

# ── PATCH JUPYTER ASYNC LOOP ───────────────────────────────────────────────────
nest_asyncio.apply()

# ── CONFIG ─────────────────────────────────────────────────────────────────────
INPUT_CSV = r"D:/Data Extraction FSP/Odete-br/odete_data/processed/Profiles_processed.csv"
OUTPUT_CSV = r"D:/Data Extraction FSP/Odete-br/odete_data/processed/Profiles_processed_AI.csv"
IMAGE_BASE = "https://images.odete.com.br/uploads/"
MODEL_NAME = "mistral-medium-latest" #"pixtral-12b-latest"   # change to desired model
RATE_LIMIT = 1  # requests per second scheduling

# ── SETUP CLIENT ────────────────────────────────────────────────────────────────
api_key = config.FSP2025
if not api_key:
    raise RuntimeError("Please set MISTRAL_API_KEY in your config")
client = Mistral(api_key=api_key)

# ── LOAD PROFILES AND PREPARE COLUMNS ──────────────────────────────────────────
df = pd.read_csv(INPUT_CSV, dtype=str).fillna("")
df = df.head(3)
fields = [
    "age", "gender", "demographic_infos", "skin_color",
    "hair", "eyes", "education_info", "personal_qualities",
    "other", "anonymized_bio", "notes"
]
conf_fields = ["age", "gender", "skin_color", "hair", "eyes", "education_info"]
for f in fields:
    df[f"ai_{f}"] = None
for f in conf_fields:
    df[f"ai_{f}_confidence"] = None
# metadata column
df["ai_model"] = MODEL_NAME

# ── ASYNC PROCESSING FUNCTION ─────────────────────────────────────────────────
async def process_row(idx, row, delay):
    await asyncio.sleep(delay)
    pid  = row["worker_id_from_filename"]
    name = row["first_name"]
    bio  = row["biography"].replace("\n", " ")
    img  = IMAGE_BASE + row["profile_image_filename"]

    prompt = f"""
id: {pid}
Name: {name}
Bio: {bio}
Image URL: {img}
Instruction: from the image, bio, name. Extract and estimate: age, gender (preferably from image), demographic infos, skin color, hair, eyes, education info, personal qualities as an array, other potential quality "other:" into JSON.
 - i want an "anonymized_bio" version (in the original language) that does not contain personal information "name, age, location.."
 - I will use this returned JSON to enhance the dataset. so i want you to reply with JSON only, nothing else. and to provide atomic values for each field. i dont want descriptions or explanations, just the values in JSON format. you can include the metadata and explanations (such as from where the infos were extracted, any potential issues..) in the "notes" field.
 - For each field (do it in english). i want an "anonymized_bio" version (in the original language) that does not contain personal information "name, age, location.." (i wanna publish the dataset later). 
 - i want you to add another field "<field_name>_confidence" with a value between 0 and 1, to indicate how confident you are about the extracted value. if you are not sure, set it to 0.5.
 the JSON should look like this + <field_name>_confidence:
 
 {{
  "id": "xxx",
  "age": ,
  "gender": "Female",
  "demographic_infos": "",
  "skin_color": "",
  "hair": "",
  "eyes": "",
  "education_info": "",
  "personal_qualities": [
    "xx",
    "yy",
    "zz",
  ],
  "other": "",
  "anonymized_bio": ,
  "notes": ""
}}
 
"""
    try:
        print(f"Processing {pid} - {name}")
        resp = await asyncio.to_thread(
            client.chat.complete,
            model=MODEL_NAME,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=1500,
            temperature=0.0
        )
        print(f"Processed {pid} - {name}")
        text = resp.choices[0].message.content.strip()
        print(f"Processed {pid} - {name}: {text}")
        data = json.loads(text)
        print(f"Processed {pid} - {name}: {text}")
    except Exception as e:
        df.at[idx, "ai_notes"] = f"Error: {e}"
        return

    # populate results
    for f in fields:
        if f in data:
            df.at[idx, f"ai_{f}"] = data[f]
    for f in conf_fields:
        key = f + "_confidence"
        if key in data:
            df.at[idx, f"ai_{f}_confidence"] = data[key]
    df.at[idx, "ai_model"] = MODEL_NAME

# ── MAIN ASYNC EXECUTION WITH PROGRESS ─────────────────────────────────────────
async def main():
    # schedule all tasks immediately, staggering their execution
    tasks = []
    interval = 1.0 / RATE_LIMIT
    for idx, row in df.iterrows():
        delay = idx * interval
        tasks.append(asyncio.create_task(process_row(idx, row, delay)))

    # track completion
    pbar = tqdm(total=len(tasks), desc="Profiles processed")
    for coro in asyncio.as_completed(tasks):
        await coro
        pbar.update(1)
    pbar.close()

    df.to_csv(OUTPUT_CSV, index=False)
    print(f"Saved augmented profiles to: {OUTPUT_CSV}")

# ── RUN IN JUPYTER ─────────────────────────────────────────────────────────────
await main()

Profiles processed:   0%|          | 0/3 [00:00<?, ?it/s]

Processing 573798b68c7e81f47727e655 - Maria
Processing 573798b68c7e81f47727e657 - Marilia
Processing 573798b68c7e81f47727e664 - Solange
Processed 573798b68c7e81f47727e655 - Maria
Processed 573798b68c7e81f47727e655 - Maria: ```json
{
  "id": "573798b68c7e81f47727e655",
  "age": 30,
  "age_confidence": 0.6,
  "gender": "Female",
  "gender_confidence": 0.9,
  "demographic_infos": "Brazilian",
  "demographic_infos_confidence": 0.7,
  "skin_color": "Light",
  "skin_color_confidence": 0.8,
  "hair": "Brown",
  "hair_confidence": 0.8,
  "eyes": "Brown",
  "eyes_confidence": 0.7,
  "education_info": "Not specified",
  "education_info_confidence": 0.5,
  "personal_qualities": [
    "Dedicated",
    "Responsible",
    "Passionate",
    "Detail-oriented",
    "Punctual",
    "Precise"
  ],
  "other": "Friendly and hardworking",
  "other_confidence": 0.7,
  "anonymized_bio": "Sou uma pessoa muito dedicada, responsável, gosto muito do que eu faço, faço tudo com muito amor, sou muito caprichosa e po

## new working version

In [13]:
import os
import re
import json
import pandas as pd
import asyncio
from mistralai import Mistral
import nest_asyncio
import config
from tqdm.notebook import tqdm

# ── PATCH JUPYTER ASYNC LOOP ───────────────────────────────────────────────────
nest_asyncio.apply()

# ── CONFIG ─────────────────────────────────────────────────────────────────────
INPUT_CSV = r"D:/Data Extraction FSP/Odete-br/odete_data/processed/Profiles_processed.csv"
OUTPUT_CSV = r"D:/Data Extraction FSP/Odete-br/odete_data/processed/Profiles_processed_AI.csv"
IMAGE_BASE = "https://images.odete.com.br/uploads/"
MODEL_NAME =  "mistral-medium-latest" #"pixtral-12b-latest"   # change to desired model
RATE_LIMIT = 1  # requests per second scheduling

# ── SETUP CLIENT ────────────────────────────────────────────────────────────────
api_key = config.FSP2025
if not api_key:
    raise RuntimeError("Please set MISTRAL_API_KEY in your config")
client = Mistral(api_key=api_key)

# ── LOAD PROFILES AND PREPARE COLUMNS ──────────────────────────────────────────
df = pd.read_csv(INPUT_CSV, dtype=str).fillna("")
#df = df.head(3)  # for testing
fields = [
    "age", "gender", "demographic_infos", "skin_color",
    "hair", "eyes", "education_info", "personal_qualities",
    "other", "anonymized_bio", "notes"
]
conf_fields = ["age", "gender", "skin_color", "hair", "eyes", "education_info"]
for f in fields:
    df[f"ai_{f}"] = None
for f in conf_fields:
    df[f"ai_{f}_confidence"] = None
# metadata column
df["ai_model"] = MODEL_NAME


# ── ASYNC PROCESSING FUNCTION ─────────────────────────────────────────────────
async def process_row(idx, row, delay):
    await asyncio.sleep(delay)
    pid  = row["worker_id_from_filename"]
    name = row["first_name"]
    bio  = row["biography"].replace("\n", " ")
    img  = IMAGE_BASE + row["profile_image_filename"]
    prompt = f"""
id: {pid}
Name: {name}
Bio: {bio}
Image URL: {img}
Instruction: from the image, bio, name. Extract and estimate: age, gender (preferably from image), demographic infos, skin color, hair, eyes, education info, personal qualities as an array, other potential quality "other:" into JSON.
 - i want an "anonymized_bio" version (in the original language) that does not contain personal information "name, age, location.."
 - I will use this returned JSON to enhance the dataset. so i want you to reply with JSON only, nothing else. and to provide atomic values for each field. i dont want descriptions or explanations, just the values in JSON format. you can include the metadata and explanations (such as from where the infos were extracted, any potential issues..) in the "notes" field.
 - For each field (do it in english). i want an "anonymized_bio" version (in the original language) that does not contain personal information "name, age, location.." (i wanna publish the dataset later). 
 - i want you to add another field "<field_name>_confidence" with a value between 0 and 1, to indicate how confident you are about the extracted value. if you are not sure, set it to 0.5.
 the JSON should look like this + <field_name>_confidence:
 
 {{
  "id": "xxx",
  "age": ,
  "gender": "Female",
  "demographic_infos": "",
  "skin_color": "",
  "hair": "",
  "eyes": "",
  "education_info": "",
  "personal_qualities": [
    "xx",
    "yy",
    "zz",
  ],
  "other": "",
  "anonymized_bio": ,
  "notes": ""
}}
 
"""

    

    raw = ""
    try:
        #print(f"Processing {pid} - {name}")
        resp = await asyncio.to_thread(
            client.chat.complete,
            model=MODEL_NAME,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=1500,
            temperature=0.0
        )
        raw = resp.choices[0].message.content.strip()
        # strip code fences
        text = re.sub(r"^```(?:json)?\n", "", raw)
        text = re.sub(r"\n```$", "", text)
        #print(f"Processed {pid} - {name}: {text}")
        data = json.loads(text)
        #print(f"Processed {pid} - {name}: {text}")
    except json.JSONDecodeError as e:
        df.at[idx, "ai_notes"] = f"JSON decode error: {e} | Raw start: {raw[:100]}"
        return
    except Exception as e:
        df.at[idx, "ai_notes"] = f"Error: {e}"
        return

    # populate results
    for f in fields:
        df.at[idx, f"ai_{f}"] = data.get(f)
    for f in conf_fields:
        df.at[idx, f"ai_{f}_confidence"] = data.get(f + "_confidence")
    df.at[idx, "ai_model"] = MODEL_NAME

# ── MAIN ASYNC EXECUTION WITH PROGRESS ─────────────────────────────────────────
async def main():
    tasks = []
    interval = 1.0 / RATE_LIMIT
    for idx, row in df.iterrows():
        tasks.append(asyncio.create_task(process_row(idx, row, idx*interval)))

    pbar = tqdm(total=len(tasks), desc="Profiles processed")
    for coro in asyncio.as_completed(tasks):
        await coro
        pbar.update(1)
    pbar.close()

    df.to_csv(OUTPUT_CSV, index=False)
    print(f"Saved augmented profiles to: {OUTPUT_CSV}")

# ── RUN IN JUPYTER ─────────────────────────────────────────────────────────────
await main()


Profiles processed:   0%|          | 0/3017 [00:00<?, ?it/s]

Saved augmented profiles to: D:/Data Extraction FSP/Odete-br/odete_data/processed/Profiles_processed_AI.csv
