In [23]:
!pip install openai



In [None]:
import os, json
import pandas as pd
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor, as_completed

# Use environment variable for safety
client = OpenAI(api_key="USE YOUR OPENAI/otherAI tool API KEY HERE")

CATEGORIES = [
    "physical_wellness", "intellectual_wellness", "occupational_wellness",
    "financial_wellness", "social_interaction_wellness", "spiritual_wellness",
    "mental_wellness", "compassion_contribution_wellness",
    "family_and_caregiving", "leisure_and_travel", "life_events_transitions"
]

SYSTEM_MSG = """You are a precise labeling assistant.
Output STRICT JSON that matches this schema:

{
  "items": [
    {
      "text": "<exact input text>",
      "physical_wellness": -1|0|1,
      "intellectual_wellness": 0|1,
      "occupational_wellness": -1|0|1,
      "financial_wellness": -1|0|1,
      "social_interaction_wellness": 0|1,
      "spiritual_wellness": 0|1,
      "mental_wellness": -1|0|1,
      "compassion_contribution_wellness": 0|1,
      "family_and_caregiving": -1|0|1,
      "leisure_and_travel": 0|1,
      "life_events_transitions": 0|1,
      "neutral": 0|1
    }
  ]
}

Rules:
- Preserve input order.
- For each input record, return exactly one object.
- Copy the input 'text' field verbatim; do not invent or rewrite.
- Use only the allowed label values exactly as specified. 
- always consider the user point of view while answering the question not the text contex where he/she might be saying about their friend/relatives state
- If no strong evidence for any category, use 0 for all categories and 1 for neutral.
- Label each record. Keep order. Return a JSON OBJECT with key 'items' only (no explanations).
"""

def label_records_with_gpt(df_part: pd.DataFrame, text_column="text") -> pd.DataFrame:
    records = [
        {"id": int(i), "text": str(t) if pd.notna(t) else ""}
        for i, t in enumerate(df_part[text_column].tolist())
    ]

    msgs = [
        {"role": "system", "content": SYSTEM_MSG},
        {"role": "user", "content": json.dumps({"records": records}, ensure_ascii=False)}
    ]

    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=msgs,
        temperature=0,
        response_format={"type": "json_object"}
    )

    content = resp.choices[0].message.content
    obj = json.loads(content)
    items = obj.get("items", [])

    if len(items) != len(records):
        raise ValueError(f"Model returned {len(items)} items, expected {len(records)}.")

    out = pd.DataFrame(items)

    missing = [c for c in CATEGORIES if c not in out.columns]
    if missing:
        raise ValueError(f"Missing columns in output: {missing}")

    return out


if __name__ == "__main__":
    df = pd.read_csv("split_file_8.csv")
    text_col = "text"

    batch_size = 20   # much faster than 1
    max_workers = 5   # run 5 requests in parallel

    all_outputs = []


    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for start in range(0, len(df), batch_size):
            df_part = df.iloc[start:start+batch_size]
            futures.append(executor.submit(label_records_with_gpt, df_part, text_column=text_col))


        for f in as_completed(futures):
            all_outputs.append(f.result())
            
        

    final_df = pd.concat(all_outputs, ignore_index=True)
    final_df.to_csv("wellness_labels_split8.csv", index=False)
    print("Saved wellness_labels_split8.csv")


KeyboardInterrupt: 