## load modules

Remember to set model name. OpenAI updates model name and the old ones may become invalid

e.g. gpt-4o-mini-2024-07-18

In [None]:
import sys
!{sys.executable} -m pip install python-dotenv openai
import os
from dotenv import load_dotenv
import openai
import pandas as pd
from openai import OpenAI
from tqdm import tqdm
from pathlib import Path


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [None]:
#Environment variables

model_name = "gpt-4o-mini-2024-07-18"


In [13]:
import os
from dotenv import load_dotenv
from openai import OpenAI

# Load .env
load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

#testing if the key works
response = client.chat.completions.create(
    model=model_name,
    messages=[{"role": "user", "content": "Hello!"}]
)

print(response.choices[0].message.content)


Hello! How can I assist you today?


## Classification

In [None]:
class File:
    def __init__(self, file_path):
        self.file_path = file_path
        self.topics = {}

        

In [None]:
# Load OpenAI API key
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Candidate topics list
candidate_topics = [
    "Gender roles", "Family structure", "Parenting", "Marriage", 
    "Childhood", "Work-life balance", "Caregiving", "Inheritance", 
    "Education", "Divorce", "Religion and family", "Adoption", 
    "Domestic labor", "Migration", "Generational conflict"
]

def build_structured_response(question, answer):
    system_message = "You are a helpful assistant tasked with tagging conversations based on their content using a structured JSON schema."
    user_message = f"""
Question: {question}
Answer: {answer}

From this list of candidate topics:
{', '.join(candidate_topics)}

Return a JSON object with a field 'relevant_topics' containing 1–3 matching topic strings.
"""

    response = client.responses.create(
        model=model_name,
        input=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ],
        text={
            "format": {
                "type": "json_schema",
                "name": "row_topic_extraction",
                "schema": {
                    "type": "object",
                    "properties": {
                        "relevant_topics": {
                            "type": "array",
                            "items": {"type": "string"}
                        }
                    },
                    "required": ["relevant_topics"],
                    "additionalProperties": False
                },
                "strict": True
            }
        }
    )

    return json.loads(response.output_text)["relevant_topics"]


def single_csv_classification(file_obj, output_folder):
    df = pd.read_csv(file_obj.file_path)
    df["topics"] = ""

    for i, row in tqdm(df.iterrows(), total=len(df)):
        try:
            question = row["question"]
            answer = row["answer"]
            topics = build_structured_response(question, answer)

            df.at[i, "topics"] = topics

            # Count topic frequencies
            for topic in topics:
                file_obj.topics[topic] = file_obj.topics.get(topic, 0) + 1

        except Exception as e:
            print(f"Error at row {i}: {e}")
            df.at[i, "topics"] = []

    # Save to file
    filename = Path(file_obj.file_path).stem
    df.to_csv(f"{output_folder}/{filename}_labeled.csv", index=False)

In [None]:
# Point to the folder path
folder_path = Path("Phase2_Folders/030_processed_html")

output_folder= "topic_labeling"
# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Dictionary to store results
files = {}

# Loop through all CSV files in the folder
for file_path in folder_path.glob("*.csv"):
    file_obj = File(file_path)
    single_csv_classification(file_obj)
    files[file_path.name] = file_obj  # Save the object if you want to access topic counts later

# Example: Print topic counts from all files
for fname, file_obj in files.items():
    print(f"\n📄 {fname}")
    for topic, count in file_obj.topics.items():
        print(f"  {topic}: {count}")