# Prepare Data for Active Learning Experiments with Silicon Interviews

- parse interviews
- create a training test split
- label test dataset
- setup data
- write new train/test sets to disk for later usage

Relevante Fragen:

- Wie würdest du deinen Gesundheitszustand im Allgemeinen beschreiben? Sehr gut, gut, mittelmäßig, schlecht oder sehr schlecht? (Turn 2 Interviewerin)
- Vielen Dank für deine Einschätzung. Nun würde mich interessieren, was du dir bei der Beantwortung der Frage gedacht hast. Was ist dir in den Kopf gekommen, als du die Frage beantwortet hast? (Turn 3)
- Du hast gesagt, dass du deinen Gesundheitszustand mit **sehr gut** bewertest. Kannst du mir genauer erzählen, warum du deine Gesundheit so beschreiben würdest? (Turn 4)
- Kannst du uns das noch ein bisschen näher erläutern? (Turn 5)
- Was hätte sein müssen, damit du deine Gesundheit besser bzw. schlechter eingeschätzt hättest? (Turn 6)
- Was heißt „Gesundheit“ für dich, wie würdest du das erklären? (Turn 8)
- Woran hast du gedacht, als in der Frage das Wort „Zustand“ vorkam? Was bedeutet „Zustand“ für dich? (Turn 9)

-> 2,3,4,5,6,8,9

In [None]:
import os
import json
import pandas as pd
import glob
import re
from helper.helper import parse_interview_with_turns

# load data
turns = pd.read_csv("data/full_analysis.csv")
# only relevant turns (Questions) see: list(enumerate(turns))
turns = turns.iloc[:, [0, 71, 73, 74, 75, 76, 77, 79]]

dfs = []

for file in glob.glob("data/interviews_new/" + "*.json"):
    dfs.append(parse_interview_with_turns(file, turns))

df = pd.concat(dfs)
df['labeled'] = False 
df.head()

In [2]:
codes_health = ["physical_health", "mental_health", "daily_functioning", "health_unspecific", "health_none"]
codes_freq = ["freq_mentioned", "freq_not_mentioned"]
codes_neg = ["health_none", "freq_not_mentioned"]

codes_combined = codes_health + codes_freq
code_full = {label: idx for idx, label in enumerate(codes_combined)}
code_pos = {k: v for k, v in code_full.items() if k not in codes_neg}
code_neg = {k: v for k, v in code_full.items() if k in codes_neg}
code_pos_new = {i: k for k, i in enumerate(code_pos.keys())}

full_to_pos = {
    code_full[label]: new_idx
    for new_idx, label in enumerate(code_pos.keys())
}

num_classes = len(code_pos)
code_pos_new

{'physical_health': 0,
 'mental_health': 1,
 'daily_functioning': 2,
 'health_unspecific': 3,
 'freq_mentioned': 4}

### Initial Training/Test Split

In [3]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.028, random_state=2025)
print(f"n train: {len(train_df)}")
print(f"n test: {len(test_df)}")
train_df = train_df.reset_index(drop=True) # to match indices from small-text random_initialization()
test_df = test_df.reset_index(drop=True) 
train_df['batch_id'] = [99 for _ in range(len(train_df))]
train_df['experiment'] = ""

n train: 5655
n test: 163


### Prepare Argilla for Labeling

In [None]:
import argilla as rg

client = rg.Argilla(
    api_url="domain", 
    api_key="" 
)

# dataset = client.datasets(name="silicon-new-testdata")

In [None]:
import argilla as rg

# Define labeling schema
settings = rg.Settings(
    guidelines="Active Learning Experiments",
    fields=[
      # Text to be labeld
        rg.TextField(
            name = "childPart",
            title = "Speech Part of the Child",
            use_markdown=False,
        ),
        rg.CustomField(
            name="interview",
            title="Interview Context",
            template="""
                       <a href="javascript:void(0)" 
                       onclick="window.open('', '_blank').document.write(record.fields.interview.text);">
                       Open Interview Context
                    </a>
                """,
            advanced_mode=True,
            )
    ],
    questions=[
      rg.MultiLabelQuestion(
          name="health",
          title="What health Dimension does the childs answer refer to?",
          labels=dict(zip(codes_health, ["Physical health", "Mental health", "Daily functioning", "Unspecific", "None"])),
      ),
      rg.LabelQuestion(
          name="frequency",
          title="The concept 'frequency' is mentioned.",
          labels=dict(zip(codes_freq, ["Yes", "No"])),
      ),
    ],
    metadata=[
        # later we only want to train on the latest batches
        rg.IntegerMetadataProperty(
            name="batch_id",
            min=0,
            max=1000,
            title="batch_id"
        ),
        rg.IntegerMetadataProperty(
            name="small-text-id",
            min=0,
            max=100000,
            title="Small Text ID"
        ),
    ],
)

# Create dataset with a label schema
dataset = rg.Dataset(
    name="silicon-new-testdata",
    settings=settings,
)
dataset.create()

## Labeling of Test Records

In [None]:
current_batch_id = 0

records = [
    rg.Record(
        fields={"childPart":row.get('childPart', ""),
                "interview": {"text": row.get('interview', "")}
               },
        metadata={"batch_id":current_batch_id,
                  "small-text-id": i}
    )  
    for i, row in test_df.iterrows() 
]

dataset.records.log(records)

In [71]:
def get_labels_with_annotators(records):

    results = []

    for record in records:

        annotators = record["health.responses.users"]
        health_lists = record["health.responses"]
        freq_labels = record["frequency.responses"]

        record_output = []

        for user, h_list, f in zip(annotators, health_lists, freq_labels):

            # merge
            labels = list(h_list) + [f]

            # remove negatives
            labels = [lbl for lbl in labels if lbl not in code_neg]

            # name → full index
            full_idx = [code_full[lbl] for lbl in labels]

            # full → positive continuous
            pos_idx = [full_to_pos[i] for i in full_idx if i in full_to_pos]

            record_output.append({
                "annotator": user,
                "labels": pos_idx
            })

        results.append(record_output)

    return results

In [6]:
status_filter = rg.Query(filter=rg.Filter([("response.status", "==", "submitted")]))
submitted_list = dataset.records(status_filter).to_list(flatten=True)

# reorder to the logged order 
record_lookup = {rec['small-text-id']: rec for rec in submitted_list} 
ordered_records = [record_lookup[uid] for uid in range(len(test_df)) if uid in record_lookup]

In [None]:
get_labels_with_annotators(ordered_records)[15]

In [66]:
y_expert_test = get_labels_with_annotators(ordered_records)

In [77]:
import pandas as pd

# explode list of dicts into rows
flat = pd.DataFrame([
    {"row": i, "annotator": d["annotator"], "labels": d["labels"]}
    for i, rec in enumerate(y_expert_test)
    for d in rec
])

# pivot to annotator columns
pivoted = flat.pivot(index="row", columns="annotator", values="labels")

# merge into df
test_df = test_df.join(pivoted, how="left")

In [85]:
test_df.columns

Index(['id', 'model', 'interviewer_temperature', 'child_temperature',
       'child_sex', 'child_name', 'child_health_status', 'interviewer_prompt',
       'child_prompt', 'num_questions', 'stop_on_phrase', 'interview_id',
       'speech_index', 'childPart', 'interview', 'label', 'labeled',
       '22cb6f61-3095-4765-a459-55288f72aeaa',
       '55c6dea3-9a8f-4cbb-b2a8-6af210558098'],
      dtype='object')

In [117]:
rename_dict = {
    '22cb6f61-3095-4765-a459-55288f72aeaa': 'peter',
    '55c6dea3-9a8f-4cbb-b2a8-6af210558098': 'leo'
}

# Rename the columns
test_df = test_df.rename(columns=rename_dict)

In [120]:
test_df['labeled'] = True

In [124]:
inter_df = test_df.loc[:, ['childPart', 'leo', 'peter']]
inter_df.to_csv("data/annotator_agree.csv")

## Write new datasets to disk 

In [123]:
test_df.to_pickle('data/test_df_new_interviews.pkl')
test_df.to_csv('data/test_df_new_interviews.csv')
train_df.to_pickle('data/train_df_new_interviews.pkl')