In [1]:
from dotenv import dotenv_values

import pandas as pd
import json
import jsonlines
from pathlib import Path
import ast
import numpy as np

from mrcad import Design, Drawing, Line, Arc, Circle

In [2]:
import sys
import os

sys.path.append(os.path.abspath(".."))

from agents.editing_actions import (
    EditExecution,
    RemoveCurve,
    MoveCurve,
    DeletePoint,
    MovePoint,
    MakeCurve,
)

from data_conversion_utils import (
    get_design_from_record,
    get_strokes_from_record,
    get_edit_actions_from_record,
)

In [3]:
DISTANCE_THRESHOLD = 0.2

In [4]:
ENV = dotenv_values("../.env")

In [5]:
df = pd.read_csv(Path(ENV["DATA_DIR"]) / ENV["DATAFRAME_FILE"])
df.rename(columns={"trialId_sp": "trialId"}, inplace=True)

In [6]:
consolidated_df = (
    df[~df.practice_sp] # exclude practice trials
    .sort_values("roundNum")
    .groupby("trialId")
    .agg(
        {
            "trialId": "first",
            "text": list,
            "targetId": "first",
            "target": "first",
            "dyadId": "first",
            "trialNum": "first",
            "roundNum": list,
            "prevJsGeometries_li": list,
            "jsGeometries": list,
            "strokes": list,
            "distance": list,
            "experiment_subset": "first",
            "actions": list,
            "prevActions": list
        }
    ))

In [7]:
consolidated_df["verified"] = consolidated_df.apply(
    lambda x: x.distance[-1] < DISTANCE_THRESHOLD and x.roundNum == [i + 1 for i, _ in enumerate(x.roundNum)], axis=1
)

In [8]:
consolidated_df["trajectory"] = consolidated_df.apply(
    lambda x: {
        "trial_id": x["trialId"], 
        "target_id": x["targetId"],
        "target": ast.literal_eval(x["target"])["design"],
        "dyad_id": x["dyadId"],
        "trial_num": x["trialNum"],
        "rounds": [
            {
                "round_num": round_num,
                "context": (
                    get_design_from_record(ast.literal_eval(context)).model_dump(
                        mode="json"
                    )
                    if context != "[]"
                    else Design(curves=[]).model_dump(mode="json")
                ),
                "instruction": {
                    "text": text if isinstance(text, str) else "",
                    "drawing": {
                        "splines": get_strokes_from_record(ast.literal_eval(strokes))
                    },
                },
                "execution": {
                    "design": get_design_from_record(
                        ast.literal_eval(execution)
                    ).model_dump(mode="json"),
                },
                "edit_execution": EditExecution.execute(
                    (
                        get_design_from_record(ast.literal_eval(context))
                        if context != "[]"
                        else Design(curves=[])
                    ),
                    get_edit_actions_from_record(
                        ast.literal_eval(actions)[len(ast.literal_eval(prevActions)) :],
                        (
                            get_design_from_record(ast.literal_eval(context))
                            if context != "[]"
                            else Design(curves=[])
                        ),
                        get_design_from_record(ast.literal_eval(execution)),
                    ),
                ).model_dump(mode="json"),
            }
            for (
                round_num,
                text,
                strokes,
                context,
                execution,
                actions,
                prevActions,
            ) in zip(
                x["roundNum"],
                x["text"],
                x["strokes"],
                x["prevJsGeometries_li"],
                x["jsGeometries"],
                x["actions"],
                x["prevActions"],
            )
        ],
    },
    axis=1,
)

In [9]:
consolidated_df[['trialId', 'targetId', 'dyadId', 'experiment_subset']].groupby('experiment_subset').nunique()

Unnamed: 0_level_0,trialId,targetId,dyadId
experiment_subset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
coverage,2684,2614,441
eval,3094,874,517


In [10]:
consolidated_df[(consolidated_df.experiment_subset == 'coverage')&(consolidated_df.verified)].trajectory.to_json(Path(ENV["DATA_DIR"]) / "coverage_verified.jsonl", orient="records", lines=True)

In [11]:
consolidated_df[(consolidated_df.experiment_subset == 'coverage')&(~consolidated_df.verified)].trajectory.to_json(Path(ENV["DATA_DIR"]) / "coverage_unverified.jsonl", orient="records", lines=True)

In [12]:
consolidated_df[(consolidated_df.experiment_subset == 'eval')&(~consolidated_df.verified)].trajectory.to_json(Path(ENV["DATA_DIR"]) / "eval_unverified.jsonl", orient="records", lines=True)

In [13]:
eval_verified_df = consolidated_df[(consolidated_df.experiment_subset == 'eval')&(consolidated_df.verified)]

trial_counts = eval_verified_df.groupby('targetId')['trialId'].nunique()
completed_targets = trial_counts[trial_counts >= 3].index.tolist()

In [14]:
eval_verified_df[eval_verified_df['targetId'].isin(completed_targets)].trajectory.to_json(Path(ENV["DATA_DIR"]) / "eval_verified_complete.jsonl", orient="records", lines=True)

In [15]:
eval_verified_df[~eval_verified_df['targetId'].isin(completed_targets)].trajectory.to_json(Path(ENV["DATA_DIR"]) / "eval_verified_incomplete.jsonl", orient="records", lines=True)