In [1]:
import os
import sys
import json
from tqdm import tqdm
import warnings
from dotenv import load_dotenv

import pandas as pd
import numpy as np

import ast

from langchain.llms import OpenAI
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
load_dotenv()
warnings.simplefilter(action='ignore', category=UserWarning)
sys.path.append(os.path.abspath(".."))
warnings.simplefilter(action='ignore')

In [3]:
from utils.filter_utils import(
    filter_for_participant_counts, 
    filter_for_regularity, 
    filter_for_goal_str_length,
    print_participant_general_states,
    align_goal_with_day_they_were_done
)

from utils.llm_utils import (
    embed_with_retries
)

In [125]:
df_lab_goals = pd.read_csv("../data/proc/labelled_daily_goals_long.csv")
df_lab_goals.head()

Unnamed: 0,ParticipantIdentifier,trial_date,ResultIdentifier,Answers,label
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,DAILY_goal1_set,Keep working on psych paper,School
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,DAILY_goal1_set,Keep working on psych paper,School
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-03,DAILY_goal1_set,Finish and hand in psych rough draft,School
3,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-04,DAILY_goal1_set,Practice biology FSG questions,School
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-05,DAILY_goal1_set,Practice biology FSG questions,School


In [99]:
df = pd.read_csv("../data/proc/embeddings/daily_openai_emb_long.csv")
df['trial_date'] = pd.to_datetime(df['trial_date'])
df = df.sort_values(['ParticipantIdentifier', 'trial_date']).reset_index(drop=True)

In [100]:
df.head(3)

Unnamed: 0,ParticipantIdentifier,trial_date,ResultIdentifier,Answers,emb
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,DAILY_goal1_set,Keep working on psych paper,"[-0.002160745905712247, 0.0015396077651530504,..."
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,DAILY_goal2_set,Finish chem prelab,"[-0.04656229913234711, 0.0007934867171570659, ..."
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,DAILY_goal1_set,Keep working on psych paper,"[-0.002160745905712247, 0.0015396077651530504,..."


___

In [101]:
df_wide_goal = df.pivot_table(
    index=['ParticipantIdentifier', 'trial_date'],
    columns='ResultIdentifier',
    values='Answers',
    aggfunc='first'
).reset_index(drop=False)

df_wide_goal = df_wide_goal.rename(columns={
    'DAILY_goal1_set': 'goal_1',
    'DAILY_goal2_set': 'goal_2'
})

# create tomorrow columns by shifting up within each participant
df_wide_goal['tomorrow_goal_1'] = df_wide_goal.groupby('ParticipantIdentifier')['goal_1'].shift(-1)
df_wide_goal['tomorrow_goal_2'] = df_wide_goal.groupby('ParticipantIdentifier')['goal_2'].shift(-1)

In [102]:
df_goal_melted = df_wide_goal.melt(
    id_vars=[
        'ParticipantIdentifier',
        'trial_date',
        'tomorrow_goal_1',
        'tomorrow_goal_2'
    ],
    value_vars=['goal_1', 'goal_2'],
    var_name='Identifier',
    value_name='today_goal'
)

df_goal_melted = df_goal_melted.sort_values(['ParticipantIdentifier', 'trial_date', 'Identifier']).reset_index(drop=True)
df_goal_melted = df_goal_melted[[
    "ParticipantIdentifier",
    "trial_date",
    "Identifier",
    "today_goal",
    "tomorrow_goal_1",
    "tomorrow_goal_2"
]]

In [103]:
df_goal_melted.head(6)

Unnamed: 0,ParticipantIdentifier,trial_date,Identifier,today_goal,tomorrow_goal_1,tomorrow_goal_2
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,goal_1,Keep working on psych paper,Keep working on psych paper,Review bio questions
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,goal_2,Finish chem prelab,Keep working on psych paper,Review bio questions
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,goal_1,Keep working on psych paper,Finish and hand in psych rough draft,Create quick bio lecture notes
3,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,goal_2,Review bio questions,Finish and hand in psych rough draft,Create quick bio lecture notes
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-03,goal_1,Finish and hand in psych rough draft,Practice biology FSG questions,Catch up on anthropology readings
5,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-03,goal_2,Create quick bio lecture notes,Practice biology FSG questions,Catch up on anthropology readings


In [152]:
from typing import List
import time
from pydantic import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

# ---- Pydantic models ----

class RowSimilarity(BaseModel):
    row_id: int = Field(description="Index of the row in the original DataFrame")
    matches_tomorrow_goal_1: bool = Field(
        description="True if tomorrow_goal_1 is essentially the same goal as today_goal"
    )
    matches_tomorrow_goal_2: bool = Field(
        description="True if tomorrow_goal_2 is essentially the same goal as today_goal"
    )

class BatchSimilarity(BaseModel):
    rows: List[RowSimilarity]

parser = PydanticOutputParser(pydantic_object=BatchSimilarity)

template = """
You will receive a list of rows. Each row has:
- row_id (an integer)
- today_goal
- tomorrow_goal_1
- tomorrow_goal_2

For each row, decide whether tomorrow_goal_1 and tomorrow_goal_2 are essentially the SAME TASK
as today_goal.

Definition of "almost the same":
- Count as TRUE if the goals describe continuing or completing the **same assignment, project, or concrete task**, even if phrasing is different.
- Examples of TRUE:
  - "Keep working on psych paper" vs "Finish and hand in psych rough draft"
  - "Review bio questions" vs "Practice biology FSG questions"
- Count as FALSE if the goals are clearly about **different tasks or topics**, even if they are from the same course or domain.
  - Example: "Keep working on psych paper" vs "Review bio questions" → different tasks.

Return ONLY valid JSON in the following format (no extra text):

{format_instructions}

Here is the list of rows (as JSON):

{rows_json}
"""

prompt = PromptTemplate(
    template=template,
    input_variables=["rows_json"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

llm = ChatOpenAI(model="gpt-4o", temperature=0.2)

chain = prompt | llm | parser

In [155]:
# =========================
# 3. DataFrame setup
# =========================

# Use your existing dataframe
df = df_goal_melted.copy()

# prepare output columns
df["same_as_tomorrow_1"] = None
df["same_as_tomorrow_2"] = None

# =========================
# 4. Batched processing
# =========================

batch_size = 50
sleep_s = 3
output_path = "goal_similarity_batched_2.csv"

In [157]:
# (Optional) remove old file if you want a fresh start
if os.path.exists(output_path):
    os.remove(output_path)

for start in tqdm(range(9201, len(df), batch_size)):
    end = min(start + batch_size, len(df))
    batch = df.iloc[start:end]

    # 4.1 Build the list of rows to send
    rows_payload = []
    for idx, row in batch.iterrows():
        rows_payload.append({
            "row_id": int(idx),
            "today_goal": row["today_goal"],
            "tomorrow_goal_1": row["tomorrow_goal_1"],
            "tomorrow_goal_2": row["tomorrow_goal_2"],
        })

    rows_json = json.dumps(rows_payload, ensure_ascii=False)

    # 4.2 Single GPT call for the whole batch
    result: BatchSimilarity = chain.invoke({"rows_json": rows_json})

    # 4.3 Convert result back into a small DataFrame keyed by row_id
    out = pd.DataFrame([r.dict() for r in result.rows]).set_index("row_id")

    # 4.4 Write flags into the main df
    df.loc[out.index, "same_as_tomorrow_1"] = out["matches_tomorrow_goal_1"]
    df.loc[out.index, "same_as_tomorrow_2"] = out["matches_tomorrow_goal_2"]

    # 4.5 Save this processed batch to CSV (append mode)
    batch_with_flags = df.loc[out.index].copy()
    batch_with_flags.to_csv(
        output_path,
        mode="a",
        header=(start == 0),  # write header only on first batch
        index=False,
    )

    # 4.6 Rate limiting between batches
    time.sleep(sleep_s)

100%|██████████| 128/128 [37:53<00:00, 17.76s/it]


In [163]:
a = pd.read_csv("/Users/farhan/projects/NROC90/src/goal_similarity_batched.csv")
b = pd.read_csv("/Users/farhan/projects/NROC90/src/goal_similarity_batched_2.csv")
c = pd.concat([a,b], axis=0)
len(c)
c.head()

Unnamed: 0,ParticipantIdentifier,trial_date,Identifier,today_goal,tomorrow_goal_1,tomorrow_goal_2,same_as_tomorrow_1,same_as_tomorrow_2
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,goal_1,Keep working on psych paper,Keep working on psych paper,Review bio questions,True,False
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,goal_2,Finish chem prelab,Keep working on psych paper,Review bio questions,False,False
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,goal_1,Keep working on psych paper,Finish and hand in psych rough draft,Create quick bio lecture notes,True,False
3,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,goal_2,Review bio questions,Finish and hand in psych rough draft,Create quick bio lecture notes,False,False
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-03,goal_1,Finish and hand in psych rough draft,Practice biology FSG questions,Catch up on anthropology readings,False,False


___

In [104]:
df_wide_emb = df.pivot_table(
    index=['ParticipantIdentifier', 'trial_date'],
    columns='ResultIdentifier',
    values='emb',
    aggfunc='first'
).reset_index()

df_wide_emb = df_wide_emb.rename(columns={
    'DAILY_goal1_set': 'goal_1_emb',
    'DAILY_goal2_set': 'goal_2_emb'
})

# create tomorrow columns by shifting up within each participant
df_wide_emb['tomorrow_emb_1'] = df_wide_emb.groupby('ParticipantIdentifier')['goal_1_emb'].shift(-1)
df_wide_emb['tomorrow_emb_2'] = df_wide_emb.groupby('ParticipantIdentifier')['goal_2_emb'].shift(-1)

In [105]:
df_emb_melted = df_wide_emb.melt(
    id_vars=[
        'ParticipantIdentifier',
        'trial_date',
        'tomorrow_emb_1',
        'tomorrow_emb_2'
    ],
    value_vars=['goal_1_emb', 'goal_2_emb'],
    var_name='Identifier',
    value_name='today_emb'
)

df_emb_melted = df_emb_melted.sort_values(['ParticipantIdentifier', 'trial_date', 'Identifier']).reset_index(drop=True)
df_emb_melted = df_emb_melted[[
    "ParticipantIdentifier",
    "trial_date",
    "Identifier",
    "today_emb",
    "tomorrow_emb_1",
    "tomorrow_emb_2"
]]

In [106]:
df_emb_melted.head()

Unnamed: 0,ParticipantIdentifier,trial_date,Identifier,today_emb,tomorrow_emb_1,tomorrow_emb_2
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,goal_1_emb,"[-0.002160745905712247, 0.0015396077651530504,...","[-0.002160745905712247, 0.0015396077651530504,...","[-0.014762434177100658, 0.02009924314916134, -..."
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,goal_2_emb,"[-0.04656229913234711, 0.0007934867171570659, ...","[-0.002160745905712247, 0.0015396077651530504,...","[-0.014762434177100658, 0.02009924314916134, -..."
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,goal_1_emb,"[-0.002160745905712247, 0.0015396077651530504,...","[-0.03221265226602554, -0.010712618008255959, ...","[0.007794945500791073, 0.014498291537165642, -..."
3,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,goal_2_emb,"[-0.014762434177100658, 0.02009924314916134, -...","[-0.03221265226602554, -0.010712618008255959, ...","[0.007794945500791073, 0.014498291537165642, -..."
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-03,goal_1_emb,"[-0.03221265226602554, -0.010712618008255959, ...","[-0.0068207005970180035, 0.024796752259135246,...","[-0.012282999232411385, 0.016515225172042847, ..."


___

In [107]:
for col in ['today_emb', 'tomorrow_emb_1', 'tomorrow_emb_2']:
    df_emb_melted[col] = df_emb_melted[col].apply(
        lambda x: np.array(ast.literal_eval(x)) if isinstance(x, str) else np.array(x)
    )

In [114]:
len(df_emb_melted)

15576

In [116]:
df_emb_melted = df_emb_melted.dropna()

In [117]:
df_emb_melted.head()

Unnamed: 0,ParticipantIdentifier,trial_date,Identifier,today_emb,tomorrow_emb_1,tomorrow_emb_2
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,goal_1_emb,"[-0.002160745905712247, 0.0015396077651530504,...","[-0.002160745905712247, 0.0015396077651530504,...","[-0.014762434177100658, 0.02009924314916134, -..."
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,goal_2_emb,"[-0.04656229913234711, 0.0007934867171570659, ...","[-0.002160745905712247, 0.0015396077651530504,...","[-0.014762434177100658, 0.02009924314916134, -..."
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,goal_1_emb,"[-0.002160745905712247, 0.0015396077651530504,...","[-0.03221265226602554, -0.010712618008255959, ...","[0.007794945500791073, 0.014498291537165642, -..."
3,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,goal_2_emb,"[-0.014762434177100658, 0.02009924314916134, -...","[-0.03221265226602554, -0.010712618008255959, ...","[0.007794945500791073, 0.014498291537165642, -..."
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-03,goal_1_emb,"[-0.03221265226602554, -0.010712618008255959, ...","[-0.0068207005970180035, 0.024796752259135246,...","[-0.012282999232411385, 0.016515225172042847, ..."


In [118]:
df_emb_melted['today_emb'].info()

<class 'pandas.core.series.Series'>
Index: 15352 entries, 0 to 15573
Series name: today_emb
Non-Null Count  Dtype 
--------------  ----- 
15352 non-null  object
dtypes: object(1)
memory usage: 239.9+ KB


In [122]:
df_emb_melted['sim_with_tom_1'] = df_emb_melted.apply(
    lambda row: cosine_similarity(
        np.array(row['today_emb']).reshape(1, -1),
        np.array(row['tomorrow_emb_1']).reshape(1, -1)
    )[0][0],
    axis=1
)

In [123]:
df_emb_melted['sim_with_tom_2'] = df_emb_melted.apply(
    lambda row: cosine_similarity(
        np.array(row['today_emb']).reshape(1, -1),
        np.array(row['tomorrow_emb_2']).reshape(1, -1)
    )[0][0],
    axis=1
)

In [126]:
df_emb_melted.head()

Unnamed: 0,ParticipantIdentifier,trial_date,Identifier,today_emb,tomorrow_emb_1,tomorrow_emb_2,sim_with_tom_1,sim_with_tom_2
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,goal_1_emb,"[-0.002160745905712247, 0.0015396077651530504,...","[-0.002160745905712247, 0.0015396077651530504,...","[-0.014762434177100658, 0.02009924314916134, -...",1.0,0.221521
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,goal_2_emb,"[-0.04656229913234711, 0.0007934867171570659, ...","[-0.002160745905712247, 0.0015396077651530504,...","[-0.014762434177100658, 0.02009924314916134, -...",0.431675,0.31424
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,goal_1_emb,"[-0.002160745905712247, 0.0015396077651530504,...","[-0.03221265226602554, -0.010712618008255959, ...","[0.007794945500791073, 0.014498291537165642, -...",0.646013,0.271393
3,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,goal_2_emb,"[-0.014762434177100658, 0.02009924314916134, -...","[-0.03221265226602554, -0.010712618008255959, ...","[0.007794945500791073, 0.014498291537165642, -...",0.288576,0.416696
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-03,goal_1_emb,"[-0.03221265226602554, -0.010712618008255959, ...","[-0.0068207005970180035, 0.024796752259135246,...","[-0.012282999232411385, 0.016515225172042847, ...",0.238864,0.394535


In [164]:
df_emb_melted["Identifier"] = df_emb_melted["Identifier"].str.replace(r"_emb$", "", regex=True)
df_emb_melted.head()

Unnamed: 0,ParticipantIdentifier,trial_date,Identifier,today_emb,tomorrow_emb_1,tomorrow_emb_2,sim_with_tom_1,sim_with_tom_2
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,goal_1,"[-0.002160745905712247, 0.0015396077651530504,...","[-0.002160745905712247, 0.0015396077651530504,...","[-0.014762434177100658, 0.02009924314916134, -...",1.0,0.221521
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,goal_2,"[-0.04656229913234711, 0.0007934867171570659, ...","[-0.002160745905712247, 0.0015396077651530504,...","[-0.014762434177100658, 0.02009924314916134, -...",0.431675,0.31424
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,goal_1,"[-0.002160745905712247, 0.0015396077651530504,...","[-0.03221265226602554, -0.010712618008255959, ...","[0.007794945500791073, 0.014498291537165642, -...",0.646013,0.271393
3,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,goal_2,"[-0.014762434177100658, 0.02009924314916134, -...","[-0.03221265226602554, -0.010712618008255959, ...","[0.007794945500791073, 0.014498291537165642, -...",0.288576,0.416696
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-03,goal_1,"[-0.03221265226602554, -0.010712618008255959, ...","[-0.0068207005970180035, 0.024796752259135246,...","[-0.012282999232411385, 0.016515225172042847, ...",0.238864,0.394535


In [166]:
c['trial_date'] = pd.to_datetime(c['trial_date'])
df_emb_melted['trial_date'] = pd.to_datetime(df_emb_melted['trial_date'])

df = c.merge(df_emb_melted, on=["ParticipantIdentifier", "trial_date", "Identifier"])
df.head()

Unnamed: 0,ParticipantIdentifier,trial_date,Identifier,today_goal,tomorrow_goal_1,tomorrow_goal_2,same_as_tomorrow_1,same_as_tomorrow_2,today_emb,tomorrow_emb_1,tomorrow_emb_2,sim_with_tom_1,sim_with_tom_2
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,goal_1,Keep working on psych paper,Keep working on psych paper,Review bio questions,True,False,"[-0.002160745905712247, 0.0015396077651530504,...","[-0.002160745905712247, 0.0015396077651530504,...","[-0.014762434177100658, 0.02009924314916134, -...",1.0,0.221521
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,goal_2,Finish chem prelab,Keep working on psych paper,Review bio questions,False,False,"[-0.04656229913234711, 0.0007934867171570659, ...","[-0.002160745905712247, 0.0015396077651530504,...","[-0.014762434177100658, 0.02009924314916134, -...",0.431675,0.31424
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,goal_1,Keep working on psych paper,Finish and hand in psych rough draft,Create quick bio lecture notes,True,False,"[-0.002160745905712247, 0.0015396077651530504,...","[-0.03221265226602554, -0.010712618008255959, ...","[0.007794945500791073, 0.014498291537165642, -...",0.646013,0.271393
3,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,goal_2,Review bio questions,Finish and hand in psych rough draft,Create quick bio lecture notes,False,False,"[-0.014762434177100658, 0.02009924314916134, -...","[-0.03221265226602554, -0.010712618008255959, ...","[0.007794945500791073, 0.014498291537165642, -...",0.288576,0.416696
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-03,goal_1,Finish and hand in psych rough draft,Practice biology FSG questions,Catch up on anthropology readings,False,False,"[-0.03221265226602554, -0.010712618008255959, ...","[-0.0068207005970180035, 0.024796752259135246,...","[-0.012282999232411385, 0.016515225172042847, ...",0.238864,0.394535


In [167]:
len(df)

15351

In [171]:
cols = [
    "ParticipantIdentifier",
    "trial_date",
    "Identifier",
    "today_goal",
    "tomorrow_goal_1",
    "tomorrow_goal_2",
    "same_as_tomorrow_1",
    "same_as_tomorrow_2",
    "sim_with_tom_1",
    "sim_with_tom_2"
]

df[cols].to_csv("/Users/farhan/projects/NROC90/data/proc/similarity/daily_similarity.csv", index=False)