In [1]:
import sys
import os
import warnings
from dotenv import load_dotenv
from openai import OpenAI
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
load_dotenv()
warnings.simplefilter(action='ignore', category=UserWarning)
sys.path.append(os.path.abspath(".."))

In [3]:
from utils.llm_utils import (
    get_batch_classification_by_llm,
    embed_batch,
    embed_with_retries
)

In [4]:
daily_path = "../data/proc/goal_filtered_long.csv"

weekly_run1_path = "../data/raw/weekly-goals/run1_weekly_goals.csv"
weekly_run2_path = "../data/raw/weekly-goals/run2_weekly_goals.csv"

monthly_run1_path = "../data/raw/monthly-goals/run1_monthly_goals.csv"
monthly_run2_path = "../data/raw/monthly-goals/run2_monthly_goals.csv"

In [5]:
MODEL = "text-embedding-3-large"   # or "text-embedding-3-large" for max quality
BATCH_SIZE = 25                   # safe default; adjust to 256–512 if inputs are short
MAX_RETRIES = 5

In [6]:
daily = pd.read_csv(daily_path) 

weekly_run1 = pd.read_csv(weekly_run1_path)
weekly_run2 = pd.read_csv(weekly_run2_path)

monthly_run1 = pd.read_csv(monthly_run1_path)
monthly_run2 = pd.read_csv(monthly_run2_path) 

In [7]:
weekly = pd.concat([weekly_run1, weekly_run2]).dropna()
monthly = pd.concat([monthly_run1, monthly_run2]).dropna()

In [8]:
print(len(daily))
print(len(weekly))
print(len(monthly))

15576
1747
466


In [9]:
daily.head()

Unnamed: 0,ParticipantIdentifier,trial_date,ResultIdentifier,Answers
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,DAILY_goal1_set,Keep working on psych paper
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,DAILY_goal1_set,Keep working on psych paper
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-03,DAILY_goal1_set,Finish and hand in psych rough draft
3,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-04,DAILY_goal1_set,Practice biology FSG questions
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-05,DAILY_goal1_set,Practice biology FSG questions


In [10]:
print(len(daily))

15576


In [11]:
client = OpenAI()

In [12]:
def chunked(seq, size):
    for i in range(0, len(seq), size):
        yield seq[i:i+size]

In [13]:
def get_embeddings(df, column, client, model, batch_size, max_retries):
    texts_all = (
        df[column]
        .fillna("N/A")             # avoid None
        .astype(str)
        .str.strip()
        .replace('', 'N/A')
        .tolist()
    )
    

    # dict.fromkeys preserves first occurrence order
    unique_texts = list(dict.fromkeys(texts_all))
    # print(unique_texts)
    index_map = {t: i for i, t in enumerate(unique_texts)}

    print(f"Total rows: {len(texts_all)} | Unique texts: {len(unique_texts)}")

    # --- Embed unique texts in batches ---
    unique_embeddings = []
    pbar = tqdm(total=len(unique_texts), desc="Embedding (unique)")
    for batch in chunked(unique_texts, batch_size):
        embs = embed_with_retries(batch, model, client, max_retries)
        unique_embeddings.extend(embs)
        pbar.update(len(batch))
    pbar.close()

    unique_embeddings = np.asarray(unique_embeddings, dtype=np.float32)
    embs_ordered = [unique_embeddings[index_map[t]].tolist() for t in texts_all]
    return embs_ordered

In [14]:
daily["emb"] = get_embeddings(daily, "Answers", client, MODEL, BATCH_SIZE, MAX_RETRIES)

# (Optional) Persist to disk so you don't re-embed if the kernel restarts
# df.to_parquet("df_with_emb_goal1.parquet")
# np.save("emb_goal1.npy", np.array(embs_ordered, dtype=np.float32))

Embedding (unique): 100%|██████████| 8701/8701 [08:38<00:00, 16.79it/s]


In [15]:
# Quick sanity check
dims = len(daily["emb"].iloc[0])
print(f"Done. Shape: {len(daily)} x {dims} (embeddings are lists of floats).")

Done. Shape: 15576 x 3072 (embeddings are lists of floats).


In [16]:
daily.to_csv("../data/proc/embeddings/daily_openai_emb_long.csv", index=False)

In [17]:
weekly["weekly_emb1"] = get_embeddings(weekly, "WEEKLY_goal_set1", client, MODEL, BATCH_SIZE, MAX_RETRIES)

Total rows: 1747 | Unique texts: 1301


Embedding (unique): 100%|██████████| 1301/1301 [01:29<00:00, 14.60it/s]


In [18]:
weekly["weekly_emb2"] = get_embeddings(weekly, "WEEKLY_goal_set2", client, MODEL, BATCH_SIZE, MAX_RETRIES)

Total rows: 1747 | Unique texts: 1408


Embedding (unique): 100%|██████████| 1408/1408 [01:32<00:00, 15.29it/s]


In [19]:
weekly.to_csv("../data/proc/embeddings/weekly_openai_emb_wide.csv", index=False)

In [20]:
monthly["monthly_emb1"] = get_embeddings(monthly, "MONTHLY_goal_set1", client, MODEL, BATCH_SIZE, MAX_RETRIES)

Total rows: 466 | Unique texts: 404


Embedding (unique): 100%|██████████| 404/404 [00:24<00:00, 16.52it/s]


In [21]:
monthly["monthly_emb2"] = get_embeddings(monthly, "MONTHLY_goal_set2", client, MODEL, BATCH_SIZE, MAX_RETRIES)

Total rows: 466 | Unique texts: 434


Embedding (unique): 100%|██████████| 434/434 [00:24<00:00, 17.40it/s]


In [22]:
monthly.to_csv("../data/proc/embeddings/monthly_openai_emb_wide.csv", index=False)