___

Text containing conditional triggers (“if/when/after [context], I will [behavior]”) predicts higher next-day completion, especially for high-effort goals. We have two goals in text formats per day. However, most people do not write explicit conditional triggers. Using an LLM, we can rate each goal for implicit or explicit conditional trigger phrasing (e.g., “after work”, “before bed”, “tonight”, “on weekends”, etc). 

We are gonna have a categorical scale of 0, 1, and 2:
- 0 means no implicit or explicit cue  or planning specificity phrasing
- 1 means some implicit or explicit cue or planning specificity phrasing
- 2 means strong implicit or explicit cue or planning specificity phrasing
___

In [1]:
import sys

import pandas as pd
import warnings
from typing import List
import os
from dotenv import load_dotenv

from tqdm import tqdm

from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI

In [18]:
load_dotenv()
warnings.simplefilter(action='ignore', category=UserWarning)
sys.path.append(os.path.abspath(".."))

In [19]:
from utils.handle_batches import (
    get_required_batches, write_log
)

from utils.llm_utils import get_batch_classification_by_llm

___
CONFIGURE THE PATHS
___

In [5]:
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")

In [6]:
self_report_path = "../data/proc/self_report/self_report.csv" 
results_path = '../results/'
classification_log_path = "../data/logs/framing_log.json"
classification_output_path = results_path + "batch_conditional_trigger_results/"

___
PREPARE THE DATA DICTIONARY
___

In [7]:
df = pd.read_csv(self_report_path)

cols = [
    "ParticipantIdentifier",
    "trial_date",
    "DAILY_goal1_set",
    "DAILY_goal2_set",
]

df = df[cols]

long_df = pd.melt(
    df,
    id_vars=["ParticipantIdentifier", "trial_date"],   # keep these as identifiers
    value_vars=["DAILY_goal1_set", "DAILY_goal2_set"],  # the columns to melt
    var_name="ResponseIdentifier",   # new column for the old column names
    value_name="Answers"             # new column for the values
)

In [8]:
goalDF = long_df

df_dict = goalDF.to_dict('records')
for i in range(len(df_dict)):
    try:
        df_dict[i]['Answers'] = df_dict[i]['Answers'].strip()
    except:
        df_dict[i]['Answers'] =str(df_dict[i]['Answers']).strip()

___
CONFIGURE THE PROMPT
___

In [9]:
class Label(BaseModel):
    goals: List[dict[str,int]] = Field(description="List of goals-[conditional-trigger-or-framing-score] pairs")
    
parser = PydanticOutputParser(pydantic_object=Label)

template = """
You are an expert in psychology and goal-setting research.

Rate each goal from **0, 1, and 2** for how much it shows **planning specificity or cue-based phrasing** — 
meaning that the goal contains *any* hint of timing, place, routine, or sequencing.

Do **not** require explicit “if/when/after” language; partial or implicit cues still count.
---

### Scoring Rubric

0 means no implicit or explicit cue  or planning specificity phrasing
1 means some implicit or explicit cue or planning specificity phrasing
2 means strong implicit or explicit cue or planning specificity phrasing
---

**Guidelines:**
- Use **whole numbers only (0–10)**.

---

### Output format
{format_instructions}

Return a list of dictionaries:
- key = exact goal text
- value = numeric score (0–10)

No other commentary.

---

GOALS:
{goalList}
"""

prompt = PromptTemplate(
    template=template,
    input_variables=["goalList"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)


___
CONFIGURE THE MODEL TO BE USED
___

In [10]:
# Configure the model to be used
model_name = 'gpt-4o'
temperature = 0.5
llm = ChatOpenAI(model=model_name, temperature=temperature)

___
CONFIGURE THE BATCHES
___

In [11]:
required_batches = get_required_batches(goalDF)
print(required_batches)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 22

In [12]:
## GET COMPLETED BATCHES AND BATCH SIZE
completed_batches = []
batch_size = 0
import json
with open(classification_log_path, "r") as file:
    loaded_json = json.load(file)
    loaded_json = json.loads(loaded_json)
    completed_batches = loaded_json['completed_batches']
    batch_size = loaded_json['batch_size']
    print(completed_batches, batch_size)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 22

In [13]:
# completed_batches = [x for x in range(1, 836)]
batch_list = [item for item in required_batches if item not in completed_batches]
print(batch_list)

[]


___
DO THE LLM CALLS TO GET THE SCORES
___

In [15]:
long_df.head(3)

Unnamed: 0,ParticipantIdentifier,trial_date,ResponseIdentifier,Answers
0,0501ba67-3406-4779-aff1-878a0e9f7885,2022-09-30,DAILY_goal1_set,"Study history and psychology, practice the vio..."
1,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-01,DAILY_goal1_set,study history lectures
2,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-02,DAILY_goal1_set,Watch a historical movie


In [None]:
def get_conditional_trigger_rating(batch_list, completed_batches, batch_size, 
                   df_dict, model_name, temperature, prompt,
                   output_directory, file_suffix, classification_log_path):

    Labels = []
    for item in tqdm(batch_list):
        if item in completed_batches:
            continue
        else:
            end_index = item*batch_size
            start_index = end_index - batch_size
            print(item, start_index, end_index)

            Labels = get_batch_classification_by_llm(
                Label, 
                df_dict, 
                llm, prompt, 
                start_index, end_index
            )
            
            print(Labels)
            print(len(Labels))
            completed_batches.append(item)

            for i in range(len(Labels)):
                try:
                    df_dict[i + start_index]['framing_score'] = list(Labels[i].values())[0] 
                except KeyError:
                    df_dict[i + start_index]['framing_score'] = "!!!FIX_ME!!!"

            pd.DataFrame(df_dict[start_index:end_index]).to_csv(output_directory + "/" + file_suffix + "_" + str(item) + ".csv")

    write_log(completed_batches, batch_size, classification_log_path)

In [None]:
get_conditional_trigger_rating(batch_list, completed_batches, 25, 
                df_dict, model_name, temperature, prompt,
                classification_output_path, 'batch', classification_log_path)

___
CONCATENATE DICTIONARIES AND SAVE OUTPUT
___

In [25]:
df_dict = []

batches = [(i + 1) for i in range(len(
    [name for name in os.listdir(classification_output_path) if not name.startswith('fixed')]
    ))]

for i in batches:
    filename = "batch" + "_" + str(i) + ".csv"
    file_path = os.path.join(classification_output_path, filename)
    if os.path.isfile(file_path): # checking if it is a file
        df_temp = pd.read_csv(file_path).to_dict('records')
        for x in df_temp:
            df_dict.append(x) 

labelledDF = pd.DataFrame.from_dict(df_dict).reset_index(drop= True)

In [28]:
cols = [
    'ParticipantIdentifier',
    'trial_date',
    'ResponseIdentifier',
    'Answers',
    'framing_score'
]

labelledDF = labelledDF.drop_duplicates(subset=['ParticipantIdentifier', 'trial_date', 'ResponseIdentifier'])[cols]
labelledDF.head()

Unnamed: 0,ParticipantIdentifier,trial_date,ResponseIdentifier,Answers,framing_score
0,0501ba67-3406-4779-aff1-878a0e9f7885,2022-09-30,DAILY_goal1_set,"Study history and psychology, practice the violin",0.0
1,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-01,DAILY_goal1_set,study history lectures,0.0
2,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-02,DAILY_goal1_set,Watch a historical movie,0.0
3,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-03,DAILY_goal1_set,review,0.0
4,0501ba67-3406-4779-aff1-878a0e9f7885,2022-10-04,DAILY_goal1_set,review,0.0


In [29]:
labelledDF.to_csv('../data/proc/daily_goal_framing_long.csv', index= False)