### **NOTEBOOK LABELS PEOPLE'S GOALS ACCORDING TO ONE OF 34 IB-GAP CATEGORIES**

___
INSTALL PACKAGES
___

In [8]:
# %pip install langchain
# %pip install typing
# %pip install openai
# !pip install openai==0.28
# !pip install langchain typing openai statsmodels seaborn matplotlib
# !pip install langchain_community
# !pip install langchain-openai

In [9]:
# !pip install config

___
IMPORT LIBRARIES
___

In [10]:
import pandas as pd
import warnings
from typing import List
import config
import os
import json

from tqdm import tqdm

from langchain.prompts import PromptTemplate
from langchain.llms import OpenAI
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI

___
LOADING API KEYS
___

In [None]:
os.environ['HUGGINGFACEHUB_API_TOKEN'] = ""
os.environ['OPENAI_API_KEY'] = ""
os.environ["SERPAPI_API_KEY"] = ""

___
SETTING UP PATHS
___

In [12]:
run_path = "../data/proc/goal_filtered_long.csv"

In [13]:
cols = ["ParticipantIdentifier", "ResultIdentifier", "Answers", "trial_date"]
run_raw = pd.read_csv(run_path)[cols]

In [14]:
run_raw.head()

Unnamed: 0,ParticipantIdentifier,ResultIdentifier,Answers,trial_date
0,0151d9f1-1644-4437-805e-02f5e244a690,DAILY_goal1_set,Keep working on psych paper,2023-02-01
1,0151d9f1-1644-4437-805e-02f5e244a690,DAILY_goal1_set,Keep working on psych paper,2023-02-02
2,0151d9f1-1644-4437-805e-02f5e244a690,DAILY_goal1_set,Finish and hand in psych rough draft,2023-02-03
3,0151d9f1-1644-4437-805e-02f5e244a690,DAILY_goal1_set,Practice biology FSG questions,2023-02-04
4,0151d9f1-1644-4437-805e-02f5e244a690,DAILY_goal1_set,Practice biology FSG questions,2023-02-05


In [19]:
classification_log_path = "../classification_log.json"
results_path = '../results/'
classification_output_path = results_path + "batch_classification_results/"

In [20]:
os.mkdir(classification_output_path,)

___
DEFINE DATA STRUCTURE FOR OUTPUT FORMATTING
___

In [21]:
class Label(BaseModel):
    goals: List[dict[str,str]] = Field(description="List of goals-label pairs")

___
DEFINE FUNCTIONS
___

In [None]:
def get_goal_classification_batch(
    data_dict,
    model_name,
    temperature,
    prompt,           # a LangChain prompt (string or ChatPromptTemplate)
    start_index,
    end_index,
):
    # Build the LLM (expects OPENAI_API_KEY in env)
    llm = ChatOpenAI(model=model_name, temperature=temperature)

    # Slice goals (assuming data_dict is a list[dict] with key "Answers")
    goal_list = [x["Answers"] for x in data_dict][start_index:end_index]

    parser = PydanticOutputParser(pydantic_object=Label)

    # --- If `prompt` is a ChatPromptTemplate (recommended) ---
    try:
        msgs = prompt.format_messages(goalList=goal_list)
        resp = llm.invoke(msgs)                # returns an AIMessage
        text = resp.content
        print(text)
    except AttributeError:
        # --- If `prompt` is a plain PromptTemplate/string fallback ---
        try:
            text_input = prompt.format_prompt(goalList=goal_list).to_string()
        except AttributeError:
            text_input = str(prompt).format(goalList=goal_list)
        resp = llm.invoke(text_input)
        text = resp.content

    return parser.parse(text).goals

In [23]:
def write_log(completed_batches, batch_size, classification_log_path):

    data = {"completed_batches": completed_batches, "batch_size": batch_size}
    jstr = json.dumps(data, indent=4)
    with open(classification_log_path, 'w', encoding='utf-8') as outfile:
        json.dump(jstr, outfile, ensure_ascii=False)

In [24]:
def get_required_batches(df):
    required_batches = []
    counter = len(df)
    i = 1
    while counter >= 25:
        required_batches.append(i)
        i = i + 1
        counter = counter - 25

    if counter == 0:
        required_batches.append(i)

    return required_batches

In [25]:
def classify_goals(batch_list, completed_batches, batch_size, 
                   df_dict, model_name, temperature, prompt,
                   output_directory, file_suffix, classification_log_path):

    Labels = []
    for item in tqdm(batch_list):
        if item in completed_batches:
            continue
        else:
            end_index = item*batch_size
            start_index = end_index - batch_size
            print(item, start_index, end_index)
            Labels = get_goal_classification_batch(df_dict, 
                                    model_name, temperature, prompt,
                                    start_index, end_index)
            
            print(Labels)
            print(len(Labels))
            completed_batches.append(item)
            for i in range(len(Labels)):
                try:
                    df_dict[i + start_index]['label'] = list(Labels[i].values())[0] 
                    # print(df_dict[i + start_index]['Answers'], df_dict[i + start_index]['label'])
                    #[df_dict[i + start_index]['Answers']]
                except KeyError:
                    df_dict[i + start_index]['label'] = "!!!FIX_ME!!!"

            pd.DataFrame(df_dict[start_index:end_index]).to_csv(output_directory + "/" + file_suffix + "_" + str(item) + ".csv")

    write_log(completed_batches, batch_size, classification_log_path)

___
MAKE THE PROMPT TEMPLATE
___

In [26]:
parser = PydanticOutputParser(pydantic_object=Label)
template = """
    Please label each given goal as one of the following 34 categories:
    1. Exercise
    2. MentalPersonal_Health
    3. Medical_Health
    4. Sleep
    5. Alcohol_drug
    6. Online
    7. Phone
    8. Video games
    9. Reading_leisure
    10. SocialMedia
    11. Sports_playing
    12. TV_Streaming
    13. Family
    14. Friends
    15. Partner
    16. Social_life
    17. Hobby
    18. Housework
    19. School
    20. Work
    21. Environment
    22. Culture
    23. Learning
    24. Self-Improvement
    25. Volunteering
    26. Community involvement
    27. Admin
    28. Future_Planning
    29. Finances
    30. Time_Management
    31. Punctuality
    32. Personal_Values
    33. Cooking
    34. Diet
    35. Other

    <FORMAT INSTRUCTONS>
    \n{format_instructions}

    Label should be the category name exactly as given above and not the number
    Each goal should have one and only one label.
    For each goal, return a dictionary with the goal as key and the label as value.
    Do not include any explanations or other text.
    I will send in a list of goals, please return a list of dictionaries of the same length.

    <CLASSIFICATION TASK>
    goals: {goalList}
    """
prompt = PromptTemplate(
    template=template,
    input_variables=["goalList"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

In [259]:
# goalList = [x['Answers'] for x in df_dict][278:500]
# input = prompt.format_prompt(goalList=goalList).to_string()
# output = model(input)
# Label = parser.parse(output).goals

___
LOAD DATA AND CONSTRUCT DATA DICTIONARY
___

In [30]:
goalDF = run_raw.copy()

In [31]:
goalDF.head()

Unnamed: 0,ParticipantIdentifier,ResultIdentifier,Answers,trial_date
0,0151d9f1-1644-4437-805e-02f5e244a690,DAILY_goal1_set,Keep working on psych paper,2023-02-01
1,0151d9f1-1644-4437-805e-02f5e244a690,DAILY_goal1_set,Keep working on psych paper,2023-02-02
2,0151d9f1-1644-4437-805e-02f5e244a690,DAILY_goal1_set,Finish and hand in psych rough draft,2023-02-03
3,0151d9f1-1644-4437-805e-02f5e244a690,DAILY_goal1_set,Practice biology FSG questions,2023-02-04
4,0151d9f1-1644-4437-805e-02f5e244a690,DAILY_goal1_set,Practice biology FSG questions,2023-02-05


In [32]:
df_dict = goalDF.to_dict('records')
for i in range(len(df_dict)):
    try:
        df_dict[i]['Answers'] = df_dict[i]['Answers'].strip()
    except:
        df_dict[i]['Answers'] =str(df_dict[i]['Answers']).strip()

___
GET NUMBER OF REQUIRED BATCHES
___

In [33]:
required_batches = get_required_batches(goalDF)
print(required_batches)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 22

___
GET COMPLETED BATCHES AND SIZE
___ 

In [37]:
## GET COMPLETED BATCHES AND BATCH SIZE
completed_batches = []
batch_size = 0
import json
with open(classification_log_path, "r") as file:
    loaded_json = json.load(file)
    loaded_json = json.loads(loaded_json)
    completed_batches = loaded_json['completed_batches']
    batch_size = loaded_json['batch_size']
    print(completed_batches, batch_size)

[] 25


___
VERIFY BATCH NUMBER AND SIZE
___

In [38]:
# # REWRITE CLASSIFICATION LOG WHEN REQUIRED
# write_log([], 25, classification_log_path)

___
SET UP MODEL PARAMETERS
___

In [39]:
model_name = 'gpt-4o-mini'
temperature = 0.5
# completed_batches = [x for x in range(1, 836)]
batch_list = [item for item in required_batches if item not in completed_batches]
print(batch_list)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 22

___
RUN CLASSIFICATION TASK ON CURRENT BATCH
___

In [None]:
warnings.simplefilter(action='ignore', category=UserWarning)
classify_goals(batch_list, completed_batches, 25, 
                df_dict, model_name, temperature, prompt,
                classification_output_path, 'batch', classification_log_path)

___
CONCATENATE ALL LABELLED BATCHES
___

In [59]:
df_dict = []

batches = [(i + 1) for i in range(len(
    [name for name in os.listdir(classification_output_path) if not name.startswith('fixed')]
    ))]

for i in batches:
    filename = "batch" + "_" + str(i) + ".csv"
    file_path = os.path.join(classification_output_path, filename)
    if os.path.isfile(file_path): # checking if it is a file
        df_temp = pd.read_csv(file_path).to_dict('records')
        for x in df_temp:
            df_dict.append(x) 

___
FILTER OUT LABELS THAT NEED FIXING
___

In [60]:
goalDF = pd.DataFrame.from_dict(df_dict).reset_index(drop= True)

cols = [
    'ParticipantIdentifier',
    'trial_date',
    'ResultIdentifier',
    'Answers',
    'label'
]

# needFix = goalDF.loc[goalDF['label'] == '!!!FIX_ME!!!']
# goalDF = goalDF.loc[goalDF['label'] != '!!!FIX_ME!!!']
# goalDF.to_csv(results_path + 'overall_labelled_first_iter.csv')
goalDF = goalDF.drop_duplicates(subset=['ParticipantIdentifier', 'trial_date', 'ResultIdentifier'])[cols]
goalDF.to_csv('../results/labelled_daily_goals_long.csv', index=False)
goalDF.head()

Unnamed: 0,ParticipantIdentifier,trial_date,ResultIdentifier,Answers,label
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,DAILY_goal1_set,Keep working on psych paper,School
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,DAILY_goal1_set,Keep working on psych paper,School
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-03,DAILY_goal1_set,Finish and hand in psych rough draft,School
3,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-04,DAILY_goal1_set,Practice biology FSG questions,School
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-05,DAILY_goal1_set,Practice biology FSG questions,School


In [61]:
def basic_counts(df):
    # 1. Number of participants
    n_participants = df['ParticipantIdentifier'].nunique()

    # 2. Entries per participant
    counts = df['ParticipantIdentifier'].value_counts()

    min_entries = counts.min()
    max_entries = counts.max()

    print("Number of participants:", n_participants)
    print("Min entries per participant:", min_entries)
    print("Max entries per participant:", max_entries)

basic_counts(goalDF)

Number of participants: 112
Min entries per participant: 82
Max entries per participant: 166


In [62]:
goal_cols = [
    'ParticipantIdentifier',
    'trial_date',
    'ResultIdentifier',
    'Answers'
]

label_cols = [
    'ParticipantIdentifier',
    'trial_date',
    'ResultIdentifier',
    'label'
]

In [63]:
goalDF_long = goalDF[goal_cols]
labelDF_long = goalDF[label_cols]

In [64]:
goalDF_wide = goalDF_long.pivot_table(
    index=['ParticipantIdentifier', 'trial_date'],
    columns='ResultIdentifier',
    values='Answers',
    aggfunc='first'
).reset_index()

goalDF_wide.head()

ResultIdentifier,ParticipantIdentifier,trial_date,DAILY_goal1_set,DAILY_goal2_set
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,Keep working on psych paper,Finish chem prelab
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,Keep working on psych paper,Review bio questions
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-03,Finish and hand in psych rough draft,Create quick bio lecture notes
3,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-04,Practice biology FSG questions,Catch up on anthropology readings
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-05,Practice biology FSG questions,Continue reading anthropology


In [65]:
labelDF_wide = labelDF_long.pivot_table(
    index=['ParticipantIdentifier', 'trial_date'],
    columns='ResultIdentifier',
    values='label',
    aggfunc='first'
).reset_index()


labelDF_wide = labelDF_wide.rename(columns = {
    "DAILY_goal1_set": "DAILY_goal1_label",
    "DAILY_goal2_set": "DAILY_goal2_label"
})

labelDF_wide.head()

ResultIdentifier,ParticipantIdentifier,trial_date,DAILY_goal1_label,DAILY_goal2_label
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,School,School
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,School,School
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-03,School,School
3,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-04,School,School
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-05,School,Reading_leisure


In [66]:
len(goalDF)

15575

In [67]:
labelled_goalDF_wide = goalDF_wide.merge(labelDF_wide, on=['ParticipantIdentifier', 'trial_date'], how='left')
labelled_goalDF_wide.head()

ResultIdentifier,ParticipantIdentifier,trial_date,DAILY_goal1_set,DAILY_goal2_set,DAILY_goal1_label,DAILY_goal2_label
0,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-01,Keep working on psych paper,Finish chem prelab,School,School
1,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-02,Keep working on psych paper,Review bio questions,School,School
2,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-03,Finish and hand in psych rough draft,Create quick bio lecture notes,School,School
3,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-04,Practice biology FSG questions,Catch up on anthropology readings,School,School
4,0151d9f1-1644-4437-805e-02f5e244a690,2023-02-05,Practice biology FSG questions,Continue reading anthropology,School,Reading_leisure


In [70]:
labelled_goalDF_wide.to_csv('../results/labelled_daily_goals_wide.csv', index=False)

In [71]:
import shutil
shutil.make_archive('../results/batch_classification_results', 'zip', classification_output_path)
shutil.rmtree(classification_output_path)

___
- RUN THE WHOLE THING AGAIN AND LOOK AT CONSISTENCY OF LABELLING
- Label in a hierarchical fashion
___