# A behavioural analog to BCI training : results

Statistics and simple plots of the results of the experiment (technical test). 

This notebook does not figure much computational modeling (yet).


Let's start by importing the necessary packages as well as the data : 

## 1. Loading the data from each task !
A few plots to show how the participants demographics compared.

In [43]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import plotly as pltly
import plotly.express as px
import plotly.graph_objects as go

# + local functions : 
from database_handling.database_extract import get_all_subject_data_from_internal_task_id
from utils import remove_by_indices

# We import the data of two separate tasks : 
FULL_DATA = {}

# Import the data from the remote mongodb database & the imported prolific demographics :
internal_task_id = "002"

# Except subjects for predictors :
problematic_subjects_misc = ["5c9cb670b472d0001295f377"]
        # This subject has read the instructions with one submission and ran
        # the actual task with another, rendering statistics computed impossible to 
        # compare, this should be substracted from any statistical models based on
        # instructional data, but can be kept for raw performance plots.
# problematic_subjects_fraudulent =["6595ae358923ce48b037a0dc"]
        # This subject has very suspicious responses, including always putting both points in the same place
        # and acting as quickly as possible, to be removed from all analysis ?

tasks_results_002 = get_all_subject_data_from_internal_task_id(internal_task_id,override_save=False)
print("Loaded the task results for " + str(len(tasks_results_002)) + " subjects.")

# Each subject in task results has the following entries : 
# TASK_RESULT_FEATURES, TASK_RESULTS_EVENTS, TASK_RESULTS_DATA, TASK_RESULTS,RT_FB
remove_these_subjects = []
for index,entry in enumerate(tasks_results_002):
    subj_dict,_,_,_ = entry
    subj_name = subj_dict["subject_id"]
    if subj_name in problematic_subjects_misc:
        remove_these_subjects.append(index)

tasks_results_002_filtered = remove_by_indices(tasks_results_002,remove_these_subjects)
print(str(len(tasks_results_002_filtered)) + " subjects remaining after removing problematic subjects.")

FULL_DATA[internal_task_id] = tasks_results_002_filtered


# Import the data from the remote mongodb database & the imported prolific demographics :
internal_task_id = "003"
# Study 1 : the std of the gauge is chosen randomly between 0.0 and 0.25
PROLIFIC_STUDY_IDs = ["66f96c31e69227986334a027","66d086503c0a69291c361b67"]
# Except subjects for predictors :
problematic_subjects_misc = ["615c1741d4630b25e6bc1cb9"]
                # This subject had 7 entries in the incomplete database, and likely restarted the task once
                # probably due to a technical error.

tasks_results_0031 = []
for prolific_study_id in PROLIFIC_STUDY_IDs:
    task_results = get_all_subject_data_from_internal_task_id(internal_task_id,prolific_study_id,override_save=False)
    print(" - Loaded the task results for study {} \n    ({} subjects.)".format(prolific_study_id,len(task_results)))
    tasks_results_0031 += task_results
print("Total : {} subjects".format(len(tasks_results_0031)))

# Each subject in task results has the following entries : 
# TASK_RESULT_FEATURES, TASK_RESULTS_EVENTS, TASK_RESULTS_DATA, TASK_RESULTS,RT_FB
remove_these_subjects = []
for index,entry in enumerate(tasks_results_0031):
    subj_dict,_,_,_ = entry
    subj_name = subj_dict["subject_id"]
    if subj_name in problematic_subjects_misc:
        remove_these_subjects.append(index)

tasks_results_0031_filtered = remove_by_indices(tasks_results_0031,remove_these_subjects)
print(str(len(tasks_results_0031_filtered)) + " subjects remaining after removing problematic subjects.")

FULL_DATA["003_1"] = tasks_results_0031_filtered

print([len(s) for s in FULL_DATA.values()])


# Import the data from the remote mongodb database & the imported prolific demographics :
internal_task_id = "003"
# Study 2 : the std of the was 0.025,0.1 and 0.175
PROLIFIC_STUDY_IDs = ["6703ab18d345eaa4893587e0","66f9aee8210357265a5958fc","6703ab1a7ea30557549dc6da"]

# Except subjects for predictors :
problematic_subjects_misc = ["611d60c383f4f70ff4bc99fd", # S2 : Did the task twice 
                             "66a74bdfdcaccdc0703894d5", # Consent revoked 
                            "667d92f2ea5c1542f417285d",
                            "6548f570022275786186ffbd"]
# problematic_subjects_fraudulent =["66bb5c09526e6d80f1146800"]
                # This subject had "eclectic performances" so say the least
                # Left fullscreen twice and missed 4 actions

tasks_results_0032 = []
for prolific_study_id in PROLIFIC_STUDY_IDs:
    task_results = get_all_subject_data_from_internal_task_id(internal_task_id,prolific_study_id,override_save=False)
    print(" - Loaded the task results for study {} \n    ({} subjects.)".format(prolific_study_id,len(task_results)))
    tasks_results_0032 += task_results
print("Total : {} subjects".format(len(tasks_results_0032)))

# Each subject in task results has the following entries : 
# TASK_RESULT_FEATURES, TASK_RESULTS_EVENTS, TASK_RESULTS_DATA, TASK_RESULTS,RT_FB
remove_these_subjects = []
for index,entry in enumerate(tasks_results_0032):
    subj_dict,_,_,_ = entry
    subj_name = subj_dict["subject_id"]
    if subj_name in problematic_subjects_misc:
        remove_these_subjects.append(index)

tasks_results_0032_filtered = remove_by_indices(tasks_results_0032,remove_these_subjects)
print(str(len(tasks_results_0032_filtered)) + " subjects remaining after removing problematic subjects.")

FULL_DATA["003_2"] = tasks_results_0032_filtered

print([len(s) for s in FULL_DATA.values()])

Loaded the task results for 90 subjects.
89 subjects remaining after removing problematic subjects.
 - Loaded the task results for study 66f96c31e69227986334a027 
    (16 subjects.)
 - Loaded the task results for study 66d086503c0a69291c361b67 
    (40 subjects.)
Total : 56 subjects
55 subjects remaining after removing problematic subjects.
[89, 55]
 - Loaded the task results for study 6703ab18d345eaa4893587e0 
    (49 subjects.)
 - Loaded the task results for study 66f9aee8210357265a5958fc 
    (50 subjects.)
 - Loaded the task results for study 6703ab1a7ea30557549dc6da 
    (50 subjects.)
Total : 149 subjects
145 subjects remaining after removing problematic subjects.
[89, 55, 145]


In [44]:

for task_id,task_results in FULL_DATA.items():
    # Warning, the questions asked may vary based on the task ID !
    print("According to you, what was the best strategy in order to control the gauge level ? In retrospect, what other strategy should you have attempted ? (Try to answer as precisely as possible.)")
    print("__________________________________________________________________________")
    for subject_results in task_results:
        subject_dict,trial_data,events,fb_rtv = subject_results
        print("-> " + subject_dict["mandatory_q2"])
        # print("-> " + subject_dict["Nationality"])
    print()
    print("Did you notice any technical issues (buttons not working, graphical bugs, etc.) ? If yes, how did they appear ?")
    print("__________________________________________________________________________")
    for subject_results in task_results:
        subject_dict,trial_data,events,fb_rtv = subject_results
        print("-> " + subject_dict["optional_q1"])
    print() 
    print("The instructions were meant to help you understand the task before it started. How clear did you find them ? What would you change to make them clearer ?")
    print("__________________________________________________________________________")
    for subject_results in task_results:
        subject_dict,trial_data,events,fb_rtv = subject_results
        print("-> " + subject_dict["optional_q2"])
    print() 
    print("Do you think there was a logic behind how the gauge responded ? How hard / frustrating did you find the task ? Other remarks ?")
    print("__________________________________________________________________________")
    for subject_results in task_results:
        subject_dict,trial_data,events,fb_rtv = subject_results
        print("-> " + subject_dict["optional_q3"])
    print() 

According to you, what was the best strategy in order to control the gauge level ? In retrospect, what other strategy should you have attempted ? (Try to answer as precisely as possible.)
__________________________________________________________________________
-> 1st click on the bottom left corner, 2nd click on the top right corner
-> The strategy that i found was putting the points at the high level of the gray screen. I think the gauge level increases when the points are higher than the current level. I should have tried more what I mentioned last, because now that im thinking about it sometimes i didn't do it. In the beginning instead, i didnt understand at all how it worked. after a bunch of trials i figured out in order for it to be full my points should be higher. 
-> to start low and move higher each step
-> I tried to check all the spaces in the grey area. 
After that, I tried to notice when the gauge level went up accordingly to where I clicked.
If it went up, I repeat thos

## 2. Get the agents 
Load the LLM agents that will attempt to rate the text answers of our participants.


We used LM-studio and mounted an empirically chosen agent (Qwen2.5-Coder-32B-Instruct-GGUF/qwen2.5-coder-32b-instruct-q2_k.gguf) sourced from HuggingFace.

Let's check that it works !

In [45]:
from openai import OpenAI


# Connect to LM Studio
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")

completion = client.chat.completions.create(
  model="model-identifier",
  messages=[
    {"role": "system", "content": "You are an impartial agent tasked with classifying text answers in a cognitive behavioural task. You live in a Jupyter notebook. You always answer in rhymes."},
    {"role": "user", "content": "Introduce yourself in 4 paragraphs."}
  ],
  temperature=0.7,
)

print(completion.choices[0].message.content)

In lines of code and cells I dwell,
A Jupyter notebook, a digital hell,
Where thoughts and data intertwine,
Like leaves that dance in autumn's line.

I classify texts with rhyme and grace,
Crafting responses in rhythmic space,
Through the lens of cognitive tasks,
Decoding minds where thoughts clash.

With every run and cell update,
My task is to provide feedback,
On paths of thought, behavior, and mood,
Guiding users through their mental good.

So here I stand, in code's embrace,
Ready to rhyme, to classify and embrace,
The journey of the mind, its highs and lows,
In a Jupyter notebook, where thoughts bestow.


## 3. Asking the real questions !

Let's indicate a set of questions asked to the subjects and how which categories we want our LLM to classify the subjects into :

In [46]:
def llm_request_message(question,subject_answer) :
    
    _general_role = "You are an impartial assistant tasked with classifying subject text responses from a behavioural experiment."
    _experiment_general = "The subjects of the experiment performed actions by placing two points (or arrows) on an unannotated grey screen. They had to learn how to control the level of a gauge using these points."
    _experiment_general_2 =  "The strategy to control the gauge using the points was ambiguous by design. Subjects had to find it by themselves through trial and error."
    _experiment_general_3 = "The gauge was actually controlled by the angle between the horizontal and the vector draw by the points. A 45° angle was optimal."
    _experiment_general_4 = "This means that the optimal strategy was to place the second point should be placed on the upper-right compared to the first point."
    _experiment_general_5 = "Placing the first point on the bottom-left corner and the second point on the top right corner also worked."
    _question = "At the end of the experiment, we asked subjects the following question : **{}**.".format(question["prompt"])
    _task = "Your job is to classify the answer of the participant into one of the following categories :"    
    _categories = "\n".join([cat["letter"] + ". " + cat["content"] for cat in question["categories"]])
    _constraint = "Please only answer using the letter corresponding the the classification of the participant."
    system_directive = "\n".join([_general_role,_experiment_general,_experiment_general_2,_experiment_general_3,_experiment_general_4,_experiment_general_5,_question,_task,_categories,_constraint])
    
    _answer = "Here is the answer of the participant : **{}**".format(subject_answer)
    
    messages=[
            {"role": "system", "content": system_directive},
            {"role": "user", "content": _answer}
        ]
    return messages


question_1 = {
        "prompt":"According to you, what was the best strategy in order to control the gauge level ? In retrospect, what other strategy should you have attempted ? (Try to answer as precisely as possible.)",
        "categories":[
            {"letter" : "A", "content" : "The subject did not provide any answer to the question / The answer was off-topic."},
            {"letter" : "B", "content" : "The subject found the optimal strategy."},
            {"letter" : "C", "content" : "The subject found only part of the optimal strategy (e.g. placing points on top of each other, drawing an horizontal line, etc.)."},
            {"letter" : "D", "content" : "The subject thought he/she found the correct strategy but was mistaken."},
            {"letter" : "E", "content" : "The subject did not try any strategy"},
        ],
        "dict_key" : "mandatory_q2"
    }


question_2 = {
        "prompt":"Did you notice any technical issues (buttons not working, graphical bugs, etc.) ? If yes, how did they appear ?",
        "categories":[
            {"letter" : "A", "content" : "The subject did not provide any answer to the question / The answer was off-topic."},
            {"letter" : "B", "content" : "The subject did not encounter any issue."},
            {"letter" : "C", "content" : "The subject encountered only minor technical issues : display delays, lag, stuttering."},
            {"letter" : "D", "content" : "The subject encountered serious issues but only affected a limited amount of data (e.g. only in a few actions)."},
            {"letter" : "E", "content" : "The subject encountered catastrophic issues that impacted the gathering of meaningful data."},
        ],
        "dict_key" : "optional_q1"
    }

question_3 = {
        "prompt":"The instructions were meant to help you understand the task before it started. How clear did you find them ? What would you change to make them clearer ?",
        "categories":[
            {"letter" : "A", "content" : "The subject did not provide any answer to the question / The answer was off-topic."},
            {"letter" : "B", "content" : "The subject fully understood the instructions."},
            {"letter" : "C", "content" : "The subject partially understood the instructions."},
            {"letter" : "D", "content" : "The subject struggled to understand the instructions."},
            {"letter" : "E", "content" : "The subject did not understand the instructions at all."},
        ],
        "dict_key" : "optional_q2"
    }

question_4 = {
        "prompt":"Do you think there was a logic behind how the gauge responded ? How hard / frustrating did you find the task ? Other remarks ?",
        "categories":[
            {"letter" : "A", "content" : "The subject did not provide any answer to the question / The answer was off-topic."},
            {"letter" : "B", "content" : "The subject believed there was a logic behind how the gauge behaved and found it."},
            {"letter" : "C", "content" : "The subject believed there was a logic behind how the gauge behaved but did not manage to find it."},
            {"letter" : "D", "content" : "The subject believed there was a slight logic behind how the gauge behaved but with significant erratic behaviour."},
            {"letter" : "E", "content" : "The subject did not find any logic in the gauge behaviour."},
        ],
        "dict_key" : "optional_q3"
    }

question_5 = {
        "prompt":"Do you think there was a logic behind how the gauge responded ? How hard / frustrating did you find the task ? Other remarks ?",
        "categories":[
            {"letter" : "A", "content" : "The subject did not provide any answer to the question."},
            {"letter" : "B", "content" : "The task was very easy to the subject."},
            {"letter" : "C", "content" : "The task was moderately easy to the subject."},
            {"letter" : "D", "content" : "The task was initially hard but then became easy when the subject found out the correct strategy."},
            {"letter" : "E", "content" : "The task was very hard to the subject."},
        ],
        "dict_key" : "optional_q3"
    }

question_6 = {
        "prompt":"Do you think there was a logic behind how the gauge responded ? How hard / frustrating did you find the task ? Other remarks ?",
        "categories":[
            {"letter" : "A", "content" : "The subject did not provide any answer to the question."},
            {"letter" : "B", "content" : "The subject found the task funny / entertaining / interesting."},
            {"letter" : "C", "content" : "The subject reported no strong opinion on the task."},
            {"letter" : "D", "content" : "The subject found the task boring."},
            {"letter" : "E", "content" : "The subject found the task a bit frustrating."},
            {"letter" : "F", "content" : "The subject found the task very frustrating."},
        ],
        "dict_key" : "optional_q3"
    }

classified_text_answers = {}
for i,question in enumerate([question_1,question_2,question_3,question_4,question_5,question_6]):
        
    question_results = {}
    for task_id,task_results in FULL_DATA.items():
        question_results[task_id] = []

        for k,subject_results in enumerate(task_results):
            print(k)
            subject_dict,trial_data,events,fb_rtv = subject_results
            
            subject_answer = subject_dict[question["dict_key"]]
            # print("-> " + subject_dict[question["dict_key"]])
            
            message = llm_request_message(question,subject_answer)

            completion = client.chat.completions.create(
                model="model-identifier",
                messages=message,
                temperature = 0.1,
            )

            detected_category = completion.choices[0].message.content
            
            question_results[task_id].append(detected_category)
        print(question_results[task_id])
        
        
    classified_text_answers["question_{}".format(i)] = {
            "prompt" : question,
            "results" : question_results
    }

    print(classified_text_answers.keys())
    print(classified_text_answers.values())

0


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
['B', 'D', 'D', 'E', 'B', 'A', 'B', 'E', 'B', 'A', 'D', 'D', 'B', 'C', 'C', 'B', 'D', 'C', 'B', 'C', 'D', 'D', 'B', 'B', 'B', 'D', 'D', 'A', 'D', 'D', 'B', 'B', 'D', 'D', 'B', 'D', 'B', 'C', 'D', 'D', 'D', 'C', 'D', 'D', 'D', 'A', 'B', 'D', 'D', 'A', 'D', 'D', 'B', 'D', 'E', 'A', 'B', 'A', 'B', 'D', 'A', 'E', 'A', 'C', 'A', 'B', 'A', 'A', 'D', 'B', 'D', 'D', 'D', 'C', 'D', 'D', 'D', 'B', 'A', 'A', 'C', 'D', 'D', 'C', 'B', 'A', 'C', 'D', 'D']
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
['D', 'B', 'D', 'D', 'E', 'D', 'C', 'B', 'D', 'A', 'D', 'D', 'D', 'A', 'D', 'D', 'D', 'D', 'C', 'D', 'B', 'A', 'B', 'B', 'A', 'B', 'B', 'B', 'D'

In [49]:
import pickle,sys,os
import pathlib

print(classified_text_answers['question_0']['results'].keys())

save_this = {}
for q_key, q_vals in classified_text_answers.items():
    save_this[q_key] =  {}
    save_this[q_key]["prompt"] =  q_vals["prompt"]
    
    results = q_vals["results"]
    results_dic =  {}
    for t_k,t_vals in results.items():
        print(t_k,t_vals)
        if t_k == "003":
            results_dic["003_2"] = t_vals
        else : 
            results_dic[t_k] = t_vals
    
    save_this[q_key]["results"] =  results_dic

print(save_this)

saveto = os.path.join("results","llm_classifications","29_01.data")
pathlib.Path(saveto).parent.mkdir(parents=True, exist_ok=True) 

with open(saveto, 'wb') as f:
    pickle.dump(save_this, f)
        
with open(saveto, 'rb') as f:
    loaded_dict = pickle.load(f)
    

dict_keys(['002', '003_1', '003_2'])
002 ['B', 'D', 'D', 'E', 'B', 'A', 'B', 'E', 'B', 'A', 'D', 'D', 'B', 'C', 'C', 'B', 'D', 'C', 'B', 'C', 'D', 'D', 'B', 'B', 'B', 'D', 'D', 'A', 'D', 'D', 'B', 'B', 'D', 'D', 'B', 'D', 'B', 'C', 'D', 'D', 'D', 'C', 'D', 'D', 'D', 'A', 'B', 'D', 'D', 'A', 'D', 'D', 'B', 'D', 'E', 'A', 'B', 'A', 'B', 'D', 'A', 'E', 'A', 'C', 'A', 'B', 'A', 'A', 'D', 'B', 'D', 'D', 'D', 'C', 'D', 'D', 'D', 'B', 'A', 'A', 'C', 'D', 'D', 'C', 'B', 'A', 'C', 'D', 'D']
003_1 ['D', 'B', 'D', 'D', 'E', 'D', 'C', 'B', 'D', 'A', 'D', 'D', 'D', 'A', 'D', 'D', 'D', 'D', 'C', 'D', 'B', 'A', 'B', 'B', 'A', 'B', 'B', 'B', 'D', 'B', 'C', 'C', 'B', 'B', 'B', 'A', 'D', 'B', 'D', 'C', 'D', 'E', 'D', 'C', 'D', 'A', 'D', 'E', 'D', 'D', 'A', 'D', 'A', 'E', 'A']
003_2 ['C', 'A', 'C', 'A', 'D', 'E', 'C', 'C', 'B', 'A', 'C', 'D', 'D', 'A', 'A', 'A', 'B', 'D', 'D', 'D', 'E', 'D', 'D', 'D', 'D', 'C', 'B', 'E', 'D', 'D', 'B', 'D', 'C', 'D', 'D', 'A', 'D', 'D', 'D', 'C', 'D', 'D', 'D', 'B', 'D',

In [52]:
print(loaded_dict["question_0"]["results"].keys())
print([len(v) for v in loaded_dict["question_0"]["results"].values()])

dict_keys(['002', '003_1', '003_2'])
[89, 55, 145]
