### File to preprocess the data of the online-study

In [None]:
import json
import pandas as pd
import numpy as np
import pickle

In [None]:
# dataset import

# open the dataset
with open("OnlineStudy_RawData.json") as jsonData:
    dataset = json.load(jsonData)

#### --- get a dataset with the relevant Study MetaData Info ---

In [None]:
metaData = {}

for participant in dataset:

    metaData[participant] = {}

    ignore = ["taskOrder", "condition", "initScreenProps"]

    for i in dataset[participant]["ExperimentMetaData"]:
        if i not in ignore and "failsMediaCheck" not in i:
            metaData[participant][i] = dataset[participant]["ExperimentMetaData"][i]

    if "DonationOption" in dataset[participant]:
        metaData[participant]["isDonating"] = dataset[participant]["DonationOption"]["data"]["isDonating"]

    if "Soziodem" in dataset[participant]:
        for i in dataset[participant]["Soziodem"]["data"]:
            metaData[participant][i] = dataset[participant]["Soziodem"]["data"][i]

df = pd.DataFrame(metaData).T

# create a "complete study completed column"
df.loc[(df["hasCompleted"] == True) | (df["lastCompletedPage"] == "DonationOption")
       | (df["lastCompletedPage"] == "Con_Mdbf"), "studyCompleted"] = True
df["studyCompleted"].fillna(False, inplace=True)

log some metadata information

In [None]:
# total number of study accesses
tot_study_open = len(df)
# total number of study accesses without a valid panel ID
no_panel_id = len(df[df["panelId"] == "undefined"])
# total number of study accesses with a valid panel ID
valid_panel_id = tot_study_open - no_panel_id
# internet explorer openings
used_ie = len(df[df["isInternetExplorer"]])
# edge openings
used_edge = len(df[df["isEdge"]])
# completed study with edge
edge_compl = len(df[df["isEdge"] & df["studyCompleted"]])
# fails the initial media check (touch or screen too small)
fails_init_media_check = len(df[df["failsInitialMediaCheck"] == True])
# answer that no mouse is used in the study
no_mouse = len(df[df["hasNoMouse"] == True])
# completed the study
completed = len(df[df["studyCompleted"] == True])

# print some information about the dataset
print("Total number of study opens: ", tot_study_open)
print("Study opens without a panel Id: ", no_panel_id)
print("Study opens with a panel Id: ", valid_panel_id)
print("Used Internet Explorer to open the study: ", used_ie)
print("Used Edge to open the study: ", used_edge)
print("Finished the study with Edge: ", edge_compl)
print("Fails initial Media check: ", fails_init_media_check)
print("Reports using no mouse: ", no_mouse)
print("Finished the study:", completed)

#### --- Create a dataset with participants who finished the study and exclude the data of participants who finished the study more than once ---

In [None]:
# Get information about the number of times persons with the same panel id participated
panel_ids = df["panelId"].value_counts()
repetitive_panel_ids = panel_ids[panel_ids > 1]
print("Number of Participants who opened the study more than once: ", len(repetitive_panel_ids))

# create a df that only has unique panel ids
cleaned_df = df

# loop over the repetive panel Ids
for panel_id in repetitive_panel_ids.index:

    # get the data of the person that opened the study more than once
    pers = cleaned_df[cleaned_df["panelId"] == panel_id]

    # get the index of the completed trials
    index_to_keep = pers[pers["studyCompleted"] == True].index
    # check is the same person completed the study more than once
    have_completed = pers[pers["studyCompleted"] == True]
    # if the study was completed more than once by the same person, only keep the first data of the first completion
    if len(have_completed) > 1:
        print("Participant with panel id " + panel_id + " completed the study " + str(len(have_completed)) + " times")
        # only keep the first completed trial
        index_to_keep = pd.to_numeric(have_completed["startTime"]).idxmin()

    # if the study was completed, keep the trials
    if len(index_to_keep) > 0:
        ind_to_del = pers.index.drop(index_to_keep)
    # if the study was not completed, keep one
    else:
        ind_to_del = pers.index[:-1]

    cleaned_df = cleaned_df.drop(ind_to_del)

# check if repetivive ids have been successfully removed
new_panel_ids = cleaned_df["panelId"].value_counts()
new_repetitive_panel_ids = new_panel_ids[new_panel_ids > 1]
print("Number of Participants who opened the study more than once after cleanup: ", len(new_repetitive_panel_ids))

# remove the "undefined" --> be careful if there are completed trials with an undefined panel ID
# indicates that someone messed around with the URL parameter (was not the case in the dataset)
cleaned_df = cleaned_df.drop(cleaned_df[cleaned_df["panelId"] == "undefined"].index)

print("Unique study access: ", len(cleaned_df))
print("Number of unique study finishes: ", len(cleaned_df[cleaned_df["studyCompleted"]]))

# save the index values (firebase ids) of the completed studies
ids_completed = cleaned_df[cleaned_df["studyCompleted"]].index

### -- Outlier and bad case removal --

goal: detect participants who did not properly work on the study (and therefore distort the study results)

 strategies to detect bad cases:
 - Timing based detection (clicking through or taking non-desired breaks)
 - Answer based detection (no variance in answers or answers are given randomly)
 - Technical difficulties in the count task
 - No compliance in the count task (no answers given, random answers given, no variance in answers)
 - No compliance in the typing task (tasks instructions are not followed) --> is checked in a later step

##### create a dataframe with the study duration and study page durations

In [None]:
study_durations = {}

# get the duration of the task in the practice condition and ignore the duration of the instruction and demo trial
pr_tasks = ["Pr_DragDrop", "Pr_FollowBox", "Pr_PatternTyping", "Pr_PointClick", "Pr_Slider"]

# loop over all datasets
for par in dataset:
    # only include the data of the finished studies
    if par in ids_completed:

        study_durations[par] = {}

        # get info about the page duration and the study duration
        study_duration = 0
        condition_duration = 0

        # loop over all study pages and save the page duration time plus add it to the total time
        for study_page in dataset[par]:

            if "MetaData" in dataset[par][study_page]:
                study_durations[par][study_page + "_duration"] = dataset[par][study_page]["MetaData"][
                                                                       "pageDuration"] / 1000
                study_duration += dataset[par][study_page]["MetaData"]["pageDuration"]

                # if its a practice task, additionally get the duration of the task and ignore the duration of the
                # instruction and demo trial
                if study_page in pr_tasks:
                    study_durations[par][study_page + "_Task_duration"] = (dataset[par][study_page]["data"][
                                                                         "taskEnded"] - dataset[par][study_page]["data"][
                                                                         "trialStarted"]) / 1000
                # if its a page from the actual condition, add its time to the condition duration (except for the intro
                # page)
                if "Con_" in study_page and study_page != "Con_Instr":
                    condition_duration += dataset[par][study_page]["MetaData"]["pageDuration"]

        # save total study duration in minutes
        study_durations[par]["study_duration"] = study_duration / 1000 / 60
        # save the condition duration
        study_durations[par]["condition_duration"] = condition_duration / 1000 / 60

study_durations_df = pd.DataFrame(study_durations).T

####  --- 1. Timing based bad case detection ---

In [None]:
# Flag cases that:

# - had a total study duration shorter than a possible time for actual participation (personal time to click through
# the study without reading anything and with knowledge about the study was about 12 minutes)

short_study_duration = study_durations_df.loc[study_durations_df["study_duration"] < 12]
print("Short study duration outliers", len(short_study_duration))

# - had a too short duration to fill out the mdbf (in personal tests, clicking through the mdbf without reading
# took about 13-15 seconds), filling the questionnaire out as fast as possible with "correct" answers took about 20
# seconds (with good knowledge about the questions)

# the mdbf in the practice condition should take longer than the mdbf in the actual condition because it is new
short_mdbf_pr = study_durations_df.loc[study_durations_df["Pr_Mdbf_duration"] < 21]
short_mdbf_co = study_durations_df.loc[study_durations_df["Con_Mdbf_duration"] < 18]
print("Co_MDBF duration outliers condition", len(short_mdbf_co),
      "\n" + "Pr_MDBF duration outliers practice", len(short_mdbf_pr))

# the count task on average took longer than 65 seconds in any count task (either because of technical difficulties or
# because the task was interrupted by changing browser tabs etc... (-> the manipulation did not work as intended!)
# the animation of the loading bar had problems "loading" in some cases, which caused the loading task length to be
# pretty high
# the expected count task time was 45 seconds
# This was included because technical difficulties (or pauses during the task) likely changed the outcome of the
# manipulation in a not desired direction --> technical difficulties cause frustration and stress independent of the
# condition

study_durations_df["avg_count_task_duration"] = study_durations_df.loc[:, ["Con_Count_DragDrop_duration",
                                                                           "Con_Count_FollowBox_duration",
                                                                           "Con_Count_PatternTyping_duration",
                                                                           "Con_Count_PointClick_duration",
                                                                           "Con_Count_Slider_duration"]].mean(axis=1)

bad_count_duration = study_durations_df.loc[study_durations_df["avg_count_task_duration"] > 65]

print("Average Count task took too long", len(bad_count_duration))

# Additionally filter out participant with a bad count task duration in the typing task
bad_count_duration_typing = study_durations_df.loc[study_durations_df["Con_Count_PatternTyping_duration"] > 65]

print("Typing task count task took too long", len(bad_count_duration_typing))


# Add all time based outliers together and print info about how many need to be removed from the dataset
timebased_outliers = list(short_study_duration.index) + list(short_mdbf_pr.index) + list(short_mdbf_co.index) + \
                     list(bad_count_duration.index) + list(bad_count_duration_typing.index)

print("Total time based outliers", len(list(set(timebased_outliers))))

#### --- 2. Questionnaire answers based bad case detection ---

In [None]:
# get cases without variance in the mdbf questionnaires (participants always clicked on the same answer in the MDBF)

# create a df with the mdbf answers
mdbf_answers = {}

for par in dataset:
    # only include the data of the finished studies
    if par in ids_completed:

        mdbf_answers[par] = {}

        # Condition
        for item in dataset[par]["Con_Mdbf"]["data"]:
            value = dataset[par]["Con_Mdbf"]["data"][item]
            mdbf_answers[par]["Con_" + item] = value

        # Practice
        for item in dataset[par]["Pr_Mdbf"]["data"]:
            value = dataset[par]["Pr_Mdbf"]["data"][item]
            mdbf_answers[par]["Pr_" + item] = value

mdbf_answers_df = pd.DataFrame(mdbf_answers).T

# filter the practice and actual condition, calculate the standard deviation of the mdbf answers and get the cases
# that have no standard deviation (= answered all questions equally)
practice_mdfb = mdbf_answers_df.filter(regex="Pr")
practice_mdfb = practice_mdfb.assign(std=practice_mdfb.std(axis=1))
no_variance_pr = practice_mdfb.loc[practice_mdfb["std"] == 0]

print("No variance in the practice mdbf", len(no_variance_pr))

condition_mdfb = mdbf_answers_df.filter(regex="Con")
condition_mdfb = condition_mdfb.assign(std=condition_mdfb.std(axis=1))
no_variance_con = condition_mdfb.loc[condition_mdfb["std"] == 0]

print("No variance in the practice mdbf", len(no_variance_con))

bad_mdbf_answers = list(no_variance_pr.index) + list(no_variance_con.index)
# filter double cases
bad_mdbf_answers = list(set(bad_mdbf_answers))
print("Total number of bad mdbf answer cases", len(bad_mdbf_answers))

#### --- 3. Stress manipulation task (count task) answers based bad case detection ---

In [None]:
# get cases with bad answer behavior in the count task (stress manipulation was not done properly)

# get the count task answers

count_tasks = ['Con_CountAns_DragDrop',
               'Con_CountAns_FollowBox',
               'Con_CountAns_PatternTyping',
               'Con_CountAns_PointClick',
               'Con_CountAns_Slider']

count_answers_data = {}

for par in dataset:

    if par in ids_completed:

        count_answers_data[par] = {}

        # get condition
        count_answers_data[par]["condition"] = dataset[par]["ExperimentMetaData"]["condition"]

        for i in count_tasks:
            # get the string of the count task (to get the correct target number)
            k = i.replace("CountAns", "Count")

            task_string = i[13:]

            count_task_answer = dataset[par][i]["data"]["Count_Task_Answer"]
            count_task_solution = dataset[par][k]["data"]["Total_num_targets"]

            # get the answer and solution of the count task aswell as the difference between the answer and solution
            # per task

            count_answers_data[par][task_string + "_CountSol"] = count_task_solution
            count_answers_data[par][task_string + "_CountAns"] = count_task_answer
            count_answers_data[par][task_string + "Difference"] = abs(count_task_solution - count_task_answer)

count_df = pd.DataFrame(count_answers_data).T

# add columns about the total count task results (total solution, total answer, total difference between all
# answers and all solutions aswell as the difference between the final answer and the final solution
count_df["total_solution"] = count_df.loc[: , count_df.columns.str.contains("_CountSol")].sum(axis=1)
count_df["total_answer"] = count_df.loc[: , count_df.columns.str.contains("_CountAns")].sum(axis=1)
count_df["total_difference"] = count_df.loc[: , count_df.columns.str.contains("Difference")].sum(axis=1)
count_df["result_difference"] = abs(count_df["total_solution"] - count_df["total_answer"])

# separate the high stress and low stress condition
hs_count_df = count_df.loc[count_df["condition"] == 0]
ls_count_df = count_df.loc[count_df["condition"] == 1]

# get cases that always give the same answers in all count tasks: likely did not do the task properly
hs_count_answers = hs_count_df.filter(regex="CountAns")
hs_count_answers = hs_count_answers.assign(std=hs_count_answers.std(axis=1))
no_variance_count_hs = hs_count_answers.loc[hs_count_answers["std"] == 0]

ls_count_answers = ls_count_df.filter(regex="CountAns")
ls_count_answers = ls_count_answers.assign(std=ls_count_answers.std(axis=1))
no_variance_count_ls = ls_count_answers.loc[ls_count_answers["std"] == 0]


bad_cases_count = list(no_variance_count_hs.index) + \
                  list(no_variance_count_ls.index)
bad_cases_count = list(set(bad_cases_count))
print("Total bad cases in the count task", len(bad_cases_count))

#### --- Bring bad cases together and save which participants have valid data for further analysis ---

In [None]:
# get a list of all possible bad datapoints based on the different analysis and save it as a pickle file
total_bad_cases = timebased_outliers + bad_mdbf_answers + bad_cases_count
# filter duplicate cases
total_bad_cases = list(set(total_bad_cases))

print("Total bad cases identified with the previous steps", len(total_bad_cases))

In [None]:
# get the study duration
new_study_durations = study_durations_df.drop(total_bad_cases)

print("Participants after bad case removal", len(new_study_durations))
print("Median Study duration:", np.median(new_study_durations["study_duration"]))
print("Standard Deviation of Study duration:", np.std(new_study_durations["study_duration"]))

In [None]:
# save the index values of all participants who completed the study and the index values of the participants who
# completed the study after removing the bad cases

# study ids after removal of bad cases
with open("filtered_online_study_ids_studyLevel", "wb") as fp:
    pickle.dump(set(ids_completed) - set(total_bad_cases), fp)