### File to process the keyboard data and create a keyboard feature dataset for data analysis

In [None]:
import json
import operator
from itertools import groupby
import numpy as np
import pandas as pd
import pickle
from statsmodels.robust.scale import mad

In [None]:
# import the dataset
with open("OnlineStudy_RawData.json") as jsonData:
    firebase_data = json.load(jsonData)

In [None]:
# import the list of participants who finished the study and have valid data (were not filtered out in the data
# preprocessing)
with open("filtered_online_study_ids_studyLevel", "rb") as fp:
    valid_ids = pickle.load(fp)

#### --- Helper Functions to process the keyboard data ---

In [None]:
# extract and clean the typing data from the task dataset
def get_typing_data(data):

    # Only include Keydown and Keyup events to exclude mouse data
    key, value1, value2 = "eventType", "KeyDown", "KeyUp"

    # sort the data by its timestamp (sorted returns a list with tuples (id, logged_event) --> only the logged event
    # data is relevant --> only get the second entry of the tuple
    sorted_page_data = sorted(data.items(), key=lambda x:x[1]["time"])

    # key chars 0 and 255 are unidentified typing chars and likely represent a recording error (datapoint is recoreded
    # twice, once with the original keyCode and once with an unidentified keyCode)
    invalid_keyChars = [0, 255]
    # remove practice trial data, non typing data and invalid typing chars
    keyboard_task_data = [datapoint for datapoint in sorted_page_data if not datapoint[1]["isPractice"] and
                          "keyCode" in datapoint[1] and datapoint[1]["keyCode"] not in invalid_keyChars]

    # get and remove the number of invalid key presses (numpad key presses, which were not shown in the input field)
    numpad_presses = sum(1 for i in keyboard_task_data if not i[1]["validKey"])
    keyboard_task_data = [datapoint for datapoint in keyboard_task_data if datapoint[1]["validKey"]]

    # get key presses which are not "part of the task": e.g. pressed F4 key etc...
    valid_keys = [8, 16, 20] + list(range(48, 58)) + list(range(65, 91))
    non_target_presses = sum(1 for i in keyboard_task_data if int(i[1]["keyCode"]) not in valid_keys)

    # Keyboard usage data
    keyboard_events = []
    # Keydown events to remove doubled down events
    key_down_events = {}
    # number of potential artifacts
    artifacts = 0

    # Gets all keyup and keydown events and deletes doubled keydown events which occur if the key is held down
    # for a longer time
    # the data is stored in tuples: ("0", {datapoint})
    # only keypresses are saved which have a keydown and a corresponding keyup event, keydown events with no
    # corresponding up event and keyup events with no corresponding down events are removed from the dataset
    for data_tuple in keyboard_task_data:
        # the second tuple is the datapoint, the first tuple the dictionary key
        datapoint = data_tuple[1]
        # if it is a keydown event
        if key in datapoint and datapoint[key] == value1:
            if datapoint["keyCode"] not in invalid_keyChars:
                # if its a non existing keydown event, add it to the dict of keydown events without a corresponding up
                if datapoint["keyCode"] not in key_down_events:
                    key_down_events[datapoint["keyCode"]] = [datapoint]
                # if the keydown event already exists
                else:
                    # if the time difference between this keydown event and the last keydown event is smaller than
                    # 1750 ms, it is likely that the key was held down and ,ore than one keydown events were fired
                    if datapoint["time"] - key_down_events[datapoint["keyCode"]][-1]["time"] < 1750:
                        key_down_events[datapoint["keyCode"]].append(datapoint)
                    else:
                        # if there is a greater time difference between the keydown events, every key down event prior
                        # to the current keydown event is likely an artifact that has no corresponding keyup event
                        # (e.g. if participants switched tabs during typing)
                        # those potential artifacts are removed from the dataset (they are overwritten by the latest
                        # keydown event
                        # print("Potential artifact", datapoint["time"] - key_down_events[datapoint["keyCode"]][-1]["time"])
                        # print("Old:", key_down_events[datapoint["keyCode"]])
                        # print("New:", datapoint)
                        artifacts += 1
                        key_down_events[datapoint["keyCode"]] = [datapoint]
        # if it is a keyup event
        elif key in datapoint and datapoint[key] == value2 and datapoint["keyCode"] not in invalid_keyChars:
            # search for the corresponding down event
            if datapoint["keyCode"] in key_down_events:
                # append the keyup event and its corresponding keydown event to the cleaned keyboard event list
                keyboard_events.append(key_down_events[datapoint["keyCode"]][0])
                keyboard_events.append(datapoint)
                # if there are more than one keydown events corresponding the the keyup event, print that information
                # if len(key_down_events[datapoint["keyCode"]]) > 1:
                #     print("More than one Keydown Event for the keyUp event")
                #     print(key_down_events[datapoint["keyCode"]])
                #     print(datapoint)
                # delete the keydown event entry of that key (because the corresponding keyup event was found)
                del key_down_events[datapoint["keyCode"]]
            else:
                # If they Keyup event has no corresponding keydown event, it is likely an artifact
                artifacts += 1

    # print("No corresponding keyUp event to KeyDown Event:", key_down_events)

    # sort the keyboard events list by timestamps again to get a cleaned list of valid keyboard events of the typing
    # task: The list should contain each valid keydown event with a corresponding keyup event
    keyboard_events = sorted(keyboard_events, key=operator.itemgetter("time"))

    # return the cleaned list of keyboard events, the number of numpad presses, potential artifacts and non target
    # key presses
    return keyboard_events, numpad_presses, artifacts, non_target_presses


# separate the typing data into the typing trials
def get_trial_data(keyboard_data):

    # split the keyboard data into trials and sort the trial data by time
    keyboard_trial_data = [list(g) for k, g in groupby(keyboard_data, operator.itemgetter("taskNumber"))]

    # the experiment was programmed in a way that trials (and the task) ended, as soon as the string in the input
    # field matched the given string to retype: --> The trial ended on a keydown event and the corresponding keyup
    # event to that keypress was tagged with the next trial number
    # --> there are datapoints of one trial in the next trial which need to be added to the correct trial

    # get a dictionary of keyup events, which are in the wrong trial
    items_to_reorder = {}
    for index, trial in enumerate(keyboard_trial_data[1:]):
        items_to_reorder[index] = []
        keydown_events = {}
        for i in trial:
            if i["eventType"] == "KeyDown":
                keydown_events[i["keyCode"]] = i
            elif i["eventType"] == "KeyUp":
                if i["keyCode"] in keydown_events:
                    del keydown_events[i["keyCode"]]
                else:
                    items_to_reorder[index].append(i)

    # place the keyup events into the correct trial
    for i in items_to_reorder:
        for k in items_to_reorder[i]:
            keyboard_trial_data[i].append(k)
            keyboard_trial_data[i + 1].remove(k)

    return keyboard_trial_data


# get the longest pause between consecutive keyboard data events to potentially filter bad cases
def get_longest_pause(keyboard_data):
    # calculate the maximum time difference between consecutive keyboard data events
    pauses = np.diff([i["time"] for i in keyboard_data])
    longest_pause = max(pauses)
    mean_pause = np.mean(pauses)

    # return the value in seconds
    return np.round(longest_pause / 1000, 4), np.round(mean_pause / 1000, 4)

#### --- Keyboard Feature Calculation Functions ---

In [None]:
# a seperate function for each feature (or related features): This was done to increase the readability of the code
# From a Performance point of view, it might be better to not loop the keyboard data separately for the calculation of
# each feature (and might need refinement in cases with large datasets)

# get the number of times, the backspace key was pressed during the task
def get_backspace_presses(keyboard_data):

    # check for the number of backspace keydown presses using the keydown event charCode, code and key
    backspace_presses = sum(1 for i in keyboard_data if i["eventType"] == "KeyDown" and (i["keyCode"] == 8 or
                            i["code"] == "Backspace" or i["key"] == "Backspace"))

    return {"Backspace_presses": backspace_presses}


# get the writing time (time difference between last and first keyboard event in the dataset) as well as the writing
# time in relation to the number of pushed keys
def get_writing_time(keyboard_data):

    # get the max and min timestamp and calculate the difference between them (keyboard_data) should already be
    # chronologically ordered in regards to time --> a faster solution, which is tallied to the data structure is
    # writing_time = keyboard_data[-1]["time"] - keyboard_data[0]["time"]
    # get the writing time in seconds = division by 1000
    writing_time = (max(i["time"] for i in keyboard_data) - min(i["time"] for i in keyboard_data)) / 1000

    # get the number of pressed keys during the keyboard task
    number_of_keypresses = sum(1 for i in keyboard_data if i["eventType"] == "KeyDown")

    # calculate the time per keystroke
    time_per_keystroke = writing_time / number_of_keypresses

    return {"Writing_Time": writing_time, "Time_per_Keystroke": time_per_keystroke}


# calculate the time difference between the last timestamp of one trial and the first timestamp of the next trial
# calculate the time per trial
def get_trial_onset_and_trial_times(trial_data, task_start_time):

    trial_onset_times = []
    trial_times = []
    # the onset time for the first trial is the first timestamp of the first trial and the task start timestamp
    start_timestamp = task_start_time
    for trial in trial_data:
        trial_onset_times.append(trial[0]["time"] - start_timestamp)
        start_timestamp = trial[-1]["time"]
        trial_times.append(trial[-1]["time"] - trial[0]["time"])

    # calculate the mean and standard deviation of the trial onset times
    trial_onset_mean = np.round(np.mean(trial_onset_times), 3)
    trial_onset_sd = np.round(np.std(trial_onset_times), 3)

    # calculate the mean and standard deviation of the trial times
    trial_time_mean = np.round(np.mean(trial_times), 3)
    trial_time_sd = np.round(np.std(trial_times), 3)

    return {"Trial_Onset_Mean": trial_onset_mean, "Trial_Onset_SD": trial_onset_sd,
            "Trial_Time_Mean": trial_time_mean, "Trial_Time_SD": trial_time_sd}


# Calculate the time between pressing a key and releasing a key --> Keypress Dwell Time
def get_keypress_dwell_time(keyboard_data):

    key, value1, value2 = "eventType", "KeyDown", "KeyUp"

    # store the individual keypress dwell times
    key_press_time = []
    # Hold information on which key is pressed down
    key_pushed = []

    # loop over all keyboard data
    for datapoint in keyboard_data:
        # if the datapoint is a keydown event
        if datapoint[key] == value1:
            # add the datapoint to the key_pushed list
            key_pushed.append(datapoint)
        # If the datapoint is a keyup event
        elif datapoint[key] == value2:
            # loop the key_pushed list and search for the corresponding keydown event to the keyup event
            for i in range(len(key_pushed)):
                # if the keydown to the corresponding keyup is found
                if datapoint["code"] == key_pushed[i]["code"]:
                    # calculate the time it took between pressing and releasing the key and append it to the
                    # key_press_time list
                    key_press_time.append(datapoint["time"] - key_pushed[i]["time"])
                    # delete the keydown event from the list and break the inner loop
                    del key_pushed[i]
                    break

    # calculate the mean and standard deviation of the keypress dwell times
    dwelltime_mean = np.round(np.mean(key_press_time), 3)
    dwelltime_sd = np.round(np.std(key_press_time), 3)

    return {"Dwelltime_Mean": dwelltime_mean, "Dwelltime_SD": dwelltime_sd}


# Calculate the time between releasing a key and pressing the next key (can contain negative values if a key is pressed
# before the previous key is released)
def get_keypress_latency(trial_data):

    # calculate the latency per trial to exclude the latency time between releasing the last key of one trial and
    # pressing the first key of the next trial (i.e. trial onset time)

    # latency calculation function using a list with keyboard data (here trial list, but would also work with all
    # mouse data when ignoring the trial structure
    def _calc_latency(data):
        key, value1, value2 = "eventType", "KeyDown", "KeyUp"

        # store the individual latency times
        latency_times = []

        # initialize variables
        next_key_down_time = 0
        currentkey_up_time = 0
        down_time_set = False
        up_time_set = False

        # loop over all datapoints: if it is a keydown event, loop over all consecutive datapoints until the time of the
        # corresponding keyup event and the time of the next keydown event is found (latency as the time difference
        # between pressing the next button and releasing the previous button
        for datapoint in data:
            # if its a keydown press and not a shift key
            if datapoint[key] == value1:
                # loop over all consecutive datapoints to find the corresponding keyup event
                for i in data[(data.index(datapoint) + 1):]:
                    # if the consecutive datapoint is the corresponding keyup event to the previous datapoint
                    if (i[key] == value2) and (i["code"] == datapoint["code"]) and (up_time_set is False):
                        # save the timepoint of the keyup event, tell the algorithm its ready and break the loop
                        currentkey_up_time = i["time"]
                        up_time_set = True
                        break
                # loop over all consecutive datapoints to find the corresponding keydown event
                for i in data[(data.index(datapoint) + 1):]:
                    # if the consecutive datapoint is the next keydown event
                    if (i[key] == value1) and (down_time_set is False):
                        # save the timestamp of the keydown event, tell the algorithms its ready and break the loop
                        next_key_down_time = i["time"]
                        down_time_set = True
                        break
                # if there is a new up time and a new down time calculate a latency time and save it in the array
                if down_time_set and up_time_set:
                    latency_times.append(next_key_down_time - currentkey_up_time)
                # Reset to False to prevent wrong latency calculations
                down_time_set = False
                up_time_set = False

        return latency_times

    # loop over each trial, calculate the latency times within each trial and merge them into one list
    all_latency_times = []
    for trial in trial_data:
        trial_latencys = _calc_latency(trial)
        all_latency_times += trial_latencys

    # Calculate the mean and standard deviation of the keypress latency time
    mean_latency = np.round(np.mean(all_latency_times), 3)
    sd_latency = np.round(np.std(all_latency_times), 3)

    return {"Latency_Mean": mean_latency, "Latency_SD": sd_latency}

#### --- Main Function to loop all pareticipants, calculate their keyboard features and save them in a dataframe ---

In [None]:
def create_keyboard_dataset(data):

    keyboard_params = {}

    keyboard_tasks = ["Pr_PatternTyping", "Con_PatternTyping"]

    for participant in data:

        # check if the participant id belongs to a valid participant
        if participant in valid_ids:

            keyboard_params[participant] = {}

            print("Processing participant " + participant)

            # loop over the keyboard tasks
            for task in keyboard_tasks:

                print("Processing " + task)

                # get the typing data and the number of invalid key presses
                typing_data, numpad_presses, artifacts, non_target_presses = get_typing_data(
                    data[participant][task]["data"]["TrackerData"])
                # get the trial data
                trial_data = get_trial_data(typing_data)

                # "visualize" the typing data to check if the task was done correctly
                # text = [i["code"] for i in typing_data if i["eventType"] == "KeyUp"]
                # text2 = [i["keyCode"] for i in typing_data if i["eventType"] == "KeyUp"]
                # text3 = [i["key"] for i in typing_data if i["eventType"] == "KeyUp"]

                # get the start time of the typing task
                task_start = data[participant][task]["data"]["trialStarted"]

                # calculate and save the keyboard features
                typing_task_features = {**get_backspace_presses(typing_data),
                                        **get_writing_time(typing_data),
                                        **get_keypress_dwell_time(typing_data),
                                        **get_trial_onset_and_trial_times(trial_data, task_start),
                                        **get_keypress_latency(trial_data)}

                # get the Arousal and Valence as continous dependent variables (SAM rating)
                if task == "Pr_PatternTyping":
                    sam = data[participant]["Pr_Sam_PatternTyping"]["data"]
                else:
                    sam = data[participant]["Con_Sam_PatternTyping"]["data"]

                # Save all variables with their tag (if they belong to the practice or condition phase)
                for keyboard_feature in typing_task_features:
                    keyboard_params[participant][task[:-13] + keyboard_feature] = typing_task_features[keyboard_feature]

                for item in sam:
                    keyboard_params[participant][task[:-13] + item] = sam[item]

                # add more data about the typing task to filter potential bad cases and inspect the data
                longest_pause, mean_pause = get_longest_pause(typing_data)

                keyboard_params[participant][task[:-13] + "Num_Datapoints"] = len(typing_data)
                keyboard_params[participant][task[:-13] + "Numpad_Presses"] = numpad_presses / 2
                keyboard_params[participant][task[:-13] + "Artifacts"] = artifacts
                keyboard_params[participant][task[:-13] + "Non_target_presses"] = non_target_presses / 2
                keyboard_params[participant][task[:-13] + "Task_duration"] = data[participant][task]["data"]["taskEnded"] - data[participant][task]["data"]["trialStarted"]
                keyboard_params[participant][task[:-13] + "Longest_Pause"] = longest_pause
                keyboard_params[participant][task[:-13] + "Mean_Pause"] = mean_pause
                keyboard_params[participant][task[:-13] + "Task_Start_Time"] = (typing_data[0]["time"] - task_start) / 1000

                # get participants texts to check what they have written
                # keyboard_params[participant][task[:-13] + "text"] = text
                # keyboard_params[participant][task[:-13] + "text2"] = text2
                # keyboard_params[participant][task[:-13] + "text3"] = text3

            # get and save the condition of the participant
            keyboard_params[participant]["condition"] = data[participant]["ExperimentMetaData"]["condition"]
            # break

    # convert the dictionary with all features per participant per condition into a pandas Dataframe
    df = pd.DataFrame(keyboard_params).T

    return df

In [None]:
# create the dataframe
typing_task_data = create_keyboard_dataset(firebase_data)

print("Number of all participants", len(typing_task_data))

#### --- Filter out bad cases in the typing dataset ---

In [None]:
# outlier detection helper function

# get the upper limit of the median plus k times the median absolute deviation to detect potential outliers
def out_mad(data_column, k=3.0):

    # calculate the median of the data column
    median = data_column.median()
    # calculate the median absolute deviance of the data column
    median_absolute_deviance = mad(data_column)
    # calc the outlier cutoff
    cut_off = median_absolute_deviance * k
    print(median + cut_off)

    return median + cut_off

##### Remove participants who did too many numpy keypresses or pressed too many non target keys

In [None]:
# (Participant did not do the task properly: 10 presses are chosen as the cut-off above which "bad key presses"
# do not represent honest mistakes anymore, but misbehavior
bad_keypresses = typing_task_data.loc[(typing_task_data["Pr_Numpad_Presses"] >= 10) |
                                      (typing_task_data["Pr_Non_target_presses"] >= 10) |
                                      (typing_task_data["Con_Numpad_Presses"] >= 10) |
                                      (typing_task_data["Con_Non_target_presses"] >= 10)].index

print("Number of participants with too many bad keypresses", len(bad_keypresses))

#####  Remove Participants who took a break for too long during the task 

In [None]:
# --> Task was not done properly anymore (stress
# manipulation likely lost its effect, if participants took a break during the task, e.g. by switching tabs or doing
# something different in between): 15 seconds between consecutive keyboard events are chosen as a too long pause

pause_too_long = typing_task_data.loc[(typing_task_data["Pr_Longest_Pause"] >= 10.0) |
                                      (typing_task_data["Con_Longest_Pause"] >= 10.0)].index

print("Number of participants with too long pauses", len(pause_too_long))

##### Remove participants with a too long overall task time

In [None]:
# ---> participant needed way longer than the median task time (outlier removal based on the median absolute deviation)
task_time_too_long = typing_task_data.loc[(typing_task_data["Pr_Task_duration"] > out_mad(typing_task_data["Pr_Task_duration"])) |
                                          (typing_task_data["Con_Task_duration"] > out_mad(typing_task_data["Con_Task_duration"]))].index

print("Number of participants with too long task time", len(task_time_too_long))

##### Bring filtered out participants together and remove their data from the keyboard dataset

In [None]:
bad_participants = list(set(np.concatenate((bad_keypresses, pause_too_long, task_time_too_long))))

print("Total number of filtered out bad participants:", len(bad_participants))

In [None]:
# remove bad participants from the dataset
typing_task_data_cleaned = typing_task_data.drop(bad_participants)

print("Final number of participants after bad case removal:", len(typing_task_data_cleaned))

#### --- Save the filtered out participants and the typing feature data for further analysis ---

In [None]:
# get and save a list with the remaining participants for the manipulation check and the sociodemographic analysis
with open("filtered_online_study_ids_taskLvl", "wb") as fp:
    pickle.dump(list(typing_task_data_cleaned.index), fp)

In [None]:
# save the Typing Task Feature Dataframe for the statistical analysis

# drop columns for bad case detection
typing_task_data_cleaned = typing_task_data_cleaned.drop(["Pr_Num_Datapoints", "Pr_Numpad_Presses", "Pr_Artifacts", "Pr_Non_target_presses",
                                               "Pr_Task_duration", "Pr_Longest_Pause", "Pr_Task_Start_Time",
                                               "Con_Num_Datapoints", "Con_Numpad_Presses", "Con_Artifacts", "Con_Non_target_presses",
                                               "Con_Task_duration", "Con_Longest_Pause", "Con_Task_Start_Time",
                                                          "Pr_Mean_Pause", "Con_Mean_Pause"
                                               ], axis=1)

# save the dataframe as a csv file for the data analysis
typing_task_data_cleaned.to_csv("Online_Study_Keyboard_Features.csv", sep="\t", encoding="utf-8")
