### File to perform the manipulation check in the lab-study

In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import pandas as pd
import operator
from typing import List
import hrvanalysis
import pingouin as pg
import seaborn as sns

In [None]:
# open the lab_study firebase dataset
with open("LabStudy_RawData.json") as f:
    firebase_data = json.load(f)

#### Helper function for physiological data processing

In [None]:
# Helper function to get the phase timestamps to be able to calculate the physiological parameters for the phase
def get_phase_timestamps(data):

    # get the timestamps about the end of an experimental phase
    sorted_exp_phase_timestamps = sorted(data["phaseFinishedTimestamps"].items(),
                                         key=operator.itemgetter(1))

    # create a dictionary with the start and end timestamp of each experimental phase
    phase_start_end_stamps = {sorted_exp_phase_timestamps[i + 1][0]: {"Start": sorted_exp_phase_timestamps[i][1],
                                                                      "End": sorted_exp_phase_timestamps[i + 1][1]}
                              for i in range(len(sorted_exp_phase_timestamps) - 1)}

    # add additional custom experimental phases

    # High-Stress & Low-Stress Phase (Beginning of first Mental Arithmetic Task in the High-Stress/Low-Stress
    # Condition to the end of the last task in the High-Stress/Low-Stress Condition
    phase_start_end_stamps[sorted_exp_phase_timestamps[12][0][:2] + "_phase"] = \
        {"Start": sorted_exp_phase_timestamps[12][1], "End": sorted_exp_phase_timestamps[26][1]}
    phase_start_end_stamps[sorted_exp_phase_timestamps[31][0][:2] + "_phase"] = \
        {"Start": sorted_exp_phase_timestamps[31][1], "End": sorted_exp_phase_timestamps[45][1]}

    # Mental Arithmetic Typing Task plus Typing Task
    phase_start_end_stamps["HS_typing"] = {
        "Start": phase_start_end_stamps["HS_Mental_Arithmetic_PatternTyping"]["Start"],
        "End": phase_start_end_stamps["HS_PatternTyping"]["End"]}
    phase_start_end_stamps["LS_typing"] = {
        "Start": phase_start_end_stamps["LS_Mental_Arithmetic_PatternTyping"]["Start"],
        "End": phase_start_end_stamps["LS_PatternTyping"]["End"]}

    return phase_start_end_stamps

#### --- Heart data processing functions ---

In [None]:
# Helper function to plot the rr timeseries (adopted the plot_timeseries function from the hrvanalysis package)
def plot_rr_timeseries(nn_intervals: List[float], normalize: bool = True,
                    autoscale: bool = True, y_min: float = None, y_max: float = None, plot_title: str = None):

    """
        Function plotting the NN-intervals time series.
        Arguments
        ---------
        nn_intervals : list
            list of Normal to Normal Interval.
        normalize : bool
            Set to True to plot X axis as a cumulative sum of Time.
            Set to False to plot X axis using x as index array 0, 1, ..., N-1.
        autoscale : bool
            Option to normalize the x-axis as a time series for comparison. Set to True by default.
        y_min : float
            Custom min value might be set for y axis.
        y_max : float
            Custom max value might be set for y axis.
            :param y_min:
            :param y_max:
            :param plot_title:
        """

    style.use("seaborn-darkgrid")
    plt.figure(figsize=(12, 8))
    plt.title("Rr Interval time series for " + plot_title)
    plt.ylabel("Rr Interval", fontsize=15)

    if normalize:
        plt.xlabel("Time (s)", fontsize=15)
        plt.plot(np.cumsum(nn_intervals) / 1000, nn_intervals)
    else:
        plt.xlabel("RR-interval index", fontsize=15)
        plt.plot(nn_intervals)

    if not autoscale:
        plt.ylim(y_min, y_max)

    plt.show()


def calc_bpm(rr_intervals):

    # remove 0 values
    rr_intervals = [i for i in rr_intervals if i !=0]

    # remove remaining outliers
    rr_intervals = hrvanalysis.remove_outliers(rr_intervals, low_rri=350, high_rri=1800, verbose=False)

    # interpolate outlier values
    rr_intervals = hrvanalysis.interpolate_nan_values(rr_intervals, interpolation_method="linear")

    # remove NaN values, which might still be in the dataset (interpolation doesnt work on the first and last datapoint
    # in the dataset
    rr_intervals = [i for i in rr_intervals if not np.isnan(i)]

    # calculate the average bpm from the cleaned rr data
    mean_bpm = hrvanalysis.get_time_domain_features(rr_intervals)["mean_hr"]

    return mean_bpm


# get and process the heart data (input is the firebase dataset of a single participant as well as the participant id)
def process_hr_data(dataset, participant):

    # Path to Heart Data
    data_path = "LabStudy_Heart_RawData/"

    hr_results = {}

    # get the physio data id to be able to match the heart data with the firebase dara
    physio_id = dataset["MetaData"]["physioDataId"]

    # get the participant hear rate data file
    heart_data = pd.read_csv(data_path + physio_id + ".csv", sep=";")

    # Convert the UniversalTimestamps into an Epoch Timestamps (timestamp had a 2-hour offset with conversion
    heart_data["convertedTimestamp"] = [int(((i - 621355968000000000) / 10000)) for i in
                                        heart_data["UniversalTimestamp"]]

    # Convert the RR-Interval String values into floats
    heart_data["IBI"] = [float(i.replace(",", ".")) for i in heart_data["IBI"]]

    # get the timestamps to extract the relevant heart rate data for each experimental phase from the heart
    # rate data set
    phase_start_end_stamps = get_phase_timestamps(dataset)

    # choose the experimental phases for which the HR data should be looked at
    relevant_phases = ["LS_Mental_Arithmetic_PatternTyping", "LS_PatternTyping", "LS_phase", "LS_typing",
                       "HS_Mental_Arithmetic_PatternTyping", "HS_PatternTyping", "HS_phase", "HS_typing"]

    # --- get the relevant experimental phase heart rate data, inspect it and calculate the average BPM if
    # there is the heart rate data is sufficient ---

    for phase in relevant_phases:

        phase_hr_data = heart_data.loc[
            (phase_start_end_stamps[phase]["Start"] < heart_data["convertedTimestamp"]) &
            (heart_data["convertedTimestamp"] < phase_start_end_stamps[phase]["End"])]

        # Data Inspection

        # Step 1: Plot the Data for each relevant Phase
        # plot_rr_timeseries(phase_hr_data["IBI"], plot_title=participant + " in " + phase)

        # Step 2: get info about data quality and potential outlier values
        # get the number of valid datapoints (valid, if they are in a range between 350 and 1800
        val_datapoints = sum([1 for i in phase_hr_data["IBI"] if 350 < i < 1800])
        # get the percentage artefacts in the data (0 datapoints are excluded, because they will be removed from
        # the dataset in a later step and might not represent measurement artifacts per se)
        # have the if else here to prevent a division by 0 if there are no heart data points for the phase
        if val_datapoints > 5:
            artifact_percentage = 100 - ((val_datapoints + len(phase_hr_data[phase_hr_data["IBI"] == 0])) /
                                         len(phase_hr_data) * 100)
        else:
            artifact_percentage = 100

        # Outlier Removal and Heart Rate Data Preprocessing
        # ignore participants with less than 5 data points in a phase and more than 5% artifact percentage
        if val_datapoints > 5 and artifact_percentage < 5:

            bpm = calc_bpm(phase_hr_data["IBI"])

            hr_results[phase + "_BPM"] = bpm

        else:
            print(participant + " has invalid heart data in phase " + phase)

    return hr_results

#### --- EDA data processing functions ---

In [None]:
# helper function to plot the eda data, similar to the hr data visualization
def plot_eda_data(eda_data, plot_title):

    style.use("seaborn-darkgrid")
    plt.figure(figsize=(12, 8))
    plt.title("EDA time series for " + plot_title)
    plt.ylabel("EDA Value", fontsize=15)

    plt.xlabel("EDA index", fontsize=15)
    plt.plot(eda_data)

    plt.show()


# get and process the eda data (input is the firebase dataset of a single participant as well as the participant id)
def process_eda_data(dataset, participant):

    # Path to EDA Data
    data_path = "LabStudy_EDA_RawData/"

    eda_results = {}

    # get the physio data id to be able to match the heart data with the firebase dara
    physio_id = dataset["MetaData"]["physioDataId"]

    # Use the physioID to create the heart data file
    eda_data = pd.read_csv(data_path + physio_id + ".csv", sep=";")

    # Convert the UniversalTimestamps into an Epoch Timestamps (timestamp had a 2-hour offset with conversion
    # formular used in the other script - (120 *
    eda_data["convertedTimestamp"] = [int(((i - 621355968000000000) / 10000)) for i in
                                      eda_data["UniversalTimestamp"]]

    # Convert the RR-Interval String values into floats
    eda_data["EDA"] = [float(i) for i in eda_data["EDA"]]

    # get the timestamps to extract the relevant eda data for each experimental phase from the eda data set
    phase_start_end_stamps = get_phase_timestamps(dataset)

    # choose the experimental phases for which the EDA data should be looked at
    relevant_phases = ["LS_Mental_Arithmetic_PatternTyping", "LS_PatternTyping", "LS_phase", "LS_typing",
                       "HS_Mental_Arithmetic_PatternTyping", "HS_PatternTyping", "HS_phase", "HS_typing"]

    for phase in relevant_phases:
        phase_eda_data = eda_data.loc[
            (phase_start_end_stamps[phase]["Start"] < eda_data["convertedTimestamp"]) &
            (eda_data["convertedTimestamp"] < phase_start_end_stamps[phase]["End"])]

        # Data Inspection

        # Step 1: Plot the eda data
        # plot_eda_data(phase_eda_data["EDA"], plot_title=participant + " in " + phase)

        # Step 2: Check for potential artifacts (bad measurements)
        if len(phase_eda_data["EDA"]) > 5:
            artifact_perc = 100 - (len(phase_eda_data[phase_eda_data["EDA"] > 0]) /
                                   len(phase_eda_data["EDA"]) * 100)
        else:
            artifact_perc = 100

        if artifact_perc < 5:

            # apply a rolling median filter to the eda data to smoothen it
            rolling_mean_eda = phase_eda_data["EDA"].rolling(20).median().dropna()

            # calculate the mean skin resistance of the phase
            mean_eda = np.mean(rolling_mean_eda)

            eda_results[phase + "_EDA"] = mean_eda

        else:
            print(participant + " has invalid eda data in phase " + phase)

    return eda_results

#### --- Self-Report data processing functions ---

In [None]:
# get the self-report data (input is the firebase dataset of a single participant as well as the participant id)
def process_self_report_data(dataset):

    self_report_ratings = {}

    # get a list of MDBF items that need to be recoded
    items_to_recode = ["MDBF_angespannt", "MDBF_nervÃ¶s", "MDBF_schlÃ¤frig", "MDBF_unglÃ¼cklich", "MDBF_unzufrieden",
                       "MDBF_ermattet"]

    # get the MDBF items
    for item in dataset["HS_Mdbf"]:
        if item in items_to_recode:
            self_report_ratings[item + "_HS"] = 4 - dataset["HS_Mdbf"][item]
        else:
            self_report_ratings[item + "_HS"] = dataset["HS_Mdbf"][item]

    for item in dataset["LS_Mdbf"]:
        if item in items_to_recode:
            self_report_ratings[item + "_LS"] = 4 - dataset["LS_Mdbf"][item]
        else:
            self_report_ratings[item + "_LS"] = dataset["LS_Mdbf"][item]

    # get the SAM items
    for page in dataset:
        if "SAM" in page:
            for item in dataset[page]:
                self_report_ratings[item + "_" + page] = dataset[page][item]

    return self_report_ratings

#### --- Process the heart data, eda data and self-report data for each participant and save it in one dataframe ---

In [None]:
# create a dataset with the manipulation check variables (wrapper function to get the hr, eda and self-report data
# and save it in a dataframe
def create_man_check_dataset(dataset):

    man_check_data = {}

    for participant in dataset:

        # if it is a real participant (if the participant finished the study)
        if "phaseFinishedTimestamps" in dataset[participant] and "BfiNeuroticism" in \
                dataset[participant]["phaseFinishedTimestamps"]:

            man_check_data[participant] = {}

            print("Processing participant " + participant)

            # get the participant dataset
            par_data = dataset[participant]

            # get the manipulation check parameters
            hr_params = process_hr_data(par_data, participant)
            eda_params = process_eda_data(par_data, participant)
            self_report_items = process_self_report_data(par_data)

            # merge them into a single dictionary
            man_check_data[participant] = {**hr_params, **eda_params, **self_report_items}

    return man_check_data

In [None]:
# get the manipulation check data dictionary
man_check_data = create_man_check_dataset(firebase_data)

# Transform the manipulation check data dic into a pandas dataframe and drop rows without data
man_check_df = pd.DataFrame(man_check_data).T.dropna(how="all")

#### --- Manipulation check procedure ----

##### Step 1: process mdbf scales

In [None]:
# create the MDBF scales
man_check_df["MDBF_GS_HS"] = (man_check_df["MDBF_wohl_HS"] + man_check_df["MDBF_gut_HS"] + man_check_df[
    "MDBF_unglÃ¼cklich_HS"] + man_check_df["MDBF_unzufrieden_HS"]) / 4
man_check_df["MDBF_RU_HS"] = (man_check_df["MDBF_ausgeglichen_HS"] + man_check_df["MDBF_ruhig_HS"] + man_check_df[
    "MDBF_angespannt_HS"] + man_check_df["MDBF_nervÃ¶s_HS"]) / 4
man_check_df["MDBF_WM_HS"] = (man_check_df["MDBF_frisch_HS"] + man_check_df["MDBF_wach_HS"] + man_check_df[
    "MDBF_schlÃ¤frig_HS"] + man_check_df["MDBF_ermattet_HS"]) / 4

man_check_df["MDBF_GS_LS"] = (man_check_df["MDBF_wohl_LS"] + man_check_df["MDBF_gut_LS"] + man_check_df[
    "MDBF_unglÃ¼cklich_LS"] + man_check_df["MDBF_unzufrieden_LS"]) / 4
man_check_df["MDBF_RU_LS"] = (man_check_df["MDBF_ausgeglichen_LS"] + man_check_df["MDBF_ruhig_LS"] + man_check_df[
    "MDBF_angespannt_LS"] + man_check_df["MDBF_nervÃ¶s_LS"]) / 4
man_check_df["MDBF_WM_LS"] = (man_check_df["MDBF_frisch_LS"] + man_check_df["MDBF_wach_LS"] + man_check_df[
    "MDBF_schlÃ¤frig_LS"] + man_check_df["MDBF_ermattet_LS"]) / 4

# calc Cronbach Alpha's of the scales
cronbachs_alphas = {}

mdbf_scales = {
    "GS_HS": ["MDBF_wohl_HS", "MDBF_gut_HS", "MDBF_unglÃ¼cklich_HS", "MDBF_unzufrieden_HS"],
    "RU_HS": ["MDBF_ausgeglichen_HS", "MDBF_ruhig_HS", "MDBF_angespannt_HS", "MDBF_nervÃ¶s_HS"],
    "WM_HS": ["MDBF_frisch_HS", "MDBF_wach_HS", "MDBF_schlÃ¤frig_HS", "MDBF_ermattet_HS"],
    "GS_LS": ["MDBF_wohl_LS", "MDBF_gut_LS", "MDBF_unglÃ¼cklich_LS", "MDBF_unzufrieden_LS"],
    "RU_LS": ["MDBF_ausgeglichen_LS", "MDBF_ruhig_LS", "MDBF_angespannt_LS", "MDBF_nervÃ¶s_LS"],
    "WM_LS": ["MDBF_frisch_LS", "MDBF_wach_LS", "MDBF_schlÃ¤frig_LS", "MDBF_ermattet_LS"]
}

for i in mdbf_scales:
    cronbachs_alphas[i] = {}
    alpha = pg.cronbach_alpha(data=man_check_df.loc[:, mdbf_scales[i]])
    print("Cronbachs Alpha " + str(i), alpha)
    cronbachs_alphas[i]["alpha"] = alpha[0]
    cronbachs_alphas[i]["alpha_ci"] = alpha[1]

# save the cronbach alphas as a dataframe
cronbachs_alphas_df = pd.DataFrame(cronbachs_alphas).T

##### Step 2: get some additional SAM values

In [None]:
# calc a merged SAM and Arousal scale

man_check_df["merged_Valence_HS"] = (man_check_df["samValence_HS_SAM_DragDrop"] +
                                     man_check_df["samValence_HS_SAM_Drawing"] +
                                     man_check_df["samValence_HS_SAM_FollowBox"] +
                                     man_check_df["samValence_HS_SAM_PatternTyping"] +
                                     man_check_df["samValence_HS_SAM_PointClick"]) / 5

man_check_df["merged_Arousal_HS"] = (man_check_df["samArousal_HS_SAM_DragDrop"] +
                                     man_check_df["samArousal_HS_SAM_Drawing"] +
                                     man_check_df["samArousal_HS_SAM_FollowBox"] +
                                     man_check_df["samArousal_HS_SAM_PatternTyping"] +
                                     man_check_df["samArousal_HS_SAM_PointClick"]) / 5

man_check_df["merged_Valence_LS"] = (man_check_df["samValence_LS_SAM_DragDrop"] +
                                     man_check_df["samValence_LS_SAM_Drawing"] +
                                     man_check_df["samValence_LS_SAM_FollowBox"] +
                                     man_check_df["samValence_LS_SAM_PatternTyping"] +
                                     man_check_df["samValence_LS_SAM_PointClick"]) / 5

man_check_df["merged_Arousal_LS"] = (man_check_df["samArousal_LS_SAM_DragDrop"] +
                                     man_check_df["samArousal_LS_SAM_Drawing"] +
                                     man_check_df["samArousal_LS_SAM_FollowBox"] +
                                     man_check_df["samArousal_LS_SAM_PatternTyping"] +
                                     man_check_df["samArousal_LS_SAM_PointClick"]) / 5

##### Step 3: get descriptive stats of the dataset for the selected set of variables

In [None]:
descriptive_stats_df = man_check_df.describe().sort_index(axis=1)

##### Step 4: perform a paired sample t-test on all relevant manipulation check items to compare them between the high-stress and low-stress condition

In [None]:
# intitialize an empty dataframe to store the results of the paired sample t-test for each variable
man_check_results = pd.DataFrame()

# loop over the variable columns
for col in man_check_df.columns:
    if "HS" in col:
        # find the corresponding low_stress variable
        ls_string = col.replace("HS", "LS")
        # create a neutral variable name
        neutral_string = col.replace("HS", "")
        # perform the paired sample t_test
        paired_ttest = pg.ttest(man_check_df[col], man_check_df[ls_string], paired=True)
        # rename the pingouin index of the t-test dataframe to the variable name
        paired_ttest.index = [neutral_string]
        # concat the results of the variables t-test to the result dataframe
        man_check_results = pd.concat([man_check_results, paired_ttest])

        # Plot the distribution of both variable pairs
        sns.distplot(man_check_df[col], hist=True, kde=True, kde_kws={"linewidth": 3}, label="HS")
        sns.distplot(man_check_df[ls_string], hist=True, kde=True, kde_kws={"linewidth": 3}, label="LS")
        plt.legend(loc="upper right")
        plt.title(neutral_string)
        plt.show()

##### Save all manipulation check data in an excel file

In [None]:
# save the dataframe plus the descriptive stats as an excel file
with pd.ExcelWriter("Labstudy_Manipulation_Check_Results.xlsx") as writer:
    man_check_results.to_excel(writer, float_format="%.4f", sheet_name="Paired sample t-test results")
    descriptive_stats_df.to_excel(writer, float_format="%.3f", sheet_name="Descriptive Stats")
    cronbachs_alphas_df.to_excel(writer, float_format="%.4f", sheet_name="Cronbachs_Alphas")

#### --- In addition to the manipulation check, create a dataset with the manipulation check items for the machine learning analysis. The data format needs to match the keyboard feature machine learning data format ---

In [None]:
# get the relevant manipulation check items for the machine learning dataset (to predict valence, arousal, bpm and eda
# from the keyboard typing data)

# The data needs to be structured in long format --> Each condition is a new column (2 columns per participant)
# code is not the cleanest: The best approach would be to change the create_man_check_dataset to support the creation
# of both long and wide data (this code was added in a later step and my shortest solution to "fix" the problem)
man_check_data_long = {}
for par in man_check_data:
    # check if the physiological variables are in the dataset
    if "HS_PatternTyping_BPM" in man_check_data[par]:
        hs_bpm = man_check_data[par]["HS_PatternTyping_BPM"]
    else:
        hs_bpm = None

    if "HS_PatternTyping_EDA" in man_check_data[par]:
        hs_eda = man_check_data[par]["HS_PatternTyping_EDA"]
    else:
        hs_eda = None

    if "LS_PatternTyping_BPM" in man_check_data[par]:
        ls_bpm = man_check_data[par]["LS_PatternTyping_BPM"]
    else:
        ls_bpm = None

    if "LS_PatternTyping_EDA" in man_check_data[par]:
        ls_eda = man_check_data[par]["LS_PatternTyping_EDA"]
    else:
        ls_eda = None

    man_check_data_long["HS_" + par] = {"BPM": hs_bpm,
                                        "EDA": hs_eda,
                                        "Valence": man_check_data[par]["samArousal_HS_SAM_PatternTyping"],
                                        "Arousal": man_check_data[par]["samValence_HS_SAM_PatternTyping"]
                                        }
    man_check_data_long["LS_" + par] = {"BPM": ls_bpm,
                                        "EDA": ls_eda,
                                        "Valence": man_check_data[par]["samArousal_LS_SAM_PatternTyping"],
                                        "Arousal": man_check_data[par]["samValence_LS_SAM_PatternTyping"]
                                        }

man_check_data_long = pd.DataFrame(man_check_data_long).T.dropna(how="all")
man_check_data_long.to_csv("Lab_Study_Manipulation_Check_Variables_for_ML.csv", sep="\t", encoding="utf-8")

