# User-Studies 

In [697]:
import pandas as pd
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd

## Data Processing

In [713]:
# Read all files
pre_study = pd.read_csv('Data/IN5060_fall24-pre-study.csv')
during_study = pd.read_csv("Data/IN5060_fall24-during-study.csv")
post_study = pd.read_csv('Data/IN5060_fall24-post-study.csv')

print(f"Number of rows in pre_study before processing data: {pre_study.shape[0]}")
print(f"Number of rows in during_study before processing data: {during_study.shape[0]}")
print(f"Number of rows in post_study before processing data: {post_study.shape[0]}\n")

session_id_col = "Session ID (the last session that the participant has completed)"
participant_id_col = "Participant ID (top-right corner of the screen)"
participant_id_during_col = "Participant ID (top-right corner)"
# -------------------------------------------------------------------------------------------------------------------
# ---------------------- REMOVE DIRTY DATA --------------------------------------------------------------------------
# -------------------------------------------------------------------------------------------------------------------
pre_study = pre_study[pre_study["Please indicate if you have any of the following health problems."] != "Test"]

# Remove participant id "grup16"
pre_study = pre_study[pre_study[participant_id_col] != "grup16"]
during_study = during_study[during_study[participant_id_during_col] != "grup16"]
post_study = post_study[post_study[participant_id_col] != "grup16"]

# Remove participant ids containing "group15" in any dataset
pre_study = pre_study[~pre_study[participant_id_col].str.contains("group15", case=False)]
during_study = during_study[~during_study[participant_id_during_col].str.contains("group15", case=False)]
post_study = post_study[~post_study[participant_id_col].str.contains("group15", case=False)]

# Remove duplicate session id CYSP for participant id G17P4
g17p4_cysp = during_study[(during_study[participant_id_during_col] == "G17P4") & (during_study[session_id_col] == "CYSP")]
g17p4_cysp_sorted = g17p4_cysp.sort_values(by="Timestamp")
if not g17p4_cysp_sorted.empty:
    oldest_index = g17p4_cysp_sorted.index[0]
    during_study.at[oldest_index, session_id_col] = "ALICE"

# Print if there are duplicates for CYSP session ID for participant G17P4
print(f"Duplicate 'CYSP' session IDs for participant 'G17P4': {len(g17p4_cysp) > 1}")

# Print if any participant ID in any dataset contains "grup16"
print(f"Participant with ID 'grup16' in pre_study: {'grup16' in pre_study[participant_id_col].values}")
print(f"Participant with ID 'grup16' in during_study: {'grup16' in during_study[participant_id_during_col].values}")
print(f"Participant with ID 'grup16' in post_study: {'grup16' in post_study[participant_id_col].values}")

# Print the count of rows that had "group15" in the Participant ID for each dataset
print(f"Participant with ID 'grup15' in pre_study: {pre_study[pre_study[participant_id_col].str.contains('group15', case=False)].shape[0]}")
print(f"Participant with ID 'grup15' in during_study: {during_study[during_study[participant_id_during_col].str.contains('group15', case=False)].shape[0]}")
print(f"Participant with ID 'grup15' in post_study: {post_study[post_study[participant_id_col].str.contains('group15', case=False)].shape[0]}")

# -------------------------------------------------------------------------------------------------------------------
# ---------------------- ENSURE SESSION ID EQUALS VOYAGER, ILLIAC, CYSP OR ALICE ------------------------------------
# -------------------------------------------------------------------------------------------------------------------
switch_ids = ['VOYAGER', 'ILLIAC', 'CYSP', 'ALICE']
switch_rows = during_study[during_study[participant_id_during_col]
                            .str.contains('|'.join(switch_ids), case=False, na=False)]

# Swap Participant ID and Session ID for these rows
for index, row in switch_rows.iterrows():
    during_study.at[index, participant_id_during_col] = row[session_id_col]
    during_study.at[index, session_id_col] = row[participant_id_during_col]

# Fix writing mistakes
replacements = {
    r'(?i)ALLICE': 'ALICE',
    r'(?i)iliac': 'ILLIAC',
    r'(?i)ILLIAD': 'ILLIAC',
    r'(?i)Crysp': 'CYSP'
}

# Replace session IDs using the dictionary
for old_value, new_value in replacements.items():
    during_study[session_id_col] = \
        during_study[session_id_col].replace(
            to_replace=old_value, value=new_value, regex=True)

# After fixing swap and correcting writing, how many invalid sessions are left?
def get_invalid_sessions(df):
    return df[~df[session_id_col]
              .str.contains("VOYAGER|ILLIAC|CYSP|ALICE", case=False, na=False)]

invalid_sessions = get_invalid_sessions(during_study)
print(f"\nRows in during_study with invalid session IDs before: {invalid_sessions.shape[0]}")

# Invalid sessions contain 35 rows, they have been investigated and we can drop them
during_study = during_study.drop(invalid_sessions.index)

invalid_sessions = get_invalid_sessions(during_study)
print(f"Rows in during_study with invalid session IDs after: {invalid_sessions.shape[0]}")

# -------------------------------------------------------------------------------------------------------------------
# ---------------------- REMOVING PARTICIPANTS WHERE SESSION COUNT IS NOT 4 -----------------------------------------
# -------------------------------------------------------------------------------------------------------------------
# Group by Participant ID and check the number of unique session IDs
session_count_per_participant = during_study.groupby(participant_id_during_col)[session_id_col].nunique()

# Identify valid participants who have 4 unique session IDs
valid_participants = session_count_per_participant[session_count_per_participant == 4].index

# Display participants with valid session counts
print(f"Participants in during_study with valid session counts (equal to 4): {valid_participants.shape[0]} = {valid_participants.shape[0]*4} rows")

# Only keep valid participants
during_study = during_study.query("`Participant ID (top-right corner)` in @valid_participants")

# -------------------------------------------------------------------------------------------------------------------
# ---------------------- REMOVE INVALID PARTICIPANTS FROM PRE AND POST STUDY ----------------------------------------
# -------------------------------------------------------------------------------------------------------------------
valid_participant_ids = during_study[participant_id_during_col].unique()

# Remove rows in pre_study and post_study where the participant ID is not present in during_study
pre_study = pre_study[pre_study[participant_id_col].isin(valid_participant_ids)]
post_study = post_study[post_study[participant_id_col].isin(valid_participant_ids)]

# Pre and post study should have the same amount of participants
common_participant_ids_pre = pre_study[participant_id_col].isin(post_study[participant_id_col])
common_participant_ids_post = post_study[participant_id_col].isin(pre_study[participant_id_col])

pre_study = pre_study[common_participant_ids_pre]
post_study = post_study[common_participant_ids_post]

# -------------------------------------------------------------------------------------------------------------------
print(f"\nNumber of rows in pre_study before processing data: {pre_study.shape[0]}")
print(f"Number of rows in during_study before processing data: {during_study.shape[0]}")
print(f"Number of rows in post_study before processing data: {post_study.shape[0]}")

# Display all columns and expand display width
pd.set_option('display.max_columns', None)     # Show all columns
pd.set_option('display.width', 1000)           # Set the display width to fit large tables
pd.set_option('display.max_colwidth', None)    # Show full column content without truncation
pd.set_option('display.max_rows', None)        # Show all rows (remove this if there are too many rows)

print("\n")
#print(during_study.head(10))

Number of rows in pre_study before processing data: 154
Number of rows in during_study before processing data: 469
Number of rows in post_study before processing data: 122

Duplicate 'CYSP' session IDs for participant 'G17P4': False
Participant with ID 'grup16' in pre_study: False
Participant with ID 'grup16' in during_study: False
Participant with ID 'grup16' in post_study: False
Participant with ID 'grup15' in pre_study: 0
Participant with ID 'grup15' in during_study: 0
Participant with ID 'grup15' in post_study: 0

Rows in during_study with invalid session IDs before: 35
Rows in during_study with invalid session IDs after: 0
Participants in during_study with valid session counts (equal to 4): 81 = 324 rows

Number of rows in pre_study before processing data: 83
Number of rows in during_study before processing data: 324
Number of rows in post_study before processing data: 78




# Pre Study

# During Study

Note to Caroline: As an initial test i wanted to checkout if i managed to get anything out of this ANOVA thing. So, i started out by looking at the one-way ANOVA test in the "during-study" and looked at the "I felt excited" data. I looked at the SessionIDs and tried to use the ANOVA test to determine if there is any sort of differences in "Excitement" between the different SessionId. It looks like there are statistical differences, revealed by the p-value, and a post-hoc analysis is required. 

We can probably look at other "feelings" and see how those look like across the SesisonIds. I guess we are using the SessionIds (the different rhythm profiles) as the base groups but idk if we should look at some other way of grouping? SessionIds seems to be alright for now, we can probably provide some analysis on that and say f.ex "people really liked the ILLIAC profile" 

In [None]:
# Perform ANOVA on "I felt excited" across different "Session ID" groups
anova_results = stats.f_oneway(
    *[during_study[during_study['Session ID (the last session that the participant has completed)'] == session]['I felt excited']
      for session in during_study['Session ID (the last session that the participant has completed)'].unique()]
)

print("F-statistic:", anova_results.statistic)
print("p-value:", anova_results.pvalue)

The ANOVA test results show an F-statistic of approximately 1.76 and a p-value of 0.0066. Since the p-value is below the typical significance level (e.g., 0.05), we can conclude that there are statistically significant differences in excitement levels across the different session types.

In [None]:
# Prepare data for Tukey's HSD test
session_data = during_study[['Session ID (the last session that the participant has completed)', 'I felt excited']].dropna()
session_data.columns = ['SessionID', 'Excitement']

tukey_results = pairwise_tukeyhsd(endog=session_data['Excitement'], groups=session_data['SessionID'], alpha=0.05)

tukey_summary = tukey_results.summary()

# Convert the summary to a pandas DataFrame for easier viewing
tukey_df = pd.DataFrame(data=tukey_summary.data[1:], columns=tukey_summary.data[0])

print(tukey_df)

# Display the results
#print(tukey_results.summary())

# Post Study