# User-Studies 

In [299]:
import pandas as pd
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd

## Data

In [301]:
# Read all files
pre_study = pd.read_csv('Data/IN5060_fall24-pre-study.csv')
during_study = pd.read_csv("Data/IN5060_fall24-during-study.csv")
post_study = pd.read_csv('Data/IN5060_fall24-post-study.csv')

# Remove test data
pre_study = pre_study[pre_study["Please indicate if you have any of the following health problems."] != "Test"]
during_study = during_study[during_study["Session ID (the last session that the participant has completed)"] != "Test"]

# Remove participant id "grup16"
pre_study = pre_study[pre_study["Participant ID (top-right corner of the screen)"] != "grup16"]
during_study = during_study[during_study["Participant ID (top-right corner)"] != "grup16"]
post_study = post_study[post_study["Participant ID (top-right corner of the screen)"] != "grup16"]

# Remove participant ids containing "group15" in any dataset
pre_study = pre_study[~pre_study["Participant ID (top-right corner of the screen)"].str.contains("group15", case=False)]
during_study = during_study[~during_study["Participant ID (top-right corner)"].str.contains("group15", case=False)]
post_study = post_study[~post_study["Participant ID (top-right corner of the screen)"].str.contains("group15", case=False)]

# Remove duplicate session id CYSP for participant id G17P4
g17p4_cysp = during_study[(during_study["Participant ID (top-right corner)"] == "G17P4") & (during_study["Session ID (the last session that the participant has completed)"] == "CYSP")]
g17p4_cysp_sorted = g17p4_cysp.sort_values(by="Timestamp")
if not g17p4_cysp_sorted.empty:
    oldest_index = g17p4_cysp_sorted.index[0]
    during_study.at[oldest_index, "Session ID (the last session that the participant has completed)"] = "ALICE"

# Print if any session ID in during_study contains "Test"
print(f"Session ID 'Test' in during_study: {'Test' in during_study['Session ID (the last session that the participant has completed)'].values}")

# Print if any participant ID in any dataset contains "grup16"
print(f"Participant with ID 'grup16' in pre_study: {'grup16' in pre_study['Participant ID (top-right corner of the screen)'].values}")
print(f"Participant with ID 'grup16' in during_study: {'grup16' in during_study['Participant ID (top-right corner)'].values}")
print(f"Participant with ID 'grup16' in post_study: {'grup16' in post_study['Participant ID (top-right corner of the screen)'].values}")

# Print if there are duplicates for CYSP session ID for participant G17P4
print(f"Duplicate 'CYSP' session IDs for participant 'G17P4': {len(g17p4_cysp) > 1}")

# Print the count of rows that had "group15" in the Participant ID for each dataset
print(f"Pre-study dataset rows with 'group15': {pre_study[pre_study['Participant ID (top-right corner of the screen)'].str.contains('group15', case=False)].shape[0]}")
print(f"During-study dataset rows with 'group15': {during_study[during_study['Participant ID (top-right corner)'].str.contains('group15', case=False)].shape[0]}")
print(f"Post-study dataset rows with 'group15': {post_study[post_study['Participant ID (top-right corner of the screen)'].str.contains('group15', case=False)].shape[0]}")

# Display all columns and expand display width
pd.set_option('display.max_columns', None)     # Show all columns
pd.set_option('display.width', 1000)           # Set the display width to fit large tables
pd.set_option('display.max_colwidth', None)    # Show full column content without truncation
pd.set_option('display.max_rows', None)        # Show all rows (remove this if there are too many rows)

print("\n")
print(during_study.head(10))

Session ID 'Test' in during_study: False
Participant with ID 'grup16' in pre_study: False
Participant with ID 'grup16' in during_study: False
Participant with ID 'grup16' in post_study: False
Duplicate 'CYSP' session IDs for participant 'G17P4': False
Pre-study dataset rows with 'group15': 0
During-study dataset rows with 'group15': 0
Post-study dataset rows with 'group15': 0


                       Timestamp Session ID (the last session that the participant has completed) Participant ID (top-right corner)  I felt bored  I felt annoyed  I felt excited  I felt at ease  I felt satisfied  I felt curious  I felt stressed  I felt tired
0    2024/11/05 5:50:02 PM GMT+1                                                           ILLIAC                         andsti-01             1               2               3               1                 2               3                2             4
1    2024/11/05 5:54:41 PM GMT+1                                                             CYSP    

# Pre Study

# During Study

Note to Caroline: As an initial test i wanted to checkout if i managed to get anything out of this ANOVA thing. So, i started out by looking at the one-way ANOVA test in the "during-study" and looked at the "I felt excited" data. I looked at the SessionIDs and tried to use the ANOVA test to determine if there is any sort of differences in "Excitement" between the different SessionId. It looks like there are statistical differences, revealed by the p-value, and a post-hoc analysis is required. 

We can probably look at other "feelings" and see how those look like across the SesisonIds. I guess we are using the SessionIds (the different rhythm profiles) as the base groups but idk if we should look at some other way of grouping? SessionIds seems to be alright for now, we can probably provide some analysis on that and say f.ex "people really liked the ILLIAC profile" 

In [305]:
# Perform ANOVA on "I felt excited" across different "Session ID" groups
anova_results = stats.f_oneway(
    *[during_study[during_study['Session ID (the last session that the participant has completed)'] == session]['I felt excited']
      for session in during_study['Session ID (the last session that the participant has completed)'].unique()]
)

print("F-statistic:", anova_results.statistic)
print("p-value:", anova_results.pvalue)

F-statistic: 1.5894817690382161
p-value: 0.023896211380225837


The ANOVA test results show an F-statistic of approximately 1.76 and a p-value of 0.0066. Since the p-value is below the typical significance level (e.g., 0.05), we can conclude that there are statistically significant differences in excitement levels across the different session types.

In [307]:
# Prepare data for Tukey's HSD test
session_data = during_study[['Session ID (the last session that the participant has completed)', 'I felt excited']].dropna()
session_data.columns = ['SessionID', 'Excitement']

tukey_results = pairwise_tukeyhsd(endog=session_data['Excitement'], groups=session_data['SessionID'], alpha=0.05)

tukey_summary = tukey_results.summary()

# Convert the summary to a pandas DataFrame for easier viewing
tukey_df = pd.DataFrame(data=tukey_summary.data[1:], columns=tukey_summary.data[0])

print(tukey_df)

# Display the results
#print(tukey_results.summary())

           group1        group2  meandiff   p-adj   lower   upper  reject
0           ALICE       VOYAGER   -1.5000  1.0000 -6.7761  3.7761   False
1           ALICE             1   -0.8750  1.0000 -4.2807  2.5307   False
2           ALICE             2   -1.2143  1.0000 -4.6683  2.2397   False
3           ALICE             3   -1.5000  0.9989 -4.8676  1.8676   False
4           ALICE             4   -2.3333  0.7946 -5.8507  1.1840   False
5           ALICE         ALICE   -0.9578  1.0000 -4.0404  2.1248   False
6           ALICE        ALLICE    0.5000  1.0000 -3.8079  4.8079   False
7           ALICE         Alice    0.2500  1.0000 -3.4807  3.9807   False
8           ALICE          CYSP   -1.1630  1.0000 -4.2421  1.9160   False
9           ALICE         Crysp    0.5000  1.0000 -4.7761  5.7761   False
10          ALICE          Cysp    1.0000  1.0000 -3.3079  5.3079   False
11          ALICE         G17P1    0.5000  1.0000 -4.7761  5.7761   False
12          ALICE       Grugru2   -1.5

# Post Study