# User-Studies 

In [118]:
import pandas as pd
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd

## Data Processing

In [120]:
# Read all files
pre_study = pd.read_csv('Data/IN5060_fall24-pre-study.csv')
during_study = pd.read_csv("Data/IN5060_fall24-during-study.csv")
post_study = pd.read_csv('Data/IN5060_fall24-post-study.csv')

print(f"Number of rows in pre_study before processing data: {pre_study.shape[0]}")
print(f"Number of rows in during_study before processing data: {during_study.shape[0]}")
print(f"Number of rows in post_study before processing data: {post_study.shape[0]}")
print(f"\n-------------------------------------------------------------------------------------------------------\n")

session_id_col = "Session ID (the last session that the participant has completed)"
participant_id_col = "Participant ID (top-right corner of the screen)"
participant_id_during_col = "Participant ID (top-right corner)"

# -------------------------------------------------------------------------------------------------------------------
# ---------------------- REMOVE DIRTY DATA --------------------------------------------------------------------------
# -------------------------------------------------------------------------------------------------------------------
pre_study = pre_study[pre_study["Please indicate if you have any of the following health problems."] != "Test"]
during_study = during_study[~during_study[participant_id_during_col].str.contains("test", case=False, na=False)]

# Remove participant id "grup16"
pre_study = pre_study[pre_study[participant_id_col] != "grup16"]
during_study = during_study[during_study[participant_id_during_col] != "grup16"]
post_study = post_study[post_study[participant_id_col] != "grup16"]

# Remove participant ids containing "group15" in any dataset
pre_study = pre_study[~pre_study[participant_id_col].str.contains("group15", case=False)]
during_study = during_study[~during_study[participant_id_during_col].str.contains("group15", case=False)]
post_study = post_study[~post_study[participant_id_col].str.contains("group15", case=False)]

# Remove duplicate session id CYSP for participant id G17P4
g17p4_cysp = during_study[(during_study[participant_id_during_col] == "G17P4") & (during_study[session_id_col] == "CYSP")]
g17p4_cysp_sorted = g17p4_cysp.sort_values(by="Timestamp")
if not g17p4_cysp_sorted.empty:
    oldest_index = g17p4_cysp_sorted.index[0]
    during_study.at[oldest_index, session_id_col] = "ALICE"

# Print if there are duplicates for CYSP session ID for participant G17P4
print(f"Duplicate 'CYSP' session IDs for participant 'G17P4': {len(g17p4_cysp) > 1}")

# Print if any participant ID in any dataset contains "grup16"
print(f"Participant with ID 'grup16' in pre_study: {'grup16' in pre_study[participant_id_col].values}")
print(f"Participant with ID 'grup16' in during_study: {'grup16' in during_study[participant_id_during_col].values}")
print(f"Participant with ID 'grup16' in post_study: {'grup16' in post_study[participant_id_col].values}")

# Print the count of rows that had "group15" in the Participant ID for each dataset
print(f"Participant with ID 'grup15' in pre_study: {pre_study[pre_study[participant_id_col].str.contains('group15', case=False)].shape[0]}")
print(f"Participant with ID 'grup15' in during_study: {during_study[during_study[participant_id_during_col].str.contains('group15', case=False)].shape[0]}")
print(f"Participant with ID 'grup15' in post_study: {post_study[post_study[participant_id_col].str.contains('group15', case=False)].shape[0]}")

# -------------------------------------------------------------------------------------------------------------------
# ---------------------- ENSURE SESSION ID EQUALS VOYAGER, ILLIAC, CYSP OR ALICE ------------------------------------
# -------------------------------------------------------------------------------------------------------------------
switch_ids = ['VOYAGER', 'ILLIAC', 'CYSP', 'ALICE']
switch_rows = during_study[during_study[participant_id_during_col]
                            .str.contains('|'.join(switch_ids), case=False, na=False)]

# Swap Participant ID and Session ID for these rows
for index, row in switch_rows.iterrows():
    during_study.at[index, participant_id_during_col] = row[session_id_col]
    during_study.at[index, session_id_col] = row[participant_id_during_col]

# Fix writing mistakes
replacements = {
    r'(?i)ALLICE': 'ALICE',
    r'(?i)iliac': 'ILLIAC',
    r'(?i)ILLIAD': 'ILLIAC',
    r'(?i)Crysp': 'CYSP'
}

# Replace session IDs using the dictionary
for old_value, new_value in replacements.items():
    during_study[session_id_col] = \
        during_study[session_id_col].replace(
            to_replace=old_value, value=new_value, regex=True)

# After fixing swap and correcting writing, how many invalid sessions are left?
def get_invalid_sessions(df):
    return df[~df[session_id_col]
              .str.contains("VOYAGER|ILLIAC|CYSP|ALICE", case=False, na=False)]

invalid_sessions = get_invalid_sessions(during_study)
print(f"\nRows in during_study with invalid session IDs before: {invalid_sessions.shape[0]}")

# Invalid sessions contain 35 rows, they have been investigated and we can drop them
during_study = during_study.drop(invalid_sessions.index)

invalid_sessions = get_invalid_sessions(during_study)
print(f"Rows in during_study with invalid session IDs after: {invalid_sessions.shape[0]}")

# -------------------------------------------------------------------------------------------------------------------
# ---------------------- REMOVING PARTICIPANTS WHERE SESSION COUNT IS NOT 4 -----------------------------------------
# -------------------------------------------------------------------------------------------------------------------
# Group by Participant ID and check the number of unique session IDs
session_count_per_participant = during_study.groupby(participant_id_during_col)[session_id_col].nunique()

# Identify valid participants who have 4 unique session IDs
valid_participants = session_count_per_participant[session_count_per_participant == 4].index

# Display participants with valid session counts
print(f"Participants in during_study with valid session counts (equal to 4): {valid_participants.shape[0]} = {valid_participants.shape[0]*4} rows")

# Only keep valid participants
during_study = during_study.query(f"`{participant_id_during_col}` in @valid_participants")

# -------------------------------------------------------------------------------------------------------------------
# ---------------------- REMOVE INVALID PARTICIPANTS FROM PRE AND POST STUDY ----------------------------------------
# -------------------------------------------------------------------------------------------------------------------
valid_participant_ids = during_study[participant_id_during_col].unique()

# Remove rows in pre_study and post_study where the participant ID is not present in during_study
pre_study = pre_study[pre_study[participant_id_col].isin(valid_participant_ids)]
post_study = post_study[post_study[participant_id_col].isin(valid_participant_ids)]

# Pre and post study should have the same amount of participants and no duplicates
pre_study = pre_study.sort_values(by="Timestamp", ascending=False)
pre_study = pre_study.drop_duplicates(subset=participant_id_col, keep='first')
post_study = post_study.sort_values(by="Timestamp", ascending=False)
post_study = post_study.drop_duplicates(subset=participant_id_col, keep='first')

common_participant_ids_pre = pre_study[participant_id_col].isin(post_study[participant_id_col])
common_participant_ids_post = post_study[participant_id_col].isin(pre_study[participant_id_col])

# This leaves us with 76 participants
pre_study = pre_study[common_participant_ids_pre]
post_study = post_study[common_participant_ids_post]

# During_study should have 304 entries, we have 316 -> find the duplicates
pre_post_participant_ids = set(pre_study[participant_id_col].unique()).intersection(post_study[participant_id_col].unique())
during_not_in_pre_post = set(valid_participant_ids) - pre_post_participant_ids
print(f"\nParticipant IDs present in during_study but not in pre_study or post_study: {during_not_in_pre_post}")

participants_to_remove = {'JOJOSIWA', 'g17p2', 'G17P6'}
during_study = during_study[~during_study[participant_id_during_col].isin(participants_to_remove)]
print("Number of rows in during_study after removing invalid participants:", len(during_study))

# -------------------------------------------------------------------------------------------------------------------
print(f"\n-------------------------------------------------------------------------------------------------------\n")
print(f"Number of rows in pre_study before processing data: {pre_study.shape[0]}")
print(f"Number of rows in during_study before processing data: {during_study.shape[0]}")
print(f"Number of rows in post_study before processing data: {post_study.shape[0]}")

# Display all columns and expand display width
pd.set_option('display.max_columns', None)     # Show all columns
pd.set_option('display.width', 1000)           # Set the display width to fit large tables
pd.set_option('display.max_colwidth', None)    # Show full column content without truncation
pd.set_option('display.max_rows', None)        # Show all rows (remove this if there are too many rows)

print("\n")
#print(during_study.head(10))

Number of rows in pre_study before processing data: 154
Number of rows in during_study before processing data: 469
Number of rows in post_study before processing data: 122

-------------------------------------------------------------------------------------------------------

Duplicate 'CYSP' session IDs for participant 'G17P4': False
Participant with ID 'grup16' in pre_study: False
Participant with ID 'grup16' in during_study: False
Participant with ID 'grup16' in post_study: False
Participant with ID 'grup15' in pre_study: 0
Participant with ID 'grup15' in during_study: 0
Participant with ID 'grup15' in post_study: 0

Rows in during_study with invalid session IDs before: 35
Rows in during_study with invalid session IDs after: 0
Participants in during_study with valid session counts (equal to 4): 79 = 316 rows

Participant IDs present in during_study but not in pre_study or post_study: {'JOJOSIWA', 'g17p2', 'G17P6'}
Number of rows in during_study after removing invalid participants: 

# Testing data

In [122]:
# Role
print(f"\n--------------------------------------What describes you the best?---------------------------------------------")
column_of_interest = "What describes you the best?"
answers_to_count = [
    "Student/semi-professional musician",
    "Avid music listener",
    "Professional musician",
    "Not particularly interested in music"
]

counts = {answer: pre_study[pre_study[column_of_interest] == answer].shape[0] for answer in answers_to_count}
for answer, count in counts.items():
    print(f"{answer}: {count} participants")

# Gender - this is skewed as there are more than double the amount of males - could run statistical significance test
print(f"\n------------------------------------------------Gender?--------------------------------------------------------")
column_of_interest = "Gender?"
answers_to_count = [
    "Prefer not to say",
    "Male",
    "Female"
]

counts = {answer: pre_study[pre_study[column_of_interest] == answer].shape[0] for answer in answers_to_count}
for answer, count in counts.items():
    print(f"{answer}: {count} participants")

# Comments
print(f"\n------------------------Please add if you have any comments, suggestions, or requests.-------------------------")
column_of_interest = "Please add if you have any comments, suggestions, or requests."
non_empty_comments = post_study[post_study[column_of_interest].str.strip().ne("") & post_study[column_of_interest].notna()]

print("Non-empty comments, suggestions, or requests:")
for comment in non_empty_comments[column_of_interest]:
    print(f"- {comment.strip()}")




--------------------------------------What describes you the best?---------------------------------------------
Student/semi-professional musician: 9 participants
Avid music listener: 32 participants
Professional musician: 1 participants
Not particularly interested in music: 7 participants

------------------------------------------------Gender?--------------------------------------------------------
Prefer not to say: 0 participants
Male: 52 participants
Female: 24 participants

------------------------Please add if you have any comments, suggestions, or requests.-------------------------
Non-empty comments, suggestions, or requests:
- fix the tracking landscape. It's invisible and gets in the way when doing almost anything that requires the mouse. The consistent downloading makes it very hard to keep tapping in rhythm due to download pop-ups.
- The db felt broken and unfinished, fun and frustrating at the same time.
- It would be more fun with more instruments and if the db actually

# Pre Study

# During Study

Note to Caroline: As an initial test i wanted to checkout if i managed to get anything out of this ANOVA thing. So, i started out by looking at the one-way ANOVA test in the "during-study" and looked at the "I felt excited" data. I looked at the SessionIDs and tried to use the ANOVA test to determine if there is any sort of differences in "Excitement" between the different SessionId. It looks like there are statistical differences, revealed by the p-value, and a post-hoc analysis is required. 

We can probably look at other "feelings" and see how those look like across the SesisonIds. I guess we are using the SessionIds (the different rhythm profiles) as the base groups but idk if we should look at some other way of grouping? SessionIds seems to be alright for now, we can probably provide some analysis on that and say f.ex "people really liked the ILLIAC profile" 

In [126]:
# Perform ANOVA on "I felt excited" across different "Session ID" groups
anova_results = stats.f_oneway(
    *[during_study[during_study['Session ID (the last session that the participant has completed)'] == session]['I felt excited']
      for session in during_study['Session ID (the last session that the participant has completed)'].unique()]
)

print("F-statistic:", anova_results.statistic)
print("p-value:", anova_results.pvalue)

F-statistic: 2.4253079621397267
p-value: 0.006674073237232363


The ANOVA test results show an F-statistic of approximately 1.76 and a p-value of 0.0066. Since the p-value is below the typical significance level (e.g., 0.05), we can conclude that there are statistically significant differences in excitement levels across the different session types.

In [128]:
# Prepare data for Tukey's HSD test
session_data = during_study[['Session ID (the last session that the participant has completed)', 'I felt excited']].dropna()
session_data.columns = ['SessionID', 'Excitement']

tukey_results = pairwise_tukeyhsd(endog=session_data['Excitement'], groups=session_data['SessionID'], alpha=0.05)

tukey_summary = tukey_results.summary()

# Convert the summary to a pandas DataFrame for easier viewing
tukey_df = pd.DataFrame(data=tukey_summary.data[1:], columns=tukey_summary.data[0])

print(tukey_df)

# Display the results
#print(tukey_results.summary())

     group1   group2  meandiff   p-adj   lower   upper  reject
0     ALICE    ALICE   -0.9242  0.9936 -3.6453  1.7968   False
1     ALICE    Alice    0.1667  1.0000 -3.2942  3.6275   False
2     ALICE     CYSP   -1.0735  0.9786 -3.7934  1.6464   False
3     ALICE     Cysp    1.5000  0.9959 -3.1432  6.1432   False
4     ALICE   ILLIAC   -0.8188  0.9978 -3.5382  1.9005   False
5     ALICE   Illiac   -1.5000  0.9959 -6.1432  3.1432   False
6     ALICE  VOYAGER   -1.3955  0.8715 -4.1160  1.3249   False
7     ALICE    alice    0.1000  1.0000 -3.0719  3.2719   False
8     ALICE     cysp   -0.5000  1.0000 -3.5397  2.5397   False
9     ALICE   illiac   -1.1667  0.9851 -4.2621  1.9288   False
10    ALICE  voyager   -0.6111  0.9999 -3.5748  2.3526   False
11    ALICE    Alice    1.0909  0.9058 -1.1471  3.3289   False
12    ALICE     CYSP   -0.1493  0.9998 -0.8044  0.5058   False
13    ALICE     Cysp    2.4242  0.6296 -1.3955  6.2440   False
14    ALICE   ILLIAC    0.1054  1.0000 -0.5473  0.7581 

# Post Study