# Developing a hypothesis for performance decline in a session and proving it using statistical tests
<b>Null Hypothesis (H₀)</b>: The average accuracy/WPM/consistency in the first half of a session is equal to or more than the average accuracy/WPM in the second half of the session.

<b>Alternative Hypothesis (H₁)</b>: The average accuracy/WPM/consistency in the first half of a session is lesser than the average accuracy/WPM in the second half of the session.

In [6]:
import gdown
folder_id = "1G7YDTrD_UNCUFiOs5ed4JV1QtRXaAEU0"
output = "datasets"
gdown.download_folder(f"https://drive.google.com/drive/folders/{folder_id}", output=output, quiet=False, use_cookies=False)

Retrieving folder contents


Processing file 1DAPRaVIVUmJME2LHfNfwwveB4mk5VOPx Subject_A_Results.csv
Processing file 1hf-mQ-rTuKlFlkMd8Nld77s8HbMs01NV Subject_B_Results.csv
Processing file 1zOQNeWfoew5LoT2AbSHqXlpRVYfiNKYg Subject_C_Results.csv
Processing file 18gvXYZ15sfvGkPIJTYb3FLwwTAdvesCr Subject_D_Results.csv
Processing file 19R_ckXXDHpanEX04UqNfwlA90jFxPaDR Subject_E_Results.csv
Processing file 1o2fkODEhX_4ObhHX1ghVAgk390BvCYoD Subject_F_Results.csv
Processing file 1tSKEy03Tcjiw9iJjpxH03plDjZRsgZZh Subject_G_Results.csv
Processing file 18NMwG0OHknGMDfFczZ5xntwFGEee_rcj Subject_H_Results.csv
Processing file 16uSR_2cqPTtCyUklqqttsNPZvrzads-k Subject_I_Results.csv
Processing file 1BnJ8m-YIVU_D7rXj46F4oLn1PMeXrOP1 Subject_J_Results.csv
Processing file 1RjLMAKuzuYk4E32wY6MvqP21WnmuKkVn Subject_K_Results.csv
Processing file 16PBRPMRWzpVjBJxs78zK5iaQ-Dc_vsjF Subject_L_Results.csv
Processing file 1JRc1mz1ODZeURfQGaMXdLkv9F73vKZ1z Subject_M_Results.csv
Processing file 1Aib17o9nXQAYNWiDwo-E0wd_NBvnOl9U Subject_N_Resu

Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1DAPRaVIVUmJME2LHfNfwwveB4mk5VOPx
To: /content/datasets/Subject_A_Results.csv
100%|██████████| 67.6k/67.6k [00:00<00:00, 59.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1hf-mQ-rTuKlFlkMd8Nld77s8HbMs01NV
To: /content/datasets/Subject_B_Results.csv
100%|██████████| 152k/152k [00:00<00:00, 74.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=1zOQNeWfoew5LoT2AbSHqXlpRVYfiNKYg
To: /content/datasets/Subject_C_Results.csv
100%|██████████| 147k/147k [00:00<00:00, 40.3MB/s]
Downloading...
From: https://drive.google.com/uc?id=18gvXYZ15sfvGkPIJTYb3FLwwTAdvesCr
To: /content/datasets/Subject_D_Results.csv
100%|██████████| 155k/155k [00:00<00:00, 28.9MB/s]
Downloading...
From: https://drive.google.com/uc?id=19R_ckXXDHpanEX04UqNfwlA90jFxPaDR
To: /content/datasets/Subject_E_Results.csv
100%|██████████| 157k/157k [00:00<00:00

['datasets/Subject_A_Results.csv',
 'datasets/Subject_B_Results.csv',
 'datasets/Subject_C_Results.csv',
 'datasets/Subject_D_Results.csv',
 'datasets/Subject_E_Results.csv',
 'datasets/Subject_F_Results.csv',
 'datasets/Subject_G_Results.csv',
 'datasets/Subject_H_Results.csv',
 'datasets/Subject_I_Results.csv',
 'datasets/Subject_J_Results.csv',
 'datasets/Subject_K_Results.csv',
 'datasets/Subject_L_Results.csv',
 'datasets/Subject_M_Results.csv',
 'datasets/Subject_N_Results.csv',
 'datasets/Subject_O_Results.csv',
 'datasets/Subject_P_Results.csv',
 'datasets/Subject_Q_Results.csv',
 'datasets/Subject_R_Results.csv',
 'datasets/Subject_S_Results.csv',
 'datasets/Subject_T_Results.csv',
 'datasets/Subject_U_Results.csv',
 'datasets/Subject_V_Results.csv']

In [9]:
import pandas as pd
import glob
from scipy.stats import ttest_rel
import numpy as np

def calculate_cohens_d(group1, group2):
    """Calculate Cohen's d for two related groups."""
    diff = np.array(group1) - np.array(group2)
    return np.mean(diff) / np.std(diff, ddof=1)

def split_and_test(aggregated_sessions):
    """Split the aggregated sessions, compute the mean for each half, and perform statistical tests."""
    first_half_wpm = []
    second_half_wpm = []
    first_half_accuracy = []
    second_half_accuracy = []
    first_half_consistency = []
    second_half_consistency = []

    for session in aggregated_sessions:
        if len(session) < 8:
            continue

        mid_point = len(session) //2
        first_half = session[:mid_point]
        second_half = session[mid_point:]

        first_half_wpm.append(first_half['wpm'].mean())
        second_half_wpm.append(second_half['wpm'].mean())

        first_half_accuracy.append(first_half['acc'].mean())
        second_half_accuracy.append(second_half['acc'].mean())

        first_half_consistency.append(first_half['consistency'].mean())
        second_half_consistency.append(second_half['consistency'].mean())

    wpm_tstat, wpm_pvalue = ttest_rel(first_half_wpm, second_half_wpm)
    accuracy_tstat, accuracy_pvalue = ttest_rel(first_half_accuracy, second_half_accuracy)
    consistency_tstat, consistency_pvalue = ttest_rel(first_half_consistency, second_half_consistency)

    print("Overall Typing Speed (WPM):")
    print(f"t-statistic = {wpm_tstat:.3f}, p-value = {wpm_pvalue:.3f}")
    print("Overall Accuracy:")
    print(f"t-statistic = {accuracy_tstat:.3f}, p-value = {accuracy_pvalue:.3f}")
    print("Overall Consistency: ")
    print(f"t-statistic = {consistency_tstat:.3f}, p-value = {consistency_pvalue:.3f}")
    print("-" * 50)

    wpm_cohen_d = calculate_cohens_d(first_half_wpm, second_half_wpm)
    accuracy_cohen_d = calculate_cohens_d(first_half_accuracy, second_half_accuracy)
    consistency_cohen_d = calculate_cohens_d(first_half_consistency, second_half_consistency)

    print(f"Effect Size (Cohen's d) for WPM: {wpm_cohen_d:.3f}")
    print(f"Effect Size (Cohen's d) for Accuracy: {accuracy_cohen_d:.3f}")
    print(f"Effect Size (Cohen's d) for Consistency: {consistency_cohen_d:.3f}")


def subject_session_details(data):
    window_size = 30 * 60 * 1000  # 30 minutes in milliseconds
    sessions = []
    curr_session = []

    for _, row in data.iterrows():
        if len(curr_session) == 0 or abs(row['timestamp'] - curr_session[0]['timestamp']) < window_size:
            curr_session.append(row)
        else:
            if len(curr_session) != 0:
                sessions.append(pd.DataFrame(curr_session))
                curr_session = [row]

    if curr_session:  # Add the last session
        sessions.append(pd.DataFrame(curr_session))
    sessions = [session for session in sessions if len(session) >= 8]
    print(f"Number of sessions with at least 8 tests: {len(sessions)}")

    return sessions

def process_csv_files(folder_path):
    """Aggregate sessions from all CSV files."""
    csv_files = glob.glob(f"{folder_path}/*.csv")
    aggregated_sessions = []

    for file in csv_files:
        try:
            print(f"Processing file: {file}")
            data = pd.read_csv(file)
            sessions = subject_session_details(data)
            aggregated_sessions.extend(sessions)
        except pd.errors.EmptyDataError:
            print(f"Warning: File '{file}' is empty. Skipping.")
        except Exception as e:
            print(f"An error occurred while processing '{file}': {e}")
    split_and_test(aggregated_sessions)

process_csv_files("datasets")


Processing file: datasets/Subject_Q_Results.csv
Number of sessions with at least 8 tests: 0
Processing file: datasets/Subject_N_Results.csv
Number of sessions with at least 8 tests: 40
Processing file: datasets/Subject_T_Results.csv
Number of sessions with at least 8 tests: 2
Processing file: datasets/Subject_A_Results.csv
Number of sessions with at least 8 tests: 22
Processing file: datasets/Subject_S_Results.csv
Number of sessions with at least 8 tests: 9
Processing file: datasets/Subject_J_Results.csv
Number of sessions with at least 8 tests: 41
Processing file: datasets/Subject_V_Results.csv
Number of sessions with at least 8 tests: 50
Processing file: datasets/Subject_G_Results.csv
Number of sessions with at least 8 tests: 47
Processing file: datasets/Subject_D_Results.csv
Number of sessions with at least 8 tests: 26
Processing file: datasets/Subject_P_Results.csv
Number of sessions with at least 8 tests: 0
Processing file: datasets/Subject_U_Results.csv
Number of sessions with at

In [10]:
import pandas as pd
import glob
from scipy.stats import wilcoxon

def filter_and_session_data(data):
    window_size = 30 * 60 * 1000  # 30 minutes in milliseconds
    sessions = []
    curr_session = []

    for _, row in data.iterrows():
        if len(curr_session) == 0 or abs(row['timestamp'] - curr_session[0]['timestamp']) < window_size:
            curr_session.append(row)
        else:
            if len(curr_session) != 0:
                sessions.append(pd.DataFrame(curr_session))
            curr_session = [row]

    if curr_session:  # Add the last session
        sessions.append(pd.DataFrame(curr_session))

    # Filter out sessions with less than 8 tests
    sessions = [session for session in sessions if len(session) >= 8]
    print(f"Number of sessions with at least 8 tests: {len(sessions)}")

    return sessions

def test_restart_count_in_first_vs_second_half(sessions):
    """Perform Wilcoxon signed-rank test to compare sum of restartCount in the first and second half of each session."""
    first_half_restart_count = []
    second_half_restart_count = []

    for session in sessions:
        if len(session) < 8:
            continue

        mid_point = len(session) // 2
        first_half = session[:mid_point]
        second_half = session[mid_point:]

        first_half_restart_count.append(first_half['restartCount'].sum())
        second_half_restart_count.append(second_half['restartCount'].sum())

    stat, p_value = wilcoxon(first_half_restart_count, second_half_restart_count, alternative='greater')

    print("Wilcoxon Signed-Rank Test for Sum of Restart Count in Second Half vs First Half:")
    print(f"Statistic = {stat:.3f}, p-value = {p_value:.3f}")
    print("-" * 50)

def process_csv_files(folder_path):
    """Aggregate sessions from all CSV files and perform statistical tests."""
    csv_files = glob.glob(f"{folder_path}/*.csv")
    aggregated_sessions = []

    for file in csv_files:
        try:
            print(f"Processing file: {file}")
            data = pd.read_csv(file)
            sessions = filter_and_session_data(data)
            aggregated_sessions.extend(sessions)
        except pd.errors.EmptyDataError:
            print(f"Warning: File '{file}' is empty. Skipping.")
        except Exception as e:
            print(f"An error occurred while processing '{file}': {e}")
    test_restart_count_in_first_vs_second_half(aggregated_sessions)

process_csv_files("datasets")

Processing file: datasets/Subject_Q_Results.csv
Number of sessions with at least 8 tests: 0
Processing file: datasets/Subject_N_Results.csv
Number of sessions with at least 8 tests: 40
Processing file: datasets/Subject_T_Results.csv
Number of sessions with at least 8 tests: 2
Processing file: datasets/Subject_A_Results.csv
Number of sessions with at least 8 tests: 22
Processing file: datasets/Subject_S_Results.csv
Number of sessions with at least 8 tests: 9
Processing file: datasets/Subject_J_Results.csv
Number of sessions with at least 8 tests: 41
Processing file: datasets/Subject_V_Results.csv
Number of sessions with at least 8 tests: 50
Processing file: datasets/Subject_G_Results.csv
Number of sessions with at least 8 tests: 47
Processing file: datasets/Subject_D_Results.csv
Number of sessions with at least 8 tests: 26
Processing file: datasets/Subject_P_Results.csv
Number of sessions with at least 8 tests: 0
Processing file: datasets/Subject_U_Results.csv
Number of sessions with at