In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('checked-data.csv')

# Drop rows where 'image' column contains the string 'pretest'
df_filtered = df[~df['image'].str.contains('pretest')]

print(df_filtered)

In [None]:
# Group 1: Entries where 'image' contains the string 'lab'
df_lab = df_filtered[df_filtered['image'].str.contains('lab')]
print(df_lab.shape)
# print(df_lab)

In [None]:
# Group 2: Entries where 'image' does NOT contain the string 'lab'
df_watrin = df_filtered[~df_filtered['image'].str.contains('lab')]
print(df_watrin.shape)
# print(df_watrin)

In [None]:
# Compute total scores for each row (sum of q1 to q5)
df_lab['total_score'] = df_lab[['q1', 'q2', 'q3', 'q4', 'q5']].sum(axis=1)
df_watrin['total_score'] = df_watrin[['q1', 'q2', 'q3', 'q4', 'q5']].sum(axis=1)

# print(df_lab[['image', 'total_score']])
# print(df_watrin[['image', 'total_score']])

In [None]:

# Extract score arrays
group1_scores = df_lab['total_score'].values
group2_scores = df_watrin['total_score'].values

In [None]:
# ----- Descriptive Statistics -----
def descriptive_stats(data, label):
    print(f"\n--- Descriptive Statistics: {label} ---")
    print(f"Count: {len(data)}")
    print(f"Mean: {np.mean(data):.2f}")
    print(f"Median: {np.median(data):.2f}")
    print(f"Standard Deviation: {np.std(data, ddof=1):.2f}")
    print(f"Minimum: {np.min(data)}")
    print(f"Maximum: {np.max(data)}")

# Test functions
def shapiro_wilk_test(data, label):
    stat, p = stats.shapiro(data)
    print(f"\nShapiro-Wilk Test ({label}): W = {stat:.4f}, p = {p:.4f}")
    return p

def levene_test(g1, g2):
    stat, p = stats.levene(g1, g2)
    print(f"\nLevene’s Test: W = {stat:.4f}, p = {p:.4f}")
    return p

def t_test(g1, g2, equal_var=True):
    stat, p = stats.ttest_ind(g1, g2, equal_var=equal_var)
    print(f"\nT-Test (equal_var={equal_var}): t = {stat:.4f}, p = {p:.4f}")
    return p

def mann_whitney_u(g1, g2):
    stat, p = stats.mannwhitneyu(g1, g2, alternative='two-sided')
    print(f"\nMann–Whitney U Test: U = {stat}, p = {p:.4f}")
    return p

def cohens_d(g1, g2):
    mean_diff = np.mean(g1) - np.mean(g2)
    pooled_sd = np.sqrt(((len(g1)-1)*np.var(g1, ddof=1) + (len(g2)-1)*np.var(g2, ddof=1)) / (len(g1) + len(g2) - 2))
    d = mean_diff / pooled_sd
    print(f"\nCohen's d: {d:.4f}")
    return d

In [None]:
# Visualization
def plot_distributions(g1, g2):
    plt.figure(figsize=(10, 4))

    plt.subplot(1, 2, 1)
    plt.hist(g1, bins=range(0, 6), alpha=0.6, label='Lab Group', color='skyblue', edgecolor='black', density=True)
    plt.hist(g2, bins=range(0, 6), alpha=0.6, label='Non-Lab Group', color='salmon', edgecolor='black', density=True)
    plt.title("Histogram of Total Scores")
    plt.xlabel("Scores")
    plt.ylabel("Density")
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.boxplot([g1, g2], labels=["Lab Group", "Non-Lab Group"])
    plt.title("Boxplot of Total Scores")
    plt.ylabel("Score")

    plt.tight_layout()
    plt.show()

In [None]:
# === Main Analysis ===
print("=== Statistical Analysis Between Lab and Watrin Groups ===")

# Step 1: Descriptive statistics
# Summary statistics for both groups to understand general performance trends.
descriptive_stats(group1_scores, "Lab Group")
descriptive_stats(group2_scores, "Non-Lab Group")

# Step 2: Normality check using Shapiro-Wilk Test
# Tests whether scores in each group come from a normal distribution.
# Null hypothesis: the data is normally distributed. If p > 0.05, we fail to reject this.
p1 = shapiro_wilk_test(group1_scores, "Lab Group")
if p1 > 0.05:
    print("Interpretation (Lab Group): Data appears to be normally distributed (p > 0.05).")
else:
    print("Interpretation (Lab Group): Data does not follow a normal distribution (p < 0.05).")

p2 = shapiro_wilk_test(group2_scores, "Non-Lab Group")
if p2 > 0.05:
    print("Interpretation (Non-Lab Group): Data appears to be normally distributed (p > 0.05).")
else:
    print("Interpretation (Non-Lab Group): Data does not follow a normal distribution (p < 0.05).")

# Step 3: Test selection based on distributional assumptions
if p1 > 0.05 and p2 > 0.05:
    # If both distributions are normal, test if their variances are equal.
    # Levene’s test checks the null hypothesis that the variances are equal.
    p_var = levene_test(group1_scores, group2_scores)
    equal_var = p_var > 0.05

    if equal_var:
        print("\nInterpretation: Variances are approximately equal (p > 0.05). Proceeding with standard t-test.")
    else:
        print("\nInterpretation: Variances are unequal (p < 0.05). Proceeding with Welch’s t-test.")

    # Independent t-test: compares the means of two independent groups.
    # Null hypothesis: the means are equal. A significant p-value (< 0.05) suggests a difference in means.
    p_ttest = t_test(group1_scores, group2_scores, equal_var=equal_var)

    if p_ttest < 0.05:
        print("Interpretation: There is a statistically significant difference in mean scores between the groups.")
    else:
        print("Interpretation: No significant difference in mean scores between the groups.")
else:
    # If normality is violated, use Mann–Whitney U test instead of t-test.
    # This non-parametric test assesses whether one group tends to have higher values than the other.
    p_mwu = mann_whitney_u(group1_scores, group2_scores)

    if p_mwu < 0.05:
        print("Interpretation: Statistically significant difference in score distributions between groups.")
    else:
        print("Interpretation: No significant difference in score distributions between groups.")

# Step 4: Effect size using Cohen’s d
# This quantifies the magnitude of the difference between the two groups.
# Interpretation thresholds:
# - small effect: d ≈ 0.2
# - medium effect: d ≈ 0.5
# - large effect: d ≥ 0.8
d = cohens_d(group1_scores, group2_scores)

if abs(d) < 0.2:
    print("Interpretation: Negligible effect size.")
elif abs(d) < 0.5:
    print("Interpretation: Small effect size.")
elif abs(d) < 0.8:
    print("Interpretation: Medium effect size.")
else:
    print("Interpretation: Large effect size.")

# Step 5: Visualization
# Use visual plots to supplement the statistical results.
plot_distributions(group1_scores, group2_scores)
