# 03 â€” Hypothesis Testing
**Author:** Ebenezer Adjartey

Covers: one-sample & two-sample t-tests, paired t-test, ANOVA (one-way, two-way), chi-square tests, z-test for proportions, F-test, multiple comparisons (Bonferroni, Tukey).

In [None]:
import os
import numpy as np
import pandas as pd
import scipy.stats as stats
from statsmodels.stats.anova import anova_lm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.proportion import proportions_ztest
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(42)
sns.set_theme(style='whitegrid')
print('Libraries loaded.')

## 1. One-Sample t-Test

In [None]:
# H0: mu = 70  vs  H1: mu != 70
scores = np.random.normal(72, 12, 30)
t_stat, p_val = stats.ttest_1samp(scores, popmean=70)
ci = stats.t.interval(0.95, df=len(scores)-1,
                       loc=scores.mean(), scale=stats.sem(scores))
print(f'Sample mean = {scores.mean():.3f}')
print(f't-statistic = {t_stat:.4f}')
print(f'p-value     = {p_val:.4f}')
print(f'95% CI      = ({ci[0]:.3f}, {ci[1]:.3f})')
print('Verdict:', 'Reject H0' if p_val < 0.05 else 'Fail to reject H0')

## 2. Two-Sample t-Test (Independent)

In [None]:
# H0: mu1 = mu2  (equal variance)
group_a = np.random.normal(75, 10, 40)
group_b = np.random.normal(70, 12, 40)

# Equal variance (Student's t)
t_eq, p_eq = stats.ttest_ind(group_a, group_b, equal_var=True)
print(f'Equal variance t-test: t={t_eq:.4f}, p={p_eq:.4f}')

# Unequal variance (Welch's t)
t_w, p_w = stats.ttest_ind(group_a, group_b, equal_var=False)
print(f"Welch's t-test:        t={t_w:.4f}, p={p_w:.4f}")

# Levene's test for equal variances
lev_stat, lev_p = stats.levene(group_a, group_b)
print(f"Levene's test (equal var): W={lev_stat:.4f}, p={lev_p:.4f}")

## 3. Paired t-Test

In [None]:
# H0: mean difference = 0  (before vs after treatment)
before = np.random.normal(120, 15, 25)
after  = before - np.random.normal(8, 5, 25)   # treatment reduces by ~8
t_p, p_p = stats.ttest_rel(before, after)
diff = before - after
print(f'Mean difference (before - after) = {diff.mean():.3f}')
print(f'Paired t-statistic = {t_p:.4f}')
print(f'p-value            = {p_p:.4f}')
print('Verdict:', 'Significant change' if p_p < 0.05 else 'No significant change')

## 4. One-Way ANOVA

In [None]:
# H0: all group means equal
g1 = np.random.normal(70, 10, 30)
g2 = np.random.normal(75, 10, 30)
g3 = np.random.normal(80, 10, 30)

f_stat, p_anova = stats.f_oneway(g1, g2, g3)
print(f'One-Way ANOVA: F={f_stat:.4f}, p={p_anova:.4f}')
print('Verdict:', 'At least one mean differs' if p_anova < 0.05 else 'No significant difference')

# Effect size (eta-squared)
all_data = np.concatenate([g1, g2, g3])
grand_mean = all_data.mean()
ss_between = sum(len(g)*(g.mean()-grand_mean)**2 for g in [g1,g2,g3])
ss_total   = ((all_data - grand_mean)**2).sum()
eta_sq = ss_between / ss_total
print(f'Eta-squared (effect size) = {eta_sq:.4f}')

## 5. Two-Way ANOVA

In [None]:
# Two factors: teaching_method + gender
n = 120
df_anova = pd.DataFrame({
    'score':   np.random.normal(70, 10, n),
    'method':  np.tile(['A','B','C'], n//3),
    'gender':  np.repeat(['M','F'], n//2)
})
# Add method effect
df_anova.loc[df_anova['method']=='B', 'score'] += 5
df_anova.loc[df_anova['method']=='C', 'score'] += 10

model = ols('score ~ C(method) + C(gender) + C(method):C(gender)', data=df_anova).fit()
anova_table = anova_lm(model, typ=2)
print('Two-Way ANOVA Table:')
print(anova_table.round(4))

## 6. Post-Hoc Multiple Comparisons

In [None]:
# Tukey HSD after one-way ANOVA
all_scores = np.concatenate([g1, g2, g3])
groups     = np.repeat(['G1','G2','G3'], 30)
tukey = pairwise_tukeyhsd(all_scores, groups, alpha=0.05)
print('Tukey HSD Results:')
print(tukey)

# Bonferroni correction
from statsmodels.stats.multitest import multipletests
raw_p = [0.01, 0.04, 0.06, 0.12, 0.20]
bon_reject, bon_p, _, _ = multipletests(raw_p, alpha=0.05, method='bonferroni')
print('\nBonferroni Correction:')
for i, (rp, bp, rej) in enumerate(zip(raw_p, bon_p, bon_reject)):
    print(f'  Test {i+1}: raw_p={rp:.2f} -> adj_p={bp:.3f} Reject={rej}')

## 7. Chi-Square Tests

In [None]:
# 7a. Chi-square goodness-of-fit
observed = np.array([45, 60, 55, 40])   # observed frequencies
expected = np.array([50, 50, 50, 50])   # expected under H0 (uniform)
chi2_gof, p_gof = stats.chisquare(observed, f_exp=expected)
print(f'Goodness-of-fit: chi2={chi2_gof:.4f}, p={p_gof:.4f}')

# 7b. Chi-square test of independence
contingency = np.array([[30, 20], [15, 35]])
chi2_ind, p_ind, dof, expected_ind = stats.chi2_contingency(contingency)
print(f'\nIndependence test: chi2={chi2_ind:.4f}, p={p_ind:.4f}, df={dof}')
print('Expected frequencies:')
print(expected_ind.round(2))

## 8. Z-Test for Proportions

In [None]:
# H0: p = 0.50 (coin is fair)
n_trials, n_success = 100, 60
z_stat, p_z = proportions_ztest(n_success, n_trials, value=0.5)
print(f'Z-test for proportion: z={z_stat:.4f}, p={p_z:.4f}')
print('Verdict:', 'Reject H0' if p_z < 0.05 else 'Fail to reject H0')

# 95% CI for proportion
p_hat = n_success / n_trials
se = np.sqrt(p_hat*(1-p_hat)/n_trials)
ci_low  = p_hat - 1.96*se
ci_high = p_hat + 1.96*se
print(f'95% CI for proportion: ({ci_low:.4f}, {ci_high:.4f})')

## 9. F-Test for Equal Variances

In [None]:
sample1 = np.random.normal(50, 8, 30)
sample2 = np.random.normal(50, 12, 30)

# Levene's test (robust)
lev_stat, lev_p = stats.levene(sample1, sample2)
print(f"Levene's test: W={lev_stat:.4f}, p={lev_p:.4f}")

# Bartlett's test (assumes normality)
bart_stat, bart_p = stats.bartlett(sample1, sample2)
print(f"Bartlett's test: T={bart_stat:.4f}, p={bart_p:.4f}")

# Manual F-test
F = sample1.var(ddof=1) / sample2.var(ddof=1)
print(f'Manual F-ratio = {F:.4f}')

## Visualization

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Boxplot: three groups for ANOVA
axes[0].boxplot([g1, g2, g3], labels=['G1','G2','G3'], patch_artist=True)
axes[0].set_title(f'One-Way ANOVA\nF={f_stat:.2f}, p={p_anova:.3f}')
axes[0].set_ylabel('Score')

# Paired data
idx = np.arange(1, 11)
axes[1].plot(idx, before[:10], 'bo-', label='Before')
axes[1].plot(idx, after[:10],  'rs-', label='After')
axes[1].set_title(f'Paired t-Test (n=25)\nt={t_p:.2f}, p={p_p:.3f}')
axes[1].legend(); axes[1].set_xlabel('Subject')

# Chi-square distribution
xc = np.linspace(0, 20, 300)
axes[2].plot(xc, stats.chi2.pdf(xc, df=3), 'b-', lw=2)
x_fill = np.linspace(chi2_gof, 20, 200)
axes[2].fill_between(x_fill, stats.chi2.pdf(x_fill, df=3), alpha=.3, color='red', label='p-value region')
axes[2].axvline(chi2_gof, color='red', linestyle='--', label=f'chi2={chi2_gof:.2f}')
axes[2].set_title('Chi-Square GOF Test'); axes[2].legend(fontsize=8)

plt.tight_layout()
os.makedirs('03_hypothesis_testing', exist_ok=True)
plt.savefig('03_hypothesis_testing/hypothesis_testing_plots.png', dpi=100, bbox_inches='tight')
plt.show(); print('Saved.')

## Key Takeaways

- Always check assumptions (normality, equal variance) before choosing a test
- Use Welch's t-test when variances are unequal
- ANOVA tests overall mean equality; post-hoc tests identify which pairs differ
- Bonferroni correction is conservative; Tukey HSD is better for pairwise comparisons
- Chi-square tests are for categorical data
