**Purpose**: To perform statistical tests to evaluate intervention effectiveness and generate a comprehensive report.

## Section 1 : Importing Libraries and Configuration Setup

In [2]:
import pandas as pd
import numpy as np
from scipy import stats
import pingouin as pg
from docx import Document
import statsmodels.stats.power as smp
from statsmodels.stats.multitest import multipletests
import logging

# Configure logging
logging.basicConfig(filename='../logs/stat_analysis.log', level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')

print('Libraries imported successfully.')
logging.info('Libraries imported successfully.')

Libraries imported successfully.


In [3]:
# Loading data
df_final = pd.read_csv('../data/processed_data/cleaned_hpv_data.csv')
df_pre_full = pd.read_excel('../data/processed_data/summary_data.xlsx', sheet_name='pretest')
df_post_full = pd.read_excel('../data/processed_data/summary_data.xlsx', sheet_name='post_test')
print('Data loaded successfully.')
logging.info('Data loaded successfully.')

Data loaded successfully.


## Section 2 : Reliability Analysis

### 1. Cronbach's Alpha for survey reliability

In [4]:
question_cols = [str(i) for i in range(1, 34)]
alpha_pre = pg.cronbach_alpha(data=df_pre_full[question_cols])[0]
alpha_post = pg.cronbach_alpha(data=df_post_full[question_cols])[0]
print(f'Cronbach\'s Alpha (Pre-Test): {alpha_pre:.3f}')
print(f'Cronbach\'s Alpha (Post-Test): {alpha_post:.3f}')
logging.info(f'Cronbach\'s Alpha - Pre: {alpha_pre:.3f}, Post: {alpha_post:.3f}')

Cronbach's Alpha (Pre-Test): 0.858
Cronbach's Alpha (Post-Test): 0.736


### 2. Normality Check and Intervention Effectiveness

In [5]:
# Normality check
stat_pre, p_pre = stats.shapiro(df_final['pre_test_score'])
stat_post, p_post = stats.shapiro(df_final['post_test_score'])
print(f'Pre-Test Normality: p={p_pre:.3f} (normal if >0.05)')
print(f'Post-Test Normality: p={p_post:.3f}')
logging.info(f'Normality - Pre: p={p_pre:.3f}, Post: p={p_post:.3f}')

Pre-Test Normality: p=0.024 (normal if >0.05)
Post-Test Normality: p=0.819


In [6]:
# Paired T-test
t_stat, p_val_t = stats.ttest_rel(df_final['post_test_score'], df_final['pre_test_score'])
print(f'Paired T-Test: t={t_stat:.3f}, p={p_val_t:.3f}')
logging.info(f'Paired T-Test: t={t_stat:.3f}, p={p_val_t:.3f}')

Paired T-Test: t=6.995, p=0.000


In [7]:
# Wilcoxon Signed-Rank Test (non-parametric alternative)
w_stat, p_val_w = stats.wilcoxon(df_final['post_test_score'], df_final['pre_test_score'])
print(f'Wilcoxon Signed-Rank Test: statistic={w_stat:.3f}, p={p_val_w:.3f}')
logging.info(f'Wilcoxon Test: statistic={w_stat:.3f}, p={p_val_w:.3f}')

Wilcoxon Signed-Rank Test: statistic=151.000, p=0.000


In [8]:
# Effect size (Cohen's d)
eff_size = pg.compute_effsize(df_final['post_test_score'], df_final['pre_test_score'], paired=True)
print(f'Cohen\'s d: {eff_size:.2f} (large effect if >0.8)')
logging.info(f'Cohen\'s d: {eff_size:.2f}')

Cohen's d: 1.28 (large effect if >0.8)


In [9]:
# Power analysis
power = smp.TTestIndPower().power(effect_size=eff_size, nobs1=len(df_final), alpha=0.05, ratio=1.0)
print(f'Statistical Power: {power:.3f}')
logging.info(f'Statistical Power: {power:.3f}')

Statistical Power: 1.000


## Section 3 : Demographic Group Comparisons

### Score improvement by education level (ANOVA with Bonferroni correction)

In [10]:
edu_groups = [df_final[df_final['Education_Label'] == edu]['score_improvement'] for edu in df_final['Education_Label'].unique()]
f_stat, p_val_anova = stats.f_oneway(*edu_groups)
p_vals = [p_val_t, p_val_w, p_val_anova]
reject, p_vals_corrected, _, _ = multipletests(p_vals, alpha=0.05, method='bonferroni')
print(f'ANOVA (Education): F={f_stat:.3f}, p={p_val_anova:.3f}, Corrected p={p_vals_corrected[2]:.3f}')
logging.info(f'ANOVA (Education): F={f_stat:.3f}, p={p_val_anova:.3f}, Corrected p={p_vals_corrected[2]:.3f}')

ANOVA (Education): F=1.118, p=0.334, Corrected p=1.000


## Section 4 : Saving Data

In [11]:
import pickle, pandas as pd
from pathlib import Path

PROC_DIR = Path("../models")

# --- Organize results into dict ---
stats_results = {
    "cronbach": {
        "alpha_pre": float(alpha_pre),
        "alpha_post": float(alpha_post)
    },
    "normality": {
        "pre": {"stat": float(stat_pre), "p": float(p_pre)},
        "post": {"stat": float(stat_post), "p": float(p_post)}
    },
    "paired_tests": {
        "t_test": {"t": float(t_stat), "p": float(p_val_t)},
        "wilcoxon": {"W": float(w_stat), "p": float(p_val_w)},
        "cohens_d": float(eff_size),
        "power": float(power)
    },
    "anova": {
        "education": {"F": float(f_stat), "p": float(p_val_anova), "p_corr": float(p_vals_corrected[2])}
    }
}

# --- Create summary DataFrame ---
results_table = pd.DataFrame([
    ["Shapiro (Pre)", stat_pre, p_pre, "★" if p_pre<0.05 else "•"],
    ["Shapiro (Post)", stat_post, p_post, "★" if p_post<0.05 else "•"],
    ["Paired t-test", t_stat, p_val_t, "★" if p_val_t<0.05 else "•"],
    ["Wilcoxon", w_stat, p_val_w, "★" if p_val_w<0.05 else "•"],
    ["ANOVA (Education)", f_stat, p_val_anova, "★" if p_val_anova<0.05 else "•"]
], columns=["Test","Statistic","p-value","Significance"])

stats_results["summary_table"] = results_table

# --- Save to pickle ---
PROC_DIR.mkdir(parents=True, exist_ok=True)
with open(PROC_DIR / "stats_results.pkl", "wb") as f:
    pickle.dump(stats_results, f)

print("Saved stats_results.pkl in models/")


Saved stats_results.pkl in models/
