## Section 1 : Importing Libraries

In [3]:
import pickle
import pandas as pd
from docx import Document
from docx.shared import Inches
from pathlib import Path

In [4]:
# --- Loading artifacts ---
with open("../models/cleaned_hpv_data.pkl", "rb") as f:
    df = pickle.load(f)

with open("../models/data_dictionary.pkl", "rb") as f:
    data_dict = pickle.load(f)

with open("../models/eda_figures.pkl", "rb") as f:
    figures = pickle.load(f)

with open("../models/stats_results.pkl", "rb") as f:
    stats_results = pickle.load(f)

In [5]:
# --- Setup paths ---
FIG_DIR = Path("../reports/figures")
REPORT_DIR = Path("../reports/final")
REPORT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
# --- Creating DOCX ---
doc = Document()

# Title
doc.add_heading("HPV Awareness Impact Analysis", 0)
doc.add_paragraph("Problem Statement: An interventional study to assess the level "
                  "of awareness among youth regarding Human Papillomavirus (HPV) "
                  "and its vaccination in District Sitapur.")

# Objectives
doc.add_heading("Objectives", level=1)
doc.add_paragraph("1. Assess levels of awareness regarding HPV and its consequences.")
doc.add_paragraph("2. Assess association between pre-test and post-test scores after teaching.")
doc.add_paragraph("3. Determine knowledge regarding availability, schedule, and benefits.")

# Data Cleaning
doc.add_heading("Data Cleaning", level=1)
doc.add_paragraph("Raw demographic, pre-test, and post-test tables were ingested, "
                  "cleaned, merged, and mapped to human-readable labels. "
                  "Final dataset shape: {} rows × {} columns.".format(*df.shape))

# Data Dictionary
doc.add_heading("Data Dictionary", level=2)
table = doc.add_table(rows=1, cols=2)
hdr = table.rows[0].cells
hdr[0].text = "Variable"
hdr[1].text = "Description"
for k,v in data_dict.items():
    row = table.add_row().cells
    row[0].text, row[1].text = k, v

# EDA
doc.add_heading("Exploratory Data Analysis", level=1)
doc.add_paragraph("We examined participant demographics and score distributions.")

for fig_name, file in [
    ("education_distribution", "education_distribution.png"),
    ("residency_distribution", "residency_distribution.png"),
    ("age_group_distribution", "age_group_distribution.png"),
    ("gender_distribution", "gender_distribution.png"),
    ("knowledge_score_distribution", "knowledge_score_distribution.png"),
    ("score_improvement_education", "score_improvement_education.png"),
    ("score_improvement_gender", "score_improvement_gender.png"),
    ("score_improvement_residency", "score_improvement_residency.png"),
    ("correlation_heatmap", "correlation_heatmap.png"),
]:
    path = FIG_DIR / file
    if path.exists():
        doc.add_picture(str(path), width=Inches(5))
        doc.add_paragraph(fig_name.replace("_"," ").title())

# Statistical Analysis
doc.add_heading("Statistical Analysis", level=1)
summary_table = stats_results["summary_table"]

t = doc.add_table(rows=1, cols=4)
hdr = t.rows[0].cells
hdr[0].text, hdr[1].text, hdr[2].text, hdr[3].text = "Test", "Statistic", "p-value", "Significance"
for _, row in summary_table.iterrows():
    r = t.add_row().cells
    r[0].text = str(row["Test"])
    r[1].text = f"{row['Statistic']:.3f}"
    r[2].text = f"{row['p-value']:.3f}"
    r[3].text = row["Significance"]

doc.add_paragraph(f"Cronbach's Alpha (Pre): {stats_results['cronbach']['alpha_pre']:.3f}")
doc.add_paragraph(f"Cronbach's Alpha (Post): {stats_results['cronbach']['alpha_post']:.3f}")
doc.add_paragraph(f"Cohen's d: {stats_results['paired_tests']['cohens_d']:.2f} "
                  f"(large effect if >0.8)")
doc.add_paragraph(f"Statistical Power: {stats_results['paired_tests']['power']:.3f}")

# Conclusions
doc.add_heading("Conclusions & Insights", level=1)
doc.add_paragraph("• Intervention significantly improved knowledge (p<0.001, large effect size).")
doc.add_paragraph("• Improvement strongest among undergraduates and rural groups need additional support.")
doc.add_paragraph("• Reliability of survey instruments acceptable (Cronbach’s α > 0.7).")
doc.add_paragraph("• Low baseline participants benefited most, demographics had minimal influence.")

# Save DOCX
docx_path = REPORT_DIR / "HPV_Awareness_Impact_Report.docx"
doc.save(docx_path)
print(f"Saved DOCX report: {docx_path}")


Unnamed: 0,Test,Statistic,p-value,Significance
0,Shapiro (Pre),0.952597,0.02397356,★
1,Shapiro (Post),0.987631,0.8192138,•
2,Paired t-test,6.994845,3.245387e-09,★
3,Wilcoxon,151.0,7.869147e-08,★
4,ANOVA (Education),1.118131,0.3342061,•


Normality Checks:
Pre-Test: W=0.953, p=0.024
Post-Test: W=0.988, p=0.819
Paired t-test: t=6.995, p=0.000
Wilcoxon Signed-Rank: W=151.000, p=0.000
Cohen's d: 1.28
Statistical Power: 1.000
ANOVA (Education): F=1.118, p=0.334, Bonferroni corrected p=1.000
Cronbach's Alpha (Pre): 0.858
Cronbach's Alpha (Post): 0.736
Saved DOCX report: ../reports/final/HPV_Awareness_Impact_Report.docx
