**Purpose**: To perform all the formal statistical tests and generate the final Word document report.

# Step 1. Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import pingouin as pg
from docx import Document
import datetime

print("Libraries imported successfully.")

Libraries imported successfully.


# Step 2. Loading Data

In [2]:
# Loading the cleaned, merged data for most analyses
df_final = pd.read_csv('../data/processed_data/cleaned_hpv_data.csv')

# Loading the raw sheets ONLY for Cronbach's Alpha, which needs the individual question columns
df_pre_full = pd.read_excel('../data/processed_data/summary_data.xlsx', sheet_name='pretest')
df_post_full = pd.read_excel('../data/processed_data/summary_data.xlsx', sheet_name='post_test')
print("Data loaded successfully.")

Data loaded successfully.


In [3]:
# Define mappings needed for the report generation
demographic_cols_map = {
    1: 'Age', 2: 'Gender', 3: 'Place_of_Residency', 4: 'Education',
    5: 'Vaccination_Status', 6: 'Health_Care_Access', 7: 'Occupation_of_Parents',
    8: 'Family_Income_per_Month'
}
demographic_maps = {
    'Age': {0: '15-19 Years', 1: '19-24 Years', 2: '24 Years and above'},
    'Gender': {0: 'Female', 1: 'Male'},
    'Place_of_Residency': {0: 'Rural', 1: 'Semi-Urban', 2: 'Urban'},
    'Education': {0: 'High school', 1: 'Under graduation', 2: 'Post-graduation'},
    'Vaccination_Status': {0: 'Irregularly vaccinated', 1: 'Regularly vaccinated', 2: 'Not vaccinated'},
    'Health_Care_Access': {0: 'Easily accessible', 1: 'Not accessible', 2: 'Not interested'},
    'Occupation_of_Parents': {0: 'Professional', 1: 'Retired', 2: 'Skilled', 3: 'Unemployed'},
    'Family_Income_per_Month': {
        0: 'Below ₹14,997', 1: '₹14,977 - ₹22,494', 2: '₹22,495 - ₹37,492',
        3: '₹37,493 - ₹74,999', 4: '₹75,000 and above'
    }
}
print("Data and mappings loaded successfully.")

Data and mappings loaded successfully.


# Step 3. Performing Statistical Analyses

In [4]:
# 3.1. Reliability Analysis (Cronbach's Alpha)
question_cols = [int(i) for i in range(1, 34)]
alpha_pre = pg.cronbach_alpha(data=df_pre_full[question_cols])[0]
alpha_post = pg.cronbach_alpha(data=df_post_full[question_cols])[0]

In [5]:
# 3.2. Paired T-test for Intervention Effectiveness
pre_mean, pre_std = df_final['pre_test_score'].mean(), df_final['pre_test_score'].std()
post_mean, post_std = df_final['post_test_score'].mean(), df_final['post_test_score'].std()
t_stat_paired, p_val_paired = stats.ttest_rel(df_final['post_test_score'], df_final['pre_test_score'])

In [6]:
# 3.3. Association with Baseline Variables (t-test/ANOVA)
assoc_table_data = []
for col in demographic_cols_map.values():
    groups_pre = [df_final['pre_test_score'][df_final[col] == g] for g in sorted(df_final[col].unique()) if len(df_final[df_final[col] == g]) > 1]
    groups_post = [df_final['post_test_score'][df_final[col] == g] for g in sorted(df_final[col].unique()) if len(df_final[df_final[col] == g]) > 1]
    
    test_type, p_pre, p_post = "N/A", np.nan, np.nan
    
    if len(groups_pre) >= 2:
        test_type = "t-test" if len(groups_pre) == 2 else "ANOVA"
        p_pre = stats.ttest_ind(*groups_pre)[1] if len(groups_pre) == 2 else stats.f_oneway(*groups_pre)[1]
        p_post = stats.ttest_ind(*groups_post)[1] if len(groups_pre) == 2 else stats.f_oneway(*groups_post)[1]

    for group_val in sorted(df_final[col].unique()):
        subset = df_final[df_final[col] == group_val]
        assoc_table_data.append([
            col.replace('_', ' '),
            demographic_maps[col].get(group_val, group_val),
            f"{subset['pre_test_score'].mean():.2f}",
            f"{subset['post_test_score'].mean():.2f}",
            test_type,
            f"{p_pre:.3f}{' **' if p_pre < 0.05 else ' ns'}" if not np.isnan(p_pre) else "N/A",
            f"{p_post:.3f}{' **' if p_post < 0.05 else ' ns'}" if not np.isnan(p_post) else "N/A"
        ])

df_assoc_final = pd.DataFrame(assoc_table_data, columns=["Baseline Variable", "Category", "Pre-test Mean", "Post-test Mean", "Test", "p-value (Pre)", "p-value (Post)"])

print("All statistical analyses complete.")

All statistical analyses complete.


# Step 4. Generating Comprehensive Word Document Report

In [7]:
doc = Document()
doc.add_heading('Statistical Analysis Report: HPV Awareness Study (N=58)', level=1)
doc.add_paragraph(f"Report generated on: {datetime.date.today().strftime('%B %d, %Y')}")

# Helper function to add a pandas DataFrame to the Word document
def add_df_to_doc(document, df, title=""):
    if title:
        document.add_heading(title, level=2)
    t = document.add_table(df.shape[0] + 1, df.shape[1])
    t.style = 'Table Grid'
    for j, col_name in enumerate(df.columns):
        t.cell(0, j).text = str(col_name)
    for i, row in enumerate(df.itertuples(index=False)):
        for j, cell in enumerate(row):
            t.cell(i + 1, j).text = str(cell)
    document.add_paragraph()

## Section 1: Demographic Profile

In [8]:
demo_summary_data = []
for column in demographic_maps.keys():
    labels = df_final[column].map(demographic_maps[column])
    freq = labels.value_counts().sort_index()
    perc = labels.value_counts(normalize=True).sort_index() * 100
    for category, n in freq.items():
        p = perc[category]
        demo_summary_data.append([column.replace('_', ' '), category, n, f"{p:.1f}%"])
df_demo_summary = pd.DataFrame(demo_summary_data, columns=["Variable", "Category", "Frequency (n)", "Percentage (%)"])
add_df_to_doc(doc, df_demo_summary, "Table 1: Frequency and Percentage of Demographic Variables")

## Section 2: Reliability and Intervention Effectiveness

In [9]:
doc.add_heading('Table 2: Reliability of the Test and Intervention Effectiveness', level=2)
doc.add_paragraph(f"The HPV Knowledge Scale showed high internal consistency both before the intervention (Cronbach's α = {alpha_pre:.2f}) and after (Cronbach's α = {alpha_post:.2f}), confirming it is a reliable tool.")
paired_data = pd.DataFrame({
    "Test": ["Pre-test", "Post-test"],
    "Mean Score": [f"{pre_mean:.2f}", f"{post_mean:.2f}"],
    "Standard Deviation": [f"{pre_std:.2f}", f"{post_std:.2f}"]
})
add_df_to_doc(doc, paired_data)
doc.add_paragraph(f"A Paired Samples t-test revealed that the increase in scores from pre-test to post-test was highly significant (t-statistic = {t_stat_paired:.2f}, p-value < 0.001).")

<docx.text.paragraph.Paragraph at 0x7f327bcb0bd0>

## Section 3: Association with Baseline Variables

In [10]:
add_df_to_doc(doc, df_assoc_final, "Table 3: Association Between Awareness Levels and Baseline Variables")
doc.add_paragraph("*Note: The p-value is for the entire variable group. 'ns' denotes non-significant (p >= 0.05), '**' denotes significant (p < 0.05).*")

<docx.text.paragraph.Paragraph at 0x7f327bc730d0>

# Last Step: Save the Document 

In [11]:
file_name = '../reports/Final_Statistical_Report.docx'
doc.save(file_name)

print(f"\n Report generation complete. Saved as '{file_name}' 📄")


 Report generation complete. Saved as '../reports/Final_Statistical_Report.docx' 📄
