In [None]:
import sqlite3
import pandas as pd
from pathlib import Path
from openpyxl.styles import Font, PatternFill, Alignment
from openpyxl.utils import get_column_letter
from datetime import datetime

# Setup paths
notebooks_dir = Path.cwd() if Path.cwd().name == 'notebooks' else Path.cwd() / 'notebooks'
sql_file = notebooks_dir / 'database.sql'
output_dir = notebooks_dir / 'output'
output_dir.mkdir(parents=True, exist_ok=True)

# Load database
temp_db = output_dir / 'temp_database.db'
if temp_db.exists():
    temp_db.unlink()

conn = sqlite3.connect(str(temp_db))
with open(sql_file, 'r', encoding='utf-8') as f:
    conn.executescript(f.read())
conn.commit()
print("DB loaded")

In [None]:
# Find schools with students missing photo_url or disability_certificate_url
# Only include students who are participating in events (connected to registrations)
schools_with_missing_docs = pd.read_sql_query("""
    SELECT DISTINCT
        s.school_code,
        s.name as school_name,
        s.location as school_location,
        COUNT(DISTINCT CASE WHEN st.photo_url IS NULL OR st.photo_url = '' THEN st.id END) as missing_photo_count,
        COUNT(DISTINCT CASE WHEN st.disability_certificate_url IS NULL OR st.disability_certificate_url = '' THEN st.id END) as missing_disability_cert_count,
        COUNT(DISTINCT st.id) as total_students_with_missing_docs
    FROM schools s
    INNER JOIN students st ON s.id = st.school_id
    INNER JOIN registration_participants rp ON st.id = rp.participant_id AND rp.participant_type = 'student'
    WHERE (st.photo_url IS NULL OR st.photo_url = '')
       OR (st.disability_certificate_url IS NULL OR st.disability_certificate_url = '')
    GROUP BY s.id, s.school_code, s.name, s.location
    ORDER BY total_students_with_missing_docs DESC, s.name
""", conn)

print(f"Found {len(schools_with_missing_docs)} schools with students missing documents")
print("\n" + "="*80)
display(schools_with_missing_docs)

In [None]:
# Detailed breakdown: List all students with missing documents
# Only include students who are participating in events (connected to registrations)
students_missing_docs = pd.read_sql_query("""
    SELECT DISTINCT
        s.school_code,
        s.name as school_name,
        s.location as school_location,
        st.student_name,
        st.age_category,
        st.gender,
        CASE WHEN st.photo_url IS NULL OR st.photo_url = '' THEN 'Missing' ELSE 'Present' END as photo_status,
        CASE WHEN st.disability_certificate_url IS NULL OR st.disability_certificate_url = '' THEN 'Missing' ELSE 'Present' END as disability_cert_status
    FROM students st
    INNER JOIN schools s ON st.school_id = s.id
    INNER JOIN registration_participants rp ON st.id = rp.participant_id AND rp.participant_type = 'student'
    WHERE (st.photo_url IS NULL OR st.photo_url = '')
       OR (st.disability_certificate_url IS NULL OR st.disability_certificate_url = '')
    ORDER BY s.name, st.student_name
""", conn)

print(f"Total students with missing documents: {len(students_missing_docs)}")
print("\n" + "="*80)
display(students_missing_docs)

In [None]:
# Summary statistics
# Only include students who are participating in events (connected to registrations)
summary = pd.read_sql_query("""
    SELECT 
        COUNT(DISTINCT s.id) as total_schools_affected,
        COUNT(DISTINCT st.id) as total_students_affected,
        COUNT(DISTINCT CASE WHEN st.photo_url IS NULL OR st.photo_url = '' THEN st.id END) as students_missing_photo,
        COUNT(DISTINCT CASE WHEN st.disability_certificate_url IS NULL OR st.disability_certificate_url = '' THEN st.id END) as students_missing_disability_cert,
        COUNT(DISTINCT CASE WHEN (st.photo_url IS NULL OR st.photo_url = '')
                              AND (st.disability_certificate_url IS NULL OR st.disability_certificate_url = '') THEN st.id END) as students_missing_both
    FROM schools s
    INNER JOIN students st ON s.id = st.school_id
    INNER JOIN registration_participants rp ON st.id = rp.participant_id AND rp.participant_type = 'student'
    WHERE (st.photo_url IS NULL OR st.photo_url = '')
       OR (st.disability_certificate_url IS NULL OR st.disability_certificate_url = '')
""", conn)

print("Summary Statistics:")
print("="*80)
display(summary)

In [None]:
# Export to Excel
def format_worksheet(worksheet, df):
    """Format worksheet with styled headers and auto-sized columns"""
    header_fill = PatternFill(start_color="366092", end_color="366092", fill_type="solid")
    header_font = Font(bold=True, color="FFFFFF", size=11)
    
    # Format header row
    for cell in worksheet[1]:
        cell.fill = header_fill
        cell.font = header_font
        cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
    
    # Auto-size columns
    for idx, column in enumerate(df.columns, 1):
        column_letter = get_column_letter(idx)
        max_length = max(
            len(str(column)),
            df[column].astype(str).map(len).max() if len(df) > 0 else 0
        )
        worksheet.column_dimensions[column_letter].width = min(max(max_length + 2, 10), 50)
    
    # Freeze header row
    worksheet.freeze_panes = "A2"
    worksheet.row_dimensions[1].height = 25

# Export results
excel_file = output_dir / f'missing_documents_{datetime.now().strftime("%Y%m%d_%H%M%S")}.xlsx'
with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
    # School summary sheet
    schools_with_missing_docs.to_excel(writer, sheet_name='Schools Summary', index=False)
    format_worksheet(writer.sheets['Schools Summary'], schools_with_missing_docs)
    
    # Detailed students sheet
    students_missing_docs.to_excel(writer, sheet_name='Students Details', index=False)
    format_worksheet(writer.sheets['Students Details'], students_missing_docs)
    
    # Summary statistics sheet
    summary.to_excel(writer, sheet_name='Summary Statistics', index=False)
    format_worksheet(writer.sheets['Summary Statistics'], summary)

print(f"\nExported to: {excel_file}")
conn.close()