# Section 1 : Data Ingestion and Sqlite setup

In [73]:
import os
import sqlite3
import pandas as pd
from sqlalchemy import create_engine


# Path to our DB
db_path = "/home/Davcote/Desktop/HPV-Awareness-Impact-Analysis/data/database/HPV.db"
engine = create_engine(f"sqlite:///{db_path}")

# Connect
conn = sqlite3.connect(db_path)

def run_sql(query):
    """Run SQL query and return results as a pandas DataFrame"""
    return pd.read_sql_query(query, conn)

def insert_processed(df: pd.DataFrame, name: str, engine=engine):
    """
    Inserts a processed DataFrame into the database.
    
    Parameters:
    - df (pd.DataFrame): The DataFrame to insert
    - name (str): The table name (will be prefixed with 'processed_')
    - engine: SQLAlchemy engine (default = HPV.db engine)

    Usage : insert_processed(df_name, "Name_of_table_you_want_to_create")
    """
    table_name = f"processed_{name}"
    try:
        df.to_sql(table_name, con=engine, if_exists="replace", index=False)
        print(f"✅ Inserted into table: {table_name} ({len(df)} rows)")
    except Exception as e:
        print(f"❌ Error inserting {table_name}: {e}")



def insert_file_to_db(file_path: str, engine=engine):
    """
    Inserts a CSV or Excel file into the SQLite DB.
    If the table exists, it will be replaced.

    Parameters:
    - file_path (str): Path to the file (.csv, .xls, .xlsx)
    - engine: SQLAlchemy engine

    Usage : insert_file_to_db("/fullpath/to/your/file.csv or xlsx or xls")
    """
    try:
        if not os.path.exists(file_path):
            print(f"❌ File not found: {file_path}")
            return

        # Derive table name from filename
        table_name = os.path.splitext(os.path.basename(file_path))[0]

        # Load file
        if file_path.lower().endswith(".csv"):
            df = pd.read_csv(file_path)
            df.to_sql(table_name, con=engine, if_exists="replace", index=False)
            print(f"✅ Inserted CSV into table: {table_name} ({len(df)} rows)")

        elif file_path.lower().endswith((".xls", ".xlsx")):
            xls = pd.read_excel(file_path, sheet_name=None)  # dict of DataFrames
            for sheet_name, df in xls.items():
                sheet_table = f"{table_name}_{sheet_name}"
                df.to_sql(sheet_table, con=engine, if_exists="replace", index=False)
                print(f"✅ Inserted Excel sheet '{sheet_name}' into table: {sheet_table} ({len(df)} rows)")
        else:
            print(f"❌ Unsupported file type: {file_path}")

    except Exception as e:
        print(f"❌ Error inserting {file_path}: {e}")



Replacing raw `summary_data.xlsx` dataset with processed `summary_data.xlsx` dataset, because
- raw summary_data sheets contained "comments" with column names, so we removed them using Excel
- there were `spaces` with names of sheets, so we manually removed them using Excel

In [74]:
insert_file_to_db("/home/Davcote/Desktop/HPV-Awareness-Impact-Analysis/data/processed_data/summary_data.xx")

❌ File not found: /home/Davcote/Desktop/HPV-Awareness-Impact-Analysis/data/processed_data/summary_data.xx


# Listing all Tables present inside our Database

In [75]:
# List all tables in the DB to verify
tables = run_sql("SELECT name FROM sqlite_master WHERE type='table';")
display(tables) #can use print(tables) as well for non-notebook environment

Unnamed: 0,name
0,raw_data_PRE_TEST
1,raw_data_HPV-KS
2,raw_data_demographic_variables
3,raw_data_1_DEMO_coded
4,raw_data_1_HPV_CODED
5,raw_data_POST_TEST
6,raw_data_DEMO_2
7,raw_data_coded_demo_2
8,raw_data_HPV_KS_2
9,raw_data_coded_hpv_2


In [76]:
# Assigning summary_data_demographic table to a dataframe and displaying first 5 rows
df_demo = run_sql("SELECT * FROM 'summary_data_demographic' LIMIT 5;")
display(df_demo)


Unnamed: 0,Sno,1,2,3,4,5,6,7,8
0,1,1,1,0,1,2,0,0,2
1,2,0,0,2,1,1,2,3,0
2,3,0,1,0,1,1,2,2,0
3,4,1,1,2,1,2,0,0,3
4,5,0,0,1,1,2,2,2,0


In [77]:
# Assigning summary_data_pretest table to a dataframe and displaying first 5 rows
df_pre = run_sql("SELECT * FROM 'summary_data_pretest' LIMIT 5;")
display(df_pre)

Unnamed: 0,Sno,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,Total points
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,22.0
1,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,17.0
3,4.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,13.0
4,5.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,15.0


In [78]:
# Assigning summary_data_posttest table to a dataframe and displaying first 5 rows
df_post = run_sql("SELECT * FROM 'summary_data_post_test' LIMIT 5;")
display(df_post)


Unnamed: 0,Sno,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,Total
0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,19.0
1,2.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,11.0
2,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,23.0
3,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,24.0
4,5.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,24.0


Conclusion : Datatype of each table is correctly assigned (float64)

# Section 2 : Data Cleaning and Preprocessing

In [79]:
# --- 1. Defining Mappings ---
# These mappings will be used to create human-readable columns for easier analysis later
demographic_cols_map = {
    '1': 'Age', 
    '2': 'Gender', 
    '3': 'Place_of_Residency', 
    '4': 'Education',
    '5': 'Vaccination_Status', 
    '6': 'Health_Care_Access', 
    '7': 'Occupation_of_Parents',
    '8': 'Family_Income_per_Month'
}


demographic_maps = {
    'Age': {0: '15-19 Years', 1: '19-24 Years', 2: '24 Years and above'},
    'Gender': {0: 'Female', 1: 'Male'},
    'Place_of_Residency': {0: 'Rural', 1: 'Semi-Urban', 2: 'Urban'},
    'Education': {0: 'High school', 1: 'Under graduation', 2: 'Post-graduation'},
    'Vaccination_Status': {0: 'Irregularly vaccinated', 1: 'Regularly vaccinated', 2: 'Not vaccinated'},
    'Health_Care_Access': {0: 'Easily accessible', 1: 'Not accessible', 2: 'Not interested'},
    'Occupation_of_Parents': {0: 'Professional', 1: 'Retired', 2: 'Skilled', 3: 'Unemployed'},
    'Family_Income_per_Month': {
        0: 'Below ₹14,997', 1: '₹14,977 - ₹22,494', 2: '₹22,495 - ₹37,492',
        3: '₹37,493 - ₹74,999', 4: '₹75,000 and above'
    }
}

In [80]:
# --- 2. Loading Full Data from DB ---
# Loading full tables into DataFrames
df_demo_full = pd.read_sql_query("SELECT * FROM summary_data_demographic", engine)
df_pre_full = pd.read_sql_query("SELECT * FROM summary_data_pretest", engine)
df_post_full = pd.read_sql_query("SELECT * FROM summary_data_post_test", engine)
print("Data fetched successfully from database tables.")

Data fetched successfully from database tables.


In [81]:
# --- 3. Cleaning and Merging Data ---
df_demo_full = df_demo_full.rename(columns=demographic_cols_map)
df_pre_full = df_pre_full.rename(columns={'Total points': 'pre_test_score'})
df_post_full = df_post_full.rename(columns={'Total': 'post_test_score'})

# Merging the dataframes into a single master dataframe
df_merged = pd.merge(df_demo_full, df_pre_full[['Sno', 'pre_test_score']], on='Sno')
df_final = pd.merge(df_merged, df_post_full[['Sno', 'post_test_score']], on='Sno')
print("Dataframes merged successfully.")

Dataframes merged successfully.


In [82]:
# --- 4. Creating Human-Readable Label Columns ---
# This makes visualizations much easier to interpret
for column, mapping in demographic_maps.items():
    df_final[f'{column}_Label'] = df_final[column].map(mapping)

# Add a score improvement column for analysis
df_final['score_improvement'] = df_final['post_test_score'] - df_final['pre_test_score']

print("Dataframes cleaned, merged, and new label columns created.")
print("\nFinal Data Head:")
display(df_final.head())

Dataframes cleaned, merged, and new label columns created.

Final Data Head:


Unnamed: 0,Sno,Age,Gender,Place_of_Residency,Education,Vaccination_Status,Health_Care_Access,Occupation_of_Parents,Family_Income_per_Month,pre_test_score,post_test_score,Age_Label,Gender_Label,Place_of_Residency_Label,Education_Label,Vaccination_Status_Label,Health_Care_Access_Label,Occupation_of_Parents_Label,Family_Income_per_Month_Label,score_improvement
0,1,1,1,0,1,2,0,0,2,22.0,19.0,19-24 Years,Male,Rural,Under graduation,Not vaccinated,Easily accessible,Professional,"₹22,495 - ₹37,492",-3.0
1,2,0,0,2,1,1,2,3,0,0.0,11.0,15-19 Years,Female,Urban,Under graduation,Regularly vaccinated,Not interested,Unemployed,"Below ₹14,997",11.0
2,3,0,1,0,1,1,2,2,0,17.0,23.0,15-19 Years,Male,Rural,Under graduation,Regularly vaccinated,Not interested,Skilled,"Below ₹14,997",6.0
3,4,1,1,2,1,2,0,0,3,13.0,24.0,19-24 Years,Male,Urban,Under graduation,Not vaccinated,Easily accessible,Professional,"₹37,493 - ₹74,999",11.0
4,5,0,0,1,1,2,2,2,0,15.0,24.0,15-19 Years,Female,Semi-Urban,Under graduation,Not vaccinated,Not interested,Skilled,"Below ₹14,997",9.0


In [83]:
# --- 5. Saving the Processed Data ---
output_path = '../data/processed_data/cleaned_hpv_data.csv'
df_final.to_csv(output_path, index=False)
print(f"\nCleaned data saved successfully to '{output_path}'")


Cleaned data saved successfully to '../data/processed_data/cleaned_hpv_data.csv'
