**Purpose**: To load the raw data, clean it, apply mappings, merge everything, and save a single, clean CSV file to be used by susequent notebooks.

# Section 1 : Data Ingestion and Sqlite setup

In [1]:
import sys
sys.path.append("/home/Davcote/Desktop/HPV-Awareness-Impact-Analysis/scripts")
import hpv_db_utils as hpv

import pandas as pd

## Listing all Tables present inside our Database

In [2]:
# List all tables in the DB to verify
tables = hpv.run_sql("SELECT name FROM sqlite_master WHERE type='table';")
display(tables) #can use print(tables) as well for non-notebook environment

Unnamed: 0,name
0,raw_data_PRE_TEST
1,raw_data_HPV-KS
2,raw_data_demographic_variables
3,raw_data_1_DEMO_coded
4,raw_data_1_HPV_CODED
5,raw_data_POST_TEST
6,raw_data_DEMO_2
7,raw_data_coded_demo_2
8,raw_data_HPV_KS_2
9,raw_data_coded_hpv_2


In [4]:
# Assigning summary_data_demographic table to a dataframe and displaying first 5 rows
df_demo = hpv.run_sql("SELECT * FROM 'demographic' LIMIT 5;")
display(df_demo)

Unnamed: 0,Sno,1,2,3,4,5,6,7,8
0,1,1,1,0,1,2,0,0,2
1,2,0,0,2,1,1,2,3,0
2,3,0,1,0,1,1,2,2,0
3,4,1,1,2,1,2,0,0,3
4,5,0,0,1,1,2,2,2,0


In [3]:
# Assigning summary_data_pretest table to a dataframe and displaying first 5 rows
df_pre = hpv.run_sql("SELECT * FROM 'pretest' LIMIT 5;")
display(df_pre)

Unnamed: 0,Sno,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,Total_points
0,1,1,1,1,1,1,1,1,0,0,...,0,0,1,1,1,1,0,1,1,22
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,0,1,0,1,0,1,0,1,...,0,1,1,0,1,0,0,0,1,17
3,4,1,1,1,0,1,0,1,0,0,...,0,0,1,0,0,0,0,0,0,13
4,5,1,1,0,0,1,0,0,0,0,...,1,0,1,1,0,1,0,0,1,15


In [17]:
# Assigning summary_data_posttest table to a dataframe and displaying first 5 rows
df_post = hpv.run_sql("SELECT * FROM 'post_test' LIMIT 5;")
display(df_post)


Unnamed: 0,Sno,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,Total
0,1,1,1,1,0,1,0,1,0,1,...,0,0,1,1,1,1,0,0,1,19
1,2,1,1,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,11
2,3,1,1,1,1,1,1,1,1,1,...,0,1,1,1,0,0,1,0,0,23
3,4,1,1,1,1,1,1,1,0,1,...,1,1,1,1,0,1,0,0,1,24
4,5,1,1,1,1,1,0,1,1,1,...,0,1,1,1,1,1,1,0,1,24


Conclusion : All datasets are correctly ingested in DB

# Section 2 : Data Cleaning and Preprocessing

## Step 1. Defining Mappings

In [5]:
# These mappings will be used to create human-readable columns for easier analysis later
demographic_cols_map = {
    '1': 'Age', 
    '2': 'Gender', 
    '3': 'Place_of_Residency', 
    '4': 'Education',
    '5': 'Vaccination_Status', 
    '6': 'Health_Care_Access', 
    '7': 'Occupation_of_Parents',
    '8': 'Family_Income_per_Month'
}


demographic_maps = {
    'Age': {0: '15-19 Years', 1: '19-24 Years', 2: '24 Years and above'},
    'Gender': {0: 'Female', 1: 'Male'},
    'Place_of_Residency': {0: 'Rural', 1: 'Semi-Urban', 2: 'Urban'},
    'Education': {0: 'High school', 1: 'Under graduation', 2: 'Post-graduation'},
    'Vaccination_Status': {0: 'Irregularly vaccinated', 1: 'Regularly vaccinated', 2: 'Not vaccinated'},
    'Health_Care_Access': {0: 'Easily accessible', 1: 'Not accessible', 2: 'Not interested'},
    'Occupation_of_Parents': {0: 'Professional', 1: 'Retired', 2: 'Skilled', 3: 'Unemployed'},
    'Family_Income_per_Month': {
        0: 'Below ₹14,997', 1: '₹14,977 - ₹22,494', 2: '₹22,495 - ₹37,492',
        3: '₹37,493 - ₹74,999', 4: '₹75,000 and above'
    }
}

## Step 2. Loading Full Data from DB

In [6]:
# Loading full tables into DataFrames
df_demo_full = pd.read_sql_query("SELECT * FROM demographic", hpv.engine)
df_pre_full = pd.read_sql_query("SELECT * FROM pretest", hpv.engine)
df_post_full = pd.read_sql_query("SELECT * FROM post_test", hpv.engine)
print("Data fetched successfully from database tables.")

Data fetched successfully from database tables.


## Step 3. Cleaning and Merging Data

In [7]:
# Renaming columns for clarity
df_demo_full = df_demo_full.rename(columns=demographic_cols_map)
df_pre_full = df_pre_full.rename(columns={'Total_points': 'pre_test_score'})
df_post_full = df_post_full.rename(columns={'Total': 'post_test_score'})

# Merging the dataframes into a single master dataframe
df_merged = pd.merge(df_demo_full, df_pre_full[['Sno', 'pre_test_score']], on='Sno')
df_final = pd.merge(df_merged, df_post_full[['Sno', 'post_test_score']], on='Sno')
print("Dataframes merged successfully.")

Dataframes merged successfully.


## Step 4. Creating Human-Readable Label Columns

In [8]:
# This makes visualizations much easier to interpret
for column, mapping in demographic_maps.items():
    df_final[f'{column}_Label'] = df_final[column].map(mapping)

# Add a score improvement column for analysis
df_final['score_improvement'] = df_final['post_test_score'] - df_final['pre_test_score']

print("Dataframes cleaned, merged, and new label columns created.")
print("\nFinal Data Head:")
display(df_final.head())

Dataframes cleaned, merged, and new label columns created.

Final Data Head:


Unnamed: 0,Sno,Age,Gender,Place_of_Residency,Education,Vaccination_Status,Health_Care_Access,Occupation_of_Parents,Family_Income_per_Month,pre_test_score,post_test_score,Age_Label,Gender_Label,Place_of_Residency_Label,Education_Label,Vaccination_Status_Label,Health_Care_Access_Label,Occupation_of_Parents_Label,Family_Income_per_Month_Label,score_improvement
0,1,1,1,0,1,2,0,0,2,22,19,19-24 Years,Male,Rural,Under graduation,Not vaccinated,Easily accessible,Professional,"₹22,495 - ₹37,492",-3
1,2,0,0,2,1,1,2,3,0,0,11,15-19 Years,Female,Urban,Under graduation,Regularly vaccinated,Not interested,Unemployed,"Below ₹14,997",11
2,3,0,1,0,1,1,2,2,0,17,23,15-19 Years,Male,Rural,Under graduation,Regularly vaccinated,Not interested,Skilled,"Below ₹14,997",6
3,4,1,1,2,1,2,0,0,3,13,24,19-24 Years,Male,Urban,Under graduation,Not vaccinated,Easily accessible,Professional,"₹37,493 - ₹74,999",11
4,5,0,0,1,1,2,2,2,0,15,24,15-19 Years,Female,Semi-Urban,Under graduation,Not vaccinated,Not interested,Skilled,"Below ₹14,997",9


## Step 5. Saving the Processed Data

In [9]:
output_path = '../data/processed_data/cleaned_hpv_data.csv'
df_final.to_csv(output_path, index=False)
print(f"\nCleaned data saved successfully to '{output_path}'")


Cleaned data saved successfully to '../data/processed_data/cleaned_hpv_data.csv'
