**Purpose**: To load raw data, clean it, apply mappings, merge datasets, and save a clean CSV file. This notebook includes missing data handling, outlier checks, and a data dictionary for reproducibility.

## Section 1 : Importing libraries and Configuration setup

In [1]:
import sys
import pandas as pd
import numpy as np
import logging

# Configure logging
logging.basicConfig(filename='../logs/data_cleaning.log', level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')

sys.path.append('/home/Davcote/Desktop/HPV-Awareness-Impact-Analysis/scripts')
import hpv_db_utils as hpv

# Display all columns
pd.set_option('display.max_columns', 200)
print('Libraries imported successfully.')
logging.info('Libraries imported successfully.')

Libraries imported successfully.


## Section 2 : Data Ingestion and SQLite Setup

### Step 1: Listing all tables in the database

In [2]:
tables = hpv.run_sql('SELECT name FROM sqlite_master WHERE type="table";')
display('Tables in HPV database', tables)
logging.info(f'Tables found in database: {tables["name"].tolist()}')

'Tables in HPV database'

Unnamed: 0,name
0,raw_data_PRE_TEST
1,raw_data_HPV-KS
2,raw_data_demographic_variables
3,raw_data_1_DEMO_coded
4,raw_data_1_HPV_CODED
5,raw_data_POST_TEST
6,raw_data_DEMO_2
7,raw_data_coded_demo_2
8,raw_data_HPV_KS_2
9,raw_data_coded_hpv_2


### Step 2: Loading demographic, pretest, and post-test data from database

In [3]:
df_demo = hpv.run_sql('SELECT * FROM demographic')
df_pre = hpv.run_sql('SELECT * FROM pretest')
df_post = hpv.run_sql('SELECT * FROM post_test')
print('Data loaded successfully.')
logging.info('Demographic, pretest, and post-test data loaded successfully.')

Data loaded successfully.


### Step 3: Displaying first 5 rows of each table

In [4]:
print('\nDemographic Data (first 5 rows):')
display(df_demo.head())
print('\nPretest Data (first 5 rows):')
display(df_pre.head())
print('\nPost-test Data (first 5 rows):')
display(df_post.head())


Demographic Data (first 5 rows):


Unnamed: 0,Sno,1,2,3,4,5,6,7,8
0,1,1,1,0,1,2,0,0,2
1,2,0,0,2,1,1,2,3,0
2,3,0,1,0,1,1,2,2,0
3,4,1,1,2,1,2,0,0,3
4,5,0,0,1,1,2,2,2,0



Pretest Data (first 5 rows):


Unnamed: 0,Sno,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,Total_points
0,1,1,1,1,1,1,1,1,0,0,1,1,0,1,1,0,1,0,1,1,0,0,0,1,1,0,0,1,1,1,1,0,1,1,22
1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,1,0,1,0,1,0,1,0,1,0,1,1,1,0,1,0,0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,0,1,17
3,4,1,1,1,0,1,0,1,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,13
4,5,1,1,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,1,0,1,0,1,0,1,1,0,1,1,0,1,0,0,1,15



Post-test Data (first 5 rows):


Unnamed: 0,Sno,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,Total
0,1,1,1,1,0,1,0,1,0,1,0,1,1,1,0,1,1,0,1,0,0,0,1,0,1,0,0,1,1,1,1,0,0,1,19
1,2,1,1,0,0,1,1,0,0,0,0,1,0,0,1,0,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,11
2,3,1,1,1,1,1,1,1,1,1,1,0,1,0,0,1,1,1,0,1,0,1,1,1,1,0,1,1,1,0,0,1,0,0,23
3,4,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0,0,1,0,0,1,1,0,1,1,1,1,1,0,1,0,0,1,24
4,5,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,0,1,0,0,1,1,0,0,0,1,1,1,1,1,1,0,1,24


## Section 3: Data Cleaning and Preprocessing

### Step 1: Defining mappings for human-readable columns

In [5]:
demographic_cols_map = {
    '1': 'Age',
    '2': 'Gender',
    '3': 'Place_of_Residency',
    '4': 'Education',
    '5': 'Vaccination_Status',
    '6': 'Health_Care_Access',
    '7': 'Occupation_of_Parents',
    '8': 'Family_Income_per_Month'
}

demographic_maps = {
    'Age': {0: '15-19 Years', 1: '19-24 Years', 2: '24 Years and above'},
    'Gender': {0: 'Female', 1: 'Male'},
    'Place_of_Residency': {0: 'Rural', 1: 'Semi-Urban', 2: 'Urban'},
    'Education': {0: 'High school', 1: 'Under graduation', 2: 'Post-graduation'},
    'Vaccination_Status': {0: 'Irregularly vaccinated', 1: 'Regularly vaccinated', 2: 'Not vaccinated'},
    'Health_Care_Access': {0: 'Easily accessible', 1: 'Not accessible', 2: 'Not interested'},
    'Occupation_of_Parents': {0: 'Professional', 1: 'Retired', 2: 'Skilled', 3: 'Unemployed'},
    'Family_Income_per_Month': {
        0: 'Below ₹14,997', 1: '₹14,977 - ₹22,494', 2: '₹22,495 - ₹37,492',
        3: '₹37,493 - ₹74,999', 4: '₹75,000 and above'
    }
}

### Step 2: Renaming columns

In [6]:
df_demo = df_demo.rename(columns=demographic_cols_map)
df_pre = df_pre.rename(columns={'Total_points': 'pre_test_score'})
df_post = df_post.rename(columns={'Total': 'post_test_score'})
logging.info('Columns renamed using demographic mappings.')

### Step 3: Checking for missing values

In [7]:
print('Missing values in demographic:', df_demo.isnull().sum().to_dict())
print('Missing values in pretest:', df_pre.isnull().sum().to_dict())
print('Missing values in post-test:', df_post.isnull().sum().to_dict())

Missing values in demographic: {'Sno': 0, 'Age': 0, 'Gender': 0, 'Place_of_Residency': 0, 'Education': 0, 'Vaccination_Status': 0, 'Health_Care_Access': 0, 'Occupation_of_Parents': 0, 'Family_Income_per_Month': 0}
Missing values in pretest: {'Sno': 0, '1': 0, '2': 0, '3': 0, '4': 0, '5': 0, '6': 0, '7': 0, '8': 0, '9': 0, '10': 0, '11': 0, '12': 0, '13': 0, '14': 0, '15': 0, '16': 0, '17': 0, '18': 0, '19': 0, '20': 0, '21': 0, '22': 0, '23': 0, '24': 0, '25': 0, '26': 0, '27': 0, '28': 0, '29': 0, '30': 0, '31': 0, '32': 0, '33': 0, 'pre_test_score': 0}
Missing values in post-test: {'Sno': 0, '1': 0, '2': 0, '3': 0, '4': 0, '5': 0, '6': 0, '7': 0, '8': 0, '9': 0, '10': 0, '11': 0, '12': 0, '13': 0, '14': 0, '15': 0, '16': 0, '17': 0, '18': 0, '19': 0, '20': 0, '21': 0, '22': 0, '23': 0, '24': 0, '25': 0, '26': 0, '27': 0, '28': 0, '29': 0, '30': 0, '31': 0, '32': 0, '33': 0, 'post_test_score': 0}


### Step 4: Dropping rows with missing critical columns (Sno, scores)

In [8]:
df_demo = df_demo.dropna(subset=['Sno'])
df_pre = df_pre.dropna(subset=['Sno', 'pre_test_score'])
df_post = df_post.dropna(subset=['Sno', 'post_test_score'])
logging.info('Dropped rows with missing critical columns.')

### Step 5: Checking for outliers in scores (using IQR)

In [9]:
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)][column]
    return outliers

pre_outliers = detect_outliers(df_pre, 'pre_test_score')
post_outliers = detect_outliers(df_post, 'post_test_score')
print('Pre-test score outliers:', pre_outliers.tolist())
print('Post-test score outliers:', post_outliers.tolist())
logging.info(f'Pre-test outliers: {pre_outliers.tolist()}, Post-test outliers: {post_outliers.tolist()}')

Pre-test score outliers: [0, 1, 2, 1, 0, 31, 31, 29]
Post-test score outliers: []


### Step 6: Merging datasets

In [10]:
df_merged = df_demo.merge(df_pre[['Sno', 'pre_test_score']], on='Sno', how='inner')
df_merged = df_merged.merge(df_post[['Sno', 'post_test_score']], on='Sno', how='inner')
logging.info(f'Merged dataset shape: {df_merged.shape}')

### Step 7: Calculating score improvement

In [11]:
df_merged['score_improvement'] = df_merged['post_test_score'] - df_merged['pre_test_score']

### Step 8: Applying demographic mappings

In [12]:
for col, mapping in demographic_maps.items():
    df_merged[f'{col}_Label'] = df_merged[col].map(mapping)
logging.info('Demographic mappings applied to create labeled columns.')

## Section 4 : Saving Data

### Step 1: Exporting cleaned data

In [13]:
output_path = '../data/processed_data/cleaned_hpv_data.csv'
df_merged.to_csv(output_path, index=False)
print(f'Cleaned data saved to {output_path}')
logging.info(f'Cleaned data saved to {output_path}')

Cleaned data saved to ../data/processed_data/cleaned_hpv_data.csv


### Step 2: Creating data dictionary

In [14]:
data_dict = {
    'Column': list(df_merged.columns),
    'Description': [
        'Unique participant ID',
        'Age group (0: 15-19, 1: 19-24, 2: 24+)', 'Gender (0: Female, 1: Male)',
        'Residency (0: Rural, 1: Semi-Urban, 2: Urban)', 'Education level (0: High school, 1: Undergrad, 2: Post-grad)',
        'Vaccination status (0: Irregular, 1: Regular, 2: Not vaccinated)', 'Healthcare access (0: Easy, 1: Not accessible, 2: Not interested)',
        'Parents’ occupation (0: Professional, 1: Retired, 2: Skilled, 3: Unemployed)', 'Family income (0: <14,997, 1: 14,977-22,494, 2: 22,495-37,492, 3: 37,493-74,999, 4: 75,000+)',
        'Pre-intervention knowledge score', 'Post-intervention knowledge score',
        'Labeled age group', 'Labeled gender', 'Labeled residency', 'Labeled education',
        'Labeled vaccination status', 'Labeled healthcare access', 'Labeled parents’ occupation',
        'Labeled family income', 'Score improvement (post - pre)'
    ]
}
pd.DataFrame(data_dict).to_csv('../data/processed_data/data_dictionary.csv', index=False)
print('Data dictionary saved to ../data/processed_data/data_dictionary.csv')
logging.info('Data dictionary exported.')

Data dictionary saved to ../data/processed_data/data_dictionary.csv


### Step 3: Saving cleaned data and data dictionary using pickle

In [15]:
import pickle

with open("../models/cleaned_hpv_data.pkl", "wb") as f:
    pickle.dump(df_merged, f)

data_dict = {
    "Age": "Age group categories",
    "Gender": "Biological sex",
    "Place_of_Residency": "Type of area where participant resides",
    "Education": "Highest education level attained",
    "Vaccination_Status": "HPV vaccination status",
    "Health_Care_Access": "Accessibility to healthcare services",
    "Occupation_of_Parents": "Occupation category of parents",
    "Family_Income_per_Month": "Monthly family income brackets",
    "pre_test_score": "Score before the educational intervention",
    "post_test_score": "Score after the educational intervention",
    "score_improvement": "Improvement in score (post - pre)"
}
with open("../models/data_dictionary.pkl", "wb") as f:
    pickle.dump(data_dict, f)