In [3]:
#!/usr/bin/env python
# coding: utf-8

"""
In this notebook, we are taking the raw CSV files and performing a complete data cleaning process.
We will:
- Verify that the raw files exist before we attempt to load them
- Load the raw data into Pandas DataFrames
- Standardize column names to a consistent, snake_case format so we can handle them uniformly
- Parse and unify date columns into a proper datetime format
- Handle missing values by either filling them with a suitable default (like 0 or mean) or dropping rows that are too incomplete
- Remove duplicate rows that might skew our analysis
- Optimize numeric types to reduce memory usage and improve performance
- Provide optional steps to detect and remove extreme outliers from numeric columns
- Ensure categorical columns are recognized and possibly encoded or documented
- Sort data by date where it makes sense, ensuring chronological order for time series
- Log these steps into a cleaning.log file for traceability and debugging if needed

This approach ensures our datasets are clean, consistent, and analysis-ready.
"""

import pandas as pd
import numpy as np
import os
import logging

# ----------------------------------------------------
# Setup logging so we have a trace of our cleaning steps
# ----------------------------------------------------
logging.basicConfig(
    filename='cleaning.log',      # We log all steps here
    filemode='a',                 # 'a' means append to existing log, don't overwrite
    format='%(asctime)s - %(levelname)s - %(message)s',  # Include timestamp and severity
    level=logging.INFO
)

logging.info("=== Starting the data cleaning process ===")

# ----------------------------------------------------
# Ensure our processed directory exists, so we can save cleaned outputs there
# ----------------------------------------------------
os.makedirs('../data/processed', exist_ok=True)

# ----------------------------------------------------
# File paths to our raw datasets
# Update these if your paths or filenames differ
# ----------------------------------------------------
path_vacc_death_rate = '../data/raw/covid-vaccinations-vs-covid-death-rate.csv'
path_vacc_manufacturer = '../data/raw/covid-vaccine-doses-by-manufacturer.csv'
path_oecd = '../data/raw/OECD_health_expenditure.csv'
path_us_death_rates = '../data/raw/united-states-rates-of-covid-19-deaths-by-vaccination-status.csv'


def check_file_exists(filepath):
    """
    Before we load a file, we want to ensure it actually exists.
    If it doesn't, we log the error and raise an exception.
    This way we don't try to clean data we don't have.
    """
    if not os.path.exists(filepath):
        logging.error(f"File not found: {filepath}")
        raise FileNotFoundError(f"Required file not found: {filepath}")
    else:
        logging.info(f"File found: {filepath}")

# Check that all our input files exist
check_file_exists(path_vacc_death_rate)
check_file_exists(path_vacc_manufacturer)
check_file_exists(path_oecd)
check_file_exists(path_us_death_rates)

# ----------------------------------------------------
# Helper functions
# ----------------------------------------------------

def standardize_column_names(df):
    """
    Convert column names to snake_case and remove problematic characters.
    Explicitly set regex=False to avoid any unintended regex interpretation.
    """
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(' ', '_', regex=False)
        .str.replace('(', '', regex=False)
        .str.replace(')', '', regex=False)
        .str.replace('-', '_', regex=False)
        .str.replace('/', '_', regex=False)
        .str.replace('&', 'and', regex=False)
        .str.replace('__', '_', regex=False)
    )
    return df


def optimize_numeric_types(df):
    """
    Downcast numeric columns to reduce the memory usage of our DataFrame.
    This can be especially helpful if the dataset is large.
    By converting floats to float32 and ints to int32 (or even int16),
    we often reduce memory footprint without losing meaningful precision.
    """
    for col in df.select_dtypes(include=[np.number]):
        df[col] = pd.to_numeric(df[col], downcast='float')
    return df

def log_dataframe_info(df, df_name):
    """
    Log basic information about the DataFrame:
    - Shape (rows and columns)
    - Null counts per column
    - Data types
    - Memory usage

    This helps us keep track of how our cleaning steps affect the dataset.
    """
    logging.info(f"--- {df_name} Info ---")
    logging.info(f"Shape: {df.shape}")
    null_counts = df.isnull().sum()
    logging.info(f"Null counts:\n{null_counts}")
    dtypes = df.dtypes
    logging.info(f"Data types:\n{dtypes}")
    mem_usage = df.memory_usage(deep=True)
    logging.info(f"Memory Usage:\n{mem_usage}")
    logging.info("-------------------------")

def parse_dates(df, date_columns):
    """
    Convert specified columns into proper datetime objects.
    If parsing fails for some rows, those become NaT (Not a Time).
    Date parsing ensures we can sort chronologically and do time-based analysis.
    """
    for col in date_columns:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')
    return df

def fill_numeric_nas(df, fill_value=0):
    """
    Fill missing values in numeric columns with a given fill_value (default 0).
    This avoids errors during calculations and ensures we don't lose rows unnecessarily.
    Adjust the strategy as needed:
    - mean/median fill might be better in some cases
    - forward/backfill if time series data
    """
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].fillna(fill_value)
    return df

def remove_outliers(df, factor=3):
    """
    OPTIONAL STEP:
    Remove extreme outliers from numeric columns using the IQR method.
    We take columns, compute Q1 and Q3, then define an acceptable range.
    Anything beyond factor * IQR from Q1 or Q3 is considered an outlier.

    This is highly dependent on the business case. Use with caution, as removing outliers
    might remove valid but rare data points.

    For demonstration, this function is defined but not automatically applied.
    If you want to apply it, call remove_outliers(df) on your dataset.

    NOTE: If certain columns shouldn't have outlier removal, exclude them.
    """
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - (factor * IQR)
        upper_bound = Q3 + (factor * IQR)
        initial_count = len(df)
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
        removed = initial_count - len(df)
        if removed > 0:
            logging.info(f"Removed {removed} outliers from '{col}' using factor={factor}")
    return df

def handle_categorical_data(df):
    """
    OPTIONAL/EXAMPLE STEP:
    Identify categorical columns and possibly transform them if needed.
    For now, we'll just log which columns appear categorical
    and could be encoded (like using one-hot encoding) later if analysis requires.

    If you know certain columns are categorical, you can cast them as 'category' dtype.
    """
    object_cols = df.select_dtypes(include=['object']).columns
    # Just an example: If a column has a low number of unique values, treat it as categorical
    for col in object_cols:
        unique_vals = df[col].nunique()
        if unique_vals < 50:  # arbitrary threshold for categoricals
            # Convert to categorical to reduce memory and clarify intention
            df[col] = df[col].astype('category')
            logging.info(f"Converted '{col}' to categorical dtype with {unique_vals} categories.")
    return df

def drop_empty_columns(df):
    """
    Drop columns that are entirely empty (all NaNs).
    This helps clean up unnecessary columns and keeps dataset tidy.
    """
    df = df.dropna(axis=1, how='all')
    return df

def sort_by_date_if_available(df, date_column='day'):
    """
    If the dataset has a 'day' or any date column,
    sort the DataFrame by that column to ensure chronological order.
    This is particularly helpful for time series data.
    """
    if date_column in df.columns and pd.api.types.is_datetime64_any_dtype(df[date_column]):
        df = df.sort_values(by=date_column)
    return df

# ----------------------------------------------------
# Load the raw data
# ----------------------------------------------------
logging.info("Loading raw datasets...")
df_vdr = pd.read_csv(path_vacc_death_rate)
df_vm = pd.read_csv(path_vacc_manufacturer)
df_oecd = pd.read_csv(path_oecd)
df_us = pd.read_csv(path_us_death_rates)

# Log initial info about raw data
log_dataframe_info(df_vdr, "Vaccinations vs Death Rate (Raw)")
log_dataframe_info(df_vm, "Vaccine Manufacturer (Raw)")
log_dataframe_info(df_oecd, "OECD (Raw)")
log_dataframe_info(df_us, "US Death Rates (Raw)")

# ----------------------------------------------------
# Cleaning the Vaccinations vs COVID Death Rate dataset (df_vdr)
# ----------------------------------------------------
# Standardize columns so they're easier to reference
df_vdr = standardize_column_names(df_vdr)

# Parse date columns (assuming 'day' is the main date column)
df_vdr = parse_dates(df_vdr, ['day'])

# Remove duplicate rows that don't add value
df_vdr.drop_duplicates(inplace=True)

# Fill numeric missing values with 0
df_vdr = fill_numeric_nas(df_vdr, fill_value=0)

# If 'entity' is crucial, remove rows missing it
if 'entity' in df_vdr.columns:
    df_vdr = df_vdr[df_vdr['entity'].notna()]

# Drop columns that are entirely empty
df_vdr = drop_empty_columns(df_vdr)

# Optimize numeric columns for memory
df_vdr = optimize_numeric_types(df_vdr)

# Attempt to handle categorical columns, if any
df_vdr = handle_categorical_data(df_vdr)

# Sort by date if it's meaningful (it likely is, since it's a timeseries)
df_vdr = sort_by_date_if_available(df_vdr, 'day')

# Log info after cleaning
log_dataframe_info(df_vdr, "Vaccinations vs Death Rate (Cleaned)")

# ----------------------------------------------------
# Cleaning the Vaccine Manufacturer dataset (df_vm)
# ----------------------------------------------------
df_vm = standardize_column_names(df_vm)
df_vm = parse_dates(df_vm, ['day'])
df_vm.drop_duplicates(inplace=True)
df_vm = fill_numeric_nas(df_vm, fill_value=0)

if 'entity' in df_vm.columns:
    df_vm = df_vm[df_vm['entity'].notna()]

df_vm = drop_empty_columns(df_vm)
df_vm = optimize_numeric_types(df_vm)
df_vm = handle_categorical_data(df_vm)
df_vm = sort_by_date_if_available(df_vm, 'day')
log_dataframe_info(df_vm, "Vaccine Manufacturer (Cleaned)")

# ----------------------------------------------------
# Cleaning the OECD Health Expenditure dataset (df_oecd)
# ----------------------------------------------------
logging.info("Cleaning OECD_health_expenditure dataset...")

# Standardize column names
df_oecd = standardize_column_names(df_oecd)

# Remove duplicate columns if any
df_oecd = df_oecd.loc[:, ~df_oecd.columns.duplicated()]

# Convert time_period to numeric (if it represents a year)
if 'time_period' in df_oecd.columns:
    df_oecd['time_period'] = pd.to_numeric(df_oecd['time_period'], errors='coerce')

# Convert obs_value to numeric and fill missing with 0
if 'obs_value' in df_oecd.columns:
    df_oecd['obs_value'] = pd.to_numeric(df_oecd['obs_value'], errors='coerce').fillna(0)

# Drop duplicate rows
df_oecd.drop_duplicates(inplace=True)

# Drop columns that are entirely empty
df_oecd = drop_empty_columns(df_oecd)

# Optimize numeric types to reduce memory usage
df_oecd = optimize_numeric_types(df_oecd)

# Handle categorical columns if any
df_oecd = handle_categorical_data(df_oecd)

# Log info after cleaning
log_dataframe_info(df_oecd, "OECD (Cleaned)")


# ----------------------------------------------------
# Cleaning the US Death Rates by Vaccination Status dataset (df_us)
# ----------------------------------------------------
df_us = standardize_column_names(df_us)
df_us = parse_dates(df_us, ['day'])

# If 'code' column is always null, let's remove it
if 'code' in df_us.columns and df_us['code'].isnull().all():
    df_us.drop(columns=['code'], inplace=True)

df_us = fill_numeric_nas(df_us, fill_value=0)
df_us.drop_duplicates(inplace=True)

# Ensure 'entity' is not missing if it's important (here it represents age groups)
df_us = df_us[df_us['entity'].notna()]

df_us = drop_empty_columns(df_us)
df_us = optimize_numeric_types(df_us)
df_us = handle_categorical_data(df_us)
df_us = sort_by_date_if_available(df_us, 'day')

log_dataframe_info(df_us, "US Death Rates (Cleaned)")

# ----------------------------------------------------
# OPTIONAL: Removing Outliers (commented out)
# If you want to remove outliers, uncomment the lines below:
#
# df_vdr = remove_outliers(df_vdr)
# df_vm = remove_outliers(df_vm)
# df_oecd = remove_outliers(df_oecd)
# df_us = remove_outliers(df_us)
#
# Remember to log and re-check data after removing outliers.

# ----------------------------------------------------
# Final Save
# Save the cleaned DataFrames to the processed directory.
# These cleaned files will be used by the Merge, Analysis, and Visualization notebooks.
# ----------------------------------------------------
df_vdr.to_csv('../data/processed/covid-vaccinations-vs-covid-death-rate_cleaned.csv', index=False)
df_vm.to_csv('../data/processed/covid-vaccine-doses-by-manufacturer_cleaned.csv', index=False)
df_oecd.to_csv('../data/processed/OECD_health_expenditure_cleaned.csv', index=False)
df_us.to_csv('../data/processed/united-states-rates-of-covid-19-deaths-by-vaccination-status_cleaned.csv', index=False)

logging.info("=== Data cleaning complete! All cleaned files saved to ../data/processed/ ===")

print("Data cleaning complete! Please check 'cleaning.log' for a detailed record of the steps taken.")


Data cleaning complete! Please check 'cleaning.log' for a detailed record of the steps taken.
