# Purpose:
Load the raw data, clean it, and save the processed version for analysis.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
from pandas.io.parquet import to_parquet

## 2. Configuration

In [None]:
# Set file paths for easy management
RAW_DATA_PATH = '../data/raw_data/OxCGRT_National_Subnational.csv'
PROCESSED_DATA_PATH = '../data/processed_data/cleaned_data.parquet'

## 3. Load Data 

In [4]:
print("Loading raw data...")

# Using low_memory=False can help with mixed data types on import
df = pd.read_csv(RAW_DATA_PATH, low_memory=False)
print("Data loaded successfully.")
print(f"Initial shape: {df.shape}")

Loading raw data...
Data loaded successfully.
Initial shape: (239661, 58)


## 4. Data Cleaning & Preprocessing 

In [5]:
# Convert 'Date' to datetime objects for time-series analysis
print("\nProcessing 'Date' column...")
df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d')


Processing 'Date' column...


In [6]:
# Handle columns with a high percentage of missing values
print("Handling missing values...")
missing_percentage = df.isnull().sum() / len(df) * 100
cols_to_drop = missing_percentage[missing_percentage > 75].index
df.drop(columns=cols_to_drop, inplace=True)
print(f"Dropped columns with >75% missing values: {list(cols_to_drop)}")

Handling missing values...
Dropped columns with >75% missing values: ['CityName', 'CityCode', 'E3_Fiscal.measures', 'E4_International.support', 'H4_Emergency.investment.in.healthcare']


In [7]:
# Impute missing values for key time-series data (ConfirmedCases, ConfirmedDeaths)
# Forward-fill is appropriate for cumulative data
df['ConfirmedCases'].fillna(method='ffill', inplace=True)
df['ConfirmedDeaths'].fillna(method='ffill', inplace=True)
print("Forward-filled 'ConfirmedCases' and 'ConfirmedDeaths'.")

Forward-filled 'ConfirmedCases' and 'ConfirmedDeaths'.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ConfirmedCases'].fillna(method='ffill', inplace=True)
  df['ConfirmedCases'].fillna(method='ffill', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ConfirmedDeaths'].fillna(method='ffill', inplace=True)
  df['ConfirmedDeaths'].fillna(method='ffill', inplace=

In [8]:
# Impute categorical '_Flag' columns with the mode
for col in df.select_dtypes(include='object').columns:
    if '_Flag' in col:
        mode_val = df[col].mode()[0]
        df[col].fillna(mode_val, inplace=True)
        print(f"Imputed '{col}' with mode.")

In [9]:
# Fill remaining numeric NaNs with 0, assuming they represent an absence of the measure
numeric_cols = df.select_dtypes(include=np.number).columns
df[numeric_cols] = df[numeric_cols].fillna(0)
print("Filled remaining numeric NaNs with 0.")

Filled remaining numeric NaNs with 0.


## 5. Final Data Review

In [10]:
print("\nFinal data overview:")
print(df.info())
print("\nChecking for any remaining nulls:")
print(df.isnull().sum().sum())


Final data overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239661 entries, 0 to 239660
Data columns (total 53 columns):
 #   Column                                                                           Non-Null Count   Dtype         
---  ------                                                                           --------------   -----         
 0   CountryName                                                                      239661 non-null  object        
 1   CountryCode                                                                      239661 non-null  object        
 2   RegionName                                                                       231989 non-null  object        
 3   RegionCode                                                                       231989 non-null  object        
 4   Jurisdiction                                                                     239661 non-null  object        
 5   Date                                

## 6. Save Processed Data

In [11]:
print(f"\nSaving cleaned data to {PROCESSED_DATA_PATH}...")
df.to_parquet(PROCESSED_DATA_PATH)
print("Cleaned data saved successfully as a Parquet file.")



Saving cleaned data to ../data/processed_data/cleaned_data.parquet...
Cleaned data saved successfully as a Parquet file.
