# Purpose: 
Create new features from the cleaned data to improve model performance.


## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np

## 2. Configuration

In [2]:
PROCESSED_DATA_PATH = '../data/processed_data/cleaned_data.parquet'
FEATURE_DATA_PATH = '../data/processed_data/engineered_features.parquet'

## 3. Load Cleaned Data

In [3]:
print("Loading cleaned data...")
df = pd.read_parquet(PROCESSED_DATA_PATH)
df = df[df['Jurisdiction'] == 'NAT_TOTAL'].copy() # Focus on national data for modeling
print("Data loaded successfully.")

Loading cleaned data...
Data loaded successfully.


## 4. Create Time-Based Features

In [4]:
print("\nEngineering time-based features...")


Engineering time-based features...


In [5]:
# Sort data by country and date to ensure correct calculations
df.sort_values(['CountryName', 'Date'], inplace=True)

In [6]:
# Create lag features (e.g., cases 7 days ago)
df['cases_lag_7'] = df.groupby('CountryName')['ConfirmedCases'].shift(7)
df['deaths_lag_7'] = df.groupby('CountryName')['ConfirmedDeaths'].shift(7)

In [7]:
# Create rolling averages to smooth out daily noise
df['cases_rolling_7'] = df.groupby('CountryName')['ConfirmedCases'].rolling(window=7).mean().reset_index(0,drop=True)
df['deaths_rolling_7'] = df.groupby('CountryName')['ConfirmedDeaths'].rolling(window=7).mean().reset_index(0,drop=True)

In [9]:
# Create growth rate feature
# Using a small number to avoid division by zero
daily_cases = df.groupby('CountryName')['ConfirmedCases'].diff().fillna(0)
df['case_growth_rate'] = (daily_cases / (df['cases_lag_7'] + 1e-5)) * 100

print("Time-based features created: lags, rolling averages, growth rate.")

Time-based features created: lags, rolling averages, growth rate.


## 5. Handle Skewed Data & Final Cleanup

In [10]:
print("\nApplying transformations and cleaning up...")


Applying transformations and cleaning up...


In [11]:
# Apply log transform to highly skewed variables to help linear models
# We add 1 to avoid log(0)
skewed_cols = ['ConfirmedCases', 'ConfirmedDeaths']
for col in skewed_cols:
    df[f'{col}_log'] = np.log1p(df[col])

In [12]:
# Drop rows with NaNs created by lags/rolling windows
df.dropna(inplace=True)

print("Applied log transformations and removed NaNs from feature creation.")

Applied log transformations and removed NaNs from feature creation.


## 6. Final Feature Selection

In [13]:
# You can select the final set of columns for your model
# For now, we will keep all engineered features
print("\nFinal feature set overview:")
print(df.info())


Final feature set overview:
<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 60 columns):
 #   Column                                                                           Non-Null Count  Dtype         
---  ------                                                                           --------------  -----         
 0   CountryName                                                                      0 non-null      object        
 1   CountryCode                                                                      0 non-null      object        
 2   RegionName                                                                       0 non-null      object        
 3   RegionCode                                                                       0 non-null      object        
 4   Jurisdiction                                                                     0 non-null      object        
 5   Date                                                       

## 7. Save Feature-Engineered Data 

In [14]:
print(f"\nSaving feature-engineered data to {FEATURE_DATA_PATH}...")
df.to_parquet(FEATURE_DATA_PATH)
print("Feature data saved successfully.")


Saving feature-engineered data to ../data/processed_data/engineered_features.parquet...
Feature data saved successfully.
