# Feature Engineering

This notebook creates rolling 7-day features for operational risk monitoring and merges all datasets into a single daily city-level feature table.

## Features Created
- Rolling 7-day averages for congestion, rainfall, and demand by city
- Merged daily city-level feature table


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys

# Add src to path for config imports
sys.path.append('../src')

# Set up paths
PROCESSED_DATA_DIR = Path('../data/processed')

print(f"Processed data directory: {PROCESSED_DATA_DIR}")


## Load Cleaned Data


In [None]:
# Load cleaned datasets
weather_df = pd.read_csv(PROCESSED_DATA_DIR / 'weather_cleaned.csv', parse_dates=['date'])
traffic_df = pd.read_csv(PROCESSED_DATA_DIR / 'traffic_cleaned.csv', parse_dates=['date'])
demand_df = pd.read_csv(PROCESSED_DATA_DIR / 'demand_cleaned.csv', parse_dates=['date'])

print("Loaded cleaned datasets:")
print(f"Weather: {weather_df.shape}")
print(f"Traffic: {traffic_df.shape}")
print(f"Demand: {demand_df.shape}")

# Ensure data is sorted by city and date
weather_df = weather_df.sort_values(['city', 'date']).reset_index(drop=True)
traffic_df = traffic_df.sort_values(['city', 'date']).reset_index(drop=True)
demand_df = demand_df.sort_values(['city', 'date']).reset_index(drop=True)


## Create Rolling 7-Day Features

For near-real-time operational risk monitoring, we compute rolling 7-day averages to smooth out daily fluctuations and capture trends.


In [None]:
def create_rolling_features(df, value_col, window=7, group_col='city'):
    """
    Create rolling window features for a given column.
    
    Args:
        df: DataFrame with date and group_col columns
        value_col: Column name to compute rolling average for
        window: Rolling window size (default 7 days)
        group_col: Column to group by (default 'city')
        
    Returns:
        DataFrame with added rolling features
    """
    df = df.copy()
    df = df.sort_values([group_col, 'date']).reset_index(drop=True)
    
    # Create rolling 7-day average
    rolling_col = f'{value_col}_7d_avg'
    df[rolling_col] = df.groupby(group_col)[value_col].transform(
        lambda x: x.rolling(window=window, min_periods=1).mean()
    )
    
    # Create rolling 7-day max (for peak detection)
    rolling_max_col = f'{value_col}_7d_max'
    df[rolling_max_col] = df.groupby(group_col)[value_col].transform(
        lambda x: x.rolling(window=window, min_periods=1).max()
    )
    
    return df

# Create rolling features for each dataset
print("Creating rolling features...")

# Traffic: rolling congestion
traffic_df = create_rolling_features(traffic_df, 'congestion_level', window=7)
print("[OK] Traffic rolling features created")

# Weather: rolling rainfall
weather_df = create_rolling_features(weather_df, 'rainfall_mm', window=7)
print("[OK] Weather rolling features created")

# Demand: rolling demand index
demand_df = create_rolling_features(demand_df, 'demand_index', window=7)
print("[OK] Demand rolling features created")

print("\nSample rolling features:")
print(traffic_df[['date', 'city', 'congestion_level', 'congestion_level_7d_avg']].head(10))


## Merge Datasets

Merge all datasets into a single daily city-level feature table.


In [None]:
# Merge datasets on date and city
# Start with weather as base (has both rainfall and temperature)
daily_features = weather_df[['date', 'city', 'rainfall_mm', 'rainfall_mm_7d_avg', 
                              'rainfall_mm_7d_max', 'temperature']].copy()

# Merge traffic data
daily_features = daily_features.merge(
    traffic_df[['date', 'city', 'congestion_level', 'congestion_level_7d_avg', 
                'congestion_level_7d_max']],
    on=['date', 'city'],
    how='outer'
)

# Merge demand data
daily_features = daily_features.merge(
    demand_df[['date', 'city', 'demand_index', 'demand_index_7d_avg', 
               'demand_index_7d_max']],
    on=['date', 'city'],
    how='outer'
)

# Sort by date and city
daily_features = daily_features.sort_values(['date', 'city']).reset_index(drop=True)

print(f"Merged feature table shape: {daily_features.shape}")
print(f"\nDate range: {daily_features['date'].min()} to {daily_features['date'].max()}")
print(f"Unique cities: {sorted(daily_features['city'].unique())}")
print(f"\nColumns: {daily_features.columns.tolist()}")


## Handle Missing Values in Merged Dataset


In [None]:
# Check for missing values after merge
print("Missing values in merged dataset:")
print(daily_features.isnull().sum())

# Forward fill missing values by city (for cases where one dataset has more dates)
daily_features = daily_features.sort_values(['city', 'date']).reset_index(drop=True)

# Forward fill within each city group
for col in daily_features.columns:
    if col not in ['date', 'city']:
        daily_features[col] = daily_features.groupby('city')[col].ffill()
        # If still missing, backward fill
        daily_features[col] = daily_features.groupby('city')[col].bfill()
        # If still missing, fill with 0 for numeric columns
        if daily_features[col].dtype in ['float64', 'int64']:
            daily_features[col] = daily_features[col].fillna(0)

print("\nMissing values after handling:")
print(daily_features.isnull().sum())


## Validate Feature Table


In [None]:
# Validate feature table
print("Feature table summary:")
print(daily_features.describe())

print("\nSample rows:")
print(daily_features.head(10))

print("\nData completeness by city:")
city_completeness = daily_features.groupby('city').agg({
    'date': 'count',
    'rainfall_mm': lambda x: x.notna().sum(),
    'congestion_level': lambda x: x.notna().sum(),
    'demand_index': lambda x: x.notna().sum()
})
print(city_completeness)


## Save Feature Table


In [None]:
# Save daily city features
output_path = PROCESSED_DATA_DIR / 'daily_city_features.csv'
daily_features.to_csv(output_path, index=False)

print(f"Daily city features saved to: {output_path}")
print(f"Total records: {len(daily_features)}")
print(f"Date range: {daily_features['date'].min().date()} to {daily_features['date'].max().date()}")
print(f"Unique cities: {daily_features['city'].nunique()}")

print("\nFeature engineering complete!")
