# Hanoi Weather Data - Feature Engineering

This notebook creates advanced features for time series temperature forecasting, including lag features, rolling statistics, cyclical encodings, and weather-specific derived features.

## Objectives
1. Create temporal lag features for time series modeling
2. Generate rolling statistics and moving averages
3. Apply cyclical encoding for seasonal patterns
4. Engineer weather-specific features
5. Feature selection and importance analysis
6. Export engineered features for modeling

## 1. Setup and Imports

In [2]:
# Import libraries for feature engineering
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import mutual_info_regression, SelectKBest
from sklearn.preprocessing import MultiLabelBinarizer
import warnings

# Configure settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

## 2. Load Processed Data

In [3]:
# Load cleaned data (from data processing step)
df = pd.read_csv('../data/processed/daily_data_clean.csv',
                parse_dates=['datetime', 'sunrise', 'sunset'])

df = df.sort_values('datetime').reset_index(drop=True)
print(f"Shape: {df.shape}")
print(f"Date range: {df['datetime'].min().date()} → {df['datetime'].max().date()}")

# Drop unnecessary or constant columns
drop_cols = ['description', 'icon', 'stations']
drop_cols = [col for col in drop_cols if col in df.columns]

constant_cols = df.columns[df.nunique() <= 1].tolist()
to_drop = drop_cols + constant_cols

if to_drop:
    print(f"\n Dropping {len(to_drop)} unnecessary or constant columns:")
    for col in to_drop:
        print(f"  - {col}")
    df = df.drop(columns=to_drop, errors='ignore')
else:
    print("\nNo columns dropped.")


# Create time-based features
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['day_of_year'] = df['datetime'].dt.dayofyear
df['day_of_week'] = df['datetime'].dt.dayofweek
df['week_of_year'] = df['datetime'].dt.isocalendar().week.astype(int)

# Cyclical encodings for time
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
df['dayofyear_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365.25)
df['dayofyear_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365.25)
df.drop(columns=['day_of_year'], inplace=True)

# Encode seasons
season_map = {
    12: 'Winter', 1: 'Winter', 2: 'Winter',
    3: 'Spring', 4: 'Spring', 5: 'Spring',
    6: 'Summer', 7: 'Summer', 8: 'Summer',
    9: 'Autumn', 10: 'Autumn', 11: 'Autumn'
}
df['season'] = df['month'].map(season_map)

# One-hot encode seasons
season_dummies = pd.get_dummies(df['season'], prefix='season', drop_first=True, dtype=float)
df = pd.concat([df, season_dummies], axis=1)

df.drop(columns=['season'], inplace=True, errors='ignore')
df.drop(columns=['month'], inplace=True)

# Weather-derived features
if all(col in df.columns for col in ['sunrise', 'sunset']):
    df['day_length_hours'] = (df['sunset'] - df['sunrise']).dt.total_seconds() / 3600

if all(col in df.columns for col in ['tempmax', 'tempmin']):
    df['dtr'] = df['tempmax'] - df['tempmin']           # Daily Temperature Range
    df['dtr_lag1'] = df['dtr'].shift(1)                 # Lagged version

if 'winddir' in df.columns:
    df['winddir_sin'] = np.sin(np.deg2rad(df['winddir']))
    df['winddir_cos'] = np.cos(np.deg2rad(df['winddir']))
    df.drop(columns=['winddir'], inplace=True)


# Binary flags
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
df['is_rainy'] = (df['precip'] > 0).astype(int)

# Conditions multi-hot encoding
mlb = None
if 'conditions' in df.columns:
    def split_conditions(s):
        if pd.isna(s) or s == '':
            return []
        parts = re.split(r',\s*', str(s).strip())
        parts = [p.strip().title() for p in parts if p.strip() != '']
        return parts

    cond_series = df['conditions'].fillna('').astype(str).map(split_conditions)
    mlb = MultiLabelBinarizer(sparse_output=False)

    try:
        cond_encoded = pd.DataFrame(
            mlb.fit_transform(cond_series),
            columns=mlb.classes_,
            index=df.index,
            dtype=float
        )
    except ValueError:
        cond_encoded = pd.DataFrame(index=df.index)

    # Drop first column to prevent collinearity
    if cond_encoded.shape[1] > 0:
        cond_encoded = cond_encoded.drop(columns=cond_encoded.columns[0], errors='ignore')

    df = pd.concat([df.drop(columns=['conditions'], errors='ignore'), cond_encoded], axis=1)


# Final summary
temporal_features = [
    'year', 'month', 'day', 'day_of_year', 'day_of_week',
    'week_of_year', 'season', 'is_weekend', 'is_rainy', 'dtr'
]

print("\nFEATURE SUMMARY")
print(f"Total columns: {len(df.columns)}")
print(f"Temporal features created: {len([f for f in temporal_features if f in df.columns])}")
print(f"Sample columns: {df.columns.tolist()[:10]}")

FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/daily_data_clean.csv'

## 3. Time Series Lag Features

In [None]:
# Create lag features for time series forecasting
print("CREATING LAG FEATURES")

# Temperature lag features (key for time series prediction)
df['temp_lag1'] = df['temp'].shift(1)      # Previous day
df['temp_lag2'] = df['temp'].shift(2)      # 2 days ago  
df['temp_lag3'] = df['temp'].shift(3)      # 3 days ago
df['temp_lag7'] = df['temp'].shift(7)      # 1 week ago

# Temperature extremes lag features
if all(col in df.columns for col in ['tempmax', 'tempmin']):
    df['tempmax_lag1'] = df['tempmax'].shift(1)
    df['tempmin_lag1'] = df['tempmin'].shift(1)
    df['temp_range'] = df['tempmax'] - df['tempmin']
    df['temp_range_lag1'] = df['temp_range'].shift(1)

# Weather variables lag features
weather_vars = ['humidity', 'precip', 'windspeed', 'sealevelpressure', 'cloudcover']
for var in weather_vars:
    if var in df.columns:
        df[f'{var}_lag1'] = df[var].shift(1)

print("Lag features created:")
lag_features = [col for col in df.columns if 'lag' in col]
for i, feature in enumerate(lag_features, 1):
    print(f"  {i:2d}. {feature}")

# Removes the first few rows that don’t have full lag information.
df = df.dropna().reset_index(drop=True)

print(f"\nTotal lag features: {len(lag_features)}")

⏰ CREATING LAG FEATURES
✅ Lag features created:
   1. dtr_lag1
   2. temp_lag1
   3. temp_lag2
   4. temp_lag3
   5. temp_lag7
   6. tempmax_lag1
   7. tempmin_lag1
   8. temp_range_lag1
   9. humidity_lag1
  10. precip_lag1
  11. windspeed_lag1
  12. sealevelpressure_lag1
  13. cloudcover_lag1

📊 Total lag features: 13


In [None]:
# Define path
processed_path = '../data/processed/daily_data_engineered.csv'

# Save as csv (recommended)
df.to_csv(processed_path, index=False)

print(f"Cleaned data saved successfully to: {processed_path}")
print(f"Shape: {df.shape}")

Cleaned data saved successfully to: ../data/processed/daily_data_with_feature_engineering.csv
Shape: (3647, 59)
