# Hanoi Weather Data - Feature Engineering

This notebook creates advanced features for time series temperature forecasting, including lag features, rolling statistics, cyclical encodings, and weather-specific derived features.

## Objectives
1. Create temporal lag features for time series modeling
2. Generate rolling statistics and moving averages
3. Apply cyclical encoding for seasonal patterns
4. Engineer weather-specific features
5. Feature selection and importance analysis
6. Export engineered features for modeling

## 1. Setup and Imports

In [None]:
# Import libraries for feature engineering
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import mutual_info_regression, SelectKBest
import warnings

# Configure settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

print("🔧 FEATURE ENGINEERING SETUP")
print("=" * 35)
print("✅ Libraries imported successfully!")
print(f"📊 Pandas version: {pd.__version__}")
print(f"🤖 Scikit-learn available for feature selection")

## 2. Load Processed Data

In [None]:
# Load cleaned data (from data processing step)
df = pd.read_csv('../data/raw/daily_data.csv')  # Will be updated to use processed data
df['datetime'] = pd.to_datetime(df['datetime'])

# Sort by date for time series operations
df = df.sort_values('datetime').reset_index(drop=True)

# Create basic temporal features
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day_of_year'] = df['datetime'].dt.dayofyear
df['day_of_week'] = df['datetime'].dt.dayofweek

print("📊 BASE DATA LOADED:")
print(f"Shape: {df.shape}")
print(f"Date range: {df['datetime'].min().date()} to {df['datetime'].max().date()}")
print(f"Temporal features created: year, month, day_of_year, day_of_week")

## 3. Time Series Lag Features

In [None]:
# Create lag features for time series forecasting
print("⏰ CREATING LAG FEATURES")
print("=" * 30)

# Temperature lag features (key for time series prediction)
df['temp_lag1'] = df['temp'].shift(1)      # Previous day
df['temp_lag2'] = df['temp'].shift(2)      # 2 days ago  
df['temp_lag3'] = df['temp'].shift(3)      # 3 days ago
df['temp_lag7'] = df['temp'].shift(7)      # 1 week ago

# Temperature extremes lag features
if all(col in df.columns for col in ['tempmax', 'tempmin']):
    df['tempmax_lag1'] = df['tempmax'].shift(1)
    df['tempmin_lag1'] = df['tempmin'].shift(1)
    df['temp_range'] = df['tempmax'] - df['tempmin']
    df['temp_range_lag1'] = df['temp_range'].shift(1)

# Weather variables lag features
weather_vars = ['humidity', 'precip', 'windspeed', 'sealevelpressure', 'cloudcover']
for var in weather_vars:
    if var in df.columns:
        df[f'{var}_lag1'] = df[var].shift(1)

print("✅ Lag features created:")
lag_features = [col for col in df.columns if 'lag' in col]
for i, feature in enumerate(lag_features, 1):
    print(f"  {i:2d}. {feature}")

print(f"\n📊 Total lag features: {len(lag_features)}")