# Hanoi Weather Data - Feature Engineering

This notebook creates advanced features for time series temperature forecasting, including lag features, rolling statistics, cyclical encodings, and weather-specific derived features.

## Objectives
1. Create temporal lag features for time series modeling
2. Generate rolling statistics and moving averages
3. Apply cyclical encoding for seasonal patterns
4. Engineer weather-specific features
5. Feature selection and importance analysis
6. Export engineered features for modeling

## 1. Setup and Imports

In [1]:
# Import libraries for feature engineering
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import mutual_info_regression, SelectKBest
import warnings

# Configure settings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

print("ðŸ”§ FEATURE ENGINEERING SETUP")
print("=" * 35)
print("âœ… Libraries imported successfully!")
print(f"ðŸ“Š Pandas version: {pd.__version__}")
print(f"ðŸ¤– Scikit-learn available for feature selection")

ðŸ”§ FEATURE ENGINEERING SETUP
âœ… Libraries imported successfully!
ðŸ“Š Pandas version: 2.2.3
ðŸ¤– Scikit-learn available for feature selection


## 2. Load Processed Data

In [2]:
# Load cleaned data (from data processing step)
df = pd.read_csv('../data/processed/prepocessed_daily_data.csv')  # Will be updated to use processed data

df2 = pd.read_csv('../data/processed/prepocessed_Guiyang_2015-2025_full_data.csv')
column_mapping = {col: f'{col}_Guiyang' for col in df2.columns}
df2.rename(columns=column_mapping, inplace=True)

df = df.merge(df2, left_on='datetime', right_on=f'datetime_Guiyang', how='left')
df.set_index('datetime', inplace=True)
df.index = pd.to_datetime(df.index)
df = df.drop(columns=[f'datetime_Guiyang'])

## 4. Time Series Features

In [3]:
df['sunrise'] = pd.to_datetime(df['sunrise'])
df['sunset'] = pd.to_datetime(df['sunset'])
df['sunrise_Guiyang'] = pd.to_datetime(df['sunrise_Guiyang'])
df['sunset_Guiyang'] = pd.to_datetime(df['sunset_Guiyang'])
df['day_length_hours'] = df['sunset'] - df['sunrise']
df['day_length_hours_Guiyang'] = df['sunset_Guiyang'] - df['sunrise_Guiyang']
df = df.drop(columns=['sunrise', 'sunset', 'sunrise_Guiyang', 'sunset_Guiyang'])
df['day_length_hours'] = df['day_length_hours'].dt.total_seconds() / 3600.0
df['day_length_hours_Guiyang'] = df['day_length_hours_Guiyang'].dt.total_seconds() / 3600.0

df['target'] = df['temp'].shift(-5)
df = df[~df['target'].isna()].copy()

In [4]:
# Create lagging features
def create_lag_features(df, cols, lags):
    for col in cols:
        for lag in lags:
            df[f"{col}_lag_{lag}"] = df[col].shift(lag)
    return df

# Specify columns and lags
# Get all numerical columns
computing_columns = df.drop(columns=['year', 'month', 'day', 'day_of_year', 'season', 'is_rainy',
                                     'year_Guiyang', 'month_Guiyang', 'day_Guiyang', 'day_of_year_Guiyang',
                                     'season_Guiyang', 'is_rainy_Guiyang', 'target']).columns

lag_steps = [1, 2, 3, 5, 7, 10, 14]  # Example lag steps

# Apply lagging features before handling rolling horizons
df = create_lag_features(df, computing_columns, lag_steps)

# Function to compute rolling mean and percentage change
def compute_rolling(df, horizon, col):
    label = f"rolling_{horizon}_{col}"
    df[label] = df[col].rolling(horizon, min_periods=horizon).mean()  # Ensure full horizon is used
    df[f"{label}_change"] = df[col] - df[label]
    return df

# Compute rolling features for specified horizons
rolling_horizons = [3, 7, 14]  # Rolling windows of 3, 7, 14 days
for horizon in rolling_horizons:
    for col in computing_columns:
        df = compute_rolling(df, horizon, col)

# Drop rows with NaN values caused by rolling horizons
df = df.iloc[14:]
# Verify no NaN values exist
nan_summary = df.isna().sum()
print("Summary of NaN values in each column after handling rolling horizons and lagging:")
print(nan_summary[nan_summary > 0])

if df.isna().any().any():
    print("\nThe dataframe contains NaN values.")
else:
    print("\nThe dataframe does not contain any NaN values.")

Summary of NaN values in each column after handling rolling horizons and lagging:
Series([], dtype: int64)

The dataframe does not contain any NaN values.


In [5]:
#Months and days average
def expand_mean(df):
    return df.expanding(1).mean()

for col in computing_columns:
    df[f"month_avg_{col}"] = df[col].groupby(df.index.month, group_keys=False).apply(expand_mean)
    df[f"day_avg_{col}"] = df[col].groupby(df.index.day_of_year, group_keys=False).apply(expand_mean)
    df[f"year_avg_{col}"] = df[col].groupby(df.index.year, group_keys=False).apply(expand_mean)
    df[f"season_avg_{col}"] = df[col].groupby(df['season'], group_keys=False).apply(expand_mean)
    df["month_max_temp"] = df['temp'].groupby(df.index.month, group_keys=False).cummax()
    df["month_min_temp"] = df['temp'].groupby(df.index.month, group_keys=False).cummin()

In [6]:
df.shape

(3635, 843)

In [7]:
df.to_csv('../data/processed/feature_engineering_daily_data.csv')