# Hanoi Temperature Forecasting - Machine Learning Pipeline

Clean and modular implementation of sliding window time series forecasting with multiple ML algorithms.

## Pipeline Overview
1. **Data Loading & Preprocessing**
2. **Sliding Window Creation**
3. **Model Training & Evaluation** 
4. **Results Visualization**
5. **Model Persistence**

In [None]:
# Import required libraries
import sys
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
project_root = '/home/vungocduong/Hanoi-Temperature-Forecasting'
sys.path.append(project_root)

# Import custom modules
from src.data.sliding_window import TimeSeriesWindowProcessor
from src.models.ml_trainer import MLModelTrainer
from src.visualization.model_plots import ModelVisualizer

print("‚úÖ All modules imported successfully!")
print(f"üìÅ Project root: {project_root}")

## 1. Data Loading & Preprocessing

In [None]:
# Load and preprocess data
data_path = f'{project_root}/data/raw/daily/Daily_Data.csv'
df = pd.read_csv(data_path)

print(f"Raw data shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Select only numeric features
df_features = df.select_dtypes(include=[np.number])
print(f"\nNumeric features shape: {df_features.shape}")
print(f"Numeric columns ({len(df_features.columns)}):")
for i, col in enumerate(df_features.columns, 1):
    print(f"  {i:2d}. {col}")

In [None]:
# Handle missing values
print("="*60)
print("CHECKING & HANDLING NaN VALUES")
print("="*60)

nan_counts = df_features.isnull().sum()
nan_columns = nan_counts[nan_counts > 0]

if len(nan_columns) > 0:
    print(f"\nFound {len(nan_columns)} columns with NaN values:")
    for col, count in nan_columns.items():
        percentage = (count / len(df_features)) * 100
        print(f"  ‚Ä¢ {col:20s}: {count:5d} NaN ({percentage:6.2f}%)")
    
    # Fill NaN with column mean
    print("\nFilling NaN values with column means...")
    for col in nan_columns.index:
        col_mean = df_features[col].mean()
        df_features[col].fillna(col_mean, inplace=True)
        print(f"  ‚úÖ {col}: Filled with mean = {col_mean:.4f}")
    
    # Verify no NaN remains
    remaining_nan = df_features.isnull().sum().sum()
    print(f"\n‚úÖ Remaining NaN after processing: {remaining_nan}")
else:
    print("\n‚úÖ No NaN values found!")

# Final verification
assert df_features.isnull().sum().sum() == 0, "ERROR: NaN values still exist!"
print(f"\nüìä Final dataset: {df_features.shape} (100% valid data)")

## 2. Train-Test Split & Sliding Windows

In [None]:
# Train-test split
train_ratio = 0.8
train_index = int(len(df_features) * train_ratio)

print(f"Dataset split ({train_ratio:.0%}-{1-train_ratio:.0%}):")
print(f"  ‚Ä¢ Train: 0 to {train_index-1} ({train_index:,} samples)")
print(f"  ‚Ä¢ Test:  {train_index} to {len(df_features)-1} ({len(df_features) - train_index:,} samples)")

# Get temperature column index
temp_col_idx = list(df_features.columns).index('temp')
print(f"  ‚Ä¢ Temperature column index: {temp_col_idx}")

In [None]:
# Initialize sliding window processor
window_processor = TimeSeriesWindowProcessor(
    window_length=25,
    forecast_horizon=[1, 2, 3, 4, 5],
    step_length=5
)

print("Sliding Window Configuration:")
print(f"  ‚Ä¢ Window length: {window_processor.window_length} timesteps")
print(f"  ‚Ä¢ Forecast horizon: {window_processor.forecast_horizon} timesteps")
print(f"  ‚Ä¢ Step length: {window_processor.step_length}")

In [None]:
# Create training windows
train_indices = np.arange(train_index)
train_windows = window_processor.create_windows(train_indices)

print(f"\nTraining windows: {len(train_windows)}")
print("First 3 windows:")
for i, (train_idx, test_idx) in enumerate(train_windows[:3]):
    print(f"  Window {i}: Train=[{train_idx[0]}:{train_idx[-1]}], Test={test_idx.tolist()}")

# Process training windows
print("\nProcessing training windows...")
processed_train_windows = window_processor.process_windows(
    df_features, train_windows, validate_no_nan=True
)

print(f"‚úÖ Processed {len(processed_train_windows)} training windows")

In [None]:
# Create test windows
test_indices = np.arange(train_index, len(df_features))
test_windows = window_processor.create_windows(test_indices)

print(f"Test windows: {len(test_windows)}")

# Process test windows
print("Processing test windows...")
processed_test_windows = window_processor.process_windows(
    df_features, test_windows, validate_no_nan=True
)

print(f"‚úÖ Processed {len(processed_test_windows)} test windows")

In [None]:
# Convert windows to training arrays
X_train, y_train = window_processor.windows_to_arrays(processed_train_windows)
X_test, y_test = window_processor.windows_to_arrays(processed_test_windows)

print(f"Training arrays:")
print(f"  ‚Ä¢ X_train: {X_train.shape} (windows, timesteps, features)")
print(f"  ‚Ä¢ y_train: {y_train.shape} (windows, forecast_steps, features)")

print(f"\nTest arrays:")
print(f"  ‚Ä¢ X_test: {X_test.shape}")
print(f"  ‚Ä¢ y_test: {y_test.shape}")

# Verify no NaN
arrays_to_check = [('X_train', X_train), ('y_train', y_train), ('X_test', X_test), ('y_test', y_test)]
for name, arr in arrays_to_check:
    has_nan = np.isnan(arr).any()
    print(f"  ‚Ä¢ {name} has NaN: {has_nan}")
    assert not has_nan, f"{name} contains NaN values!"

print("\n‚úÖ All arrays verified - NO NaN values!")

## 3. Machine Learning Model Training

In [None]:
# Initialize ML trainer
ml_trainer = MLModelTrainer()

# Prepare data for ML models
X_train_flat, y_train_target, X_test_flat, y_test_target = ml_trainer.prepare_data(
    X_train, y_train, X_test, y_test, temp_col_idx
)

print(f"Prepared data for ML models:")
print(f"  ‚Ä¢ X_train_flat: {X_train_flat.shape} (windows, flattened_features)")
print(f"  ‚Ä¢ y_train_target: {y_train_target.shape} (windows, avg_forecast)")
print(f"  ‚Ä¢ X_test_flat: {X_test_flat.shape}")
print(f"  ‚Ä¢ y_test_target: {y_test_target.shape}")

In [None]:
# Train all ML models
print("="*60)
print("TRAINING MACHINE LEARNING MODELS")
print("="*60)

results = ml_trainer.train_all_models(
    X_train_flat, y_train_target, X_test_flat, y_test_target
)

print(f"\n‚úÖ Trained {len(results)} models successfully!")

In [None]:
# Get model comparison
comparison_df = ml_trainer.get_comparison_dataframe()
best_model_name, best_model = ml_trainer.get_best_model()

print("\n" + "="*80)
print("MODEL COMPARISON (Sorted by Test RMSE)")
print("="*80)
print(comparison_df.to_string(index=False))
print("="*80)

print(f"\nüèÜ BEST MODEL: {best_model_name}")
best_row = comparison_df.iloc[0]
print(f"  ‚Ä¢ Test RMSE: {best_row['Test RMSE']:.6f}")
print(f"  ‚Ä¢ Test MAE:  {best_row['Test MAE']:.6f}")
print(f"  ‚Ä¢ Test R¬≤:   {best_row['Test R¬≤']:.4f}")
print(f"  ‚Ä¢ Test MAPE: {best_row['Test MAPE (%)']:.2f}%")

## 4. Results Visualization

In [None]:
# Initialize visualizer
visualizer = ModelVisualizer()

# Create model comparison plots
fig, axes = visualizer.plot_model_comparison(comparison_df)
plt.show()

In [None]:
# Plot best model predictions
best_predictions = results[best_model_name]['y_pred']
fig, axes = visualizer.plot_predictions_comparison(
    y_test_target, best_predictions, best_model_name
)
plt.show()

In [None]:
# Compare all models predictions
fig, axes = visualizer.plot_all_models_comparison(
    results, y_test_target, num_samples=100
)
plt.show()

## 5. Model Persistence & Results Saving

In [None]:
# Define paths
models_dir = f'{project_root}/models/daily'
processed_dir = f'{project_root}/data/processed'

# Create directories if they don't exist
os.makedirs(models_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

# Save models
ml_trainer.save_models(models_dir)

# Save results
results_path = f'{processed_dir}/ml_results.pkl'
comparison_csv_path = f'{processed_dir}/ml_models_comparison.csv'
ml_trainer.save_results(results_path, comparison_csv_path)

print("\n" + "="*60)
print("‚úÖ ALL MODELS AND RESULTS SAVED!")
print("="*60)
print(f"üìÅ Models directory: {models_dir}")
print(f"üìÅ Results file: {results_path}")
print(f"üìÅ Comparison CSV: {comparison_csv_path}")
print(f"üèÜ Best model: {best_model_name}")

In [None]:
# Clean up
visualizer.close_all()
print("‚úÖ Pipeline completed successfully!")