# Advanced Time Series Models

This notebook covers machine learning and deep learning approaches.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

## 1. Feature Engineering

In [None]:
def create_features(df, target_col='value', lags=[1, 2, 3, 24, 48]):
    """
    Create lag features, rolling statistics, and time-based features
    """
    df_features = df.copy()
    
    # Lag features
    for lag in lags:
        df_features[f'lag_{lag}'] = df_features[target_col].shift(lag)
    
    # Rolling statistics
    for window in [3, 7, 24]:
        df_features[f'rolling_mean_{window}'] = df_features[target_col].rolling(window).mean()
        df_features[f'rolling_std_{window}'] = df_features[target_col].rolling(window).std()
    
    # Time-based features
    df_features['hour'] = df_features.index.hour
    df_features['day_of_week'] = df_features.index.dayofweek
    df_features['day_of_month'] = df_features.index.day
    df_features['month'] = df_features.index.month
    
    # Cyclical encoding
    df_features['hour_sin'] = np.sin(2 * np.pi * df_features['hour'] / 24)
    df_features['hour_cos'] = np.cos(2 * np.pi * df_features['hour'] / 24)
    df_features['dow_sin'] = np.sin(2 * np.pi * df_features['day_of_week'] / 7)
    df_features['dow_cos'] = np.cos(2 * np.pi * df_features['day_of_week'] / 7)
    
    return df_features

## 2. Data Preparation

In [None]:
def prepare_ml_data(df, target_col='value', test_size=0.2):
    """
    Prepare data for machine learning models
    """
    # Drop rows with NaN values
    df_clean = df.dropna()
    
    # Separate features and target
    feature_cols = [col for col in df_clean.columns if col != target_col]
    X = df_clean[feature_cols]
    y = df_clean[target_col]
    
    # Split data (time series split)
    split_point = int(len(df_clean) * (1 - test_size))
    
    X_train = X[:split_point]
    X_test = X[split_point:]
    y_train = y[:split_point]
    y_test = y[split_point:]
    
    return X_train, X_test, y_train, y_test

## 3. Random Forest Model

In [None]:
def train_random_forest(X_train, y_train, n_estimators=100, random_state=42):
    """
    Train Random Forest model
    """
    model = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
    model.fit(X_train, y_train)
    return model

def get_feature_importance(model, feature_names, top_n=10):
    """
    Get top N feature importances
    """
    importances = model.feature_importances_
    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    return feature_importance.head(top_n)

## 4. XGBoost Model

In [None]:
def train_xgboost(X_train, y_train, X_val=None, y_val=None):
    """
    Train XGBoost model
    """
    model = xgb.XGBRegressor(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42
    )
    
    if X_val is not None and y_val is not None:
        model.fit(X_train, y_train, 
                 eval_set=[(X_val, y_val)], 
                 early_stopping_rounds=10, 
                 verbose=False)
    else:
        model.fit(X_train, y_train)
    
    return model

## 5. Walk-Forward Validation

In [None]:
def walk_forward_validation(df, model_func, n_splits=5, target_col='value'):
    """
    Perform walk-forward validation
    """
    df_clean = df.dropna()
    feature_cols = [col for col in df_clean.columns if col != target_col]
    
    total_len = len(df_clean)
    initial_train_size = total_len // (n_splits + 1)
    
    scores = []
    predictions = []
    
    for i in range(n_splits):
        # Define train and test splits
        train_end = initial_train_size + i * (total_len - initial_train_size) // n_splits
        test_start = train_end
        test_end = min(train_end + (total_len - initial_train_size) // n_splits, total_len)
        
        X_train = df_clean.iloc[:train_end][feature_cols]
        y_train = df_clean.iloc[:train_end][target_col]
        X_test = df_clean.iloc[test_start:test_end][feature_cols]
        y_test = df_clean.iloc[test_start:test_end][target_col]
        
        # Train model
        model = model_func(X_train, y_train)
        
        # Predict
        y_pred = model.predict(X_test)
        
        # Evaluate
        mae = mean_absolute_error(y_test, y_pred)
        scores.append(mae)
        predictions.extend(y_pred)
    
    return np.mean(scores), scores, predictions

## 6. Example Usage

In [None]:
# Load and prepare data
# df = pd.read_csv('../data/processed/clean_data.csv', parse_dates=['timestamp'], index_col='timestamp')
# df_features = create_features(df)
# X_train, X_test, y_train, y_test = prepare_ml_data(df_features)

# Train models
# rf_model = train_random_forest(X_train, y_train)
# xgb_model = train_xgboost(X_train, y_train)

# Make predictions
# rf_pred = rf_model.predict(X_test)
# xgb_pred = xgb_model.predict(X_test)

# Evaluate
# rf_mae = mean_absolute_error(y_test, rf_pred)
# xgb_mae = mean_absolute_error(y_test, xgb_pred)

print("Example template - uncomment and modify for your data")