# Content for 03_Modeling Pipeline Setup.ipynb
This notebook loads enriched data from 01_data_collection.ipynb, adds lagged features, prepares data, trains classical models (LR, RF, LogReg, KMeans), and LSTM models.
Saves model results to model_results_dir.

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error, r2_score
from sklearn.cluster import KMeans
import tensorflow as tf
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import LSTM, Dense, Input, Dropout, Bidirectional

# Set paths (same as previous)
ROOT = os.getcwd()
DATA_DIR = os.path.join(ROOT, "data")
ENRICHED_DIR = os.path.join(DATA_DIR, "enriched")
MODEL_RESULTS_DIR = os.path.join(DATA_DIR, 'model_results')
os.makedirs(MODEL_RESULTS_DIR, exist_ok=True)

# Commodity names
commodities = ["Gold", "WTI", "Wheat", "NaturalGas", "Copper", "Lithium"]

# Load enriched data
merged_data = {}
for name in commodities:
    fname = f"{name.lower()}_enriched.csv"
    path = os.path.join(ENRICHED_DIR, fname)
    if os.path.exists(path):
        df = pd.read_csv(path)
        df['Date'] = pd.to_datetime(df['Date'])
        merged_data[name] = df
    else:
        print(f"Missing enriched file for {name}")

In [None]:
# Functions for modeling
def prepare_features_targets(df, features, target):
    X = df[features]
    y = df[target]
    return X, y

def mean_absolute_percentage_error(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def create_sequences(data, feature_cols, target_col, seq_length):
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        x = data[feature_cols].iloc[i:i+seq_length].values
        y = data[target_col].iloc[i+seq_length]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

def add_lagged_features(df, feature_cols, max_lag=5):
    for col in feature_cols:
        for lag in range(1, max_lag + 1):
            df[f'{col}_lag{lag}'] = df[col].shift(lag)
    return df

In [None]:
# Modeling
target_return = 'Return'
target_ma = 'MA_5'
lag_features = ['GPRD', 'geo_keyword_hits']

results = {}
results_ma = {}

for commodity, df in merged_data.items():
    print(f"Processing commodity: {commodity}")

    df['geo_keyword_hits'].fillna(0, inplace=True)
    df['sentiment'].fillna(0, inplace=True)
    if 'EVENT' in df.columns:
        df['EVENT'].fillna('None', inplace=True)

    # Add lagged features
    df = add_lagged_features(df, lag_features, max_lag=5)
    df.dropna(inplace=True)
    print(f"{commodity} - {len(df)} rows after lagged features added and NaN drops.")

    features_baseline = ['Return_lag1']
    features_enhanced = ['Return_lag1', 'GPRD', 'geo_keyword_hits', 'sentiment'] + \
                        [f'{feat}_lag{lag}' for feat in lag_features for lag in range(1, 6)]

    # Convert to numeric
    df[features_enhanced] = df[features_enhanced].apply(pd.to_numeric, errors='coerce')

    # Train/test split
    split_date = pd.to_datetime('2000-01-01')  # Note: This date is early; adjust if needed for more train data
    train_df = df[df['Date'] < split_date].copy()
    test_df = df[df['Date'] >= split_date].copy()

    print(f"{commodity} - Train size: {len(train_df)}, Test size: {len(test_df)}")
    if len(train_df) < 20 or len(test_df) < 20:
        print(f"Insufficient samples for {commodity}, skipping.")
        continue

    # Classical models for Return
    X_train_base, y_train = prepare_features_targets(train_df, features_baseline, target_return)
    X_test_base, y_test = prepare_features_targets(test_df, features_baseline, target_return)
    X_train_enh = train_df[features_enhanced]
    X_test_enh = test_df[features_enhanced]

    scaler = StandardScaler()
    X_train_enh_scaled = scaler.fit_transform(X_train_enh)
    X_test_enh_scaled = scaler.transform(X_test_enh)

    # Baseline LR
    lr_base = LinearRegression()
    lr_base.fit(X_train_base, y_train)
    y_pred_base = lr_base.predict(X_test_base)
    rmse_base = np.sqrt(mean_squared_error(y_test, y_pred_base))

    # Enhanced RF
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train_enh_scaled, y_train)
    y_pred_rf = rf.predict(X_test_enh_scaled)
    rmse_enh = np.sqrt(mean_squared_error(y_test, y_pred_rf))

    results[commodity] = {'RMSE_baseline': rmse_base, 'RMSE_enhanced': rmse_enh}

In [None]:
# Classification
    train_df['Return_binary'] = (train_df['Return'] > 0).astype(int)
    test_df['Return_binary'] = (test_df['Return'] > 0).astype(int)

    train_df = train_df.dropna(subset=features_enhanced)
    test_df = test_df.dropna(subset=features_enhanced)

    X_train_class = scaler.fit_transform(train_df[features_enhanced])
    X_test_class = scaler.transform(test_df[features_enhanced])
    y_train_class = train_df['Return_binary']
    y_test_class = test_df['Return_binary']

    logreg = LogisticRegression(max_iter=200)
    logreg.fit(X_train_class, y_train_class)
    y_pred_class = logreg.predict(X_test_class)
    print(f"{commodity} - Classification Accuracy: {accuracy_score(y_test_class, y_pred_class):.4f}")

    # Clustering
    kmeans = KMeans(n_clusters=2, random_state=42)
    regime_features = df[['Vol_5', 'GPRD', 'geo_keyword_hits']].fillna(0)
    df['Regime'] = kmeans.fit_predict(regime_features)

In [None]:
# LSTM for MA_5
    y_train_ma = train_df[target_ma]
    y_test_ma = test_df[target_ma]

    minmax_scaler = MinMaxScaler()
    train_features_scaled = minmax_scaler.fit_transform(X_train_enh)
    test_features_scaled = minmax_scaler.transform(X_test_enh)

    train_scaled_df = pd.DataFrame(train_features_scaled, columns=features_enhanced, index=train_df.index)
    test_scaled_df = pd.DataFrame(test_features_scaled, columns=features_enhanced, index=test_df.index)

    train_scaled_df[target_ma] = y_train_ma.values  # Align lengths
    test_scaled_df[target_ma] = y_test_ma.values

    train_scaled_df.dropna(subset=features_enhanced + [target_ma], inplace=True)
    test_scaled_df.dropna(subset=features_enhanced + [target_ma], inplace=True)

    seq_length = 10
    X_train_seq, y_train_seq = create_sequences(train_scaled_df, features_enhanced, target_ma, seq_length)
    X_test_seq, y_test_seq = create_sequences(test_scaled_df, features_enhanced, target_ma, seq_length)

    if len(X_train_seq) == 0 or len(X_test_seq) == 0:
        print(f"Skipping LSTM for {commodity} due to insufficient sequences.")
        continue

    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    model = Sequential([
        Input(shape=(seq_length, len(features_enhanced))),
        Bidirectional(LSTM(128, return_sequences=True)),
        Dropout(0.2),
        LSTM(64),
        Dropout(0.2),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')

    val_split = 0.2 if len(X_train_seq) > 10 else 0.0

    history = model.fit(
        X_train_seq, y_train_seq,
        epochs=100,
        batch_size=64,
        validation_split=val_split,
        callbacks=[early_stop] if val_split > 0 else [],
        verbose=2
    )

    y_pred_lstm = model.predict(X_test_seq)
    rmse_lstm = np.sqrt(mean_squared_error(y_test_seq, y_pred_lstm))
    mae_lstm = mean_absolute_error(y_test_seq, y_pred_lstm)
    mape_lstm = mean_absolute_percentage_error(y_test_seq, y_pred_lstm)
    r2_lstm = r2_score(y_test_seq, y_pred_lstm)

    results_ma[commodity] = {'RMSE_LSTM': rmse_lstm, 'MAE_LSTM': mae_lstm, 'MAPE_LSTM': mape_lstm, 'R2_LSTM': r2_lstm}
    print(f"Finished processing {commodity}\n")

In [None]:
# Save results
results_df = pd.DataFrame(results).T
results_ma_df = pd.DataFrame(results_ma).T
results_df.to_csv(os.path.join(MODEL_RESULTS_DIR, 'classical_model_results.csv'))
results_ma_df.to_csv(os.path.join(MODEL_RESULTS_DIR, 'lstm_ma_results.csv'))

print(f"Saved results to {MODEL_RESULTS_DIR}")