# Objective: 
- To design, train, evaluate, and fine-tune various machine learning models (LightGBM, TensorFlow, PyTorch) for generating alpha signals based on the engineered features.

# Load Features & Target

In [None]:
import os
import pandas as pd
import yaml
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def load_datasets(processed_dir="data/processed", base_filename="final_features"):
    """
    Load train, val, and test datasets from Parquet files.
    """
    train = pd.read_parquet(os.path.join(processed_dir, f"{base_filename}_train.parquet"))
    val = pd.read_parquet(os.path.join(processed_dir, f"{base_filename}_val.parquet"))
    test = pd.read_parquet(os.path.join(processed_dir, f"{base_filename}_test.parquet"))
    return train, val, test

def apply_final_scaling(train, val, test, config_path="ml_config.yaml"):
    """
    Apply final feature scaling or transformations as defined in ml_config.yaml.
    """
    with open(config_path, "r") as f:
        config = yaml.safe_load(f)
    scaling_cfg = config.get("scaling", {})
    method = scaling_cfg.get("method", "standard")
    feature_cols = scaling_cfg.get("feature_columns", [col for col in train.columns if col not in scaling_cfg.get("target_columns", ["target"])])

    if method == "standard":
        scaler = StandardScaler()
    elif method == "minmax":
        scaler = MinMaxScaler()
    else:
        raise ValueError(f"Unknown scaling method: {method}")

    scaler.fit(train[feature_cols])
    train[feature_cols] = scaler.transform(train[feature_cols])
    val[feature_cols] = scaler.transform(val[feature_cols])
    test[feature_cols] = scaler.transform(test[feature_cols])

    return train, val, test

if __name__ == "__main__":
    train, val, test = load_datasets()
    train, val, test = apply_final_scaling(train, val, test, config_path="ml_config.yaml")
    print("Final scaling and transformations applied.")2

# Configuration & Hyperparameter Loading

In [None]:
import yaml

def load_model_config(config_path="config/ml_config.yaml"):
    """
    Load model-specific configurations and hyperparameters from a YAML file.
    :param config_path: str, path to the YAML config file
    :return: dict, configuration dictionary
    """
    with open(config_path, "r") as f:
        config = yaml.safe_load(f)
    return config

# Example usage:
# model_config = load_model_config()
# print(model_config)

# Model Training - LightGBM/XGBoost

In [None]:
import yaml
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
from src.models.traditionals_ml_models import GradientBoostingAlphaModel

def load_ml_config(config_path="config/ml_config.yaml"):
    with open(config_path, "r") as f:
        return yaml.safe_load(f)

def evaluate_regression(y_true, y_pred):
    return {
        "RMSE": np.sqrt(mean_squared_error(y_true, y_pred)),
        "MAE": mean_absolute_error(y_true, y_pred),
        "R2": r2_score(y_true, y_pred)
    }

def run_time_series_cv(X, y, model_params, n_splits=5):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    metrics_list = []
    feature_importances = []
    for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        model = GradientBoostingAlphaModel(model_params)
        model.train(X_train, y_train)
        preds = model.predict(X_val)
        metrics = evaluate_regression(y_val, preds)
        metrics_list.append(metrics)
        # Feature importances
        if hasattr(model.model, "feature_importances_"):
            feature_importances.append(model.model.feature_importances_)
        print(f"Fold {fold+1} metrics: {metrics}")
    # Aggregate metrics
    avg_metrics = {k: np.mean([m[k] for m in metrics_list]) for k in metrics_list[0]}
    print(f"\nAverage CV Metrics: {avg_metrics}")
    # Aggregate feature importances
    if feature_importances:
        avg_importances = np.mean(feature_importances, axis=0)
        print("\nAverage Feature Importances:")
        for idx, imp in enumerate(avg_importances):
            print(f"Feature {idx}: {imp:.4f}")
    return avg_metrics, avg_importances if feature_importances else None

if __name__ == "__main__":
    # Load config and data
    config = load_ml_config()
    data = pd.read_parquet(config["data_path"])
    feature_cols = config["feature_columns"]
    target_col = config["target_column"]
    X = data[feature_cols].values
    y = data[target_col].values
    model_params = config.get("model_params", {})
    n_splits = config.get("time_series_cv_splits", 5)
    run_time_series_cv(X, y, model_params, n_splits)

# Model Training - TensorFlow/Keras

In [None]:
import os
import yaml
import numpy as np
import pandas as pd
import tensorflow as tf
from src.models import tensorflow_models

def load_config(config_path="config/ml_config.yaml"):
    with open(config_path, "r") as f:
        return yaml.safe_load(f)

def get_model(model_type, input_shape, config):
    if model_type == "dnn":
        return tensorflow_models.build_dnn_model(input_shape, config)
    elif model_type == "lstm":
        return tensorflow_models.build_lstm_model(input_shape, config)
    elif model_type == "cnn":
        return tensorflow_models.build_cnn_model(input_shape, config)
    elif model_type == "gru":
        return tensorflow_models.build_gru_model(input_shape, config)
    elif model_type == "bidirectional_lstm":
        return tensorflow_models.build_bidirectional_lstm_model(input_shape, config)
    else:
        raise ValueError(f"Unknown model type: {model_type}")

def train_tf_model(model, X_train, y_train, X_val, y_val, config):
    callbacks = []
    if config.get("early_stopping", True):
        callbacks.append(tf.keras.callbacks.EarlyStopping(
            monitor="val_loss",
            patience=config.get("patience", 10),
            restore_best_weights=True
        ))
    if config.get("checkpoint_path"):
        callbacks.append(tf.keras.callbacks.ModelCheckpoint(
            filepath=config["checkpoint_path"],
            monitor="val_loss",
            save_best_only=True
        ))
    model.compile(
        optimizer=config.get("optimizer", "adam"),
        loss=config.get("loss", "mse"),
        metrics=config.get("metrics", ["mae"])
    )
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=config.get("epochs", 100),
        batch_size=config.get("batch_size", 32),
        callbacks=callbacks,
        verbose=2
    )
    return history

def evaluate_tf_model(model, X_val, y_val):
    results = model.evaluate(X_val, y_val, verbose=0)
    print("Validation results:", dict(zip(model.metrics_names, results)))
    return results

# Example for tf-quant-finance custom loss (if applicable)
def custom_financial_loss(y_true, y_pred):
    # Placeholder: Replace with tf-quant-finance loss if needed
    return tf.reduce_mean(tf.square(y_true - y_pred))

if __name__ == "__main__":
    # Load config and data
    config = load_config()
    data = pd.read_parquet(config["data_path"])
    feature_cols = config["feature_columns"]
    target_col = config["target_column"]
    X = data[feature_cols].values
    y = data[target_col].values

    # Split data (assuming already split, else do so here)
    split = int(len(X) * 0.8)
    X_train, X_val = X[:split], X[split:]
    y_train, y_val = y[:split], y[split:]

    # Build model
    input_shape = X_train.shape[1:] if len(X_train.shape) > 1 else (X_train.shape[1],)
    model = get_model(config["nn_model_type"], input_shape, config.get("nn_model_config", {}))

    # Optionally use tf-quant-finance custom loss
    if config.get("use_tfqf_loss", False):
        model.compile(
            optimizer=config.get("optimizer", "adam"),
            loss=custom_financial_loss,
            metrics=config.get("metrics", ["mae"])
        )

    # Train
    train_tf_model(model, X_train, y_train, X_val, y_val, config)

    # Evaluate
    evaluate_tf_model(model, X_val, y_val)

# Model Training - PyTorch

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
import pandas as pd
from src.models.pytorch_models import CustomLSTM, CustomDNN

def get_model(model_type, input_size, config):
    if model_type == "lstm":
        return CustomLSTM(
            input_size=input_size,
            hidden_size=config.get("hidden_size", 64),
            num_layers=config.get("num_layers", 1),
            dropout=config.get("dropout", 0.0),
            output_size=config.get("output_size", 1),
            bidirectional=config.get("bidirectional", False)
        )
    elif model_type == "dnn":
        return CustomDNN(
            input_size=input_size,
            hidden_layers=config.get("hidden_layers", [64, 32]),
            dropout=config.get("dropout", 0.0),
            output_size=config.get("output_size", 1)
        )
    else:
        raise ValueError(f"Unknown model type: {model_type}")

def train_pytorch_model(model, train_loader, val_loader, config):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=config.get("lr", 1e-3))
    epochs = config.get("epochs", 50)
    best_val_loss = float("inf")
    best_model_state = None

    for epoch in range(epochs):
        model.train()
        train_losses = []
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            output = model(X_batch)
            loss = criterion(output.squeeze(), y_batch)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
        model.eval()
        val_losses = []
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                output = model(X_batch)
                loss = criterion(output.squeeze(), y_batch)
                val_losses.append(loss.item())
        avg_train_loss = np.mean(train_losses)
        avg_val_loss = np.mean(val_losses)
        print(f"Epoch {epoch+1}/{epochs} - Train Loss: {avg_train_loss:.4f} - Val Loss: {avg_val_loss:.4f}")
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model_state = model.state_dict()
    model.load_state_dict(best_model_state)
    return model

def evaluate_pytorch_model(model, data_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()
    preds = []
    trues = []
    with torch.no_grad():
        for X_batch, y_batch in data_loader:
            X_batch = X_batch.to(device)
            output = model(X_batch)
            preds.append(output.cpu().numpy())
            trues.append(y_batch.numpy())
    preds = np.concatenate(preds).squeeze()
    trues = np.concatenate(trues).squeeze()
    mse = np.mean((preds - trues) ** 2)
    mae = np.mean(np.abs(preds - trues))
    print(f"Test MSE: {mse:.4f}, MAE: {mae:.4f}")
    return mse, mae

if __name__ == "__main__":
    # Example: Load data
    df = pd.read_parquet("data/processed/final_features_train.parquet")
    feature_cols = [col for col in df.columns if "feature" in col or "momentum" in col or "volatility" in col]
    target_col = "forward_return_5d"
    X = df[feature_cols].values.astype(np.float32)
    y = df[target_col].values.astype(np.float32)
    # For LSTM, reshape X to (samples, timesteps, features) if needed
    # Here, assume DNN: (N, F)
    X_tensor = torch.tensor(X)
    y_tensor = torch.tensor(y)
    dataset = TensorDataset(X_tensor, y_tensor)
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

    # Model config
    config = {
        "hidden_size": 64,
        "num_layers": 1,
        "dropout": 0.1,
        "output_size": 1,
        "epochs": 30,
        "lr": 1e-3
    }

    # Build, train, and evaluate model
    model = get_model("dnn", input_size=X.shape[1], config=config)
    model = train_pytorch_model(model, train_loader, val_loader, config)
    evaluate_pytorch_model(model, val_loader)

# Model Selection & Ensembling 

In [None]:
import numpy as np
import pandas as pd

def compare_model_performance(val_preds_dict, y_val, metrics=None):
    """
    Compare validation performance of different models.
    val_preds_dict: dict, model_name -> np.array of predictions
    y_val: np.array, true validation targets
    metrics: list of metric functions, each should take (y_true, y_pred)
    """
    if metrics is None:
        from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
        metrics = [
            ("RMSE", lambda y, p: np.sqrt(mean_squared_error(y, p))),
            ("MAE", mean_absolute_error),
            ("R2", r2_score)
        ]
    results = {}
    for model_name, preds in val_preds_dict.items():
        results[model_name] = {name: func(y_val, preds) for name, func in metrics}
    return pd.DataFrame(results)

def ensemble_predictions(val_preds_dict, method="mean"):
    """
    Ensemble predictions from multiple models.
    method: 'mean' (average), 'median', or custom function
    """
    preds_matrix = np.column_stack(list(val_preds_dict.values()))
    if method == "mean":
        return np.mean(preds_matrix, axis=1)
    elif method == "median":
        return np.median(preds_matrix, axis=1)
    else:
        raise ValueError("Unknown ensembling method")

# Example usage:
# val_preds_dict = {
#     "lgbm": lgbm_val_preds,
#     "tensorflow": tf_val_preds,
#     "pytorch": torch_val_preds
# }
# y_val = ... # your validation targets
# results_df = compare_model_performance(val_preds_dict, y_val)
# print(results_df)
# # Simple ensembling
# ensemble_val_preds = ensemble_predictions(val_preds_dict, method="mean")
# # Evaluate ensemble
# from sklearn.metrics import mean_squared_error
# print("Ensemble RMSE:", np.sqrt(mean_squared_error(y_val,

# Generate Out-of-Sample Predictions

In [None]:
import os
import numpy as np
import pandas as pd

def save_test_predictions(model, X_test, test_index, output_path):
    """
    Generate and save predictions on the test set.
    model: trained model or ensemble (must have .predict method)
    X_test: np.ndarray or pd.DataFrame, test features
    test_index: index or identifier for each test row (e.g., date, ticker)
    output_path: str, where to save the predictions CSV
    """
    preds = model.predict(X_test)
    results = pd.DataFrame({
        "index": test_index,
        "prediction": np.ravel(preds)
    })
    results.to_csv(output_path, index=False)
    print(f"Test predictions saved to {output_path}")

def save_ensemble_test_predictions(models, X_test, test_index, output_path, method="mean"):
    """
    Generate and save ensemble predictions on the test set.
    models: list of trained models (each must have .predict)
    X_test: np.ndarray or pd.DataFrame
    test_index: index or identifier for each test row
    output_path: str
    method: 'mean' or 'median'
    """
    preds_matrix = np.column_stack([m.predict(X_test) for m in models])
    if method == "mean":
        ensemble_preds = np.mean(preds_matrix, axis=1)
    elif method == "median":
        ensemble_preds = np.median(preds_matrix, axis=1)
    else:
        raise ValueError("Unknown ensemble method")
    results = pd.DataFrame({
        "index": test_index,
        "prediction": np.ravel(ensemble_preds)
    })
    results.to_csv(output_path, index=False)
    print(f"Ensemble test predictions saved to {output_path}")

if __name__ == "__main__":
    # Example usage:
    # Load test data
    test = pd.read_parquet("data/processed/final_features_test.parquet")
    feature_cols = [col for col in test.columns if "feature" in col or "momentum" in col or "volatility" in col]
    X_test = test[feature_cols].values
    test_index = test.index if test.index.name else np.arange(len(test))

    # Load your best model or ensemble
    # from joblib import load
    # model = load("models/best_model.pkl")
    # save_test_predictions(model, X_test, test_index, "results/ml_metrics/test_predictions.csv")

    # For ensemble:
    # models = [load("models/model1.pkl"), load("models/model2.pkl"), ...]
    # save_ensemble_test_predictions(models, X_test, test_index, "results/ml_metrics/ensemble_test_predictions.csv", method=

# Save Trained Models

In [None]:
import os

def save_lgbm_model(model, filename="models/trained_models/lgbm_model.pkl"):
    import joblib
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    joblib.dump(model, filename)
    print(f"LightGBM model saved to {filename}")

def save_tensorflow_model(model, filename="models/trained_models/tf_model.h5"):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    model.save(filename)
    print(f"TensorFlow model saved to {filename}")

def save_pytorch_model(model, filename="models/trained_models/torch_model.pt"):
    import torch
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    torch.save(model.state_dict(), filename)
    print(f"PyTorch model saved to {filename}")

# Example usage:
# save_lgbm_model(lgbm_model)
# save_tensorflow_model(tf_model)
#