# Data Gathering

In [1]:
from pathlib import Path
from datetime import datetime
import sys

# Define the base directory
base_dir = Path.cwd().resolve().parents[1]

# Define subdirectories
data_dir = base_dir / "data"
model_dir = base_dir / "models"
notebooks_dir = base_dir / "notebooks"

# Append base_dir to sys.path
sys.path.append(str(base_dir))

In [4]:
import pandas as pd
import glob
from pathlib import Path

def get_meta_data(meta_data_dir):
    meta_data_dir = Path(meta_data_dir)
    # Define file patterns for OOF and test predictions
    oof_files = sorted(glob.glob(str(meta_data_dir/"*_oof.csv")))  # Sorted for consistency
    test_files = sorted(glob.glob(str(meta_data_dir/"*_test.csv")))  # Sorted for consistency

    # Ensure the number of files matches
    assert len(oof_files) == len(test_files), "Mismatch in the number of OOF and test files."

    # Function to load and rename files with prefixes
    def load_and_prefix(files, prefix):
        dfs = []
        for file in files:
            model_name = file.split("/")[-1].replace("_oof.csv", "").replace("_test.csv", "")
            df = pd.read_csv(file)
            df.columns = [f"{prefix}{model_name}_F{i+1}" for i in range(df.shape[1])]
            dfs.append(df)
        return pd.concat(dfs, axis=1)

    # Load OOF and test predictions with identical features
    X_meta_train = load_and_prefix(oof_files, prefix="OOF_")
    X_meta_test = load_and_prefix(test_files, prefix="Test_")

    # Ensure the columns are identical between train and test
    X_meta_test.columns = X_meta_train.columns

    # Check the shapes
    print(f"Meta-train shape: {X_meta_train.shape}")
    print(f"Meta-test shape: {X_meta_test.shape}")
    
    return X_meta_train, X_meta_test

X_meta_train, X_meta_test =get_meta_data("notebooks/Ensemble Catboost XGBoost LGBM")

ValueError: No objects to concatenate

In [None]:
# Mock

import pandas as pd
import numpy as np


# Set random seed for reproducibility
np.random.seed(42)

# Create a mock meta_features DataFrame (3 models, 100 samples)
X_meta_train = pd.DataFrame({
    "model_1": np.random.rand(100) * 0.8 + 0.1,  # Predictions from model 1
    "model_2": np.random.rand(100) * 0.9 + 0.05, # Predictions from model 2
    "model_3": np.random.rand(100) * 0.85 + 0.1  # Predictions from model 3
})

# Create a mock meta_predictions DataFrame for the test set (3 models, 50 samples)
X_meta_test = pd.DataFrame({
    "model_1": np.random.rand(50) * 0.8 + 0.1,
    "model_2": np.random.rand(50) * 0.9 + 0.05,
    "model_3": np.random.rand(50) * 0.85 + 0.1
})

# Create a mock y_train Series (true target values)
y_train = pd.Series(np.random.rand(100) * 0.9 + 0.05)

# Print the shapes and samples
print("meta_features (train):")
print(X_meta_train.head())
print("\nmeta_predictions (test):")
print(X_meta_test.head())
print("\ny_train (target):")
print(y_train.head())


# Meta Training

In [None]:
import optuna
import numpy as np
from sklearn.metrics import mean_squared_error
from optuna.visualization import plot_optimization_history
import matplotlib.pyplot as plt

def optimize_ensemble_weights(X_meta_train, X_meta_test, y_train, n_trials=100):
    """
    Optimize ensemble weights for a given set of meta-features using Optuna.
    
    Parameters:
        X_meta_train (pd.DataFrame): OOF predictions (meta-features) for training.
        X_meta_test (pd.DataFrame): Predictions for the test set (meta-features).
        y_train (pd.Series): True target values for training.
        n_trials (int): Number of trials for Optuna optimization.
    
    Returns:
        dict: Optimized weights for each meta-model.
        np.ndarray: Final predictions on the test set.
    """
    # Define the objective function
    def objective(trial):
        num_models = X_meta_train.shape[1]
        weights = [trial.suggest_float(f"weight_{i}", 0, 1) for i in range(num_models)]
        weights = np.array(weights) / sum(weights)  # Normalize weights to sum to 1
        ensemble_preds = (X_meta_train.values * weights).sum(axis=1)
        rmse = np.sqrt(mean_squared_error(y_train, ensemble_preds))
        return rmse

    # Initialize and optimize the study
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)

    # df = study.trials_dataframe()
    # print(df)  

    # Retrieve the best weights
    best_weights = study.best_params
    normalized_weights = np.array([best_weights[f"weight_{i}"] for i in range(len(best_weights))])
    normalized_weights /= normalized_weights.sum()

    # Generate predictions for the test set
    test_preds = (X_meta_test.values * normalized_weights).sum(axis=1)

    # Print optimization results
    print("Optimized Weights:", normalized_weights)
    print("Best RMSE:", study.best_value)

    # Plot optimization history
    plot = plot_optimization_history(study)
    plt.title("Optuna Optimization History")
    plt.show()

    return best_weights, test_preds

# Assuming meta_features (training meta-features), meta_predictions (test meta-features),
# and y_train (true target values) are already defined.
optimized_weights, test_predictions = optimize_ensemble_weights(
    X_meta_train=X_meta_train,
    X_meta_test=X_meta_test,
    y_train=y_train,
    n_trials=10
)

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from axyom_utilities.training import train_model_cv

# Neural Network class
class NeuralNetwork:
    def __init__(self, input_dim, learning_rate=0.001):
        self.model = Sequential([
            Dense(128, activation='relu', input_dim=input_dim),
            Dense(64, activation='relu'),
            Dense(1)
        ])
        self.model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')
    
    def fit(self, X_train, y_train, epochs=50, batch_size=32, verbose=0):
        self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=verbose)
    
    def predict(self, X):
        return self.model.predict(X).flatten()

# Initialize models
meta_models = {
    "Ridge": Ridge(alpha=1.0, random_state=42),
    "LightGBM": lgb.LGBMRegressor(random_state=42, verbose=-1),
    "NeuralNetwork": NeuralNetwork(input_dim=X_meta_train.shape[1])
}

# Train and evaluate models
results = {}
for name, model in meta_models.items():
    results = train_model_cv(model, X_meta_train, y_train, X_meta_test, cv_splits=10)
    
    oof_preds = results['oof_preds']
    test_preds = results['test_preds']
    score = np.mean(results['cv_scores'])
    
    results[name] = {"oof_preds": oof_preds, "test_preds": test_preds, "score":score}
    print(f"{name} OOF RMSE: {score:.6f}")

# Compare stacking methods
# for method, result in results.items():
#     print(f"{method}: OOF RMSE = {result['oof_rmse']:.4f}")
# Find the best model
best_model = max(results.items(), key=lambda x: x[1]["score"])
best_name, best_data = best_model

# Extract relevant details
best_oof_preds = best_data["oof_preds"]
best_test_preds = best_data["test_preds"]

print(f"Best Model Name: {best_name}")
print(f"OOF Predictions: {best_oof_preds}")
print(f"Test Predictions: {best_test_preds}")


# Submission

In [None]:
y_pred = np.maximum(0, np.expm1(test_preds))

submission = pd.DataFrame({
    'id': X_meta_test.index,  
    'Premium Amount': y_pred
})

FILE_PATH = f"Stacking_v3_{best_model_score:.4f}.csv"

submission.to_csv(FILE_PATH, index=False)