# Data Gathering

In [1]:
from pathlib import Path
from datetime import datetime
import sys


# Define the base directory
base_dir = Path.cwd().resolve().parents[1]

# Define subdirectories
data_dir = base_dir / "data"
model_dir = base_dir / "models"
notebooks_dir = base_dir / "notebooks"

# Append base_dir to sys.path
sys.path.append(str(base_dir))

from axyom_utilities.data_extraction import extract_data

X_train, y_train, X_test, X_orig, y_orig = extract_data(data_dir, log_transform=True)


In [None]:
import pandas as pd
import glob
from pathlib import Path

def get_meta_data(meta_data_dir, oof_joker="*_oof.csv", test_joker="*_test.csv"):
    meta_data_dir = Path(meta_data_dir)
    # Define file patterns for OOF and test predictions
    oof_catch = str(meta_data_dir / oof_joker)
    test_catch = str(meta_data_dir / test_joker)
    
    oof_files = sorted(glob.glob(oof_catch))  # Sorted for consistency
    test_files = sorted(glob.glob(test_catch))  # Sorted for consistency
    
    #print(test_files)

    # Ensure the number of files matches
    assert len(oof_files) == len(test_files), "Mismatch in the number of OOF and test files."

    # Function to load and rename files with prefixes
    def load_and_prefix(files, prefix):
        dfs = []
        for file in files:
            # Extract only the base name without directory and suffix
            model_name = Path(file).stem.replace("_oof", "").replace("_test", "")
            df = pd.read_csv(file)
                        
            if len(df) == 1600000:
                #print("not good")
                # TODO fix in the source, not here
                #print(model_name)
                df = df.iloc[-len(df)//2:].reset_index(drop=True)
                
                
            #print(df.head(2))  
                
            #print(df.head(4))
            df.columns = [f"{prefix}{model_name}_F{i+1}" for i in range(df.shape[1])]
            
            #print(df.columns)
            
            dfs.append(df)
        return pd.concat(dfs, axis=1)

    # Load OOF and test predictions with identical features
    X_meta_train = load_and_prefix(oof_files, prefix="")
    X_meta_test = load_and_prefix(test_files, prefix="")

    # Ensure the columns are identical between train and test
    X_meta_test.columns = X_meta_train.columns

    # Check the shapes
    print(f"Meta-train shape: {X_meta_train.shape}")
    print(f"Meta-test shape: {X_meta_test.shape}")
    
    return X_meta_train, X_meta_test

# Example usage
#X_meta_train, X_meta_test = get_meta_data("notebooks/Ensemble Catboost XGBoost LGBM")

# mikail
X_meta_train_mikail, X_meta_test_mikail = get_meta_data(notebooks_dir/"Ensemble Catboost XGBoost LGBM")

#print(X_meta_train_mikail.head())

def file_exists(filepath):
    file_path = Path(filepath)
    if not file_path.is_file():
        raise FileNotFoundError(f"The file '{filepath}' does not exist.")

def get_oof_test(filename, column_name):        
    df = pd.read_csv(filename)
    
    if df.shape[1] > 1:
        # If multiple columns, get last one
        df = pd.DataFrame(df.iloc[:, 1], columns=[column_name])
        
    # If single column, assign the provided column name
    df.columns = [column_name]
    return df

# Axyom
lgbm2_train = get_oof_test(notebooks_dir/"LGBM_v2\\1.0313+-0.0008\\oof_preds.csv", "LGBM_v2_axyom")
lgbm2_test = get_oof_test(notebooks_dir/"LGBM_v2\\1.0313+-0.0008\\test_preds.csv", "LGBM_v2_axyom")

autogluon_v2_train = get_oof_test(notebooks_dir/"Autogluon_v2\\AutogluonModels\\ag-20241225_165244\\oof_preds.csv", "autogluon_v2_axyom")
autogluon_v2_test = get_oof_test(notebooks_dir/"Autogluon_v2\\AutogluonModels\\ag-20241225_165244\\test_preds.csv", "autogluon_v2_axyom")

catboost_v2_train = get_oof_test(notebooks_dir/"Catboost_nonlog_feature\\1.0309+-0.0008\\oof_preds.csv", "catboost_v2_axyom")
catboost_v2_test = get_oof_test(notebooks_dir/"Catboost_nonlog_feature\\1.0309+-0.0008\\test_preds.csv", "catboost_v2_axyom")

xgboost_v2_train = get_oof_test(notebooks_dir/"XGBoost v2/1.0312+-0.0007\\oof_preds.csv", "xgboost_v2_axyom")
xgboost_v2_test = get_oof_test(notebooks_dir/"XGBoost v2/1.0312+-0.0007\\test_preds.csv", "xgboost_v2_axyom")

# arno
catboost_kaggle_train = get_oof_test(notebooks_dir/"Kaggle computed ensemble\\cleaned_catboost_oof_preds.csv", "catboost_kaggle")
catboost_kaggle_test = get_oof_test(notebooks_dir/"Kaggle computed ensemble\\cleaned_catboost_test_preds.csv", "catboost_kaggle")

lgbm_kaggle_train = get_oof_test(notebooks_dir/"Kaggle computed ensemble\\cleaned_lgbm_oof_preds.csv", "lgbm_kaggle")
lgbm_kaggle_test = get_oof_test(notebooks_dir/"Kaggle computed ensemble\\cleaned_lgbm_test_preds.csv", "lgbm_kaggle")

xgb_kaggle_train = get_oof_test(notebooks_dir/"Kaggle computed ensemble\\cleaned_xgb_oof_preds.csv", "xgb_kaggle")
xgb_kaggle_test = get_oof_test(notebooks_dir/"Kaggle computed ensemble\\cleaned_xgb_test_preds.csv", "xgb_kaggle")

# CONCATENATION
X_meta_train = pd.concat([
    X_meta_train_mikail, 
    lgbm2_train, 
    autogluon_v2_train, 
    catboost_v2_train, 
    xgboost_v2_train,
    catboost_kaggle_train, 
    lgbm_kaggle_train, 
    xgb_kaggle_train
], axis=1)
X_meta_test =  pd.concat([
    X_meta_test_mikail, 
    lgbm2_test,
    autogluon_v2_test, 
    catboost_v2_test,
    xgboost_v2_test, 
    catboost_kaggle_test, 
    lgbm_kaggle_test, 
    xgb_kaggle_test
], axis=1)


Meta-train shape: (1200000, 27)
Meta-test shape: (800000, 27)
    CAT2_F1   CAT3_F1   CAT4_F1   CAT5_F1   CAT6_F1   CAT8_F1   CAT9_F1  \
0  6.879656  6.878609  6.888915  6.880599  6.844798  6.880209  6.857543   
1  6.603727  6.604122  6.612402  6.590276  6.711615  6.600636  6.605473   
2  6.731330  6.727038  6.734683  6.747237  6.770649  6.727395  6.735930   
3  6.804395  6.809789  6.767255  6.807785  6.702834  6.790446  6.796255   
4  6.820363  6.830094  6.827049  6.822431  6.766598  6.821919  6.819712   

     CAT_F1   HGB2_F1   HGB3_F1  ...  LGBM8_F1  LGBM9_F1   LGBM_F1   XGB2_F1  \
0  6.879306  6.881209  6.880418  ...  6.897672  6.890580  6.856897  6.870620   
1  6.617835  6.598084  6.591714  ...  6.590937  6.592692  6.592067  6.607316   
2  6.713660  6.726840  6.729418  ...  6.728329  6.721480  6.737296  6.735967   
3  6.801755  6.807567  6.793407  ...  6.865108  6.831611  6.792099  6.835166   
4  6.824475  6.807433  6.807429  ...  6.822183  6.817229  6.803202  6.809719   

    XG

In [21]:
X_meta_train

Unnamed: 0,CAT2_F1,CAT3_F1,CAT4_F1,CAT5_F1,CAT6_F1,CAT8_F1,CAT9_F1,CAT_F1,HGB2_F1,HGB3_F1,...,XGB6_F1,XGB8_F1,XGB_F1,LGBM_v2_axyom,autogluon_v2_axyom,catboost_v2_axyom,xgboost_v2_axyom,catboost_kaggle,lgbm_kaggle,xgb_kaggle
0,6.879656,6.878609,6.888915,6.880599,6.844798,6.880209,6.857543,6.879306,6.881209,6.880418,...,6.878317,6.869698,6.879441,6.882895,6.865344,6.893105,6.867620,6.881234,6.842678,6.839812
1,6.603727,6.604122,6.612402,6.590276,6.711615,6.600636,6.605473,6.617835,6.598084,6.591714,...,6.592738,6.596825,6.595139,6.576740,6.599874,6.637723,6.597183,6.594889,6.592382,6.594367
2,6.731330,6.727038,6.734683,6.747237,6.770649,6.727395,6.735930,6.713660,6.726840,6.729418,...,6.738655,6.739345,6.754562,6.730502,6.735925,6.768459,6.721693,6.704542,6.714395,6.723536
3,6.804395,6.809789,6.767255,6.807785,6.702834,6.790446,6.796255,6.801755,6.807567,6.793407,...,6.836875,6.838369,6.800961,6.821496,6.794538,6.830385,6.843273,6.844987,6.818417,6.840056
4,6.820363,6.830094,6.827049,6.822431,6.766598,6.821919,6.819712,6.824475,6.807433,6.807429,...,6.809700,6.814489,6.823993,6.813567,6.800051,6.642096,6.812170,6.810781,6.817370,6.813779
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,6.915602,6.925285,6.907171,6.918143,6.887570,6.914079,6.924232,6.944999,6.910259,6.908276,...,6.895965,6.895347,6.937392,6.915423,6.886085,6.879275,6.903190,6.910383,6.891430,6.894538
1199996,6.769553,6.772158,6.776487,6.777134,6.720663,6.775982,6.778171,6.742666,6.769451,6.770576,...,6.775882,6.769505,6.735462,6.771563,6.771649,6.783833,6.777412,6.776988,6.776798,6.783721
1199997,5.232226,5.242050,5.312352,5.254823,5.607344,5.234463,5.230846,5.174806,5.244530,5.228245,...,5.282940,5.249376,5.301820,5.238006,5.282425,5.111880,5.252008,5.224594,5.303931,5.331536
1199998,6.830614,6.817938,6.812025,6.809251,6.767027,6.824788,6.830950,6.850570,6.820464,6.816830,...,6.820324,6.807199,6.817419,6.784793,6.791085,6.872786,6.796462,6.828211,6.807533,6.812346


# Meta Training

In [None]:
import optuna
import numpy as np
from sklearn.metrics import mean_squared_error
from optuna.visualization.matplotlib import (
    plot_optimization_history, 
    plot_param_importances, 
    plot_slice
)
import matplotlib.pyplot as plt

def optimize_ensemble_weights(X_meta_train, X_meta_test, y_train, n_trials=100):
    """
    Optimize ensemble weights for a given set of meta-features using Optuna.
    
    Parameters:
        X_meta_train (pd.DataFrame): OOF predictions (meta-features) for training.
        X_meta_test (pd.DataFrame): Predictions for the test set (meta-features).
        y_train (pd.Series): True target values for training.
        n_trials (int): Number of trials for Optuna optimization.
    
    Returns:
        dict: Optimized weights for each meta-model.
        np.ndarray: Final predictions on the test set.
    """
    # Define the objective function
    def objective(trial):
        num_models = X_meta_train.shape[1]
        weights = [trial.suggest_float(f"weight_{i}", 0, 1) for i in range(num_models)]
        weights = np.array(weights) / sum(weights)  # Normalize weights to sum to 1
        ensemble_preds = (X_meta_train.values * weights).sum(axis=1)
        rmse = np.sqrt(mean_squared_error(y_train, ensemble_preds))
        return rmse

    # Initialize and optimize the study
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=n_trials)

    # df = study.trials_dataframe()
    # print(df)  

    # Retrieve the best weights
    best_weights = study.best_params
    normalized_weights = np.array([best_weights[f"weight_{i}"] for i in range(len(best_weights))])
    normalized_weights /= normalized_weights.sum()

    # Generate predictions for the test set
    test_preds = (X_meta_test.values * normalized_weights).sum(axis=1)

    # Print optimization results
    print("Optimized Weights:", normalized_weights)
    print("Best RMSE:", study.best_value)

    # Plot optimization history
    plot = plot_optimization_history(study)
    plt.title("Optuna Optimization History")
    plt.show()
    
    weight_mapping = dict(zip(X_meta_train.columns, normalized_weights))
    
    # Plot bar chart of weights
    plt.figure(figsize=(10, 6))
    plt.bar(weight_mapping.keys(), weight_mapping.values(), color="skyblue")
    plt.title("Optimized Weights for Meta-Models")
    plt.xlabel("Meta-Models")
    plt.ylabel("Weight")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()

    return best_weights, test_preds, study.best_value

# Assuming meta_features (training meta-features), meta_predictions (test meta-features),
# and y_train (true target values) are already defined.
optimized_weights, test_preds, weight_ensembling_score = optimize_ensemble_weights(
    X_meta_train=X_meta_train,
    X_meta_test=X_meta_test,
    y_train=y_train,
    n_trials=750
)

[I 2024-12-27 23:56:04,177] A new study created in memory with name: no-name-7b259e9c-9391-4826-88d7-d0f3695f07ac
[I 2024-12-27 23:56:04,376] Trial 0 finished with value: 1.0309959705767697 and parameters: {'weight_0': 0.3262964355547947, 'weight_1': 0.4259467912548768, 'weight_2': 0.2808361779985227, 'weight_3': 0.43847556634827256, 'weight_4': 0.5425978611524185, 'weight_5': 0.5381287521734771, 'weight_6': 0.9134444070836266, 'weight_7': 0.11877048068395213, 'weight_8': 0.050354803190158615, 'weight_9': 0.005630069434837459, 'weight_10': 0.06707440091797112, 'weight_11': 0.5988261091469476, 'weight_12': 0.5652804711069039, 'weight_13': 0.30776501075913143, 'weight_14': 0.5616409942634241, 'weight_15': 0.5889534907603763, 'weight_16': 0.47413270183993006, 'weight_17': 0.7273744861127713, 'weight_18': 0.8386610006726508, 'weight_19': 0.06914000152139166, 'weight_20': 0.8073150150615858, 'weight_21': 0.6915834803554126, 'weight_22': 0.6481416593293355, 'weight_23': 0.22903881361805722, 

In [4]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from axyom_utilities.training import train_model_cv
from axyom_utilities.wrappers import RidgeRegressorWrapper, LGBMRegressorWrapper

# Neural Network class
class NeuralNetwork:
    def __init__(self, input_dim, learning_rate=0.001):
        self.model = Sequential([
            Dense(128, activation='relu', input_dim=input_dim),
            Dense(64, activation='relu'),
            Dense(1)
        ])
        self.model.compile(optimizer=Adam(learning_rate=learning_rate), loss='mse')
    
    def fit(self, X_train, y_train, epochs=50, batch_size=32, verbose=0):
        self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=verbose)
    
    def predict(self, X):
        return self.model.predict(X).flatten().ravel()

# Initialize models
meta_models = {
    "Ridge": RidgeRegressorWrapper(alpha=1.0, random_state=42),
    "LightGBM": LGBMRegressorWrapper(random_state=42, verbose=-1)
    #"NeuralNetwork": NeuralNetwork(input_dim=X_meta_train.shape[1])
}

# Train and evaluate models
results = {}
for name, model in meta_models.items():
    r = train_model_cv(model, X_meta_train, y_train, X_meta_test, cv_splits=10)
    
    oof_preds = r['oof_preds']
    test_preds = r['test_preds']
    score = np.mean(r['cv_scores'])
    
    results[name] = {"oof_preds": oof_preds, "test_preds": test_preds, "score":score}
    print(f"{name} OOF RMSE: {score:.6f}")

# Compare stacking methods
# for method, result in results.items():
#     print(f"{method}: OOF RMSE = {result['oof_rmse']:.4f}")
# Find the best model
# best_model = max(results.items(), key=lambda x: x[1]["score"])
# best_name, best_data = best_model

# # Extract relevant details
# best_oof_preds = best_data["oof_preds"]
# best_test_preds = best_data["test_preds"]

# print(f"Best Model Name: {best_name}")
# print(f"OOF Predictions: {best_oof_preds}")
# print(f"Test Predictions: {best_test_preds}")


Training fold 1...
Fold 1 RMSE: 1.0338
Training fold 2...
Fold 2 RMSE: 1.0287
Training fold 3...
Fold 3 RMSE: 1.0284
Training fold 4...
Fold 4 RMSE: 1.0325
Training fold 5...
Fold 5 RMSE: 1.0320
Training fold 6...
Fold 6 RMSE: 1.0271
Training fold 7...
Fold 7 RMSE: 1.0322
Training fold 8...
Fold 8 RMSE: 1.0312
Training fold 9...
Fold 9 RMSE: 1.0337
Training fold 10...
Fold 10 RMSE: 1.0273
Mean CV RMSE: 1.0307 ± 0.0025
LightGBM OOF RMSE: 1.030690


In [5]:
results

{'LightGBM': {'oof_preds': array([6.86604487, 6.61481786, 6.73409579, ..., 5.26747052, 6.79981696,
         5.67555292]),
  'test_preds': array([6.78876534, 6.82375446, 6.70490667, ..., 6.75423886, 6.8195356 ,
         6.71713966]),
  'score': 1.0306895513430856}}

# Submission

In [17]:
# test_preds = results['LightGBM']['test_preds']
# score = results['LightGBM']['score']
test_preds
score = weight_ensembling_score

In [18]:
if True:
    y_pred = np.maximum(0, np.expm1(test_preds))

    submission = pd.DataFrame({
        'id': X_test.index,  
        'Premium Amount': y_pred
    })

    FILE_PATH = f"Stacking_v3_{score:.4f}.csv"

    submission.to_csv(FILE_PATH, index=False)