# Random Forest Rangers - Predicting Car Sales Prices
## Training the models

### Prerequisites

In [None]:
# Install dependencies
# !pip install -q -r requirements.txt

# On MacOS you need the following command
# brew install libomp

In [None]:
import sys
import time

def progress_bar(current, total, start_time, name, bar_width=30):
    """
    Prints a progress bar to the shell.

    Parameters:
    - current (int): Current iteration.
    - total (int): Total number of iterations.
    - start_time (float): Start time of the process (from `time.time()`).
    - name (string): Name of the run.
    - bar_width (int): Width of the progress bar (default: 30).
    """
    elapsed_time = time.time() - start_time
    progress = current / total
    completed = int(bar_width * progress)
    remaining = bar_width - completed
    progress_percent = int(progress * 100)
    est_total_time = elapsed_time / progress if progress > 0 else 0
    est_remaining_time = est_total_time - elapsed_time

    # Create the progress bar
    bar = f"[{'#' * completed}{'-' * remaining}] {progress_percent}%"

    # Display current progress and ETA
    sys.stdout.write(
        f"\r{bar} ({current}/{total}) | Elapsed: {elapsed_time:.2f}s | ETA: {est_remaining_time:.2f}s / {name} "
    )
    sys.stdout.flush()

    # Print a newline when done
    if current == total:
        sys.stdout.write("\n")

In [None]:
# Import neccessary modules
import pandas as pd
import numpy as np

### Load Data

In [None]:
# Load the datasets
train_dataset_original = pd.read_csv('data/1_Preprocessing/train.csv')
print("Training Dataset Original")
print(train_dataset_original.dtypes)
print(train_dataset_original.head(10))
print(50*'-')

train_dataset_mixed = pd.read_csv('data/1_Preprocessing/train_generated_and_original.csv')
print("Training Dataset Mixed")
print(train_dataset_mixed.dtypes)
print(train_dataset_mixed.head(10))
print(50*'-')

test_dataset = pd.read_csv('data/1_Preprocessing/test.csv')
print("Test Dataset")
print(test_dataset.dtypes)
print(test_dataset.head(10))
print(50*'-')

### Remove target variable

In [None]:
# Define the features and target variable
X_train_original = train_dataset_original.drop(columns=['price'])
y_train_original = train_dataset_original['price']

X_train_mixed = train_dataset_mixed.drop(columns=['price'])
y_train_mixed = train_dataset_mixed['price']

X_test = test_dataset.drop(columns=['price'])
y_test = test_dataset['price']

In [127]:

from itertools import product
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.preprocessing import RobustScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

def adjusted_r2(r2, n, p):
    """Calculate Adjusted R²."""
    return 1 - (1 - r2) * ((n - 1) / (n - p - 1))

def hyperparameter_tuning_with_cv(X, y, param_grid, model_class=LinearRegression, n_splits=5, random_state=42, verbose=True, polynomial=False, degree=2, name="Linear Regression"):
    """
    Perform hyperparameter tuning with K-Fold cross-validation.

    Parameters:
    - X (DataFrame or ndarray): Feature matrix.
    - y (Series or ndarray): Target vector.
    - param_grid (dict): Hyperparameter grid as a dictionary of parameter lists.
    - model_class (class): Model class to be instantiated (default: LinearRegression).
    - n_splits (int): Number of splits for K-Fold cross-validation.
    - random_state (int): Random state for reproducibility.
    - verbose (bool): Activate verbose output.
    - polynomial (bool): Whether to apply polynomial feature scaling on the dataset.
    - degree (int): Degree of the polynomial features (default: 2).
    - name (stirng): Name of the run (default: "Linear Regression").

    Returns:
    - dict: Best hyperparameters and performance metrics.
    - list: All results for each hyperparameter combination.
    """
    # Generate all combinations of hyperparameters
    param_combinations = list(product(*param_grid.values()))
    param_array = [dict(zip(param_grid.keys(), combo)) for combo in param_combinations]

    # For tracking the progress in the shell
    total_combinations = len(param_combinations)
    start_time = time.time()

    # Initialize variables to store results
    results = []

    # Scalers and transformers
    scaler = RobustScaler()
    poly_transformer = PolynomialFeatures(degree=degree, include_bias=False) if polynomial else None

    # Iterate over all hyperparameter combinations
    for index, params in enumerate(param_array, start=0):
        if verbose:
            print(f"Testing hyperparameters: {params}")
        else:
            progress_bar(index, total_combinations, start_time, name)

        mae_scores = []
        mape_scores = []
        mse_scores = []
        rmse_scores = []
        adjusted_r2_scores = []
        r2_scores = []
        pcc_scores = []

        # Perform K-Fold cross-validation
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

        for train_index, test_index in kf.split(X):
            # Split the data
            X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
            y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

            # Apply polynomial feature transformation if specified
            if polynomial:
                X_train_fold = poly_transformer.fit_transform(X_train_fold)
                X_test_fold = poly_transformer.transform(X_test_fold)

            # Scale the data
            X_train_fold = scaler.fit_transform(X_train_fold)
            X_test_fold = scaler.transform(X_test_fold)

            # Create a new model for this fold with the current hyperparameters
            model = model_class(**params)

            # Fit the model
            model.fit(X_train_fold, y_train_fold)

            # Predict on the test fold
            y_pred = model.predict(X_test_fold)

            # Calculate metrics
            mse = mean_squared_error(y_test_fold, y_pred)
            rmse = np.sqrt(mse)
            mae = mean_absolute_error(y_test_fold, y_pred)
            mape = mean_absolute_percentage_error(y_test_fold, y_pred)
            r2 = max(0, r2_score(y_test_fold, y_pred)) # R² can be negative, make it 0 in this case
            adj_r2 = max(0, adjusted_r2(r2, len(y_test_fold), X_test_fold.shape[1])) # R² can be negative, make it 0 in this case
            pcc, _ = pearsonr(y_test_fold, y_pred)

            # Store results for this fold
            mse_scores.append(mse)
            rmse_scores.append(rmse)
            mae_scores.append(mae)
            mape_scores.append(mape)
            r2_scores.append(r2)
            adjusted_r2_scores.append(adj_r2)
            pcc_scores.append(pcc)

        # Calculate mean metrics and store results
        mean_mse = np.mean(mse_scores)
        mean_rmse = np.mean(rmse_scores)
        mean_mae = np.mean(mae_scores)
        mean_mape = np.mean(mape_scores)
        mean_r2 = np.mean(r2_scores)
        mean_adj_r2 = np.mean(adjusted_r2_scores)
        mean_pcc = np.mean(pcc_scores)

        results.append({
            'params': params,
            'mean_mse': mean_mse,
            'mean_rmse': mean_rmse,
            'mean_mae': mean_mae,
            'mean_mape': mean_mape,
            'mean_r2': mean_r2,
            'mean_adj_r2': mean_adj_r2,
            'mean_pcc': mean_pcc
        })

        if verbose:
            print(f"Metrics for {params}:")
            print(f"\tMean RMSE: {mean_rmse:.4f}")
            print(f"\tMean MSE: {mean_mse:.4f}")
            print(f"\tMean MAE: {mean_mae:.4f}")
            print(f"\tMean MAPE: {mean_mape:.4f}")
            print(f"\tMean R²: {mean_r2:.4f}")
            print(f"\tMean Adjusted R²: {mean_adj_r2:.4f}")
            print(f"\tMean PCC: {mean_pcc:.4f}")

    if not verbose:
        progress_bar(total_combinations, total_combinations, start_time, name)

    # Select the best hyperparameters based on R²
    best_result = max(results, key=lambda x: x['mean_r2'])
    print(f"\nBest hyperparameters: {best_result['params']} with:")
    print(f"\tMean RMSE: {best_result['mean_rmse']:.4f}")
    print(f"\tMean MSE: {best_result['mean_mse']:.4f}")
    print(f"\tMean MAE: {best_result['mean_mae']:.4f}")
    print(f"\tMean MAPE: {best_result['mean_mape']:.4f}")
    print(f"\tMean R²: {best_result['mean_r2']:.4f}")
    print(f"\tMean Adjusted R²: {best_result['mean_adj_r2']:.4f}")
    print(f"\tMean PCC: {best_result['mean_pcc']:.4f}")

    return best_result, results

In [128]:
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns

# Visualization
def plot_results(y_true, y_pred):
    # 1. Actual vs Predicted
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=y_true, y=y_pred, alpha=0.7, s=60)
    plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], 'r--', linewidth=2)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title('Actual vs Predicted with Regression Line')
    plt.show()

    # 2. Residuals
    residuals = y_true - y_pred
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=y_pred, y=residuals, alpha=0.7)
    plt.axhline(0, color='red', linestyle='--', linewidth=2)
    plt.xlabel('Predicted Values')
    plt.ylabel('Residuals')
    plt.title('Residual Plot')
    plt.show()

    # 3. Residual Histogram
    plt.figure(figsize=(10, 6))
    sns.histplot(residuals, kde=True, bins=30, color='blue', alpha=0.7)
    plt.axvline(0, color='red', linestyle='--', linewidth=2)
    plt.xlabel('Residuals')
    plt.ylabel('Frequency')
    plt.title('Histogram of Residuals')
    plt.show()

    # 4. Q-Q Plot
    plt.figure(figsize=(10, 6))
    stats.probplot(residuals, dist="norm", plot=plt)
    plt.title('Q-Q Plot of Residuals')
    plt.show()

In [129]:
def plot_mileage_vs_price(mileage, actual_price, predicted_price, model_name="Model"):
    """
    Plots mileage vs. price with actual values, predicted values, and regression line/curve.

    Args:
    - mileage (array-like): Feature values (e.g., mileage).
    - actual_price (array-like): Ground truth target values (e.g., price).
    - predicted_price (array-like): Predicted target values from the model.
    - model_name (str): Name of the model for the plot title.
    """
    plt.figure(figsize=(10, 6))

    # Scatter plot for actual vs. predicted
    plt.scatter(mileage, actual_price, color="blue", label="Actual values", alpha=0.6)
    plt.scatter(mileage, predicted_price, color="red", label="Predicted values", alpha=0.6)

    # Regression line/curve
    sorted_indices = np.argsort(mileage)
#    plt.plot(
 #       mileage[sorted_indices],
  #      predicted_price[sorted_indices],
   #     color="black",
    #    linestyle="--",
     #   linewidth=2,
      #  label="Regression Line/Curve"
    #)

    # Labeling
    plt.title(f"{model_name}: Mileage vs. Price", fontsize=16)
    plt.xlabel("Mileage", fontsize=14)
    plt.ylabel("Price", fontsize=14)
    plt.legend(fontsize=12)
    plt.grid(True)
    plt.show()

### Model Training

#### Linear Regression

In [None]:
param_grid = {
    'fit_intercept': [True, False]
}

#################
# Original data #
#################

best_result_linear_original, all_results_linear_original = hyperparameter_tuning_with_cv(X_train_original, y_train_original, param_grid, verbose=False, name="LR - Original")

best_params_linear_original = best_result_linear_original['params']
scaler = RobustScaler()

X_train_original_copy = X_train_original.copy()
y_train_original_copy = y_train_original.copy()
X_test_copy = X_test.copy()
y_test_copy = y_test.copy()

X_train_scaled_original = scaler.fit_transform(X_train_original_copy)
X_test_scaled_original = scaler.transform(X_test_copy)

final_linear_model_original = LinearRegression(**best_params_linear_original)
final_linear_model_original.fit(X_train_scaled_original, y_train_original_copy)

# Generate predictions for evaluation
y_pred_linear_original = final_linear_model_original.predict(X_test_scaled_original)
plot_mileage_vs_price(
    mileage=X_test['milage'],
    actual_price=y_test,
    predicted_price=y_pred_linear_original,
    model_name="Linear Regression - Original"
)

#################
#   Mixed data  #
#################

best_result_linear_mixed, all_results_linear_mixed = hyperparameter_tuning_with_cv(X_train_mixed, y_train_mixed, param_grid, verbose=False, name="LR - Mixed")
best_params_linear_mixed = best_result_linear_mixed['params']
scaler = RobustScaler()

X_train_mixed_copy = X_train_mixed.copy()
y_train_mixed_copy = y_train_mixed.copy()
X_test_copy = X_test.copy()
y_test_copy = y_test.copy()

X_train_scaled_mixed = scaler.fit_transform(X_train_mixed_copy)
X_test_scaled_mixed = scaler.transform(X_test_copy)

final_linear_model_mixed = LinearRegression(**best_params_linear_mixed)
final_linear_model_mixed.fit(X_train_scaled_mixed, y_train_mixed_copy)

# Generate predictions for evaluation
y_pred_linear_mixed = final_linear_model_mixed.predict(X_test_scaled_original)
plot_mileage_vs_price(
    mileage=X_test['milage'],
    actual_price=y_test,
    predicted_price=y_pred_linear_mixed,
    model_name="Linear Regression - Mixed"
)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

param_grid = {
    'fit_intercept': [True, False]
}

#################
# Original data #
#################

best_result_poly_original, all_results_poly_original = hyperparameter_tuning_with_cv(X_train_original, y_train_original, param_grid, polynomial=True, verbose=False, name="LR - Poly - Original")
best_params_poly_original = best_result_poly_original['params']

scaler = RobustScaler()
poly = PolynomialFeatures(degree=2, include_bias=False)

X_train_original_copy = X_train_original.copy()
y_train_original_copy = y_train_original.copy()
X_test_copy = X_test.copy()
y_test_copy = y_test.copy()

# Apply polynomial feature transformation
X_train_poly = poly.fit_transform(X_train_original_copy)
X_test_poly = poly.transform(X_test_copy)

# Scale the polynomial features
X_train_scaled_poly = scaler.fit_transform(X_train_poly)
X_test_scaled_poly = scaler.transform(X_test_poly)

final_model_poly_original = LinearRegression(**best_params_poly_original)
final_model_poly_original.fit(X_train_scaled_original, y_train_original_copy)

# Generate predictions for evaluation
y_pred_poly_original = final_model_poly_original.predict(X_test_scaled_original)
plot_mileage_vs_price(
    mileage=X_test['milage'],
    actual_price=y_test,
    predicted_price=y_pred_poly_original,
    model_name="Linear Regression with Poly - Original"
)

#################
#   Mixed data  #
#################

best_result_mixed, all_results_mixed = hyperparameter_tuning_with_cv(X_train_mixed, y_train_mixed, param_grid, polynomial=True, verbose=False, name="LR - Poly - Mixed")

best_params_poly_mixed = best_result_mixed['params']
scaler = RobustScaler()

X_train_mixed_copy = X_train_mixed.copy()
y_train_mixed_copy = y_train_mixed.copy()
X_test_copy = X_test.copy()
y_test_copy = y_test.copy()

# Apply polynomial feature transformation
X_train_poly = poly.fit_transform(X_train_mixed_copy)
X_test_poly = poly.transform(X_test_copy)

# Scale the polynomial features
X_train_scaled_poly = scaler.fit_transform(X_train_poly)
X_test_scaled_poly = scaler.transform(X_test_poly)

final_model_poly_mixed = LinearRegression(**best_params_poly_mixed)
final_model_poly_mixed.fit(X_train_scaled_mixed, y_train_mixed_copy)

# Generate predictions for evaluation
y_pred_poly_mixed = final_model_poly_mixed.predict(X_test_scaled_original)
plot_mileage_vs_price(
    mileage=X_test['milage'],
    actual_price=y_test,
    predicted_price=y_pred_poly_original,
    model_name="Linear Regression with Poly - Mixed"
)

#### Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': [10, 25, 50],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2'],
}

#################
# Original data #
#################

best_result_rfr_original, all_results_rfr_original = hyperparameter_tuning_with_cv(X=X_train_original, y=y_train_original, param_grid=param_grid, model_class=RandomForestRegressor, verbose=False)
best_params_rfr_original = best_result_rfr_original['params']
scaler = RobustScaler()

X_train_original_copy = X_train_original.copy()
y_train_original_copy = y_train_original.copy()
X_test_copy = X_test.copy()
y_test_copy = y_test.copy()

X_train_scaled_original = scaler.fit_transform(X_train_original_copy)
X_test_scaled_original = scaler.transform(X_test_copy)
final_model_rfr_original = RandomForestRegressor(**best_params_rfr_original)
final_model_rfr_original.fit(X_train_scaled_original, y_train_original_copy)

# Generate predictions for evaluation
y_pred_rfr_original = final_model_rfr_original.predict(X_test_scaled_original)
plot_mileage_vs_price(
    mileage=X_test['milage'],
    actual_price=y_test,
    predicted_price=y_pred_rfr_original,
    model_name="Random Forest Regressor - Original"
)

#################
#   Mixed data  #
#################

best_result_rfr_mixed, all_results_rfr_mixed = hyperparameter_tuning_with_cv(X=X_train_mixed, y=y_train_mixed, param_grid=param_grid, model_class=RandomForestRegressor, verbose=False)
best_params_rfr_mixed = best_result_rfr_mixed['params']
scaler = RobustScaler()

X_train_mixed_copy = X_train_mixed.copy()
y_train_mixed_copy = y_train_mixed.copy()
X_test_copy = X_test.copy()
y_test_copy = y_test.copy()

X_train_scaled_mixed = scaler.fit_transform(X_train_mixed_copy)
X_test_scaled_mixed = scaler.transform(X_test_copy)

final_model_rfr_original = RandomForestRegressor(**best_params_rfr_mixed)
final_model_rfr_original.fit(X_train_scaled_mixed, y_train_mixed_copy)

        # R² can be negative, this is equivalent to R² = 0
        r2 = max(0, r2)

        # Store results for this fold
        mse_scores.append(mse)
        r2_scores.append(r2)

    # Calculate mean R² and store results
    mean_r2 = np.mean(r2_scores)
    results.append({'params': params, 'mean_r2': mean_r2})
    if index % 100 == 0:
        print(f"Run {index} of {len}")
    index = index + 1

# Train the final model with the best hyperparameters on the full dataset
X_copy = X.copy()
y_copy = y.copy()

# Select the best hyperparameters
best_result = max(results, key=lambda x: x['mean_r2'])
best_params = best_result['params']
print(f"Best hyperparameters: {best_params} with Mean R²: {best_result['mean_r2']:.4f}")

# Train final model on the full dataset with the best hyperparameters
final_model = RandomForestRegressor(**best_params)
X_scaled = scaler.fit_transform(X_copy)
final_model.fit(X_scaled, y_copy)

# Predict on the full dataset
y_pred_full = final_model.predict(X_scaled)
# Generate predictions for evaluation
y_pred_rfr_mixed = final_model_rfr_original.predict(X_test_scaled_original)
plot_mileage_vs_price(
    mileage=X_test['milage'],
    actual_price=y_test,
    predicted_price=y_pred_rfr_mixed,
    model_name="Random Forest Regressor - Mixed"
)

#### XGBoost

In [None]:
from xgboost import XGBRegressor

param_grid = {
    'n_estimators': [10, 25, 50, 100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.1, 0.3],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'min_child_weight': [1, 3]
}

#################
# Original data #
#################

best_result_xg_original, all_results_xg_original = hyperparameter_tuning_with_cv(
    X=X_train_original, y=y_train_original, param_grid=param_grid, model_class=XGBRegressor,
    verbose=False, name="XGBoost - Original"
)
best_params_xg_original = best_result_xg_original['params']
scaler = RobustScaler()

X_train_original_copy = X_train_original.copy()
y_train_original_copy = y_train_original.copy()
X_test_copy = X_test.copy()
y_test_copy = y_test.copy()

X_train_scaled_original = scaler.fit_transform(X_train_original_copy)
X_test_scaled_original = scaler.transform(X_test_copy)
final_model_xg_original = XGBRegressor(**best_params_xg_original)
final_model_xg_original.fit(X_train_scaled_original, y_train_original_copy)

# Generate predictions for evaluation
y_pred_xg_original = final_model_xg_original.predict(X_test_scaled_original)
plot_mileage_vs_price(
    mileage=X_test['milage'],
    actual_price=y_test,
    predicted_price=y_pred_xg_original,
    model_name="XGBoost - Original"
)

#################
#   Mixed data  #
#################

best_result_xg_mixed, all_results_xg_mixed = hyperparameter_tuning_with_cv(
    X=X_train_mixed, y=y_train_mixed, param_grid=param_grid,
    model_class=XGBRegressor, verbose=False, name="XGBoost - Mixed"
)
best_params_xg_mixed = best_result_xg_mixed['params']
scaler = RobustScaler()

X_train_mixed_copy = X_train_mixed.copy()
y_train_mixed_copy = y_train_mixed.copy()
X_test_copy = X_test.copy()
y_test_copy = y_test.copy()

X_train_scaled_mixed = scaler.fit_transform(X_train_mixed_copy)
X_test_scaled_mixed = scaler.transform(X_test_copy)
final_model_xg_mixed = XGBRegressor(**best_params_xg_mixed)
final_model_xg_mixed.fit(X_train_scaled_mixed, y_train_mixed_copy)

# Generate predictions for evaluation
y_pred_xg_mixed = final_model_xg_mixed.predict(X_test_scaled_original)
plot_mileage_vs_price(
    mileage=X_test['milage'],
    actual_price=y_test,
    predicted_price=y_pred_xg_mixed,
    model_name="XGBoost - Mixed"
)

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold

# Initialize the XGBoost regressor
model = XGBRegressor(objective='reg:squarederror', random_state=42)

def training_XGBoost(initial_model, y, X):
    # Cross-validation loop
    mse_scores = []                     # Store MSE for each fold
    r2_scores = []                      # Store R² for each fold
    adjusted_r2_scores = []             # Store Adjusted R² for each fold
    mean_absolute_percentage_error = [] # Store MAPE for each fold
    mae_scores = []                     # Store MAE for each fold
    model_list = []                     # Store the models for each fold

    n = len(train_dataset_original)  # Total number of samples
    p = X.shape[1]    # Number of predictors
    i = 1

    x_copy = X.copy()
    y_copy = y.copy()

    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    for train_index, val_index in kf.split(X):
        # Split the data
        X_train_fold, X_val_fold = x_copy.iloc[train_index], x_copy.iloc[val_index]
        y_train_fold, y_val_fold = y_copy.iloc[train_index], y_copy.iloc[val_index]

        # Make sure the model is reinitialized for each fold
        model = None
        model = initial_model

        # Train the model
        model.fit(X_train_fold, y_train_fold)

        # Predict on the test fold
        y_pred = model.predict(X_val_fold)

        # Calculate MSE
        mse = mean_squared_error(y_val_fold, y_pred)
        mse_scores.append(mse)

        # Calculate R²
        r2 = r2_score(y_val_fold, y_pred)

        # Negative R² is possible, however, this is equivalent to R² = 0
        if r2 < 0:
            r2 = 0
        r2_scores.append(r2)

        # Calculate Adjusted R²
        n_fold = len(y_val_fold)  # Number of samples in this fold
        adjusted_r2 = 1 - ((1 - r2) * (n_fold - 1)) / (n_fold - p - 1)
        adjusted_r2_scores.append(adjusted_r2)

        # Calculate MAPE
        mape = np.mean(np.abs((y_val_fold - y_pred) / y_val_fold)) * 100
        mean_absolute_percentage_error.append(mape)

        # Calculate MAE
        mae = np.mean(np.abs(y_val_fold - y_pred))
        mae_scores.append(mae)

        print(f"Run {i}:\nMSE: {mse}\nRMSE: {np.sqrt(mse)}\nR2: {r2}\nAdjusted R2: {adjusted_r2}\n")
        i = i + 1

        # Store the model
        model_list.append(model)

    dict_of_results = {
        'mse': mse_scores,
        'r2': r2_scores,
        'adjusted_r2': adjusted_r2_scores,
        'mape': mean_absolute_percentage_error,
        'mae': mae_scores,
        'model': model_list
    }

    return dict_of_results


initial_model = XGBRegressor(objective='reg:squarederror', random_state=42)
results = training_XGBoost(initial_model, y_train_original, X_train_original)

mse_scores = results['mse']
r2_scores = results['r2']
adjusted_r2_scores = results['adjusted_r2']
mean_absolute_percentage_error = results['mape']
mae_scores = results['mae']

# Average metrics across folds
avg_mse = sum(mse_scores) / len(mse_scores)
avg_r2 = sum(r2_scores) / len(r2_scores)
avg_adjusted_r2 = sum(adjusted_r2_scores) / len(adjusted_r2_scores)
avg_mape = sum(mean_absolute_percentage_error) / len(mean_absolute_percentage_error)
avg_mae = sum(mae_scores) / len(mae_scores)

print(f"Average MSE: {avg_mse:.2f}")
print(f"Average R²: {avg_r2:.2f}")
print(f"Average Adjusted R²: {avg_adjusted_r2:.2f}")
print(f"Average MAPE: {avg_mape:.2f}")
print(f"Average MAE: {avg_mae:.2f}")
