In [1]:
import os
import numpy as np
import random
import warnings
import pandas as pd
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from skopt import BayesSearchCV
from skopt.space import Integer, Real

# Set seeds for reproducibility
os.environ['PYTHONHASHSEED'] = str(1)
np.random.seed(1)
random.seed(1)
warnings.filterwarnings("ignore")

# 1. Data Loading
data = pd.read_excel('machine.xlsx')  # Ensure the file path is correct
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Ensure data is numeric
if not np.issubdtype(X.dtypes, np.number):
    # Convert to numeric, coerce invalid data
    X = X.apply(pd.to_numeric, errors='coerce')
if not np.issubdtype(y.dtype, np.number):
    y = pd.to_numeric(y, errors='coerce')

# Check for invalid values
if X.isnull().any().any() or y.isnull().any():
    raise ValueError(
        "Dataset contains invalid values (NaN or inf). Please check your data.")

# 2. Data Preprocessing

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=1)

# 3. Parameter Optimization (Bayesian Optimization)
hist_gb_model = HistGradientBoostingRegressor(random_state=1)
param_spaces = {
    # Reduced range for learning rate
    'learning_rate': Real(0.01, 0.1, prior='uniform'),
    # Reduced max iterations to prevent overfitting
    'max_iter': Integer(50, 150),
    'max_depth': Integer(3, 10),  # Restrict tree depth
    # Larger leaf requirement to reduce overfitting
    'min_samples_leaf': Integer(5, 20),
    # Added regularization
    'l2_regularization': Real(0.1, 1.0, prior='uniform')
}

optimizer = BayesSearchCV(
    estimator=hist_gb_model,
    search_spaces=param_spaces,
    n_iter=32,
    scoring='neg_mean_squared_error',
    cv=5,
    random_state=1
)
optimizer.fit(X_train, y_train)

# 4. Model Training with Best Parameters
best_params = optimizer.best_params_
print(f'Best parameters: {best_params}')
hist_gb_optimized = HistGradientBoostingRegressor(
    **best_params, random_state=1)
hist_gb_optimized.fit(X_train, y_train)

# 5. Model Evaluation


def evaluate_model(model, X_train, X_test, y_train, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    metrics = {
        'Train RMSE': np.sqrt(mean_squared_error(y_train, y_pred_train)),
        'Test RMSE': np.sqrt(mean_squared_error(y_test, y_pred_test)),
        'Train R^2': r2_score(y_train, y_pred_train),
        'Test R^2': r2_score(y_test, y_pred_test),
        'Train MAE': mean_absolute_error(y_train, y_pred_train),
        'Test MAE': mean_absolute_error(y_test, y_pred_test)
    }
    return metrics


results = evaluate_model(hist_gb_optimized, X_train, X_test, y_train, y_test)
for metric, value in results.items():
    print(f'{metric}: {value}')

# 6. Export Results


def export_results(y_train, y_pred_train, y_test, y_pred_test, filename='Results_HistGB.xlsx'):
    with pd.ExcelWriter(filename) as writer:
        train_results_df = pd.DataFrame({
            'y_true': y_train,
            'y_pred': y_pred_train
        })
        test_results_df = pd.DataFrame({
            'y_true': y_test,
            'y_pred': y_pred_test
        })

        train_results_df.to_excel(
            writer, sheet_name='Train Results', index=False)
        test_results_df.to_excel(
            writer, sheet_name='Test Results', index=False)


export_results(
    y_train,
    hist_gb_optimized.predict(X_train),
    y_test,
    hist_gb_optimized.predict(X_test)
)



Best parameters: OrderedDict([('l2_regularization', 1.0), ('learning_rate', 0.01), ('max_depth', 8), ('max_iter', 116), ('min_samples_leaf', 5)])
Train RMSE: 0.9819217617321508
Test RMSE: 1.371661815688947
Train R^2: 0.8206873660911337
Test R^2: 0.7997442300479629
Train MAE: 0.7460648242606238
Test MAE: 1.2219722384209593
