In [1]:
import os
import numpy as np
import random
import warnings
import pandas as pd
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from skopt import BayesSearchCV
from skopt.space import Integer, Real

# Set seeds for reproducibility
os.environ['PYTHONHASHSEED'] = str(1)
np.random.seed(1)
random.seed(1)
warnings.filterwarnings("ignore")

# 1. Data Loading
data = pd.read_excel('machine.xlsx')  # Ensure the file path is correct
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Ensure data is numeric
if not np.issubdtype(X.dtypes, np.number):
    X = X.apply(pd.to_numeric, errors='coerce')  # Convert to numeric, coerce invalid data
if not np.issubdtype(y.dtype, np.number):
    y = pd.to_numeric(y, errors='coerce')

# Check for invalid values
if X.isnull().any().any() or y.isnull().any():
    raise ValueError("Dataset contains invalid values (NaN or inf). Please check your data.")

# 2. Data Preprocessing

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=1)

# 3. Parameter Optimization (Bayesian Optimization)
adaboost_model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(random_state=1), random_state=1)
param_spaces = {
    'n_estimators': Integer(50, 100),  # Reduced range to limit overfitting
    'learning_rate': Real(0.01, 0.1, prior='uniform'),  # Lower learning rate range
    'base_estimator__max_depth': Integer(3, 10),  # Restrict tree depth
    'base_estimator__min_samples_split': Integer(5, 20),  # Larger split requirement
    'base_estimator__min_samples_leaf': Integer(3, 15)  # Larger leaf requirement
}

optimizer = BayesSearchCV(
    estimator=adaboost_model,
    search_spaces=param_spaces,
    n_iter=32,
    scoring='neg_mean_squared_error',
    cv=5,
    random_state=1
)
optimizer.fit(X_train, y_train)

# 4. Model Training with Best Parameters
best_params = optimizer.best_params_
print(f'Best parameters: {best_params}')
adaboost_optimized = AdaBoostRegressor(
    base_estimator=DecisionTreeRegressor(
        max_depth=best_params['base_estimator__max_depth'],
        min_samples_split=best_params['base_estimator__min_samples_split'],
        min_samples_leaf=best_params['base_estimator__min_samples_leaf'],
        random_state=1
    ),
    n_estimators=best_params['n_estimators'],
    learning_rate=best_params['learning_rate'],
    random_state=1
)
adaboost_optimized.fit(X_train, y_train)

# 5. Model Evaluation
def evaluate_model(model, X_train, X_test, y_train, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    metrics = {
        'Train RMSE': np.sqrt(mean_squared_error(y_train, y_pred_train)),
        'Test RMSE': np.sqrt(mean_squared_error(y_test, y_pred_test)),
        'Train R^2': r2_score(y_train, y_pred_train),
        'Test R^2': r2_score(y_test, y_pred_test),
        'Train MAE': mean_absolute_error(y_train, y_pred_train),
        'Test MAE': mean_absolute_error(y_test, y_pred_test)
    }
    return metrics

results = evaluate_model(adaboost_optimized, X_train, X_test, y_train, y_test)
for metric, value in results.items():
    print(f'{metric}: {value}')

# 6. Export Results
def export_results(y_train, y_pred_train, y_test, y_pred_test, filename='Results_AdaBoost.xlsx'):
    with pd.ExcelWriter(filename) as writer:
        train_results_df = pd.DataFrame({
            'y_true': y_train,
            'y_pred': y_pred_train
        })
        test_results_df = pd.DataFrame({
            'y_true': y_test,
            'y_pred': y_pred_test
        })

        train_results_df.to_excel(writer, sheet_name='Train Results', index=False)
        test_results_df.to_excel(writer, sheet_name='Test Results', index=False)

export_results(
    y_train, 
    adaboost_optimized.predict(X_train), 
    y_test, 
    adaboost_optimized.predict(X_test)
)




Best parameters: OrderedDict([('base_estimator__max_depth', 3), ('base_estimator__min_samples_leaf', 3), ('base_estimator__min_samples_split', 13), ('learning_rate', 0.09992549737446227), ('n_estimators', 50)])
Train RMSE: 0.45047101104573817
Test RMSE: 0.840456868125178
Train R^2: 0.9622609628058011
Test R^2: 0.9248166277557974
Train MAE: 0.3704351468218659
Test MAE: 0.7108159820252202
