In [None]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import mean_squared_error, r2_score
import optuna

# 数据加载
data = pd.read_excel('catalytic ozonation data.xlsx').values
x = data[:, :9]
y = data[:, 9:10]

# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2, random_state=0)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)  # 查看数据大小

# 特征缩放
# 使用 StandardScaler 适用于机器学习模型
scaler_mlm = StandardScaler()
x_train_scaled_mlm = scaler_mlm.fit_transform(x_train)
x_test_scaled_mlm = scaler_mlm.transform(x_test)

# 使用 MinMaxScaler 适用于神经网络模型
scaler_nn = MinMaxScaler()
x_train_scaled_nn = scaler_nn.fit_transform(x_train)
x_test_scaled_nn = scaler_nn.transform(x_test)

# 定义结果字典
results = {
    'Model': [],
    'Training Time': [],
    'MSE (Test)': [],
    'RMSE (Test)': [],
    'R² (Test)': [],
    'MSE (Train)': [],
    'RMSE (Train)': [],
    'R² (Train)': [],
    'Training y': [],
    'Predicted y (Train)': [],
    'Test y': [],
    'Predicted y (Test)': [],
    'Loss': [],
    'Training Loss': [],
    'Validation Loss': [],
    'Best Parameters': []
}

# 训练并评估模型
def train_and_evaluate(model, x_train, y_train, x_test, y_test, model_name, scaler=None, callbacks=None, epochs=200, batch_size=16):
    start_time = time.time()  # 开始计时
    
    if model_name == 'NeuralNetwork':
        # 训练神经网络模型
        history = model.fit(
            x_train, y_train,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=0.2,
            callbacks=callbacks,
            verbose=0
        )
        training_time = time.time() - start_time  # 记录训练时间
        
        # 预测
        y_pred_train = model.predict(x_train).flatten()
        y_pred_test = model.predict(x_test).flatten()
        
        # 记录训练损失和验证损失
        training_loss = history.history['loss']
        validation_loss = history.history['val_loss']
        
        # 计算损失值
        loss = history.history['loss'][-1] if history.history['loss'] else 'N/A'
    else:
        # 训练传统机器学习模型
        model.fit(x_train, y_train)
        training_time = time.time() - start_time  # 记录训练时间
        
        # 预测
        y_pred_train = model.predict(x_train)
        y_pred_test = model.predict(x_test)
        
        # 传统机器学习模型没有损失值
        training_loss = []
        validation_loss = []
        loss = 'N/A'
    
    # 计算评估指标
    mse_train = mean_squared_error(y_train, y_pred_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(y_train, y_pred_train)
    
    mse_test = mean_squared_error(y_test, y_pred_test)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_test, y_pred_test)
    
    # 保存结果
    results['Model'].append(model_name)
    results['Training Time'].append(training_time)
    results['MSE (Test)'].append(mse_test)
    results['RMSE (Test)'].append(rmse_test)
    results['R² (Test)'].append(r2_test)
    results['MSE (Train)'].append(mse_train)
    results['RMSE (Train)'].append(rmse_train)
    results['R² (Train)'].append(r2_train)
    results['Training y'].append(y_train.flatten().tolist())
    results['Predicted y (Train)'].append(y_pred_train.tolist())
    results['Test y'].append(y_test.flatten().tolist())
    results['Predicted y (Test)'].append(y_pred_test.tolist())
    results['Loss'].append(loss)
    results['Training Loss'].append(training_loss)
    results['Validation Loss'].append(validation_loss)
    
    # 获取最佳参数
    if model_name == 'NeuralNetwork':
        best_params = model.get_config()
    else:
        best_params = model.get_params()
    results['Best Parameters'].append(best_params)
    
    # 打印评估结果
    print(f"Model: {model_name}")
    print(f"Training R²: {r2_train:.4f}, Training RMSE: {rmse_train:.4f}")
    print(f"Test R²: {r2_test:.4f}, Test RMSE: {rmse_test:.4f}")
    print("-" * 50)

# 贝叶斯优化函数
def optimize_models():
    print("Starting Bayesian Optimization for all models...")
    
    # 1. SVR
    def objective_svr(trial):
        # 定义超参数空间
        c = trial.suggest_loguniform('C', 1e-3, 1e3)
        gamma = trial.suggest_loguniform('gamma', 1e-3, 1e3)
        epsilon = trial.suggest_loguniform('epsilon', 1e-3, 1e3)
        
        # 创建模型
        model = SVR(kernel='rbf', C=c, gamma=gamma, epsilon=epsilon)
        model.fit(x_train_scaled_mlm, y_train)
        y_pred = model.predict(x_test_scaled_mlm)
        
        # 评估指标
        mse = mean_squared_error(y_test, y_pred)
        return mse  # 最小化MSE
    
    study_svr = optuna.create_study(direction='minimize')
    study_svr.optimize(objective_svr, n_trials=50)
    best_params_svr = study_svr.best_params
    svr_optimized = SVR(kernel='rbf', **best_params_svr)
    
    print("SVR Best Parameters:", best_params_svr)
    
    # 2. RandomForest
    def objective_rf(trial):
        n_estimators = trial.suggest_int('n_estimators', 50, 500)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        
        model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, 
                                      min_samples_split=min_samples_split, 
                                      min_samples_leaf=min_samples_leaf, random_state=42)
        model.fit(x_train_scaled_mlm, y_train.ravel())
        y_pred = model.predict(x_test_scaled_mlm)
        mse = mean_squared_error(y_test, y_pred)
        return mse
    
    study_rf = optuna.create_study(direction='minimize')
    study_rf.optimize(objective_rf, n_trials=50)
    best_params_rf = study_rf.best_params
    rf_optimized = RandomForestRegressor(**best_params_rf, random_state=42)
    
    print("Random Forest Best Parameters:", best_params_rf)
    
    # 3. XGBoost
    def objective_xgb(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
            'gamma': trial.suggest_uniform('gamma', 0, 1),
            'reg_alpha': trial.suggest_uniform('reg_alpha', 0, 1)
        }
        model = XGBRegressor(**params, random_state=42)
        model.fit(x_train_scaled_mlm, y_train.ravel())
        y_pred = model.predict(x_test_scaled_mlm)
        mse = mean_squared_error(y_test, y_pred)
        return mse
    
    study_xgb = optuna.create_study(direction='minimize')
    study_xgb.optimize(objective_xgb, n_trials=50)
    best_params_xgb = study_xgb.best_params
    xgb_optimized = XGBRegressor(**best_params_xgb, random_state=42)
    
    print("XGBoost Best Parameters:", best_params_xgb)
    
    # 4. Neural Network
    def objective_nn(trial):
        # 定义超参数空间
        units = trial.suggest_categorical('units', [32, 64, 128])
        dropout_rate = trial.suggest_uniform('dropout_rate', 0.2, 0.5)
        learning_rate = trial.suggest_loguniform('learning_rate', 0.0001, 0.1)
        
        # 构建模型
        model = Sequential()
        model.add(Dense(units, activation='relu', input_shape=(9,)))
        model.add(Dropout(dropout_rate))
        model.add(Dense(32, activation='relu'))
        model.add(Dropout(dropout_rate))
        model.add(Dense(1))
        
        # 编译模型
        optimizer = Adam(learning_rate=learning_rate)
        model.compile(optimizer=optimizer, loss='mean_squared_error')
        
        # 设置早停法
        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)
        
        history = model.fit(x_train_scaled_nn, y_train, 
                            epochs=200, 
                            batch_size=32, 
                            validation_split=0.2,
                            callbacks=[early_stopping, reduce_lr],
                            verbose=0)
        
        # 获取验证损失
        loss = history.history['val_loss'][-1]
        return loss
    
    study_nn = optuna.create_study(direction='minimize')
    study_nn.optimize(objective_nn, n_trials=50)
    best_params_nn = study_nn.best_params
    
    # 使用最佳参数重构模型
    best_units = best_params_nn['units']
    best_dropout = best_params_nn['dropout_rate']
    best_lr = best_params_nn['learning_rate']
    
    model_nn_optimized = Sequential()
    model_nn_optimized.add(Dense(best_units, activation='relu', input_shape=(9,)))
    model_nn_optimized.add(Dropout(best_dropout))
    model_nn_optimized.add(Dense(32, activation='relu'))
    model_nn_optimized.add(Dropout(best_dropout))
    model_nn_optimized.add(Dense(1))
    
    optimizer = Adam(learning_rate=best_lr)
    model_nn_optimized.compile(optimizer=optimizer, loss='mean_squared_error')
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)
    
    history = model_nn_optimized.fit(x_train_scaled_nn, y_train, 
                                     epochs=200, 
                                     batch_size=32, 
                                     validation_split=0.2,
                                     callbacks=[early_stopping, reduce_lr],
                                     verbose=0)
    
    # 获取训练和验证损失
    training_loss = history.history['loss']
    validation_loss = history.history['val_loss']
    
    # 评估模型
    y_pred_train = model_nn_optimized.predict(x_train_scaled_nn).flatten()
    y_pred_test = model_nn_optimized.predict(x_test_scaled_nn).flatten()
    
    # 计算评估指标
    mse_train = mean_squared_error(y_train, y_pred_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(y_train, y_pred_train)
    
    mse_test = mean_squared_error(y_test, y_pred_test)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_test, y_pred_test)
    
    # 保存结果
    results['Model'].append('NeuralNetwork_BayesOptimized')
    results['Training Time'].append(history.history['loss'][-1])
    results['MSE (Test)'].append(mse_test)
    results['RMSE (Test)'].append(rmse_test)
    results['R² (Test)'].append(r2_test)
    results['MSE (Train)'].append(mse_train)
    results['RMSE (Train)'].append(rmse_train)
    results['R² (Train)'].append(r2_train)
    results['Training y'].append(y_train.flatten().tolist())
    results['Predicted y (Train)'].append(y_pred_train.tolist())
    results['Test y'].append(y_test.flatten().tolist())
    results['Predicted y (Test)'].append(y_pred_test.tolist())
    results['Loss'].append(history.history['loss'][-1])
    results['Training Loss'].append(training_loss)
    results['Validation Loss'].append(validation_loss)
    results['Best Parameters'].append(best_params_nn)
    
    # 将优化后的模型添加到结果中
    train_and_evaluate(svr_optimized, x_train_scaled_mlm, y_train, x_test_scaled_mlm, y_test, 'SVR_BayesOptimized')
    train_and_evaluate(rf_optimized, x_train_scaled_mlm, y_train, x_test_scaled_mlm, y_test, 'RandomForest_BayesOptimized')
    train_and_evaluate(xgb_optimized, x_train_scaled_mlm, y_train, x_test_scaled_mlm, y_test, 'XGBoost_BayesOptimized')
    
    # 绘制损失函数图
    plot_loss_functions()

# 绘制损失函数图
def plot_loss_functions():
    models = ['SVR', 'RandomForest', 'XGBoost', 'NeuralNetwork', 'SVR_BayesOptimized', 'RandomForest_BayesOptimized', 'XGBoost_BayesOptimized', 'NeuralNetwork_BayesOptimized']
    
    for model_name in models:
        idx = results['Model'].index(model_name)
        training_loss = results['Training Loss'][idx]
        validation_loss = results['Validation Loss'][idx]
        
        plt.figure(figsize=(10, 6))
        plt.plot(training_loss, label='Training Loss')
        plt.plot(validation_loss, label='Validation Loss')
        plt.title(f'Loss Function for {model_name}')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.savefig(f'{model_name}_loss_function.png')
        plt.close()

# 主函数
def main():
    # 训练并评估未优化的模型
    # 1. SVR
    svr = SVR(kernel='rbf')
    train_and_evaluate(svr, x_train_scaled_mlm, y_train, x_test_scaled_mlm, y_test, 'SVR')
    
    # 2. RandomForest
    random_forest = RandomForestRegressor(n_estimators=100, random_state=42)
    train_and_evaluate(random_forest, x_train_scaled_mlm, y_train, x_test_scaled_mlm, y_test, 'RandomForest')
    
    # 3. XGBoost
    xgboost = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    train_and_evaluate(xgboost, x_train_scaled_mlm, y_train, x_test_scaled_mlm, y_test, 'XGBoost')
    
    # 4. Neural Network
    model_nn = Sequential()
    model_nn.add(Dense(64, activation='relu', input_shape=(9,)))
    model_nn.add(Dropout(0.3))
    model_nn.add(Dense(32, activation='relu'))
    model_nn.add(Dropout(0.3))
    model_nn.add(Dense(1))
    
    optimizer = Adam(learning_rate=0.001)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)
    
    model_nn.compile(optimizer=optimizer, loss='mean_squared_error')
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    train_and_evaluate(model_nn, x_train_scaled_nn, y_train, x_test_scaled_nn, y_test, 'NeuralNetwork', callbacks=[early_stopping, reduce_lr])
    
    # 贝叶斯优化
    optimize_models()
    
    # 将结果保存到 DataFrame
    results_df = pd.DataFrame(results)
    
    # 将训练集和测试集的预测结果输出为 Excel 文件
    for model_name in results['Model']:
        train_results = pd.DataFrame({
            'Actual y (Train)': results['Training y'][results['Model'].index(model_name)],
            'Predicted y (Train)': results['Predicted y (Train)'][results['Model'].index(model_name)]
        })
        test_results = pd.DataFrame({
            'Actual y (Test)': results['Test y'][results['Model'].index(model_name)],
            'Predicted y (Test)': results['Predicted y (Test)'][results['Model'].index(model_name)]
        })
        
        # 保存为 Excel 文件
        train_results.to_excel(f'{model_name}_train_results.xlsx', index=False)
        test_results.to_excel(f'{model_name}_test_results.xlsx', index=False)
    
    # 输出每个模型的详细指标
    # 创建一个汇总 DataFrame
    summary = pd.DataFrame({
        'Model': results['Model'],
        'MSE (Train)': results['MSE (Train)'],
        'RMSE (Train)': results['RMSE (Train)'],
        'R² (Train)': results['R² (Train)'],
        'MSE (Test)': results['MSE (Test)'],
        'RMSE (Test)': results['RMSE (Test)'],
        'R² (Test)': results['R² (Test)'],
        'Loss': results['Loss'],
        'Training Time': results['Training Time']
    })
    
    # 保存到 Excel 文件
    summary.to_excel('model_summary.xlsx', index=False)
    
    # 保存训练损失和验证损失
    loss_data = pd.DataFrame({
        'Model': results['Model'],
        'Training Loss': results['Training Loss'],
        'Validation Loss': results['Validation Loss']
    })
    
    # 保存到 Excel 文件
    loss_data.to_excel('training_validation_loss.xlsx', index=False)

    # 打印每个模型的最佳参数
    for model_name, best_params in zip(results['Model'], results['Best Parameters']):
        print(f"{model_name} 最佳参数:")
        print(best_params)
        print()

    print("结果已保存到 model_summary.xlsx 和 training_validation_loss.xlsx 文件中！")

if __name__ == "__main__":
    main()