In [None]:
import pandas as pd                                     #相关性热图
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt

# 读取数据
file_path = r"C:\Users\SS\Desktop\论文\Machine learning\Machine learning 数据集\无道砟训练集\有道砟训练集.xlsx"
df = pd.read_excel(file_path)

# 确保 'date' 列是 datetime 类型
df['date'] = pd.to_datetime(df['date'])

# 添加时间特征
start_date = df['date'].min()
df['DAY'] = (df['date'] - start_date).dt.days  # 整个数据集的第几天
df['DOY'] = df['date'].dt.dayofyear  # 一年中的第几天

# 数据预处理
features = ['WS2', 'WS10', 'AT2', 'RH2', 'VP2', 'AT10', 'RH10', 'VP10', 'AP', 'PR', 'PRR', 'DR', 'UR', 'DLR', 'ULR', 'Rn', 'ALB', 'TNR', 'SG5', 'SG15', 'DAY', 'DOY']
target_variable = ['ST5', 'SVWC5']

# 删除日期列
df = df.drop(columns=['date'])
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# 对特征进行归一化处理
scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(df_imputed[features])

# 对每个目标变量分别进行归一化处理
scaler_y = MinMaxScaler()
for target in target_variable:
    df_imputed[target] = scaler_y.fit_transform(df_imputed[target].values.reshape(-1, 1))

# 计算相关性矩阵
correlation_matrix = df_imputed[features + target_variable].corr()

# 创建标准化后的数据框用于保存
df_scaled = pd.DataFrame(X_scaled, columns=features)
for target in target_variable:
    df_scaled[target] = df_imputed[target]

# 将相关性矩阵和标准化后的数据保存到Excel文件的不同工作表
output_file_path = r"D:\ML数据集\相关性热图绘图数据\相关性矩阵与标准化数据.xlsx"
with pd.ExcelWriter(output_file_path) as writer:
    # 保存相关性矩阵到SHEET1
    correlation_matrix.to_excel(writer, sheet_name="Correlation Matrix")
    
    # 保存标准化后的数据到SHEET2
    df_scaled.to_excel(writer, sheet_name="Scaled Data", index=False)

# 设置绘图尺寸
plt.figure(figsize=(18, 12))

# 绘制热图
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Feature and Target Variable Correlation Heatmap")
plt.show()


In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist

# 读取数据
file_path = r"C:\Users\SS\Desktop\论文\Machine learning\Machine learning 数据集\无道砟训练集\有道砟训练集.xlsx"
df = pd.read_excel(file_path)

# 确保 'date' 列是 datetime 类型
df['date'] = pd.to_datetime(df['date'])


# 数据预处理
features = ['WS2', 'WS10', 'AT2', 'RH2', 'VP2', 'AT10', 'RH10', 'VP10', 'AP', 'PR', 'PRR', 'DR', 'UR', 'DLR', 'ULR', 'Rn', 'ALB', 'TNR', 'SG5', 'SG15']
target_variable = ['ST5', 'SVWC5']

# 删除日期列
df = df.drop(columns=['date'])

# 处理缺失值和标准化
imputer = SimpleImputer(strategy='mean')
scaler = MinMaxScaler()
X = df[features]
X_imputed = imputer.fit_transform(X)
X_scaled = scaler.fit_transform(X_imputed)

# 计算特征之间的相关性矩阵
corr_matrix = np.corrcoef(X_scaled, rowvar=False)

# 基于特征之间的皮尔森相关系数进行层次聚类
dist_matrix = pdist(1 - np.abs(corr_matrix))
linkage_matrix = linkage(dist_matrix, method='average')

# 设定一个阈值，将特征分成若干簇
threshold = 0.5
clusters = fcluster(linkage_matrix, threshold, criterion='distance')

# 打印每个特征所属的簇
for feature, cluster in zip(features, clusters):
    print(f'Feature: {feature}, Cluster: {cluster}')

# 为每个簇选择一个代表特征
selected_features = []
for cluster_id in np.unique(clusters):
    cluster_features = np.where(clusters == cluster_id)[0]
    selected_feature = features[cluster_features[0]]
    selected_features.append(selected_feature)

print("Selected Features:")
print(selected_features)

# 将所需的计算结果保存到字典中
results = {
    "linkage_matrix": linkage_matrix,
    "features": features
}


In [None]:
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram

# 从结果字典中提取计算结果
linkage_matrix = results["linkage_matrix"]
features = results["features"]

# 绘制层次聚类树状图
plt.figure(figsize=(14, 8), linewidth=3)

# 设置字体类型和大小
plt.rcParams.update({'font.size': 16, 'font.family': 'Arial'})

dendrogram(linkage_matrix, labels=features, leaf_rotation=90, leaf_font_size=15)
plt.title('Hierarchical Clustering', fontsize=18, fontweight='bold', family='Arial')
plt.xlabel('Features', fontsize=18, fontweight='bold', family='Arial')
plt.ylabel('Distance', fontsize=18, fontweight='bold', family='Arial')

# 设置x轴和y轴刻度字体
plt.xticks(fontsize=15, fontfamily='Arial')
plt.yticks(fontsize=15, fontfamily='Arial')

# 删除背景方块并添加边框
plt.gca().patch.set_facecolor('white')
plt.gca().patch.set_edgecolor('black')
for spine in plt.gca().spines.values():
    spine.set_edgecolor('black')
    spine.set_linewidth(1.5)

plt.tick_params(axis='both', which='major', labelsize=16)
plt.grid(False)

# 保存为JPEG格式，dpi=300
output_file_path = r"D:\ML数据集\提交绘图源PPT\层次聚类树状图.jpeg"
plt.savefig(output_file_path, format='jpeg', dpi=300, bbox_inches='tight')

plt.show()


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import time
from sklearn.inspection import permutation_importance
from sklearn.tree import DecisionTreeRegressor

# 记录开始时间
start_time = time.time()

# 读取数据
file_path = r"D:\ML数据集\CPE\CPE_data.xlsx"
df = pd.read_excel(file_path)

# 确保 'date' 列是 datetime 类型
df['date'] = pd.to_datetime(df['date'])

# 数据预处理
features = ['WS2', 'WS10', 'AT2', 'RH2', 'VP2', 'AT10', 'RH10', 'VP10', 'AP', 'PR', 'PRR', 'DR', 'UR', 'DLR', 'ULR', 'Rn', 'ALB', 'TNR', 'SG5', 'SG15']
target_variables = ['ST5', 'SVWC5']


# 排除日期时间列
df = df.select_dtypes(exclude=['datetime'])

# 处理缺失值
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# 数据归一化
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df_imputed), columns=df_imputed.columns)

# 模型定义
model = DecisionTreeRegressor(random_state=42)

def process_target(target):
    local_importance_results = []
    
    X = df_scaled[features].values
    y = df_scaled[target].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # 训练模型
    model.fit(X_train, y_train)
    
    # 计算排列重要性
    result = permutation_importance(model, X_test, y_test, n_repeats=30, random_state=42, n_jobs=-1)
    importances = result.importances_mean
    std = result.importances_std
    
    local_importance_results.append({
        'Target': target,
        'Features': features,
        'Importances': importances,
        'Std': std
    })
    
    return local_importance_results

# 初始化全局变量
importance_results = []

# 指定核心数
num_cores = 1

# 使用多线程并行处理
with ThreadPoolExecutor(max_workers=num_cores) as executor:
    futures = [executor.submit(process_target, target) for target in target_variables]
    for future in futures:
        local_importance_results = future.result()
        importance_results.extend(local_importance_results)

# 将结果转换为数据框
importance_df = pd.DataFrame(importance_results)

# 记录结束时间
end_time = time.time()
total_time = end_time - start_time

print("模型计算完成")
print(f"总运行时间: {total_time:.2f}秒")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# 确保已经计算出 importance_df

# 转换数据格式以便绘制图
data = {}
targets = importance_df['Target'].unique()
for target in targets:
    data[target] = {}
    for i, feature in enumerate(importance_df[importance_df['Target'] == target]['Features'].values[0]):
        data[target][feature] = importance_df[importance_df['Target'] == target]['Importances'].values[0][i]

# 为每个目标变量绘制单独的图
for target in targets:
    target_data = pd.DataFrame.from_dict(data[target], orient='index', columns=['Importance'])
    target_data.sort_values(by='Importance', ascending=False, inplace=True)
    
    # 绘制图形
    plt.figure(figsize=(12, 6))  # 调整图形大小
    
    # 颜色映射
    colors = sns.color_palette("Set3", len(target_data))  # 使用Set3调色板
    
    # 绘制水平柱状图
    target_data.plot(kind='barh', color=colors, edgecolor='k', legend=False)
    
    # 设置轴标签
    plt.ylabel("Feature", fontsize=15, labelpad=20)  # y轴标签大小和间距
    plt.xlabel("Permutation Importance", fontsize=15, labelpad=20)  # x轴标签大小和间距
    
    # 设置标题
    plt.title(f"Permutation Importance of Features for {target}", fontsize=18, pad=20)  # 标题大小和间距
    
    # 设置刻度和标签
    plt.xticks(fontsize=12)  # x轴刻度标签大小
    plt.yticks(fontsize=12)  # y轴刻度标签大小
    
    # 添加网格
    plt.grid(True, which='both', linestyle='--', linewidth=0.7)
    
    # 调整图表布局
    plt.tight_layout()
    
    plt.show()


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.feature_selection import RFE
import time

# 记录开始时间
start_time = time.time()

# 读取数据
file_path = r"C:\Users\SS\Desktop\论文\Machine learning\Machine learning 数据集\无道砟训练集\有道砟训练集.xlsx"
df = pd.read_excel(file_path)

# 确保 'date' 列是 datetime 类型
df['date'] = pd.to_datetime(df['date'])

# 数据预处理
features = ['WS2', 'WS10', 'AT2', 'RH2', 'VP2', 'AT10', 'RH10', 'VP10', 'AP', 'PR', 'PRR', 'DR', 'UR', 'DLR', 'ULR', 'Rn', 'ALB', 'TNR', 'SG5', 'SG15']
target_variables = ['ST5']

# 删除日期列
df = df.drop(columns=['date'])

imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(df_imputed[features])

# 对目标变量进行归一化处理
scaler_y = MinMaxScaler()
for target in target_variables:
    df_imputed[target] = scaler_y.fit_transform(df_imputed[target].values.reshape(-1, 1))

# 定义超参数搜索空间
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 模型定义
base_model = DecisionTreeRegressor(random_state=42)
grid_search = GridSearchCV(estimator=base_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)

def process_target(target):
    results = []
    predictions = []
    cross_val_results = []

    scaler_y = MinMaxScaler()
    y_scaled = scaler_y.fit_transform(df_imputed[target].values.reshape(-1, 1))
    y = y_scaled.ravel()
    
    # 分割训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    # 进行超参数搜索
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # 打印最优参数
    print(f"Best Parameters for target {target}: {best_params}")

    # 使用RFE进行特征递归消除
    rfe = RFE(estimator=best_model, n_features_to_select=1, step=1)
    rfe.fit(X_train, y_train)
    ranking = rfe.ranking_
    sorted_features = np.array(features)[np.argsort(ranking)]

    for n_features in range(len(features), 0, -1):
        selected_features = sorted_features[:n_features]
        print(f'Target: {target}, Remaining features ({n_features}): {list(selected_features)}')
        
        X_train_rfe = X_train[:, np.argsort(ranking)[:n_features]]
        X_test_rfe = X_test[:, np.argsort(ranking)[:n_features]]
        
        # 进行五折交叉验证
        kfold = KFold(n_splits=5, shuffle=True, random_state=42)
        
        fold_index = 1
        for train_idx, test_idx in kfold.split(X_train_rfe):
            X_train_fold, X_test_fold = X_train_rfe[train_idx], X_train_rfe[test_idx]
            y_train_fold, y_test_fold = y_train[train_idx], y_train[test_idx]
            
            best_model.fit(X_train_fold, y_train_fold)
            y_pred_fold = best_model.predict(X_test_fold)
            
            fold_rmse = np.sqrt(mean_squared_error(y_test_fold, y_pred_fold))
            fold_r2 = r2_score(y_test_fold, y_pred_fold)
            fold_mae = mean_absolute_error(y_test_fold, y_pred_fold)

            cross_val_results.append({
                'Target': target,
                'n_features': n_features,
                'Fold': fold_index,
                'CV_RMSE': fold_rmse,
                'CV_R²': fold_r2,
                'CV_MAE': fold_mae
            })
            
            fold_index += 1
        
        # 在测试集上进行预测并计算指标
        best_model.fit(X_train_rfe, y_train)
        y_pred_test = best_model.predict(X_test_rfe)
        y_pred_train = best_model.predict(X_train_rfe)
        y_pred_test_unscaled = scaler_y.inverse_transform(y_pred_test.reshape(-1, 1)).ravel()
        y_pred_train_unscaled = scaler_y.inverse_transform(y_pred_train.reshape(-1, 1)).ravel()
        y_test_unscaled = scaler_y.inverse_transform(y_test.reshape(-1, 1)).ravel()
        y_train_unscaled = scaler_y.inverse_transform(y_train.reshape(-1, 1)).ravel()

        test_rmse = np.sqrt(mean_squared_error(y_test_unscaled, y_pred_test_unscaled))
        test_r2 = r2_score(y_test_unscaled, y_pred_test_unscaled)
        test_mae = mean_absolute_error(y_test_unscaled, y_pred_test_unscaled)

        train_rmse = np.sqrt(mean_squared_error(y_train_unscaled, y_pred_train_unscaled))
        train_r2 = r2_score(y_train_unscaled, y_pred_train_unscaled)
        train_mae = mean_absolute_error(y_train_unscaled, y_pred_train_unscaled)

        # 记录模型性能
        results.append({
            'Target': target,
            'n_features': n_features,
            'Remaining Features': list(selected_features),
            'Test_RMSE': test_rmse,
            'Test_R²': test_r2,
            'Test_MAE': test_mae,
            'Train_RMSE': train_rmse,
            'Train_R²': train_r2,
            'Train_MAE': train_mae,
            'Best_Params': best_params
        })

        # 保存预测值和真实值
        predictions.append({
            'Target': target,
            'n_features': n_features,
            'y_test': y_test_unscaled.tolist(),
            'y_pred_test': y_pred_test_unscaled.tolist(),
            'y_train': y_train_unscaled.tolist(),
            'y_pred_train': y_pred_train_unscaled.tolist()
        })

    return results, cross_val_results, predictions

# 初始化全局变量
all_results = []
all_cross_val_results = []
all_predictions = []

# 处理所有目标变量
for target in target_variables:
    target_results, target_cross_val_results, target_predictions = process_target(target)
    all_results.extend(target_results)
    all_cross_val_results.extend(target_cross_val_results)
    all_predictions.extend(target_predictions)

# 将结果转换为数据框
results_df = pd.DataFrame(all_results)
cross_val_results_df = pd.DataFrame(all_cross_val_results)
predictions_df = pd.DataFrame(all_predictions)

# 保存结果到Excel
output_dir = r'D:\ML数据集\决策树递归消除和交叉验证结果'
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, '递归消除_交叉验证结果.xlsx')

with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
    results_df.to_excel(writer, sheet_name='Results', index=False)
    cross_val_results_df.to_excel(writer, sheet_name='Cross_Validation_Results', index=False)
    predictions_df.to_excel(writer, sheet_name='Predictions', index=False)

# 记录结束时间
end_time = time.time()
total_time = end_time - start_time

print("模型计算完成")
print(f"总运行时间: {total_time:.2f}秒")
print(f"结果已保存到: {output_path}")


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from concurrent.futures import ThreadPoolExecutor
import time

# 记录开始时间
start_time = time.time()

# 读取数据
file_path = r"C:\Users\SS\Desktop\论文\Machine learning\Machine learning 数据集\无道砟训练集\有道砟训练集.xlsx"
df = pd.read_excel(file_path)

# 确保 'date' 列是 datetime 类型
df['date'] = pd.to_datetime(df['date'])

# 数据预处理
features = ['WS2', 'WS10', 'AT2', 'RH2', 'VP2', 'AT10', 'RH10', 'VP10', 'AP', 'PR', 'PRR', 'DR', 'UR', 'DLR', 'ULR', 'Rn', 'ALB', 'TNR', 'SG5', 'SG15']
target_variables = ['ST5', 'SVWC5']

# 删除日期列
df = df.drop(columns=['date'])

# 填补缺失值
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# 对所有特征进行归一化处理
scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(df_imputed[features])

# 对所有目标变量进行归一化处理
scaler_y = MinMaxScaler()
for target in target_variables:
    df_imputed[target] = scaler_y.fit_transform(df_imputed[target].values.reshape(-1, 1))

# 定义超参数搜索空间
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 模型定义
base_model = DecisionTreeRegressor(random_state=42)
grid_search = GridSearchCV(estimator=base_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)

def process_target(target, selected_features):
    local_results = []
    local_cross_val_results = []
    local_predictions = []

    # 提取归一化后的目标变量
    y_scaled = df_imputed[target].values
    y = y_scaled.ravel()
    
    # 提取对应的归一化后的特征
    X_selected = df_imputed[selected_features].values
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
    
    # 不进行反归一化处理，直接使用归一化后的数据进行计算

    # 进行超参数搜索
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # 打印最优参数
    print(f"Best Parameters for target {target}: {best_params}")

    for n_features in range(len(selected_features), 0, -1):
        X_train_rfe = X_train[:, :n_features]
        X_test_rfe = X_test[:, :n_features]
        best_model.fit(X_train_rfe, y_train)
        y_pred_test = best_model.predict(X_test_rfe)
        y_pred_train = best_model.predict(X_train_rfe)

        # 记录模型性能
        local_results.append({
            'Model': 'Decision Tree',
            'Target': target,
            'n_features': n_features,
            'RMSE_test': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'R²_test': r2_score(y_test, y_pred_test),
            'MAE_test': mean_absolute_error(y_test, y_pred_test),
            'RMSE_train': np.sqrt(mean_squared_error(y_train, y_pred_train)),
            'R²_train': r2_score(y_train, y_pred_train),
            'MAE_train': mean_absolute_error(y_train, y_pred_train),
        })

        # 保存预测值和真实值
        local_predictions.append({
            'Model': 'Decision Tree',
            'Target': target,
            'n_features': n_features,
            'y_test': y_test.tolist(),
            'y_pred_test': y_pred_test.tolist(),
            'y_train': y_train.tolist(),
            'y_pred_train': y_pred_train.tolist(),
            'RMSE_test': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'R²_test': r2_score(y_test, y_pred_test),
            'MAE_test': mean_absolute_error(y_test, y_pred_test),
            'RMSE_train': np.sqrt(mean_squared_error(y_train, y_pred_train)),
            'R²_train': r2_score(y_train, y_pred_train),
            'MAE_train': mean_absolute_error(y_train, y_pred_train)
        })

        # 交叉验证
        kfold = KFold(n_splits=5, shuffle=True, random_state=42)
        for fold, (train_idx, test_idx) in enumerate(kfold.split(X_train_rfe)):
            X_train_fold, X_test_fold = X_train_rfe[train_idx], X_train_rfe[test_idx]
            y_train_fold, y_test_fold = y_train[train_idx], y_train[test_idx]
            best_model.fit(X_train_fold, y_train_fold)
            y_pred_fold = best_model.predict(X_test_fold)
            local_cross_val_results.append({
                'Model': 'Decision Tree',
                'Target': target,
                'n_features': n_features,
                'Fold': fold + 1,
                'CV_RMSE': np.sqrt(mean_squared_error(y_test_fold, y_pred_fold)),
                'CV_R²': r2_score(y_test_fold, y_pred_fold),
                'CV_MAE': mean_absolute_error(y_test_fold, y_pred_fold)
            })

    return local_results, local_cross_val_results, local_predictions

# 初始化全局变量
results = []
cross_val_results = []
predictions = []

# 指定核心数
num_cores = 1

# 使用多线程并行处理
with ThreadPoolExecutor(max_workers=num_cores) as executor:
    target_feature_pairs = [
        ('ST5', ['RH10',  'SG15', 'AT10','PRR']),  
        ('SVWC5', ['RH10', 'PRR',  'SG15', 'VP2']), 
    ]
    futures = [executor.submit(process_target, target, features) for target, features in target_feature_pairs]
    for future in futures:
        local_results, local_cross_val_results, local_predictions = future.result()
        results.extend(local_results)
        cross_val_results.extend(local_cross_val_results)
        predictions.extend(local_predictions)

# 将结果转换为数据框
results_df = pd.DataFrame(results)
cross_val_df = pd.DataFrame(cross_val_results)
predictions_df = pd.DataFrame(predictions)

# 保存计算结果为Excel文件
output_dir = r'D:\ML数据集\聚类分析计算结果'
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, '决策树模型计算结果.xlsx')

with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
    results_df.to_excel(writer, sheet_name='Model_Performance', index=False)
    cross_val_df.to_excel(writer, sheet_name='Cross_Validation_Results', index=False)
    predictions_df.to_excel(writer, sheet_name='Predictions', index=False)

# 记录结束时间
end_time = time.time()
total_time = end_time - start_time

print("模型计算完成")
print(f"总运行时间: {total_time:.2f}秒")
print(f"结果已保存到: {output_path}")


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from concurrent.futures import ThreadPoolExecutor
import time

# 记录开始时间
start_time = time.time()

# 读取数据
file_path = r"C:\Users\SS\Desktop\论文\Machine learning\Machine learning 数据集\无道砟训练集\有道砟训练集.xlsx"
df = pd.read_excel(file_path)

# 确保 'date' 列是 datetime 类型
df['date'] = pd.to_datetime(df['date'])

# 数据预处理
features = ['WS2', 'WS10', 'AT2', 'RH2', 'VP2', 'AT10', 'RH10', 'VP10', 'AP', 'PR', 'PRR', 'DR', 'UR', 'DLR', 'ULR', 'Rn', 'ALB', 'TNR', 'SG5', 'SG15']
target_variables = ['ST5', 'SVWC5']

# 删除日期列
df = df.drop(columns=['date'])

# 填补缺失值
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# 对所有特征进行归一化处理
scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(df_imputed[features])

# 对所有目标变量进行归一化处理
scaler_y = MinMaxScaler()
for target in target_variables:
    df_imputed[target] = scaler_y.fit_transform(df_imputed[target].values.reshape(-1, 1))

# 定义超参数搜索空间
param_grid = {
    'n_estimators': [100, 200, 300],  # 森林中树的数量
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 模型定义
base_model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=base_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)

def process_target(target, selected_features):
    local_results = []
    local_cross_val_results = []
    local_predictions = []

    # 提取归一化后的目标变量
    y_scaled = df_imputed[target].values
    y = y_scaled.ravel()
    
    # 提取对应的归一化后的特征
    X_selected = df_imputed[selected_features].values
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
    
    # 不进行反归一化处理，直接使用归一化后的数据进行计算

    # 进行超参数搜索
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # 打印最优参数
    print(f"Best Parameters for target {target}: {best_params}")

    for n_features in range(len(selected_features), 0, -1):
        X_train_rfe = X_train[:, :n_features]
        X_test_rfe = X_test[:, :n_features]
        best_model.fit(X_train_rfe, y_train)
        y_pred_test = best_model.predict(X_test_rfe)
        y_pred_train = best_model.predict(X_train_rfe)

        # 记录模型性能
        local_results.append({
            'Model': 'Random Forest',
            'Target': target,
            'n_features': n_features,
            'RMSE_test': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'R²_test': r2_score(y_test, y_pred_test),
            'MAE_test': mean_absolute_error(y_test, y_pred_test),
            'RMSE_train': np.sqrt(mean_squared_error(y_train, y_pred_train)),
            'R²_train': r2_score(y_train, y_pred_train),
            'MAE_train': mean_absolute_error(y_train, y_pred_train),
        })

        # 保存预测值和真实值
        local_predictions.append({
            'Model': 'Random Forest',
            'Target': target,
            'n_features': n_features,
            'y_test': y_test.tolist(),
            'y_pred_test': y_pred_test.tolist(),
            'y_train': y_train.tolist(),
            'y_pred_train': y_pred_train.tolist(),
            'RMSE_test': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'R²_test': r2_score(y_test, y_pred_test),
            'MAE_test': mean_absolute_error(y_test, y_pred_test),
            'RMSE_train': np.sqrt(mean_squared_error(y_train, y_pred_train)),
            'R²_train': r2_score(y_train, y_pred_train),
            'MAE_train': mean_absolute_error(y_train, y_pred_train)
        })

        # 交叉验证
        kfold = KFold(n_splits=5, shuffle=True, random_state=42)
        for fold, (train_idx, test_idx) in enumerate(kfold.split(X_train_rfe)):
            X_train_fold, X_test_fold = X_train_rfe[train_idx], X_train_rfe[test_idx]
            y_train_fold, y_test_fold = y_train[train_idx], y_train[test_idx]
            best_model.fit(X_train_fold, y_train_fold)
            y_pred_fold = best_model.predict(X_test_fold)
            local_cross_val_results.append({
                'Model': 'Random Forest',
                'Target': target,
                'n_features': n_features,
                'Fold': fold + 1,
                'CV_RMSE': np.sqrt(mean_squared_error(y_test_fold, y_pred_fold)),
                'CV_R²': r2_score(y_test_fold, y_pred_fold),
                'CV_MAE': mean_absolute_error(y_test_fold, y_pred_fold)
            })

    return local_results, local_cross_val_results, local_predictions

# 初始化全局变量
results = []
cross_val_results = []
predictions = []

# 指定核心数
num_cores = 1

# 使用多线程并行处理
with ThreadPoolExecutor(max_workers=num_cores) as executor:
    target_feature_pairs = [
        ('ST5', ['RH10',  'SG15', 'AT10','PRR']),  
        ('SVWC5', ['RH10', 'PRR',  'SG15', 'VP2']), 
    ]
    futures = [executor.submit(process_target, target, features) for target, features in target_feature_pairs]
    for future in futures:
        local_results, local_cross_val_results, local_predictions = future.result()
        results.extend(local_results)
        cross_val_results.extend(local_cross_val_results)
        predictions.extend(local_predictions)

# 将结果转换为数据框
results_df = pd.DataFrame(results)
cross_val_df = pd.DataFrame(cross_val_results)
predictions_df = pd.DataFrame(predictions)

# 保存计算结果为Excel文件
output_dir = r'D:\ML数据集\聚类分析计算结果'
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, '随机森林模型计算结果.xlsx')

with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
    results_df.to_excel(writer, sheet_name='Model_Performance', index=False)
    cross_val_df.to_excel(writer, sheet_name='Cross_Validation_Results', index=False)
    predictions_df.to_excel(writer, sheet_name='Predictions', index=False)

# 记录结束时间
end_time = time.time()
total_time = end_time - start_time

print("模型计算完成")
print(f"总运行时间: {total_time:.2f}秒")
print(f"结果已保存到: {output_path}")


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from concurrent.futures import ThreadPoolExecutor
import time

# 记录开始时间
start_time = time.time()

# 读取数据
file_path = r"C:\Users\SS\Desktop\论文\Machine learning\Machine learning 数据集\无道砟训练集\有道砟训练集.xlsx"
df = pd.read_excel(file_path)

# 确保 'date' 列是 datetime 类型
df['date'] = pd.to_datetime(df['date'])

# 数据预处理
features = ['WS2', 'WS10', 'AT2', 'RH2', 'VP2', 'AT10', 'RH10', 'VP10', 'AP', 'PR', 'PRR', 'DR', 'UR', 'DLR', 'ULR', 'Rn', 'ALB', 'TNR', 'SG5', 'SG15']
target_variables = ['ST5', 'SVWC5']

# 删除日期列
df = df.drop(columns=['date'])

# 填补缺失值
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# 对所有特征进行归一化处理
scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(df_imputed[features])

# 对所有目标变量进行归一化处理
scaler_y = MinMaxScaler()
for target in target_variables:
    df_imputed[target] = scaler_y.fit_transform(df_imputed[target].values.reshape(-1, 1))

# 定义超参数搜索空间
param_grid = {
    'n_estimators': [100, 200, 300],  # 弱学习器的数量
    'learning_rate': [0.01, 0.1, 0.2],  # 学习率
    'max_depth': [3, 5, 7],  # 每个树的最大深度
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 模型定义
base_model = GradientBoostingRegressor(random_state=42)
grid_search = GridSearchCV(estimator=base_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)

def process_target(target, selected_features):
    local_results = []
    local_cross_val_results = []
    local_predictions = []

    # 提取归一化后的目标变量
    y_scaled = df_imputed[target].values
    y = y_scaled.ravel()
    
    # 提取对应的归一化后的特征
    X_selected = df_imputed[selected_features].values
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
    
    # 不进行反归一化处理，直接使用归一化后的数据进行计算

    # 进行超参数搜索
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # 打印最优参数
    print(f"Best Parameters for target {target}: {best_params}")

    for n_features in range(len(selected_features), 0, -1):
        X_train_rfe = X_train[:, :n_features]
        X_test_rfe = X_test[:, :n_features]
        best_model.fit(X_train_rfe, y_train)
        y_pred_test = best_model.predict(X_test_rfe)
        y_pred_train = best_model.predict(X_train_rfe)

        # 记录模型性能
        local_results.append({
            'Model': 'Gradient Boosting',
            'Target': target,
            'n_features': n_features,
            'RMSE_test': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'R²_test': r2_score(y_test, y_pred_test),
            'MAE_test': mean_absolute_error(y_test, y_pred_test),
            'RMSE_train': np.sqrt(mean_squared_error(y_train, y_pred_train)),
            'R²_train': r2_score(y_train, y_pred_train),
            'MAE_train': mean_absolute_error(y_train, y_pred_train),
        })

        # 保存预测值和真实值
        local_predictions.append({
            'Model': 'Gradient Boosting',
            'Target': target,
            'n_features': n_features,
            'y_test': y_test.tolist(),
            'y_pred_test': y_pred_test.tolist(),
            'y_train': y_train.tolist(),
            'y_pred_train': y_pred_train.tolist(),
            'RMSE_test': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'R²_test': r2_score(y_test, y_pred_test),
            'MAE_test': mean_absolute_error(y_test, y_pred_test),
            'RMSE_train': np.sqrt(mean_squared_error(y_train, y_pred_train)),
            'R²_train': r2_score(y_train, y_pred_train),
            'MAE_train': mean_absolute_error(y_train, y_pred_train)
        })

        # 交叉验证
        kfold = KFold(n_splits=5, shuffle=True, random_state=42)
        for fold, (train_idx, test_idx) in enumerate(kfold.split(X_train_rfe)):
            X_train_fold, X_test_fold = X_train_rfe[train_idx], X_train_rfe[test_idx]
            y_train_fold, y_test_fold = y_train[train_idx], y_train[test_idx]
            best_model.fit(X_train_fold, y_train_fold)
            y_pred_fold = best_model.predict(X_test_fold)
            local_cross_val_results.append({
                'Model': 'Gradient Boosting',
                'Target': target,
                'n_features': n_features,
                'Fold': fold + 1,
                'CV_RMSE': np.sqrt(mean_squared_error(y_test_fold, y_pred_fold)),
                'CV_R²': r2_score(y_test_fold, y_pred_fold),
                'CV_MAE': mean_absolute_error(y_test_fold, y_pred_fold)
            })

    return local_results, local_cross_val_results, local_predictions

# 初始化全局变量
results = []
cross_val_results = []
predictions = []

# 指定核心数
num_cores = 1

# 使用多线程并行处理
with ThreadPoolExecutor(max_workers=num_cores) as executor:
    target_feature_pairs = [
        ('ST5', ['RH10',  'SG15', 'AT10','PRR']),  
        ('SVWC5', ['RH10', 'PRR',  'SG15', 'VP2']), 
    ]
    futures = [executor.submit(process_target, target, features) for target, features in target_feature_pairs]
    for future in futures:
        local_results, local_cross_val_results, local_predictions = future.result()
        results.extend(local_results)
        cross_val_results.extend(local_cross_val_results)
        predictions.extend(local_predictions)

# 将结果转换为数据框
results_df = pd.DataFrame(results)
cross_val_df = pd.DataFrame(cross_val_results)
predictions_df = pd.DataFrame(predictions)

# 保存计算结果为Excel文件
output_dir = r'D:\ML数据集\聚类分析计算结果'
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, 'GBDT模型计算结果.xlsx')

with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
    results_df.to_excel(writer, sheet_name='Model_Performance', index=False)
    cross_val_df.to_excel(writer, sheet_name='Cross_Validation_Results', index=False)
    predictions_df.to_excel(writer, sheet_name='Predictions', index=False)

# 记录结束时间
end_time = time.time()
total_time = end_time - start_time

print("模型计算完成")
print(f"总运行时间: {total_time:.2f}秒")
print(f"结果已保存到: {output_path}")


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from concurrent.futures import ThreadPoolExecutor
import time

# 记录开始时间
start_time = time.time()

# 读取数据
file_path = r"C:\Users\SS\Desktop\论文\Machine learning\Machine learning 数据集\无道砟训练集\有道砟训练集.xlsx"
df = pd.read_excel(file_path)

# 确保 'date' 列是 datetime 类型
df['date'] = pd.to_datetime(df['date'])

# 数据预处理
features = ['WS2', 'WS10', 'AT2', 'RH2', 'VP2', 'AT10', 'RH10', 'VP10', 'AP', 'PR', 'PRR', 'DR', 'UR', 'DLR', 'ULR', 'Rn', 'ALB', 'TNR', 'SG5', 'SG15']
target_variables = ['ST5', 'SVWC5']

# 删除日期列
df = df.drop(columns=['date'])

# 填补缺失值
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# 对所有特征进行归一化处理
scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(df_imputed[features])

# 对所有目标变量进行归一化处理
scaler_y = MinMaxScaler()
for target in target_variables:
    df_imputed[target] = scaler_y.fit_transform(df_imputed[target].values.reshape(-1, 1))

# 定义超参数搜索空间
param_grid = {
    'n_estimators': [100, 200, 300],  # 弱学习器的数量
    'learning_rate': [0.01, 0.1, 0.2],  # 学习率
    'max_depth': [3, 5, 7],  # 每个树的最大深度
    'min_child_weight': [1, 3, 5],  # 控制子叶节点中最小的样本权重和
    'subsample': [0.6, 0.8, 1.0],  # 每棵树对样本的采样比例
    'colsample_bytree': [0.6, 0.8, 1.0]  # 每棵树对特征的采样比例
}

# 模型定义
base_model = XGBRegressor(random_state=42, objective='reg:squarederror')
grid_search = GridSearchCV(estimator=base_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)

def process_target(target, selected_features):
    local_results = []
    local_cross_val_results = []
    local_predictions = []

    # 提取归一化后的目标变量
    y_scaled = df_imputed[target].values
    y = y_scaled.ravel()
    
    # 提取对应的归一化后的特征
    X_selected = df_imputed[selected_features].values
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
    
    # 不进行反归一化处理，直接使用归一化后的数据进行计算

    # 进行超参数搜索
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # 打印最优参数
    print(f"Best Parameters for target {target}: {best_params}")

    for n_features in range(len(selected_features), 0, -1):
        X_train_rfe = X_train[:, :n_features]
        X_test_rfe = X_test[:, :n_features]
        best_model.fit(X_train_rfe, y_train)
        y_pred_test = best_model.predict(X_test_rfe)
        y_pred_train = best_model.predict(X_train_rfe)

        # 记录模型性能
        local_results.append({
            'Model': 'XGBoost',
            'Target': target,
            'n_features': n_features,
            'RMSE_test': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'R²_test': r2_score(y_test, y_pred_test),
            'MAE_test': mean_absolute_error(y_test, y_pred_test),
            'RMSE_train': np.sqrt(mean_squared_error(y_train, y_pred_train)),
            'R²_train': r2_score(y_train, y_pred_train),
            'MAE_train': mean_absolute_error(y_train, y_pred_train),
        })

        # 保存预测值和真实值
        local_predictions.append({
            'Model': 'XGBoost',
            'Target': target,
            'n_features': n_features,
            'y_test': y_test.tolist(),
            'y_pred_test': y_pred_test.tolist(),
            'y_train': y_train.tolist(),
            'y_pred_train': y_pred_train.tolist(),
            'RMSE_test': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'R²_test': r2_score(y_test, y_pred_test),
            'MAE_test': mean_absolute_error(y_test, y_pred_test),
            'RMSE_train': np.sqrt(mean_squared_error(y_train, y_pred_train)),
            'R²_train': r2_score(y_train, y_pred_train),
            'MAE_train': mean_absolute_error(y_train, y_pred_train)
        })

        # 交叉验证
        kfold = KFold(n_splits=5, shuffle=True, random_state=42)
        for fold, (train_idx, test_idx) in enumerate(kfold.split(X_train_rfe)):
            X_train_fold, X_test_fold = X_train_rfe[train_idx], X_train_rfe[test_idx]
            y_train_fold, y_test_fold = y_train[train_idx], y_train[test_idx]
            best_model.fit(X_train_fold, y_train_fold)
            y_pred_fold = best_model.predict(X_test_fold)
            local_cross_val_results.append({
                'Model': 'XGBoost',
                'Target': target,
                'n_features': n_features,
                'Fold': fold + 1,
                'CV_RMSE': np.sqrt(mean_squared_error(y_test_fold, y_pred_fold)),
                'CV_R²': r2_score(y_test_fold, y_pred_fold),
                'CV_MAE': mean_absolute_error(y_test_fold, y_pred_fold)
            })

    return local_results, local_cross_val_results, local_predictions

# 初始化全局变量
results = []
cross_val_results = []
predictions = []

# 指定核心数
num_cores = 1

# 使用多线程并行处理
with ThreadPoolExecutor(max_workers=num_cores) as executor:
    target_feature_pairs = [
        ('ST5', ['RH10',  'SG15', 'AT10','PRR']),  
        ('SVWC5', ['RH10', 'PRR',  'SG15', 'VP2']), 
    ]
    futures = [executor.submit(process_target, target, features) for target, features in target_feature_pairs]
    for future in futures:
        local_results, local_cross_val_results, local_predictions = future.result()
        results.extend(local_results)
        cross_val_results.extend(local_cross_val_results)
        predictions.extend(local_predictions)

# 将结果转换为数据框
results_df = pd.DataFrame(results)
cross_val_df = pd.DataFrame(cross_val_results)
predictions_df = pd.DataFrame(predictions)

# 保存计算结果为Excel文件
output_dir = r'D:\ML数据集\聚类分析计算结果'
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, 'XGBoost模型计算结果.xlsx')

with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
    results_df.to_excel(writer, sheet_name='Model_Performance', index=False)
    cross_val_df.to_excel(writer, sheet_name='Cross_Validation_Results', index=False)
    predictions_df.to_excel(writer, sheet_name='Predictions', index=False)

# 记录结束时间
end_time = time.time()
total_time = end_time - start_time

print("模型计算完成")
print(f"总运行时间: {total_time:.2f}秒")
print(f"结果已保存到: {output_path}")


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from concurrent.futures import ThreadPoolExecutor
import time

# 记录开始时间
start_time = time.time()

# 读取数据
file_path = r"C:\Users\SS\Desktop\论文\Machine learning\Machine learning 数据集\无道砟训练集\有道砟训练集.xlsx"
df = pd.read_excel(file_path)

# 确保 'date' 列是 datetime 类型
df['date'] = pd.to_datetime(df['date'])

# 数据预处理
features = ['WS2', 'WS10', 'AT2', 'RH2', 'VP2', 'AT10', 'RH10', 'VP10', 'AP', 'PR', 'PRR', 'DR', 'UR', 'DLR', 'ULR', 'Rn', 'ALB', 'TNR', 'SG5', 'SG15']
target_variables = ['ST5', 'SVWC5']

# 删除日期列
df = df.drop(columns=['date'])

# 填补缺失值
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# 对所有特征进行归一化处理
scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(df_imputed[features])

# 对所有目标变量进行归一化处理
scaler_y = MinMaxScaler()
for target in target_variables:
    df_imputed[target] = scaler_y.fit_transform(df_imputed[target].values.reshape(-1, 1))

# 定义超参数搜索空间
param_grid = {
    'hidden_layer_sizes': [(50, 50), (100,), (100, 50)],  # 隐藏层的结构
    'activation': ['relu', 'tanh'],  # 激活函数
    'solver': ['adam', 'lbfgs'],  # 优化器
    'alpha': [0.0001, 0.001, 0.01],  # 正则化参数
    'learning_rate': ['constant', 'adaptive']  # 学习率
}

# 模型定义
base_model = MLPRegressor(max_iter=1000, random_state=42)
grid_search = GridSearchCV(estimator=base_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)

def process_target(target, selected_features):
    local_results = []
    local_cross_val_results = []
    local_predictions = []

    # 提取归一化后的目标变量
    y_scaled = df_imputed[target].values
    y = y_scaled.ravel()
    
    # 提取对应的归一化后的特征
    X_selected = df_imputed[selected_features].values
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
    
    # 不进行反归一化处理，直接使用归一化后的数据进行计算

    # 进行超参数搜索
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # 打印最优参数
    print(f"Best Parameters for target {target}: {best_params}")

    for n_features in range(len(selected_features), 0, -1):
        X_train_rfe = X_train[:, :n_features]
        X_test_rfe = X_test[:, :n_features]
        best_model.fit(X_train_rfe, y_train)
        y_pred_test = best_model.predict(X_test_rfe)
        y_pred_train = best_model.predict(X_train_rfe)

        # 记录模型性能
        local_results.append({
            'Model': 'ANN',
            'Target': target,
            'n_features': n_features,
            'RMSE_test': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'R²_test': r2_score(y_test, y_pred_test),
            'MAE_test': mean_absolute_error(y_test, y_pred_test),
            'RMSE_train': np.sqrt(mean_squared_error(y_train, y_pred_train)),
            'R²_train': r2_score(y_train, y_pred_train),
            'MAE_train': mean_absolute_error(y_train, y_pred_train),
        })

        # 保存预测值和真实值
        local_predictions.append({
            'Model': 'ANN',
            'Target': target,
            'n_features': n_features,
            'y_test': y_test.tolist(),
            'y_pred_test': y_pred_test.tolist(),
            'y_train': y_train.tolist(),
            'y_pred_train': y_pred_train.tolist(),
            'RMSE_test': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'R²_test': r2_score(y_test, y_pred_test),
            'MAE_test': mean_absolute_error(y_test, y_pred_test),
            'RMSE_train': np.sqrt(mean_squared_error(y_train, y_pred_train)),
            'R²_train': r2_score(y_train, y_pred_train),
            'MAE_train': mean_absolute_error(y_train, y_pred_train)
        })

        # 交叉验证
        kfold = KFold(n_splits=5, shuffle=True, random_state=42)
        for fold, (train_idx, test_idx) in enumerate(kfold.split(X_train_rfe)):
            X_train_fold, X_test_fold = X_train_rfe[train_idx], X_train_rfe[test_idx]
            y_train_fold, y_test_fold = y_train[train_idx], y_train[test_idx]
            best_model.fit(X_train_fold, y_train_fold)
            y_pred_fold = best_model.predict(X_test_fold)
            local_cross_val_results.append({
                'Model': 'ANN',
                'Target': target,
                'n_features': n_features,
                'Fold': fold + 1,
                'CV_RMSE': np.sqrt(mean_squared_error(y_test_fold, y_pred_fold)),
                'CV_R²': r2_score(y_test_fold, y_pred_fold),
                'CV_MAE': mean_absolute_error(y_test_fold, y_pred_fold)
            })

    return local_results, local_cross_val_results, local_predictions

# 初始化全局变量
results = []
cross_val_results = []
predictions = []

# 指定核心数
num_cores = 1

# 使用多线程并行处理
with ThreadPoolExecutor(max_workers=num_cores) as executor:
    target_feature_pairs = [
        ('ST5', ['RH10',  'SG15', 'AT10','PRR']),  
        ('SVWC5', ['RH10', 'PRR',  'SG15', 'VP2']), 
    ]
    futures = [executor.submit(process_target, target, features) for target, features in target_feature_pairs]
    for future in futures:
        local_results, local_cross_val_results, local_predictions = future.result()
        results.extend(local_results)
        cross_val_results.extend(local_cross_val_results)
        predictions.extend(local_predictions)

# 将结果转换为数据框
results_df = pd.DataFrame(results)
cross_val_df = pd.DataFrame(cross_val_results)
predictions_df = pd.DataFrame(predictions)

# 保存计算结果为Excel文件
output_dir = r'D:\ML数据集\聚类分析计算结果'
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, 'ANN模型计算结果.xlsx')

with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
    results_df.to_excel(writer, sheet_name='Model_Performance', index=False)
    cross_val_df.to_excel(writer, sheet_name='Cross_Validation_Results', index=False)
    predictions_df.to_excel(writer, sheet_name='Predictions', index=False)

# 记录结束时间
end_time = time.time()
total_time = end_time - start_time

print("模型计算完成")
print(f"总运行时间: {total_time:.2f}秒")
print(f"结果已保存到: {output_path}")


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from concurrent.futures import ThreadPoolExecutor
import time

# 记录开始时间
start_time = time.time()

# 读取数据
file_path = r"C:\Users\SS\Desktop\论文\Machine learning\Machine learning 数据集\无道砟训练集\有道砟训练集.xlsx"
df = pd.read_excel(file_path)

# 确保 'date' 列是 datetime 类型
df['date'] = pd.to_datetime(df['date'])

# 数据预处理
features = ['WS2', 'WS10', 'AT2', 'RH2', 'VP2', 'AT10', 'RH10', 'VP10', 'AP', 'PR', 'PRR', 'DR', 'UR', 'DLR', 'ULR', 'Rn', 'ALB', 'TNR', 'SG5', 'SG15']
target_variables = ['ST5', 'SVWC5']

# 删除日期列
df = df.drop(columns=['date'])

# 填补缺失值
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# 对所有特征进行归一化处理
scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(df_imputed[features])

# 对所有目标变量进行归一化处理
scaler_y = MinMaxScaler()
for target in target_variables:
    df_imputed[target] = scaler_y.fit_transform(df_imputed[target].values.reshape(-1, 1))

# 定义超参数搜索空间
param_grid = {
    'n_neighbors': [1, 2],  # 邻居的数量
    'weights': ['uniform'],  # 权重函数
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # 近邻算法
    'p': [1, 2]  # 距离度量，1代表曼哈顿距离，2代表欧几里得距离
}

# 模型定义
base_model = KNeighborsRegressor()
grid_search = GridSearchCV(estimator=base_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1)

def process_target(target, selected_features):
    local_results = []
    local_cross_val_results = []
    local_predictions = []

    # 提取归一化后的目标变量
    y_scaled = df_imputed[target].values
    y = y_scaled.ravel()
    
    # 提取对应的归一化后的特征
    X_selected = df_imputed[selected_features].values
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
    
    # 不进行反归一化处理，直接使用归一化后的数据进行计算

    # 进行超参数搜索
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # 打印最优参数
    print(f"Best Parameters for target {target}: {best_params}")

    for n_features in range(len(selected_features), 0, -1):
        X_train_rfe = X_train[:, :n_features]
        X_test_rfe = X_test[:, :n_features]
        best_model.fit(X_train_rfe, y_train)
        y_pred_test = best_model.predict(X_test_rfe)
        y_pred_train = best_model.predict(X_train_rfe)

        # 记录模型性能
        local_results.append({
            'Model': 'KNN',
            'Target': target,
            'n_features': n_features,
            'RMSE_test': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'R²_test': r2_score(y_test, y_pred_test),
            'MAE_test': mean_absolute_error(y_test, y_pred_test),
            'RMSE_train': np.sqrt(mean_squared_error(y_train, y_pred_train)),
            'R²_train': r2_score(y_train, y_pred_train),
            'MAE_train': mean_absolute_error(y_train, y_pred_train),
        })

        # 保存预测值和真实值
        local_predictions.append({
            'Model': 'KNN',
            'Target': target,
            'n_features': n_features,
            'y_test': y_test.tolist(),
            'y_pred_test': y_pred_test.tolist(),
            'y_train': y_train.tolist(),
            'y_pred_train': y_pred_train.tolist(),
            'RMSE_test': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'R²_test': r2_score(y_test, y_pred_test),
            'MAE_test': mean_absolute_error(y_test, y_pred_test),
            'RMSE_train': np.sqrt(mean_squared_error(y_train, y_pred_train)),
            'R²_train': r2_score(y_train, y_pred_train),
            'MAE_train': mean_absolute_error(y_train, y_pred_train)
        })

        # 交叉验证
        kfold = KFold(n_splits=5, shuffle=True, random_state=42)
        for fold, (train_idx, test_idx) in enumerate(kfold.split(X_train_rfe)):
            X_train_fold, X_test_fold = X_train_rfe[train_idx], X_train_rfe[test_idx]
            y_train_fold, y_test_fold = y_train[train_idx], y_train[test_idx]
            best_model.fit(X_train_fold, y_train_fold)
            y_pred_fold = best_model.predict(X_test_fold)
            local_cross_val_results.append({
                'Model': 'KNN',
                'Target': target,
                'n_features': n_features,
                'Fold': fold + 1,
                'CV_RMSE': np.sqrt(mean_squared_error(y_test_fold, y_pred_fold)),
                'CV_R²': r2_score(y_test_fold, y_pred_fold),
                'CV_MAE': mean_absolute_error(y_test_fold, y_pred_fold)
            })

    return local_results, local_cross_val_results, local_predictions

# 初始化全局变量
results = []
cross_val_results = []
predictions = []

# 指定核心数
num_cores = 1

# 使用多线程并行处理
with ThreadPoolExecutor(max_workers=num_cores) as executor:
    target_feature_pairs = [
        ('ST5', ['RH10',  'SG15', 'AT10','PRR']),  
        ('SVWC5', ['RH10', 'PRR',  'SG15', 'VP2']), 
    ]
    futures = [executor.submit(process_target, target, features) for target, features in target_feature_pairs]
    for future in futures:
        local_results, local_cross_val_results, local_predictions = future.result()
        results.extend(local_results)
        cross_val_results.extend(local_cross_val_results)
        predictions.extend(local_predictions)

# 将结果转换为数据框
results_df = pd.DataFrame(results)
cross_val_df = pd.DataFrame(cross_val_results)
predictions_df = pd.DataFrame(predictions)

# 保存计算结果为Excel文件
output_dir = r'D:\ML数据集\聚类分析计算结果'
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, 'KNN模型计算结果.xlsx')

with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
    results_df.to_excel(writer, sheet_name='Model_Performance', index=False)
    cross_val_df.to_excel(writer, sheet_name='Cross_Validation_Results', index=False)
    predictions_df.to_excel(writer, sheet_name='Predictions', index=False)

# 记录结束时间
end_time = time.time()
total_time = end_time - start_time

print("模型计算完成")
print(f"总运行时间: {total_time:.2f}秒")
print(f"结果已保存到: {output_path}")


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from concurrent.futures import ThreadPoolExecutor
import time

# 记录开始时间
start_time = time.time()

# 读取数据
file_path = r"C:\Users\SS\Desktop\论文\Machine learning\Machine learning 数据集\无道砟训练集\有道砟训练集.xlsx"
df = pd.read_excel(file_path)

# 确保 'date' 列是 datetime 类型
df['date'] = pd.to_datetime(df['date'])

# 数据预处理
features = ['WS2', 'WS10', 'AT2', 'RH2', 'VP2', 'AT10', 'RH10', 'VP10', 'AP', 'PR', 'PRR', 'DR', 'UR', 'DLR', 'ULR', 'Rn', 'ALB', 'TNR', 'SG5', 'SG15']
target_variables = ['ST5', 'SVWC5']

# 删除日期列
df = df.drop(columns=['date'])

# 填补缺失值
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# 对所有特征进行归一化处理
scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(df_imputed[features])

# 对所有目标变量进行归一化处理
scaler_y = MinMaxScaler()
for target in target_variables:
    df_imputed[target] = scaler_y.fit_transform(df_imputed[target].values.reshape(-1, 1))

# 模型定义
base_model = LinearRegression()
# 注意：线性回归模型没有超参数需要调整，所以我们不需要使用 GridSearchCV
# 直接用 KFold 进行交叉验证

def process_target(target, selected_features):
    local_results = []
    local_cross_val_results = []
    local_predictions = []

    # 提取归一化后的目标变量
    y_scaled = df_imputed[target].values
    y = y_scaled.ravel()
    
    # 提取对应的归一化后的特征
    X_selected = df_imputed[selected_features].values
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
    
    # 不进行反归一化处理，直接使用归一化后的数据进行计算

    # 训练模型
    best_model = base_model
    best_model.fit(X_train, y_train)

    for n_features in range(len(selected_features), 0, -1):
        X_train_rfe = X_train[:, :n_features]
        X_test_rfe = X_test[:, :n_features]
        best_model.fit(X_train_rfe, y_train)
        y_pred_test = best_model.predict(X_test_rfe)
        y_pred_train = best_model.predict(X_train_rfe)

        # 记录模型性能
        local_results.append({
            'Model': 'Linear Regression',
            'Target': target,
            'n_features': n_features,
            'RMSE_test': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'R²_test': r2_score(y_test, y_pred_test),
            'MAE_test': mean_absolute_error(y_test, y_pred_test),
            'RMSE_train': np.sqrt(mean_squared_error(y_train, y_pred_train)),
            'R²_train': r2_score(y_train, y_pred_train),
            'MAE_train': mean_absolute_error(y_train, y_pred_train),
        })

        # 保存预测值和真实值
        local_predictions.append({
            'Model': 'Linear Regression',
            'Target': target,
            'n_features': n_features,
            'y_test': y_test.tolist(),
            'y_pred_test': y_pred_test.tolist(),
            'y_train': y_train.tolist(),
            'y_pred_train': y_pred_train.tolist(),
            'RMSE_test': np.sqrt(mean_squared_error(y_test, y_pred_test)),
            'R²_test': r2_score(y_test, y_pred_test),
            'MAE_test': mean_absolute_error(y_test, y_pred_test),
            'RMSE_train': np.sqrt(mean_squared_error(y_train, y_pred_train)),
            'R²_train': r2_score(y_train, y_pred_train),
            'MAE_train': mean_absolute_error(y_train, y_pred_train)
        })

        # 交叉验证
        kfold = KFold(n_splits=5, shuffle=True, random_state=42)
        for fold, (train_idx, test_idx) in enumerate(kfold.split(X_train_rfe)):
            X_train_fold, X_test_fold = X_train_rfe[train_idx], X_train_rfe[test_idx]
            y_train_fold, y_test_fold = y_train[train_idx], y_train[test_idx]
            best_model.fit(X_train_fold, y_train_fold)
            y_pred_fold = best_model.predict(X_test_fold)
            local_cross_val_results.append({
                'Model': 'Linear Regression',
                'Target': target,
                'n_features': n_features,
                'Fold': fold + 1,
                'CV_RMSE': np.sqrt(mean_squared_error(y_test_fold, y_pred_fold)),
                'CV_R²': r2_score(y_test_fold, y_pred_fold),
                'CV_MAE': mean_absolute_error(y_test_fold, y_pred_fold)
            })

    return local_results, local_cross_val_results, local_predictions

# 初始化全局变量
results = []
cross_val_results = []
predictions = []

# 指定核心数
num_cores = 1

# 使用多线程并行处理
with ThreadPoolExecutor(max_workers=num_cores) as executor:
    target_feature_pairs = [
        ('ST5', ['RH10',  'SG15', 'AT10','PRR']),  
        ('SVWC5', ['RH10', 'PRR',  'SG15', 'VP2']), 
    ]
    futures = [executor.submit(process_target, target, features) for target, features in target_feature_pairs]
    for future in futures:
        local_results, local_cross_val_results, local_predictions = future.result()
        results.extend(local_results)
        cross_val_results.extend(local_cross_val_results)
        predictions.extend(local_predictions)

# 将结果转换为数据框
results_df = pd.DataFrame(results)
cross_val_df = pd.DataFrame(cross_val_results)
predictions_df = pd.DataFrame(predictions)

# 保存计算结果为Excel文件
output_dir = r'D:\ML数据集\聚类分析计算结果'
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, '线性回归模型计算结果.xlsx')

with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
    results_df.to_excel(writer, sheet_name='Model_Performance', index=False)
    cross_val_df.to_excel(writer, sheet_name='Cross_Validation_Results', index=False)
    predictions_df.to_excel(writer, sheet_name='Predictions', index=False)

# 记录结束时间
end_time = time.time()
total_time = end_time - start_time

print("模型计算完成")
print(f"总运行时间: {total_time:.2f}秒")
print(f"结果已保存到: {output_path}")


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from alibi.explainers import ALE
import matplotlib.pyplot as plt
import time

# 记录开始时间
start_time = time.time()

# 读取数据
file_path = r"C:\Users\SS\Desktop\论文\Machine learning\Machine learning 数据集\无道砟训练集\有道砟训练集.xlsx"
df = pd.read_excel(file_path)

# 确保 'date' 列是 datetime 类型
df['date'] = pd.to_datetime(df['date'])

# 数据预处理
target_features_map = {
    'ST5': ['AT10', 'RH10', 'SG15', 'PRR', 'SVWC5'],
    'SVWC5': ['PRR', 'SG15', 'VP2', 'RH10', 'ST5']
}

# 删除日期列
df = df.drop(columns=['date'])

# 填补缺失值
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# 初始化 ALE 结果保存字典
ale_plot_data = {}

# 遍历每个目标变量，训练模型并保存ALE
for target, features in target_features_map.items():
    # 分割数据集，不进行归一化
    X_train, X_test, y_train, y_test = train_test_split(df_imputed[features], df_imputed[target], test_size=0.2, random_state=42)
    
    # 将 X_train 转换为 numpy 数组
    X_train_np = X_train.values
    
    # 训练随机森林模型
    model = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42)
    model.fit(X_train_np, y_train)
    
    # 计算 ALE
    ale_explainer = ALE(model.predict, feature_names=features)
    ale_result = ale_explainer.explain(X_train_np)
    
    # 保存绘图数据
    for i, feature in enumerate(features):
        feature_values = np.ravel(ale_result.feature_values[i])
        ale_values = np.ravel(ale_result.ale_values[i])
        plot_data = pd.DataFrame({
            f'{feature}_values': feature_values,
            f'{feature}_ale': ale_values
        })
        ale_plot_data[f'{target}_{feature}'] = plot_data
    
        # 绘制ALE图
        plt.figure(figsize=(8, 6))
        plt.plot(feature_values, ale_values)
        plt.title(f'ALE Plot for {target} - {feature}')
        plt.xlabel(f'{feature}')
        plt.ylabel('ALE')
        plt.grid(True)
        plt.show()

# 保存 ALE 绘图数据为 xlsx 文件
output_dir = r'D:\ML数据集\聚类分析计算结果\ALE计算结果'
os.makedirs(output_dir, exist_ok=True)
output_xlsx_path = os.path.join(output_dir, 'ale_plot_data.xlsx')

with pd.ExcelWriter(output_xlsx_path) as writer:
    for sheet_name, data in ale_plot_data.items():
        data.to_excel(writer, sheet_name=sheet_name, index=False)

# 记录结束时间
end_time = time.time()
total_time = end_time - start_time

print(f"ALE绘图数据已保存为Excel文件：{output_xlsx_path}")
print(f"总运行时间: {total_time:.2f}秒")
print(df_imputed['RH10'].describe())


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# 读取数据集
file_path = r"D:\ML数据集\CPE\CPE_事件数据\CPE_data.xlsx"
df = pd.read_excel(file_path)

features = ['VP2', 'RH10', 'APR', 'HF15']
target_variable = 'SVWC5'

# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
    df[features], 
    df[target_variable], 
    test_size=0.3, 
    random_state=42
)

# 训练随机森林模型
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
import matplotlib.pyplot as plt
from sklearn.inspection import PartialDependenceDisplay
from matplotlib import font_manager

# 设置自定义字体（可选）
# 请确保系统中安装了对应的字体
# font_path = 'C:/Windows/Fonts/Arial.ttf'  # 修改为你想要的字体路径
# font_prop = font_manager.FontProperties(fname=font_path)

# 全局设置字体大小和字体族
plt.rcParams['font.size'] = 16
plt.rcParams['font.family'] = 'Arial'  # 可根据需要修改

# 定义轴标签和标题的字体大小
label_fontsize = 16
title_fontsize = 16
tick_fontsize = 16

# 绘制特征的PDP和ICE图
for i, feature in enumerate(features):
    fig, ax = plt.subplots(figsize=(8, 6),dpi=300)
    
    # 绘制PDP和ICE
    display = PartialDependenceDisplay.from_estimator(
        model, 
        X_train, 
        features=[feature], 
        kind="both", 
        ice_lines_kw={'color': 'lightblue', 'alpha': 0.3},  # ICE线样式
        pd_line_kw={'color': '#4682B4', 'linewidth': 2},        # PDP线样式
        ax=ax,
        line_kw={'label': 'PDP'},  # 添加PDP图例标签
    )
    
    # 设置标题和轴标签
    ax.set_title(f'Partial Dependence and ICE for {feature}', fontsize=title_fontsize)
    ax.set_xlabel(f'{feature}', fontsize=label_fontsize)
    ax.set_ylabel('Partial Dependence', fontsize=label_fontsize)
    
    # 设置刻度字体大小
    ax.tick_params(axis='both', which='major', labelsize=tick_fontsize)
    
    # 添加图例
    ax.legend(fontsize=label_fontsize)
    
    # 优化布局
    plt.tight_layout()
    plt.show()


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import PartialDependenceDisplay
import matplotlib.pyplot as plt
import seaborn as sns
import time

# 记录开始时间
start_time = time.time()

# 读取数据
file_path = r"C:\Users\SS\Desktop\论文\Machine learning\Machine learning 数据集\无道砟训练集\有道砟训练集.xlsx"
df = pd.read_excel(file_path)

# 查看数据列名
print(df.columns)

# 确保 'date' 列存在并解析日期
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])
else:
    raise KeyError("'date' 列不存在于数据集中")

# 数据预处理
features = ['PRR', 'AT10', 'RH10', 'SG15']
target_variables = ['ST5']

# 处理缺失值
df = df.select_dtypes(exclude=['datetime64'])
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# 模型定义
model = RandomForestRegressor(random_state=42, n_estimators=100, max_depth=20)

def process_target(target):
    X = df_imputed[features].values
    y = df_imputed[target].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # 训练模型
    model.fit(X_train, y_train)
    
    return model, X_test

# 初始化全局变量
trained_models = {}
X_tests = {}

# 处理每个目标变量
for target in target_variables:
    model, X_test = process_target(target)
    trained_models[target] = model
    X_tests[target] = X_test

# 创建一个函数绘制PDP图并保存数据
def plot_single_pdp(model, df, feature, target, writer):
    fig, ax = plt.subplots(figsize=(8, 6))
    pdp_display = PartialDependenceDisplay.from_estimator(model, df, [feature], feature_names=features, grid_resolution=1000, ax=ax)
    pdp_values = pdp_display.lines_[0][0].get_ydata()
    pdp_grid = pdp_display.lines_[0][0].get_xdata()

    pdp_df = pd.DataFrame({
        'Quantiles': pdp_grid,
        'PDP': pdp_values
    })
    
    # 绘制PDP图
    plt.cla()  # 清除之前的图形
    sns.lineplot(x='Quantiles', y='PDP', data=pdp_df)
    plt.title(f'PDP for {feature} (Target: {target})')
    plt.xlabel(feature)
    plt.ylabel('PDP')
    plt.grid(True)
    plt.show()
    
    # 保存PDP数据到Excel
    sheet_name = f'{target}_{feature}_PDP'
    pdp_df.to_excel(writer, sheet_name=sheet_name, index=False)

# 保存PDP数据的文件路径
output_file = r"D:\ML数据集\聚类分析计算PKL  ALE PDP结果\PDP计算结果\PDP_STresults.xlsx"

# 使用ExcelWriter保存多个工作表
with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
    for target in target_variables:
        model = trained_models[target]
        X_test = X_tests[target]
        for feature in features:
            plot_single_pdp(model, df_imputed[features], feature, target, writer)

# 记录结束时间
end_time = time.time()
total_time = end_time - start_time

print("PDP计算完成并保存至Excel文件")
print(f"总运行时间: {total_time:.2f}秒")


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import shap
import pandas as pd
from sklearn.impute import SimpleImputer

# 防止 shap 检查 transformers
shap.utils.transformers.is_transformers_model = lambda x: False
shap.utils.transformers.is_transformers_lm = lambda x: False
shap.utils.transformers.safe_isinstance = lambda obj, class_path_str: False

# 读取数据
file_path = r"D:\ML数据集\CPE\CPE_事件数据\春季.xlsx"
df = pd.read_excel(file_path)

# 确保 'date' 列是 datetime 类型
df['date'] = pd.to_datetime(df['date'])

# 数据预处理
features = ['VP2', 'RH10', 'PRR', 'SG15']
target_variable = 'SVWC5'

# 删除日期列
df = df.drop(columns=['date'])
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# 使用未归一化的特征值划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    df_imputed[features], df_imputed[target_variable], test_size=0.3, random_state=42)

# 使用随机森林回归模型
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# 计算 SHAP 值
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)

# 选择两个感兴趣的特征
feature_1 = 'VP2'
feature_2 = 'RH10'
feature_1_index = features.index(feature_1)
feature_2_index = features.index(feature_2)

# 生成网格数据
feature_1_values = np.linspace(X_train[feature_1].min(), X_train[feature_1].max(), 50)
feature_2_values = np.linspace(X_train[feature_2].min(), X_train[feature_2].max(), 50)
grid_x, grid_y = np.meshgrid(feature_1_values, feature_2_values)

# 计算网格点的 SHAP 值
shap_interaction_grid = np.zeros_like(grid_x)

for i in range(grid_x.shape[0]):
    for j in range(grid_x.shape[1]):
        temp = X_train.copy()
        temp[feature_1] = grid_x[i, j]
        temp[feature_2] = grid_y[i, j]
        shap_interaction_grid[i, j] = explainer.shap_values(temp).mean(axis=0)[feature_1_index] + explainer.shap_values(temp).mean(axis=0)[feature_2_index]

# 绘制等高线图
plt.contourf(grid_x, grid_y, shap_interaction_grid, cmap='viridis')
plt.colorbar(label="SHAP Value")
plt.xlabel(feature_1)
plt.ylabel(feature_2)
plt.title(f"SHAP Interaction for {feature_1} and {feature_2}")
plt.show()


In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.inspection import partial_dependence
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

# 读取数据
file_path = r"D:\ML数据集\CPE\CPE_事件数据\春季.xlsx"
df = pd.read_excel(file_path)

# 确保 'date' 列是 datetime 类型
df['date'] = pd.to_datetime(df['date'])

# 数据预处理
features = ['VP2', 'RH10', 'PRR', 'SG15']
target_variable = 'SVWC5'

# 删除日期列
df = df.drop(columns=['date'])
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# 使用未归一化的特征值划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    df_imputed[features], df_imputed[target_variable], test_size=0.3, random_state=42)

# 使用随机森林回归模型
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# 计算部分依赖数据
pdp_result = partial_dependence(
    model, X_train, features=[(0, 1)], grid_resolution=50
)

# 提取网格数据和平均依赖数据
XX, YY = np.meshgrid(pdp_result['grid_values'][0], pdp_result['grid_values'][1])
Z = pdp_result['average'][0].T  # 将average进行转置以匹配网格形状

# 将数据整理成 DataFrame
df_grid = pd.DataFrame(Z, index=YY[:, 0], columns=XX[0, :])

# 绘制图像以确保生成的部分依赖图正确
plt.contourf(XX, YY, Z, cmap='viridis')
plt.colorbar(label="Partial Dependence")
plt.xlabel('PRR')
plt.ylabel('VP2')
plt.title('2D Partial Dependence Plot for VP2 and RH10')
plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from scipy.ndimage import gaussian_filter1d

# 读取数据集
file_path = r"D:\ML数据集\CPE\CPE_事件数据\CPE_data.xlsx"
df = pd.read_excel(file_path)

features = ['AT10', 'RH10', 'APR', 'HF15']
target_variable = 'ST5'

# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df[target_variable],
    test_size=0.3,
    random_state=42
)

# 训练随机森林模型
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# 全局设置字体大小和字体族
plt.rcParams['font.size'] = 16
plt.rcParams['font.family'] = 'Arial'  # 可根据需要修改

# 定义轴标签和标题的字体大小
label_fontsize = 16
title_fontsize = 16
tick_fontsize = 16

# 为每个特征单独绘制导数图
for feature_idx, feature in enumerate(features):
    fig, ax = plt.subplots(figsize=(8, 6),dpi=300)
    
    # 为当前特征生成值的范围
    feature_values = np.linspace(X_train[feature].min(), X_train[feature].max(), 100)
    
    # 初始化导数存储列表
    derivatives = []
    
    # 对于每个样本，计算其在不同特征值下的预测值，并计算偏导数
    for i in range(X_train.shape[0]):
        X_temp = X_train.copy()
        ice_values = []
        
        for val in feature_values:
            X_temp[feature] = val
            pred = model.predict(X_temp)
            ice_values.append(pred)
        
        ice_values = np.array(ice_values)
        smoothed_ice = gaussian_filter1d(ice_values, sigma=2, axis=0)
        
        # 确保 smoothed_ice 的维度匹配 feature_values
        smoothed_ice = smoothed_ice[:, i]  # 选择第 i 个样本的 ICE 线
        
        # 计算平滑后ICE曲线的导数
        derivative = np.gradient(smoothed_ice, feature_values)  # 不指定 axis，因为是一维数据
        derivatives.append(derivative)
    
    derivatives = np.array(derivatives).T  # 转置以使其形状正确

    mean_derivative = np.mean(derivatives, axis=1)  # 计算沿样本的平均值
    std_derivative = np.std(derivatives, axis=1)
    
    # 绘制导数ICE曲线
    ax.plot(feature_values, mean_derivative, color='black', linewidth=2, label='Mean Derivative ICE')
    ax.fill_between(feature_values, mean_derivative - std_derivative, 
                    mean_derivative + std_derivative, color='gray', alpha=0.5)
    
    ax.set_xlabel(feature, fontsize=label_fontsize)
    ax.set_ylabel('Partial Derivative', fontsize=label_fontsize)
    ax.set_title(f'Derivative ICE for {feature}', fontsize=title_fontsize)
    ax.tick_params(axis='both', which='major', labelsize=tick_fontsize)
    ax.legend(fontsize=label_fontsize)
    
    plt.tight_layout()
    plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.inspection import partial_dependence
from sklearn.impute import SimpleImputer
from itertools import combinations

# 读取数据
file_path = r"D:\ML数据集\CPE\CPE_事件数据\春季.xlsx"
df = pd.read_excel(file_path)

# 确保 'date' 列是 datetime 类型
df['date'] = pd.to_datetime(df['date'])

# 数据预处理
features = [ 'PRR','AT10', 'SG15', 'RH10']
target_variable = 'ST5'

# 删除日期列
df = df.drop(columns=['date'])
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# 使用未归一化的特征值划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    df_imputed[features], df_imputed[target_variable], test_size=0.3, random_state=42)

# 使用随机森林回归模型
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# 创建一个ExcelWriter对象，用于保存不同的sheet
output_file = r"D:\ML数据集\CPE\二阶PDP\ST5春季all_contour_data.xlsx"
with pd.ExcelWriter(output_file) as writer:
    # 遍历所有特征组合的二阶PDP
    for feature_pair in combinations(range(len(features)), 2):
        # 计算部分依赖数据
        pdp_result = partial_dependence(
            model, X_train, features=[feature_pair], grid_resolution=50
        )

        # 提取网格数据和平均依赖数据
        XX, YY = np.meshgrid(pdp_result['grid_values'][0], pdp_result['grid_values'][1])
        Z = pdp_result['average'][0].T  # 将average进行转置以匹配网格形状

        # 将数据整理成 DataFrame，并添加特征组合名称
        feature_names = [features[feature_pair[0]], features[feature_pair[1]]]
        df_grid = pd.DataFrame(Z, index=YY[:, 0], columns=XX[0, :])
        
        # 将当前结果保存到Excel文件中的不同sheet
        sheet_name = f"{feature_names[0]}_vs_{feature_names[1]}"
        df_grid.to_excel(writer, sheet_name=sheet_name)
        
        # 绘制每个特征组合的二阶PDP图
        plt.figure(figsize=(8, 6))
        plt.contourf(XX, YY, Z, cmap='viridis')
        plt.colorbar(label="Partial Dependence")
        plt.xlabel(feature_names[0])
        plt.ylabel(feature_names[1])
        plt.title(f'2D Partial Dependence Plot for {feature_names[0]} and {feature_names[1]}')
        plt.show()

print(f"所有绘图数据已保存为 {output_file}")
