#### 导入神经网络中间层特征

In [30]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# 调参随机种子设置
import random
import numpy as np
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


# 定义数据集类
class CustomDataset(Dataset):
    def __init__(self, data, labels):
        self.data = torch.tensor(data, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index], self.labels[index]


In [99]:
# 从Excel读取数据集
data_df = pd.read_excel('Pb1.xlsx', sheet_name='Sheet1',header=None)
data = data_df.iloc[0:, :2048].values
label_df = pd.read_excel('Pb1.xlsx',sheet_name='Sheet2',header=None)
labels = label_df.iloc[0:, 0].values

# 数据预处理：标准化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_normalized = scaler.fit_transform(data)

# 数据预处理：归一化
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
normalized_data = min_max_scaler.fit_transform(data)

from sklearn.decomposition import PCA
# 创建PCA模型并降维到十六维
pca = PCA(n_components=5)
data_pca = pca.fit_transform(data_normalized)

from sklearn.feature_selection import SelectKBest, f_regression
# 选择最好的K个特征
k_best = SelectKBest(score_func=f_regression, k=15)
data_selected = k_best.fit_transform(data_normalized, labels)

merged_matrix = np.hstack((data_pca, data_selected))

In [32]:
# 假设data是包含特征数据的数组，labels是包含对应标签的数组
data_train, data_test, labels_train, labels_test = train_test_split(merged_matrix, labels, test_size=0.1, random_state=42)

data_train_tensor = torch.tensor(data_train, dtype=torch.float32).clone().detach()
labels_train_tensor = torch.tensor(labels_train, dtype=torch.float32).clone().detach()
data_test_tensor = torch.tensor(data_test, dtype=torch.float32).clone().detach()
labels_test_tensor = torch.tensor(labels_test, dtype=torch.float32).clone().detach()

batch_size = 57

# 准备数据
train_dataset = CustomDataset(data_train, labels_train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

print(labels.shape)
print(merged_matrix.shape)



(57,)
(57, 20)


In [33]:
# 定义神经网络模型
# 利用神经网络提取中间层特征
# 但是有一个问题是为什么要用这样一个三个线形层的神经网络：
# 前馈神经网络：一个输入层，两个隐藏层和一个输出层

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(20, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)

    def forward(self, x):
        intermediate_features = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(intermediate_features))  # 提取中间层特征
        x = self.fc3(x)
        return x, intermediate_features


In [89]:
def Set_Random_State():
    seed = 42
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    return

Set_Random_State()
# 存储最后一次训练的隐藏层特征
last_hidden_features = None

# 设置超参数

learning_rate = 0.006
batch_size = 57
num_epochs = 309

# 初始化模型和损失函数
model = Net()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# 训练神经网络模型
total_step = len(train_dataloader)
for epoch in range(num_epochs):
    optimizer.zero_grad()
    for inputs, targets in train_dataloader:
        outputs, intermediate_features = model(inputs)
        loss = criterion(outputs, targets.unsqueeze(1))
        loss.backward(retain_graph=True)
        optimizer.step()


        # 计算训练集上的预测精度差
        train_outputs,_ = model(data_train_tensor)
        train_rmse = mean_squared_error(labels_train_tensor, train_outputs.detach().numpy(), squared=False)
        train_r2 = r2_score(labels_train_tensor, train_outputs.detach().numpy())
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item()}, Train RMSE: {train_rmse}, Train R^2: {train_r2}")
        # 提取最后一次训练的隐藏层特征
    last_hidden_features = intermediate_features.detach().numpy().astype(np.float64)

Epoch [1/308], Loss: 158302.5, Train RMSE: 397.27398681640625, Train R^2: -0.10150604778481886
Epoch [2/308], Loss: 157826.609375, Train RMSE: 396.7463073730469, Train R^2: -0.09858190116005039
Epoch [3/308], Loss: 157407.640625, Train RMSE: 396.2071838378906, Train R^2: -0.09559842966344712
Epoch [4/308], Loss: 156980.15625, Train RMSE: 395.53753662109375, Train R^2: -0.09189780974872064
Epoch [5/308], Loss: 156449.90625, Train RMSE: 394.73590087890625, Train R^2: -0.08747638939868896
Epoch [6/308], Loss: 155816.40625, Train RMSE: 393.7745056152344, Train R^2: -0.08218563964463343
Epoch [7/308], Loss: 155058.328125, Train RMSE: 392.6207580566406, Train R^2: -0.07585353894298752
Epoch [8/308], Loss: 154151.046875, Train RMSE: 391.26153564453125, Train R^2: -0.06841729270017449
Epoch [9/308], Loss: 153085.578125, Train RMSE: 389.6529541015625, Train R^2: -0.059650296690682
Epoch [10/308], Loss: 151829.421875, Train RMSE: 387.7790222167969, Train R^2: -0.049482548312000274
Epoch [11/308]

In [90]:
# 直接通过神经网路进行预测
model_path = "8.11/model.pth"
# 将模型的参数保存到文件中
torch.save(model.state_dict(), model_path)
model1 = Net()
model1.load_state_dict(torch.load(model_path))

test_inputs = data_test_tensor
test_outputs,_ = model1(test_inputs)
test_outputs = test_outputs.detach().numpy()
test_outputs = np.squeeze(test_outputs)
labels_test = np.squeeze(labels_test)

print(test_outputs)
print(labels_test)
test_rmse = mean_squared_error(labels_test, test_outputs, squared=False)
test_r2 = r2_score(labels_test, test_outputs)
print("Test RMSE:", test_rmse)
print("Test R2:",test_r2)

if (test_r2>=0.96):
    good_model_path = "good_model.pth"+str(test_r2)
    torch.save(model.state_dict(),good_model_path)


[ 75.9813    47.88044  628.3527   115.92144    6.569015   4.629687]
[ 85.  19. 636.  79.  21.  58.]
Test RMSE: 29.982589386641166
Test R2: 0.9812540090109586


In [94]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
r2 = 0

# Set_Random_State()
new_data_train, new_data_test, new_labels_train, new_labels_test = train_test_split(data_selected, labels, test_size=0.2, random_state=42)
from sklearn.ensemble import RandomForestRegressor
# 使用随机森林回归模型进行回归
rf = RandomForestRegressor(n_estimators=120, random_state=42)
rf.fit(new_data_train, new_labels_train)

# 在测试集上进行预测
predictions = rf.predict(new_data_test)

# 评估模型
mse = mean_squared_error(new_labels_test, predictions)
r2 = r2_score(new_labels_test, predictions)
# 打印评估结果
# print("Mean Squared Error:", mse)
print("R2 Score:", r2)

R2 Score: 0.8462874284895459


In [106]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA

# Set_Random_State()
new_data_train, new_data_test, new_labels_train, new_labels_test = train_test_split(merged_matrix, labels, test_size=0.2, random_state=42)

print(data_selected.shape)
print(data_pca.shape)
print(merged_matrix.shape)
from sklearn.ensemble import RandomForestRegressor
# 使用随机森林回归模型进行回归
rf = RandomForestRegressor(n_estimators=120, random_state=42)
rf.fit(new_data_train, new_labels_train)

# 在测试集上进行预测
predictions = rf.predict(new_data_test)

# 评估模型
mse = mean_squared_error(new_labels_test, predictions)
r2 = r2_score(new_labels_test, predictions)
# 打印评估结果
# print("Mean Squared Error:", mse)
print("R2 Score:", r2)

(57, 15)
(57, 5)
(57, 20)
R2 Score: 0.7997280905400042


In [36]:
def Set_Random_State():
    seed = 42
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    return

from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
r2 = 0
while r2 < 0.7:
    # Set_Random_State()
    # 创建PCA模型并降维到十六维
    pca = PCA(n_components=10)
    data_pca = pca.fit_transform(data_normalized)
    new_data_train, new_data_test, new_labels_train, new_labels_test = train_test_split(data_pca, labels, test_size=0.2, random_state=42)
    from sklearn.ensemble import RandomForestRegressor
    # 使用随机森林回归模型进行回归
    rf = RandomForestRegressor(n_estimators=120, random_state=42)
    rf.fit(new_data_train, new_labels_train)

    # 在测试集上进行预测
    predictions = rf.predict(new_data_test)

    # 评估模型
    mse = mean_squared_error(new_labels_test, predictions)
    r2 = r2_score(new_labels_test, predictions)
    # 打印评估结果
    print("Mean Squared Error:", mse)
    print("R2 Score:", r2)

Mean Squared Error: 12480.519210821758
R2 Score: 0.5527078609462119
Mean Squared Error: 11049.507330671295
R2 Score: 0.6039942180337339
Mean Squared Error: 12410.040490451387
R2 Score: 0.555233763679882
Mean Squared Error: 11951.819016377316
R2 Score: 0.5716560662970082
Mean Squared Error: 11900.977219560189
R2 Score: 0.5734781969045194
Mean Squared Error: 11394.490909143518
R2 Score: 0.591630273862287
Mean Squared Error: 11967.417010127321
R2 Score: 0.5710970462857783
Mean Squared Error: 9359.492760821757
R2 Score: 0.6645630308540056
Mean Squared Error: 8315.291454918979
R2 Score: 0.7019863966475557


In [37]:
print(r2)

0.7019863966475557


In [38]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
r2 = 0

# Set_Random_State()
new_data_train, new_data_test, new_labels_train, new_labels_test = train_test_split(data_selected, labels, test_size=0.2, random_state=42)
from sklearn.ensemble import RandomForestRegressor
# 使用随机森林回归模型进行回归
rf = RandomForestRegressor(n_estimators=120, random_state=42)
rf.fit(new_data_train, new_labels_train)

# 在测试集上进行预测
predictions = rf.predict(new_data_test)

# 评估模型
mse = mean_squared_error(new_labels_test, predictions)
r2 = r2_score(new_labels_test, predictions)
# 打印评估结果
# print("Mean Squared Error:", mse)
print("R2 Score:", r2)

R2 Score: 0.8462874284895459


In [39]:
load_rf = joblib.load(model_filename)
pca = PCA(n_components=9)
data_pca = pca.fit_transform(data_normalized)
new_data_train, new_data_test, new_labels_train, new_labels_test = train_test_split(data_pca, labels, test_size=0.2, random_state=42)
load_rf = RandomForestRegressor(n_estimators=130, random_state=42)
load_rf.fit(new_data_train, new_labels_train)

# 在测试集上进行预测
predictions = load_rf.predict(new_data_test)

# 评估模型
mse = mean_squared_error(new_labels_test, predictions)
r2 = r2_score(new_labels_test, predictions)
print(r2)

NameError: name 'model_filename' is not defined

In [None]:
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
r2 = 0
while True:
    # Set_Random_State()
    # 创建PCA模型并降维到十六维
    pca = PCA(n_components=9)
    data_pca = pca.fit_transform(data_normalized)
    new_data_train, new_data_test, new_labels_train, new_labels_test = train_test_split(data_pca, labels, test_size=0.2, random_state=42)
    from sklearn.ensemble import RandomForestRegressor
    # 使用随机森林回归模型进行回归
    rf = RandomForestRegressor(n_estimators=125, random_state=42)
    rf.fit(new_data_train, new_labels_train)

    # 在测试集上进行预测
    predictions = rf.predict(new_data_test)

    # 评估模型
    mse = mean_squared_error(new_labels_test, predictions)
    r2 = r2_score(new_labels_test, predictions)
    print(r2)
    if (r2 > 0.75):
        model_filename = 'random_forest_model.pkl'
        joblib.dump(rf, model_filename)
        break
print(r2)


0.7211554639684774
0.6529930497943028
0.7213502000534403
0.7217400195790483
0.7173203040395983
0.6512545252856485
0.6604211151621824
0.7264254928327523
0.7257901832634799
0.6430318387018237
0.7218502022671893
0.660676715053709
0.7203138855096038
0.7222260082868253
0.7226281229145254
0.6747683221618219
0.7245510368270338
0.7198891382181134
0.7244507975570391
0.6612433160400841
0.7126834595072107
0.6517009520035519
0.7156082201955627
0.7133235041740352
0.6625128817902088
0.6516266234574101
0.7280699517363507
0.6637520264624096
0.6547769300830076
0.6558857065858558
0.7228002962995999
0.7472683458255862
0.6433316947437793
0.6271999339570684
0.7154610402948374
0.6634366204458
0.6345149915023602
0.6727478472169048
0.6545126660769387
0.6276890058474482
0.6665102067090232
0.7236368900574424
0.6522296625166701
0.6614200181977249
0.6562937743516186
0.6662008825917334
0.7248116884341642
0.6540386615052873
0.655314382467451
0.6646250632685771
0.7283710808946267
0.6567251744928252
0.652182859646187

KeyboardInterrupt: 

In [None]:
load_rf = joblib.load(model_filename)
pca = PCA(n_components=9)
data_pca = pca.fit_transform(data_normalized)
new_data_train, new_data_test, new_labels_train, new_labels_test = train_test_split(data_normalized, labels, test_size=0.2, random_state=42)
load_rf = RandomForestRegressor(n_estimators=130, random_state=42)
load_rf.fit(new_data_train, new_labels_train)

# 在测试集上进行预测
predictions = load_rf.predict(new_data_test)

# 评估模型
mse = mean_squared_error(new_labels_test, predictions)
r2 = r2_score(new_labels_test, predictions)
print(r2)

0.2744458843633376
