In [1]:
from openfe import OpenFE, transform

In [2]:
# import sys
# sys.path.append('../')
import pandas as pd
from sklearn.datasets import fetch_california_housing
from openfe import OpenFE, tree_to_formula, transform
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn import preprocessing

def get_score(train_x, test_x, train_y, test_y):
    train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=2020)
    params = {'n_estimators': 1000, 'n_jobs': n_jobs, 'seed': 1}
    gbm = lgb.LGBMRegressor(**params)
    gbm.fit(train_x, train_y, eval_set=[(val_x, val_y)], callbacks=[lgb.early_stopping(50, verbose=False)])
    pred = pd.DataFrame(gbm.predict(test_x), index=test_x.index)
    score = mean_squared_error(test_y, pred)
    return score,test_y, pred

In [4]:
data = pd.read_excel("./数据2.0.xlsx")
columns = data.columns

In [None]:
correlation_matrix = data.corr()
# # 选择相关系数的阈值，例如0.1
# threshold = 0.2
# # 找出低于阈值的特征
# low_correlation_features = [index for index in correlation_matrix["lattice_distortion"].index if abs(correlation_matrix["lattice_distortion"][index]) < threshold]
# # 删除这些特征
# data = data.drop(columns=low_correlation_features)
correlation_matrix.to_excel("皮尔逊相关系数矩阵-原始特征.xlsx",index = False)

In [None]:
# data.to_excel("初步筛选后的特征.xlsx",index = False)

In [None]:
def data_generator(f_path, seed=88): 
    df_all = pd.read_excel(f_path)
    #create a min max processing object
    composition = df_all
    scaler = preprocessing.StandardScaler().fit(composition)
    normalized_composition = scaler.transform(composition)
    return normalized_composition,scaler

In [None]:
data_trans,scaler = data_generator("./数据2.0.xlsx", seed=42)
data = pd.DataFrame(data_trans,columns=columns)
data.to_excel("归一化后的数据.xlsx",index = False)

In [None]:
# data = pd.read_excel("./原始数据2.xlsx")

In [None]:
aa = data.copy(deep = True)

In [None]:
if __name__ == '__main__':
    n_jobs = 4
#     data = fetch_california_housing(as_frame=True).frame
#     label = data[['MedHouseVal']]
#     del data['MedHouseVal']
#     label = -np.log10(1-data[["average_coulombic_efficiency"]]/100)
    label=data[["Y"]]
    del data["Y"]
    train_x, test_x, train_y, test_y = train_test_split(data, label, test_size=0.2, random_state=1)
    # get baseline score
    score,_,_ = get_score(train_x, test_x, train_y, test_y)
    print("The MSE before feature generation is", score)
    # feature generation
    ofe = OpenFE()
    ofe.fit(data=train_x, label=train_y, n_jobs=n_jobs,n_data_blocks = 1)
    # OpenFE recommends a list of new features. We include the top 10
    # generated features to see how they influence the model performance
    train_x, test_x = transform(train_x, test_x, ofe.new_features_list[:10], n_jobs=n_jobs)
    score,test_y, pred = get_score(train_x, test_x, train_y, test_y)
    print("The MSE after feature generation is", score)
    print("The top 10 generated features are")
    for feature in ofe.new_features_list[:10]:
        print(tree_to_formula(feature))

In [None]:
new_feature_name = []
origin_data = pd.read_excel("./数据2.0.xlsx")
for i in list(origin_data.columns)[:-1]:
    new_feature_name.append(i)

In [None]:
for feature in ofe.new_features_list[:10]:
    new_feature_name.append(tree_to_formula(feature))
new_feature_name.append(list(origin_data.columns)[-1])

In [None]:
train_data = pd.concat([train_x,train_y],axis=1)
test_data = pd.concat([test_x,test_y],axis=1)

In [None]:
data_concat = pd.concat([train_data,test_data],axis=0)
data_concat.columns = new_feature_name
data_concat.to_excel("高价值特征生成.xlsx",index=False)

# 贪心算法特征选择

In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

In [None]:
def Liner_model(datasets,cols):
    skf = KFold(n_splits=5, shuffle=True,random_state=888)
    valid_ssr =[]
    x = datasets[:,cols]
    y = datasets[:,-1:]
    for i, (trn_idx, val_idx) in enumerate(skf.split(x, y)):
        trn_x, trn_y = x[trn_idx], y[trn_idx]
        val_x, val_y = x[val_idx], y[val_idx]
        # 创建并拟合线性回归模型
        model = LinearRegression()
        model.fit(trn_x, trn_y)
        # 获得残差和
        residual_sum_of_squares = np.sum((model.predict(val_x) - val_y) ** 2)
        valid_ssr.append(residual_sum_of_squares)
    #返回当前特征组合的交叉验证的残差和
    return np.array(valid_ssr).mean()

In [None]:
def bic(data, features):
    # 估计线性回归模型的参数和残差平方和
    rss = Liner_model(data,features)
    # 计算BIC值
    n = len(data)*0.8
    k = len(features)
    
    return n * np.log(rss / n) + k * np.log(n),rss

In [None]:
def get_best_features(data):
    # 初始化特征集合为空
    best_features = []
    current_best_bic = []
    current_best_feature = []
    SSE = []
    for a in range(data.shape[1] - 1):
        # 初始化最小的BIC值为无穷大
        Bics = []
        cols = []
        sses = []
        # 遍历所有的特征
        for i in range(data.shape[1] - 1):
            col = current_best_feature.copy()
            if i not in col:
                col.append(i)
                # 计算添加后的特征子集的BIC值
                new_bic,sse = bic(data, col)
                Bics.append(new_bic)
                cols.append(col)
                sses.append(sse)
        current_best_feature = cols[np.array(sses).argmin()].copy()
        current_best_bic.append(Bics[np.array(Bics).argmin()])
        best_features.append(current_best_feature)
        SSE.append(sses[np.array(Bics).argmin()])
    return best_features,SSE

In [None]:
import statsmodels.api as sm
import pandas as pd
def get_bis(features,path):
    org_data = pd.read_excel(path).fillna(0)
    columns = list(org_data.columns)
    data_y = org_data[[columns[i] for i in features]]
    model_1 = sm.OLS(org_data['Y'], sm.add_constant(data_y)).fit()
    return model_1.bic

In [None]:
def bic_choose_feature(best_features,path):
    bics = []
    for i in range(len(best_features)):
        bic = get_bis(best_features[i],path)
        bics.append(bic)
    min_index = np.array(bics).argmin()
    return best_features[min_index],bics

In [None]:
path = "高价值特征生成.xlsx"
data = pd.read_excel(path).fillna(0)

In [None]:
best_features,SSE = get_best_features(np.array(data))
pd.DataFrame(best_features).to_excel("best_feature_candinate.xlsx")
bic_feature,bics = bic_choose_feature(best_features,path)

In [None]:
# bic_feature = best_features[4]

In [None]:
sse_bics = pd.concat((pd.DataFrame(SSE),pd.DataFrame(bics)),axis=1)
sse_bics.columns = ["SSE","BIC"]
sse_bics.to_excel("SSE_and_BIC.xlsx",index = False)

In [None]:
org_data = pd.read_excel("高价值特征生成.xlsx").fillna(0)
columns = list(org_data.columns)
filter_data = org_data[[columns[i] for i in bic_feature]]

In [None]:
data3 = pd.concat((filter_data,org_data.iloc[:,-1:]),axis = 1)
data3.to_excel("BIC筛选后特征数据集.xlsx",index = False)

In [None]:
data3.corr().to_excel("皮尔逊相关系数矩阵-特征筛选后.xlsx",index = False)

In [None]:
data3.corr()

# 比较不同模型的效果

In [None]:
#线性回归

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
import sklearn.metrics as metrics

def Liner_model(datasets):
    skf = KFold(n_splits=5, shuffle=True,random_state=888)
    intercepts =[]
    coefficients = []
    x = datasets[:,:-1]
    y = datasets[:,-1:]
    val_ys = []
    pre_ys = []
    r2s = []
    for i, (trn_idx, val_idx) in enumerate(skf.split(x, y)):
        trn_x, trn_y = x[trn_idx], y[trn_idx]
        val_x, val_y = x[val_idx], y[val_idx]
        # 创建并拟合线性回归模型
        model = LinearRegression()
        model.fit(trn_x, trn_y)
        intercept = model.intercept_
        # 获取参数
        coefficient = model.coef_
        intercepts.append(intercept)
        coefficients.append(coefficient)
        # 计算MAE，MSE和R2
#         xgb_mae = metrics.mean_absolute_error(xgb_val, xgb_pre)
#         xgb_mse = metrics.mean_squared_error(xgb_val, xgb_pre)
        line_r2 = metrics.r2_score(val_y, model.predict(val_x))
        r2s.append(line_r2)
        pre_ys.append(model.predict(val_x))
        val_ys.append(val_y)
    return intercepts,coefficients,r2s,pre_ys,val_ys

In [None]:
path = "BIC筛选后特征数据集.xlsx"
data = pd.read_excel(path).fillna(0)

In [None]:
data

In [None]:
path = "BIC筛选后特征数据集.xlsx"
data = pd.read_excel(path).fillna(0)
data["Y"] = data["Y"] * scaler.scale_[-1] + scaler.mean_[-1] 
intercepts,coefficients,r2s,pre_ys,val_ys = Liner_model(np.array(data))

# 导入机器学习包
import sklearn.metrics as metrics
line_val = [a for i in val_ys for a in i]
line_pre = [a for i in pre_ys for a in i]

# 计算MAE，MSE和R2
line_mae = metrics.mean_absolute_error(line_val, line_pre)
line_mse = metrics.mean_squared_error(line_val, line_pre)
line_r2 = metrics.r2_score(line_val, line_pre)

# 打印结果
print("line_MAE:", line_mae)
print("line_MSE:", line_mse)
print("line_R2:", line_r2)


line_pre_data = pd.concat((pd.DataFrame(line_val),pd.DataFrame(line_pre)),axis = 1)
line_pre_data.columns = ["line_Actual","line_Predict"]
line_pre_data.to_excel("line_pre_data.xlsx",index =False)

In [None]:
#随机森林

In [None]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
import optuna

In [None]:
def read_data(f_path):
    if f_path.endswith(".csv"):
        data = pd.read_csv(f_path)
    elif f_path.endswith(".xls") or f_path.endswith(".xlsx"):
        data = pd.read_excel(f_path)
    return data

In [None]:
def RF_train(params,data,Predict = False,Output = False):
    dataset = np.array(data)
    x = dataset[:,:-1]
    y = dataset[:,-1:].reshape(-1)
    n_splits = 5
    skf = KFold(n_splits=n_splits, shuffle=True, random_state=2020)
    min_mae = []
    pre = 0
    val_ys = []
    pre_ys = []
    for i, (trn_idx, val_idx) in enumerate(skf.split(x, y)):
        trn_x, trn_y = x[trn_idx], y[trn_idx]
        val_x, val_y = x[val_idx], y[val_idx]
        rf_model = RandomForestRegressor(n_estimators=params["n_estimators"],max_depth=params["max_depth"])
        rf_model.fit(trn_x,trn_y)
        val_ys.append(val_y)
        pre_ys.append(rf_model.predict(val_x))
        if Predict is True:
            pre += rf_model.predict(test)/n_splits
        else:
#                 min_r2.append(rf_model.score(val_x,val_y))
            val_mae = abs(abs(rf_model.predict(val_x))-abs(val_y)).mean()
            min_mae.append(val_mae)
    if Predict is True:
        return pre
    elif Output is True:
        return val_ys,pre_ys
    else:     
        return np.array(min_mae).mean()

In [None]:
def rf_objective(trial,data):

    params = {
          'n_estimators': trial.suggest_int("n_estimators", 10, 100,5),
          'max_depth': trial.suggest_int("max_depth", 1,10,1),
          }

    loss = RF_train(params,data)

    return loss

In [None]:
def optimizer_optuna(n_trials, algo,optuna_objective,data):

    #定义使用TPE或者GP
    if algo == "TPE":
        algo = optuna.samplers.TPESampler(n_startup_trials = 10, n_ei_candidates = 24)
    elif algo == "GP":
        from optuna.integration import SkoptSampler
        import skopt
        algo = SkoptSampler(skopt_kwargs={'base_estimator':'GP', #选择高斯过程
                                          'n_initial_points':10, #初始观测点10个
                                          'acq_func':'EI'} #选择的采集函数为EI，期望增量
                           )

    #实际优化过程，首先实例化优化器
    study = optuna.create_study(sampler = algo #要使用的具体算法
                                , direction="minimize" #优化的方向，可以填写minimize或maximize
                               )
    #开始优化，n_trials为允许的最大迭代次数
    #由于参数空间已经在目标函数中定义好，因此不需要输入参数空间
    study.optimize(lambda trial: optuna_objective(trial, data) #目标函数
                   , n_trials=n_trials #最大迭代次数（包括最初的观测值的）
                   , show_progress_bar=True #要不要展示进度条呀？
                  )

    #可直接从优化好的对象study中调用优化的结果
    #打印最佳参数与最佳损失值
    print("\n","\n","best params: ", study.best_trial.params,
          "\n","\n","best score: ", study.best_trial.values,
          "\n")

    return study.best_trial.params, study.best_trial.values

In [None]:
path = "./BIC筛选后特征数据集.xlsx"

In [None]:
data = read_data(path)

In [None]:
rf_best_params,rf_best_mae = optimizer_optuna(n_trials = 100, algo="TPE", optuna_objective=rf_objective, data=data)

In [None]:
val_ys,pre_ys= RF_train(params=rf_best_params,data=data,Output=True)

In [None]:
# 导入机器学习包
import sklearn.metrics as metrics
rf_val = [a for i in val_ys for a in i]
rf_pre = [a for i in pre_ys for a in i]

rf_val = np.array(rf_val) * scaler.scale_[-1] + scaler.mean_[-1] 
rf_pre = np.array(rf_pre) * scaler.scale_[-1] + scaler.mean_[-1] 
# 计算MAE，MSE和R2
rf_mae = metrics.mean_absolute_error(rf_val, rf_pre)
rf_mse = metrics.mean_squared_error(rf_val, rf_pre)
rf_r2 = metrics.r2_score(rf_val, rf_pre)

# 打印结果
print("RF_MAE:", rf_mae)
print("RF_MSE:", rf_mse)
print("RF_R2:", rf_r2)

In [None]:
RF_pre_data = pd.concat((pd.DataFrame(rf_val),pd.DataFrame(rf_pre)),axis = 1)
RF_pre_data.columns = ["RF_Actual","RF_Predict"]
RF_pre_data.to_excel("RF_pre_data.xlsx",index =False)

# XGBOOST

In [None]:
import xgboost as xgb

In [None]:
def xgb_train(params, data, data2=None, Predict=False, Output=False):
    dataset = np.array(data)
    x = dataset[:, :-1]
    y = dataset[:, -1:]
    if Predict is True:
        test_x = np.array(data2)
    n_splits = 5
    skf = KFold(n_splits=n_splits, shuffle=True, random_state=2020)
    min_mae = []
    pre = 0
    
    val_ys = []
    pre_ys = []
    evals_result = {}
    for i, (trn_idx, val_idx) in enumerate(skf.split(x, y)):
        trn_x, trn_y = x[trn_idx], y[trn_idx]
        val_x, val_y = x[val_idx], y[val_idx]
        dataTrain = xgb.DMatrix(trn_x, trn_y)
        dataVal = xgb.DMatrix(val_x, val_y)
        watchlist = [(dataVal, 'test'), (dataTrain, 'train')]
        if Predict is True:
            dataTest = xgb.DMatrix(test_x)
            bst = xgb.train(params=params, dtrain=dataTrain, num_boost_round=5000, evals=watchlist,
                            callbacks=[xgb.callback.EarlyStopping(50)])
            pre += bst.predict(dataTest) / n_splits

        
        elif Output is True:
            bst = xgb.train(params=params, dtrain=dataTrain, num_boost_round=5000, evals=watchlist,
                            callbacks=[xgb.callback.EarlyStopping(50)],verbose_eval=False)
            val_ys.append(val_y)
            pre_ys.append(bst.predict(xgb.DMatrix(val_x)))
            

        else:
            bst = xgb.train(params=params, dtrain=dataTrain, num_boost_round=5000, evals=watchlist,
                            evals_result=evals_result, callbacks=[xgb.callback.EarlyStopping(50)],verbose_eval=False)
            min_mae.append(min(list(evals_result["test"].values())[0]))


    if Predict is True:
        return pre
    elif Output is True:
        return val_ys, pre_ys
    else:
        return np.array(min_mae).mean()
    
    
    
def xgb_objective(trial, data):
    params = {
        'subsample': trial.suggest_float("subsample", 0.2, 0.8),
        'eta': trial.suggest_float("eta", 0.01, 0.1, step=0.01),
        'max_depth': trial.suggest_int("max_depth", 1, 5, 1),
        'colsample_bytree': trial.suggest_float("colsample_bytree", 0.2, 0.8, step=0.1),
        'gamma': trial.suggest_float("gamma", 0, 1),
        'eval_metric': "mae",
        "verbose":0
    }
    loss = xgb_train(params, data)

    return loss

In [None]:
def get_prediction(params, data1, data2):
    pre_reslust = xgb_train(params=params, data=data1, data2=data2, Predict=True)
    return pre_reslust

In [None]:
file_path = "./BIC筛选后特征数据集.xlsx"
data = read_data(file_path)

In [None]:
xgb_best_params, xgb_best_mae = optimizer_optuna(n_trials=100, algo="TPE", optuna_objective=xgb_objective,
                                               data=data)

In [None]:
val_ys, pre_ys = xgb_train(xgb_best_params, data, data2=None, Predict=False, Output=True)

In [None]:
# 导入机器学习包
import sklearn.metrics as metrics
xgb_val = [a for i in val_ys for a in i]
xgb_pre = [a for i in pre_ys for a in i]
xgb_val = np.array(xgb_val) * scaler.scale_[-1] + scaler.mean_[-1] 
xgb_pre = np.array(xgb_pre) * scaler.scale_[-1] + scaler.mean_[-1] 
# 计算MAE，MSE和R2
xgb_mae = metrics.mean_absolute_error(xgb_val, xgb_pre)
xgb_mse = metrics.mean_squared_error(xgb_val, xgb_pre)
xgb_r2 = metrics.r2_score(xgb_val, xgb_pre)

# 打印结果
print("XGB_MAE:", xgb_mae)
print("XGB_MSE:", xgb_mse)
print("XGB_R2:", xgb_r2)

In [None]:
XGB_pre_data = pd.concat((pd.DataFrame(xgb_val),pd.DataFrame(xgb_pre)),axis = 1)
XGB_pre_data.columns = ["XGB_Actual","XGB_Predict"]
XGB_pre_data.to_excel("XGB_pre_data.xlsx",index =False)

# 神经网络

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import time
import pandas as pd
import copy
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 搭建全连接神经网络回归
class MLPregression(nn.Module):
    def __init__(self, params, data):
        super(MLPregression, self).__init__()
        self.layber_number = params["layber_number"]
        self.unit = params["unit"]
        # 第一个隐含层
        self.hidden1 = nn.Linear(in_features=data.shape[-1]-1, out_features=self.unit, bias=True)
        # 第二个隐含层
        self.hidden2 = nn.Linear(self.unit, self.unit)
        # 第三个隐含层
        self.hidden3 = nn.Linear(128, 256)
        # 回归预测层
        self.hidden5 = nn.Linear(self.unit, 64)
        self.predict = nn.Linear(64, 1)
        self.relu = nn.functional.relu
        self.dropout = nn.Dropout(params["drop_out"])

    # 定义网络前向传播路径
    def forward(self, x):
        x = self.relu(self.hidden1(x))
        for i in range(self.layber_number):
            x = self.dropout(self.relu(self.hidden2(x)))
        x = self.dropout(self.relu(self.hidden5(x)))
        output = self.predict(x)
        # 输出一个一维向量
        return output[:, 0]

In [None]:
class FastTensorDataLoader:
    """
    A DataLoader-like object for a set of tensors that can be much faster than
    TensorDataset + DataLoader because dataloader grabs individual indices of
    the dataset and calls cat (slow).
    Source: https://discuss.pytorch.org/t/dataloader-much-slower-than-manual-batching/27014/6
    """
    def __init__(self, *tensors, batch_size=32, shuffle=False):
        """
        Initialize a FastTensorDataLoader.
        :param *tensors: tensors to store. Must have the same length @ dim 0.
        :param batch_size: batch size to load.
        :param shuffle: if True, shuffle the data *in-place* whenever an
            iterator is created out of this object.
        :returns: A FastTensorDataLoader.
        """
        assert all(t.shape[0] == tensors[0].shape[0] for t in tensors)
        self.tensors = tensors

        self.dataset_len = self.tensors[0].shape[0]
        self.batch_size = batch_size
        self.shuffle = shuffle

        # Calculate # batches
        n_batches, remainder = divmod(self.dataset_len, self.batch_size)
        if remainder > 0:
            n_batches += 1
        self.n_batches = n_batches
    def __iter__(self):
        if self.shuffle:
            r = torch.randperm(self.dataset_len)
            self.tensors = [t[r] for t in self.tensors]
        self.i = 0
        return self

    def __next__(self):
        if self.i >= self.dataset_len:
            raise StopIteration
        batch = tuple(t[self.i:self.i+self.batch_size] for t in self.tensors)
        self.i += self.batch_size
        return batch

    def __len__(self):
        return self.n_batches

In [None]:
def train_model(params, model, dataloaders, criterion, optimizer, num_epochs, i, savebest=False,Output=False):
    since = time.time()
    best_loss = 10000000
    model.to(device)
    train_losses = []
    valid_losses = []

    for epoch in range(num_epochs):

        # 训练和验证
        for phase in ['train', 'valid']:
            if phase == 'train':
                model.train()  # 训练
            else:
                model.eval()  # 验证

            running_loss = 0.0
            running_corrects = 0

            # 把数据都取个遍
            for inputs, labels in dataloaders[phase]:

                inputs = inputs.to(device)
                labels = labels.to(device)
                # 清零
                optimizer.zero_grad()
                # 只有训练的时候计算和更新梯度

                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    #                     print(outputs.shape, labels.shape)
                    loss = criterion(outputs, labels)
                    #                     print("loss为：",loss)
                    # 训练阶段更新权重
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                #                 计算损失

                running_loss += loss.item() * inputs.size(0)
            epoch_loss = running_loss / dataloaders[phase].dataset_len
            #             print(epoch_loss)
            time_elapsed = time.time() - since
            #             print('Time elapsed {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
            #             print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

            # 得到最好那次的模型
            if phase == 'valid' and epoch_loss < best_loss:
                best_loss = epoch_loss
#                 print(best_loss)
#                 print(outputs[0],labels[0])
                if savebest is True:
                    best_model_wts = copy.deepcopy(model.state_dict())
                    state = {
                        'state_dict': model.state_dict(),  # 字典里key就是各层的名字，值就是训练好的权重
                        'best_loss': best_loss,
                        'optimizer': optimizer.state_dict(),  # 优化器的状态信息
                    }
                    filename = './NN_best' + str(i) + '.pth'
                    torch.save(state, filename)
                    inputs_list = []
                    labels_list = []
                    for inputs, labels in dataloaders[phase]:
                        inputs = inputs.to(device)
                        labels = labels.to(device)
                        inputs_list.append(inputs)
                        labels_list.append(labels)
                    final_inputs = torch.cat(inputs_list, dim=0)
                    final_labels = torch.cat(labels_list, dim=0)
                    pre_y = model(final_inputs)
                    val_y = final_labels

                    
            if phase == 'valid':
                valid_losses.append(epoch_loss)
            #                 scheduler.step(epoch_loss)#学习率衰减
            if phase == 'train':
                train_losses.append(epoch_loss)

    if savebest is True:
        return train_losses, valid_losses, best_loss, pre_y,val_y
    else:
        return best_loss

In [None]:
def model_cross_train(params, data, savebest=False):
    x = data[:, :-1]
    y = data[:, -1:]
    loss = []
    skf = KFold(n_splits=5, shuffle=True, random_state=42)
    train_losses = []
    valid_losses = []
    val_ys = []
    pre_ys = []
    for i, (trn_idx, val_idx) in enumerate(skf.split(x, y)):
        trn_x, trn_y = x[trn_idx], y[trn_idx]
        val_x, val_y = x[val_idx], y[val_idx]
#         print(f"val is {len(val_y)}")
        # 将数据集转为张量
        X_train_t = torch.from_numpy(trn_x.astype(np.float32))
        y_train_t = torch.from_numpy(trn_y.squeeze().astype(np.float32))
        X_valid_t = torch.from_numpy(val_x.astype(np.float32))
        y_valid_t = torch.from_numpy(val_y.squeeze().astype(np.float32))

        # 将训练数据处理为数据加载器
        #         train_data = Data.TensorDataset(X_train_t, y_train_t)
        #         valid_data = Data.TensorDataset(X_valid_t, y_valid_t)

        #         train_loader = Data.DataLoader(dataset = train_data, batch_size = params['batch_size'],
        #                                        shuffle = True, num_workers = 1)
        #         val_loader = Data.DataLoader(dataset = valid_data, batch_size = params['batch_size'],
        #                                        shuffle = True, num_workers = 1)
        train_loader = FastTensorDataLoader(X_train_t, y_train_t, batch_size=params['batch_size'],
                                            shuffle=True)
        val_loader = FastTensorDataLoader(X_valid_t, y_valid_t, batch_size=params['batch_size'],
                                          shuffle=True)
        # 实例化
        model = MLPregression(params, data)
        # 损失函数
        criterion = torch.nn.L1Loss()
        # 优化器设置
        optimizer_ft = optim.Adam(model.parameters(), lr=params["lr"])
        # scheduler = optim.lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)#学习率每7个epoch衰减成原来的1/1
        # 数据加载器
        dataloaders = {'train': train_loader, 'valid': val_loader}
        
        if savebest is True:
            train_loss, valid_loss, best_loss, pre_y,act_y = train_model(params, model, dataloaders, criterion, optimizer_ft,
                                                            num_epochs=100, i=i, savebest=True)
            train_losses.append(train_loss)
            valid_losses.append(valid_loss)
            loss.append(best_loss)
            val_ys.append(act_y)
            pre_ys.append(pre_y)
        else:
            best_loss = train_model(params, model, dataloaders, criterion, optimizer_ft, num_epochs=100, i=i)
            loss.append(best_loss)

    if savebest is True:
        return train_losses, valid_losses, loss,val_ys,pre_ys
    else:
        return np.array(loss).mean()


def model_objective(trial, data):
    params = {
          'lr': trial.suggest_float("lr", 1e-5, 1e-2),
          'batch_size': trial.suggest_int("batch_size", 16,128,16),
          'drop_out':trial.suggest_float("drop_out", 0, 0.2),
          'unit': trial.suggest_int("unit", 16, 128,16),
          'layber_number':trial.suggest_int("layber_number", 1, 16,1)
          }
    loss = model_cross_train(params, data)
    return loss

In [None]:
def data_generator(f_path, seed=42): 
    df_all = pd.read_excel(f_path)
    #create a min max processing object
    composition = df_all.iloc[:,:-1]
    scaler = preprocessing.StandardScaler().fit(composition)
    normalized_composition = scaler.transform(composition)
    
    return normalized_composition

In [None]:
file_path = "./BIC筛选后特征数据集.xlsx"
# file_path = "./原始数据2.xlsx"
test_x = data_generator(file_path, seed=42)
label = np.array(pd.read_excel(file_path)["Y"]).reshape(-1,1)
data =np.hstack((test_x,label))

In [None]:
# file_path = "./BIC筛选后特征数据集.xlsx"
# data = pd.read_excel(path).fillna(0)

In [None]:
data = np.array(aa)

In [None]:
nn_best_params, nn_best_mae = optimizer_optuna(n_trials=100, algo="TPE", optuna_objective=model_objective,
                                               data=data)

In [None]:
train_losses, valid_losses, loss,val_ys,pre_ys = model_cross_train(nn_best_params, data, savebest=True)

In [None]:
criterion = torch.nn.L1Loss()
criterion(pre_ys[1].cpu(),val_ys[1].cpu())
# 导入机器学习包
import sklearn.metrics as metrics
nn_val = [a for i in val_ys for a in i.cpu().data.numpy()]
nn_pre = [a for i in pre_ys for a in i.cpu().data.numpy()]

nn_val = np.array(nn_val) * scaler.scale_[-1] + scaler.mean_[-1] 
nn_pre = np.array(nn_pre) * scaler.scale_[-1] + scaler.mean_[-1] 
# 计算MAE，MSE和R2
nn_mae = metrics.mean_absolute_error(nn_val, nn_pre)
nn_mse = metrics.mean_squared_error(nn_val, nn_pre)
nn_r2 = metrics.r2_score(nn_val, nn_pre)

# 打印结果
print("NN_MAE:", nn_mae)
print("NN_MSE:", nn_mse)
print("NN_R2:", nn_r2)

In [None]:
NN_pre_data = pd.concat((pd.DataFrame(nn_val),pd.DataFrame(nn_pre)),axis = 1)
NN_pre_data.columns = ["NN_Actual","NN_Predict"]
NN_pre_data.to_excel("NN_pre_data.xlsx",index =False)

In [None]:
len(nn_val)