如果您想预测多个目标变量（'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'），那么确实可以分别建立多个模型，每个模型用于预测一个目标变量。这样做可以允许每个模型专注于一个特定的目标，并可能提高预测的准确性。

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
# 注意：上面的代码只是基于历史数据进行了训练和评估，并没有真正进行未来的预测。
# 要进行未来预测，你需要有新的、未见过的数据作为输入给模型。
import matplotlib.pyplot as plt
import pandas as pd


def get_rf(feature_names,other_rows, N,feature_name,n_estimators,max_depth,min_samples_leaf,criterion,min_samples_split):
    # 现在last_N_rows包含了最后N行的数据，other_rows包含了其他数据
    features = other_rows[feature_names].rolling(window=N).mean().shift(1).dropna()
    # 单特征单步预测
    target = other_rows[feature_name].shift(-N)
    target = target.dropna()

    # 合并特征和目标变量
    X = features.values
    y = target.values

    # 初始化随机森林回归器
    rf = RandomForestRegressor(n_estimators=n_estimators, random_state=42,max_depth=max_depth,min_samples_leaf=min_samples_leaf,criterion=criterion,min_samples_split=min_samples_split)

    # 训练模型
    rf.fit(X, y)

    return rf

# 假设我们有一个包含股票数据的CSV文件，其中包含日期、开盘价、最高价、最低价、收盘价和成交量等列
# CSV文件的结构如下：Date, Open, High, Low, Close, Volume

def train_and_test(n_estimators,max_depth,min_samples_leaf,N=5,criterion='friedman_mse',min_samples_split=20):
    # 加载数据
    data = pd.read_csv('/kaggle/input/lstm-stack/rlData.csv', parse_dates=['Date'])
    data.set_index('Date', inplace=True)

    # 假设N是我们要分开的行数，这应该是由题目决定的
    N_problem = 5

    # 使用tail方法获取最后N行数据
    last_N_rows = data.tail(N_problem)

    # 使用iloc方法获取除了最后N行之外的数据
    other_rows = data.iloc[:-N_problem]  # 注意这里使用了负索引，表示除了最后N行


    feature_names = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
    rfs = {}
    for name in feature_names :
        rfs[name] = get_rf(feature_names,other_rows, N,name,n_estimators,max_depth,min_samples_leaf,criterion,min_samples_split)

    #预测N_problem时间步
    for i in range(N_problem):
        # 计算other_rows当前最后N行的平均值，注意这里不需要使用rolling
        last_N_rows_mean = other_rows.tail(N)[feature_names].mean()
        last_N_rows_mean = last_N_rows_mean.values.reshape(1,-1)

        #建立一个时间步的空特征表
        new_row = pd.DataFrame([[np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]], 
                            columns=feature_names)

        #预测该时间步的各个特征
        for name in feature_names :
            y_pred = rfs[name].predict(last_N_rows_mean)
            new_row[name] = y_pred

        # 将新行添加到other_rows的尾部，更新other_rows的最后N行
        other_rows = pd.concat([other_rows, new_row], ignore_index=True)

    # 评估预测的最后N天和真实的最后N天
    mse = mean_squared_error(last_N_rows, other_rows.tail(N_problem))
    rmse = np.sqrt(mse)

    return rmse

In [2]:

def objective(trial):  
    # 输入的维度为2
    input_dim = 2
    output_dim = 2
    num_epochs = 100
    
    # 隐藏层特征的维度
    hidden_dim = 32
    # 循环的layers
    num_layers = 3

    # 这里的参数将在Optuna的搜索过程中被优化  
    n_estimators = trial.suggest_int("n_estimators", 10, 200, step=1)
    max_depth = trial.suggest_int("max_depth", 1, 50, step=1)
    criterion = trial.suggest_categorical("criterion", ['squared_error', 'friedman_mse', 'poisson', 'absolute_error'])
    N = trial.suggest_int("N", 1, 20, step=1)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 100, step=1)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 100, step=1)
    
    rmse=train_and_test(n_estimators,max_depth,min_samples_leaf,N,criterion,min_samples_split)

    return rmse  # 假设这是评估后的准确率

In [3]:
import optuna  

# 创建研究实例  
study = optuna.create_study(direction="minimize")  
  
# 执行优化  
#n_trials数就是尝试的超参数的组数
study.optimize(objective, n_trials=1000)  
  
# 输出最优参数和对应的分数  
print("Best params: ", study.best_params)  
print("Best score: ", study.best_value)

[I 2024-07-04 04:03:32,135] A new study created in memory with name: no-name-6d4ed854-57da-4b7f-8ea4-5d00d747e34c
[I 2024-07-04 04:03:32,928] Trial 0 finished with value: 5330020.521350032 and parameters: {'n_estimators': 74, 'max_depth': 27, 'criterion': 'friedman_mse', 'N': 9, 'min_samples_leaf': 77, 'min_samples_split': 20}. Best is trial 0 with value: 5330020.521350032.
[I 2024-07-04 04:03:33,788] Trial 1 finished with value: 2596388.7486562766 and parameters: {'n_estimators': 66, 'max_depth': 36, 'criterion': 'poisson', 'N': 1, 'min_samples_leaf': 6, 'min_samples_split': 3}. Best is trial 1 with value: 2596388.7486562766.
[I 2024-07-04 04:03:34,556] Trial 2 finished with value: 5567425.344193884 and parameters: {'n_estimators': 77, 'max_depth': 6, 'criterion': 'friedman_mse', 'N': 15, 'min_samples_leaf': 84, 'min_samples_split': 61}. Best is trial 1 with value: 2596388.7486562766.
[I 2024-07-04 04:03:35,526] Trial 3 finished with value: 3914309.5644610096 and parameters: {'n_estim

Best params:  {'n_estimators': 52, 'max_depth': 25, 'criterion': 'absolute_error', 'N': 2, 'min_samples_leaf': 4, 'min_samples_split': 54}
Best score:  1732110.8469695896
