# 锂电池温度预测
锂离子电池材料的主要生产设备是电炉，研究烧结过程的数字化建模，通过电炉空间温度推测产品内部温度，设计烧结过程的温度场和浓度场的最优控制律，搭建产品制备过程运行平台，有望最终实现该过程的效率提升和协同优化，达到提高产品一致性，降低生产能耗的目标。初赛提供了电炉17个温区的实际生产数据，分别是电炉上部17组加热棒设定温度T1-1~T1-17，电炉下部17组加热棒设定温度T2-1~T2-17，底部17组进气口的设定进气流量V1-V17，选手需要根据提供的数据样本构建模型，预测电炉上下部空间17个测温点的测量温度值。初赛考核办法采用测试集各行数据的加热棒上部温度设定值、加热棒下部温度设定值、进气流量3类数据作为输入，选手分别预测上部空间测量温度、下部空间测量温度。将选手预测的上部空间测量温度、下部空间测量温度与测试集数据的测量值进行比较。采用MAE平均绝对误差作为评价指标。

## 环境配置

In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
import optuna
import optuna.visualization as ov
from joblib import dump, load, Parallel, delayed
import numpy as np

In [None]:
# !conda install pandas numpy sklearn lightgbm optuna joblib -y

## 超参数配置

In [None]:
# 使用的特征
USE_FEATURE_TYPES = ['timebase']
USE_FEATURE_TYPES_ALL = ['timebase', 'time', 'statistical', 'overall', 'interaction', 'difference', 'ratio',
                         'rolling_window',
                         'spatial_gradient', 'time_gradient', 'time_spatial_gradient']
# 模型训练迭代次数
TRAIN_ROUND = 3

# 超参数训练次数
OPTUNA_ROUND = 50

## 数据准备

In [None]:
# 数据提取
df_train = pd.read_csv("./datasets/train.csv")
df_test = pd.read_csv("./datasets/test.csv")

# 重命名
df_train.columns = (
        ['index', 'datetime'] +
        [f'V{i + 1}' for i in range(17)] + [f'T1-{i + 1}' for i in range(17)] + [f'T2-{i + 1}' for i in range(17)] +
        [f'T1R-{i + 1}' for i in range(17)] + [f'T2R-{i + 1}' for i in range(17)]

)
df_test.columns = df_train.columns[:53]
df_train

## 特征工程

In [None]:
def generate_features(df, feature_types: [str, ...]):
    df['timestamp'] = pd.to_datetime(df['datetime'])
    df = df.sort_values('timestamp')

    # 时间特征-baseline
    if 'timebase' in feature_types:
        df['month'] = df['timestamp'].dt.month
        df['day'] = df['timestamp'].dt.day
        df['hour'] = df['timestamp'].dt.hour
        df['minute'] = df['timestamp'].dt.minute
        df['dayofweek'] = df['timestamp'].dt.dayofweek
        df["dayofyear"] = df["timestamp"].dt.dayofyear
        df["is_weekend"] = df["timestamp"].dt.dayofweek // 6
        df["weekofyear"] = df["时间"].dt.isocalendar().week.astype(int)

    # 时间特征
    if 'time' in feature_types:
        df['year'] = df['timestamp'].dt.year
        df['month'] = df['timestamp'].dt.month
        df['day'] = df['timestamp'].dt.day
        df['hour'] = df['timestamp'].dt.hour
        df['minute'] = df['timestamp'].dt.minute
        df['dayofweek'] = df['timestamp'].dt.dayofweek

    # 累积统计特征
    if 'statistical' in feature_types:
        for i in range(1, 18):
            df[f'T1-{i}_mean'] = df[[f'T1-{j}' for j in range(1, i + 1)]].mean(axis=1)
            df[f'T1-{i}_std'] = df[[f'T1-{j}' for j in range(1, i + 1)]].std(axis=1)

            df[f'T2-{i}_mean'] = df[[f'T2-{j}' for j in range(1, i + 1)]].mean(axis=1)
            df[f'T2-{i}_std'] = df[[f'T2-{j}' for j in range(1, i + 1)]].std(axis=1)

            df[f'V{i}_mean'] = df[[f'V{j}' for j in range(1, i + 1)]].mean(axis=1)
            df[f'V{i}_std'] = df[[f'V{j}' for j in range(1, i + 1)]].std(axis=1)

    # 上下温度区总体特征
    if 'overall' in feature_types:
        # Average, min, max, and standard deviation
        df['T1_mean'] = df[[f'T1-{i}' for i in range(1, 18)]].mean(axis=1)
        df['T1_min'] = df[[f'T1-{i}' for i in range(1, 18)]].min(axis=1)
        df['T1_max'] = df[[f'T1-{i}' for i in range(1, 18)]].max(axis=1)
        df['T1_std'] = df[[f'T1-{i}' for i in range(1, 18)]].std(axis=1)

        df['T2_mean'] = df[[f'T2-{i}' for i in range(1, 18)]].mean(axis=1)
        df['T2_min'] = df[[f'T2-{i}' for i in range(1, 18)]].min(axis=1)
        df['T2_max'] = df[[f'T2-{i}' for i in range(1, 18)]].max(axis=1)
        df['T2_std'] = df[[f'T2-{i}' for i in range(1, 18)]].std(axis=1)

        # Difference and ratio between upper and lower heating rods
        df['T1_T2_diff'] = df['T1_mean'] - df['T2_mean']
        df['T1_T2_ratio'] = df['T1_mean'] / df['T2_mean']

    # 上下温度区交互特征
    if 'interaction' in feature_types:
        for i in range(1, 18):
            df[f'T1-{i}_T2-{i}_diff'] = df[f'T1-{i}'] - df[f'T2-{i}']
            df[f'T1-{i}_T2-{i}_ratio'] = df[f'T1-{i}'] / df[f'T2-{i}']
            df[f'T1-{i}_T2-{i}_interaction'] = df[f'T1-{i}'] * df[f'T2-{i}']

    # 相邻数据点的差值
    if 'difference' in feature_types:
        for i in range(1, 17):
            df[f'T1-{i}_T1-{i + 1}_diff'] = df[f'T1-{i}'] - df[f'T1-{i + 1}']
            df[f'T2-{i}_T2-{i + 1}_diff'] = df[f'T2-{i}'] - df[f'T2-{i + 1}']
            df[f'V{i}_V{i + 1}_diff'] = df[f'V{i}'] - df[f'V{i + 1}']

    # 比例特征
    if 'ratio' in feature_types:
        for i in range(1, 18):
            df[f'T1-{i}_V{i}_ratio'] = df[f'T1-{i}'] / df[f'V{i}']
            df[f'T2-{i}_V{i}_ratio'] = df[f'T2-{i}'] / df[f'V{i}']
            df[f'T1-{i}_V{i}_interaction'] = df[f'T1-{i}'] * df[f'V{i}']
            df[f'T2-{i}_V{i}_interaction'] = df[f'T2-{i}'] * df[f'V{i}']

    # 滑动窗口特征
    if 'rolling_window' in feature_types:
        window_sizes = [i for i in range(1, 5)]  # Change this list according to your needs
        for window_size in window_sizes:
            for i in range(1, 18):
                df[f'T1-{i}_rolling_mean_{window_size}'] = df[f'T1-{i}'].rolling(window_size).mean()
                df[f'T2-{i}_rolling_mean_{window_size}'] = df[f'T2-{i}'].rolling(window_size).mean()
                df[f'V{i}_rolling_mean_{window_size}'] = df[f'V{i}'].rolling(window_size).mean()

    # 空间梯度特征
    # 假设加热棒之间的空间距离是均匀的
    if 'spatial_gradient' in feature_types:
        for i in range(1, 17):
            df[f'T1-{i}_T1-{i + 1}_gradient'] = (df[f'T1-{i + 1}'] - df[f'T1-{i}']) / i
            df[f'T2-{i}_T2-{i + 1}_gradient'] = (df[f'T2-{i + 1}'] - df[f'T2-{i}']) / i

    # 时间梯度特征
    if 'time_gradient' in feature_types:
        for i in range(1, 18):
            df[f'T1-{i}_time_gradient'] = df[f'T1-{i}'].diff()
            df[f'T2-{i}_time_gradient'] = df[f'T2-{i}'].diff()
            df[f'V{i}_time_gradient'] = df[f'V{i}'].diff()

    # 时空梯度特征
    if 'time_spatial_gradient' in feature_types:
        for i in range(1, 17):
            df[f'T1-{i}_T1-{i + 1}_time_gradient'] = df[f'T1-{i + 1}'].diff() - df[f'T1-{i}'].diff()
            df[f'T2-{i}_T2-{i + 1}_time_gradient'] = df[f'T2-{i + 1}'].diff() - df[f'T2-{i}'].diff()
            df[f'V{i}_V{i + 1}_time_gradient'] = df[f'V{i + 1}'].diff() - df[f'V{i}'].diff()

    return df


df_add = generate_features(df_train.copy(), USE_FEATURE_TYPES)
df_add_test = generate_features(df_train.copy(), USE_FEATURE_TYPES)

df_add.columns

## 训练前预处理

In [None]:
target_cols = df_add.columns[53: 53 + 17 * 2].tolist()
feature_cols = [col for col in df_add.columns if col not in target_cols + ['index', 'datetime', 'timestamp']]
# 训练集特征向量
X = df_add[feature_cols]

# 测试集特征向量
X_test = df_add_test[feature_cols]

# 训练集标签向量
Y = df_add[target_cols]

# TODO 压缩内存
X.shape, Y.shape

## 模型训练

### 超参数优化

In [None]:
def objective(trial, x, y_target):
    """
    :param trial:
    :param x: n维特征向量
    :param y_target: 一维标签向量
    :return:
    """

    # 超参数范围
    params = {
            'boosting_type': 'gbdt',
            'objective': 'regression_l1',
            'metric': 'mae',
            'min_child_weight': 5,
            'num_leaves': trial.suggest_int('num_leaves', 2, 256),
            'lambda_l2': 10,
            'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
            'seed': 2023,
            'nthread': 16,
            'verbose': -1,
            'device': 'cpu'
    }

    # 5-Fold 交叉验证
    kf = KFold(n_splits=5, random_state=42, shuffle=True)
    mae_scores_target = []  # Store the MAE for each fold

    # 使用mae确定最优超参数
    for train_index, val_index in kf.split(x):
        X_train, X_val = x.iloc[train_index], x.iloc[val_index]
        y_train, y_val = y_target.iloc[train_index], y_target.iloc[val_index]

        train_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

        gbm = lgb.train(params, train_data, valid_sets=val_data, num_boost_round=TRAIN_ROUND)

        y_val_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        mae = mean_absolute_error(y_val, y_val_pred)

        mae_scores_target.append(mae)

    # Pass the average score back to optuna
    average_mae = np.mean(mae_scores_target)
    trial.report(average_mae, step=0)

    return average_mae


### 多目标训练

In [None]:

def sequential_optimization(x, y):
    for i, target in enumerate(y.columns):
        y_target = y[target]

        study = optuna.create_study(direction='minimize')

        study.optimize(lambda trial: objective(trial, x, y_target), n_trials=OPTUNA_ROUND)

        best_params = study.best_trial.params

        # Train and save a model using the best hyperparameters
        train_data = lgb.Dataset(x, label=y_target)
        gbm_best = lgb.train(best_params, train_data, num_boost_round=TRAIN_ROUND)

        dump(gbm_best, f'./models/model_{i}.joblib')

        # 可视化
        ov.plot_optimization_history(study).show()
        ov.plot_param_importances(study).show()


# 并行版本
def parallel_optimization(x, y, ):
    # Optimize the hyperparameters for each target in parallel
    results = Parallel(n_jobs=-1)(
            delayed(lambda trial: objective(trial, x, y[col]))(optuna.create_study(direction='minimize'), n_trials=OPTUNA_ROUND) for col in
            y.columns)

    # Get the best hyperparameters for each target
    best_params_list = [res.params for res in results]

    # Train and save a model for each target using the best hyperparameters
    for i, (target, best_params) in enumerate(zip(y.columns, best_params_list)):
        train_data = lgb.Dataset(x, label=y[target])
        gbm_best = lgb.train(best_params, train_data, num_boost_round=5000)
        dump(gbm_best, f'./models/model_parallel_{i}.joblib')
        # Visualize the optimization history and parameter importances for each target
        # ov.plot_optimization_history(study).show()
        # ov.plot_param_importances(study).show()


sequential_optimization(X, Y)

## 模型推理

In [None]:
def predict_by_multi_targets(x, y):
    """
    使用保存的模型分开预测
    :param x: 测试集特征向量
    :param y: 训练集标签向量，用于提取名称
    """
    # Initialize an empty dataframe to store the predictions
    df_predictions = pd.DataFrame()

    # For each target, load the corresponding model and make predictions
    for i in range(len(y.columns)):
        # Load the model
        gbm_best_saved = load(f'model_{i}.joblib')

        # Make predictions
        predictions = gbm_best_saved.predict(x)

        # Store the predictions in the dataframe
        df_predictions[f'target_{i}'] = predictions

    # Save the predictions to a csv file
    df_predictions.to_csv('predictions.csv', index=False)


predict_by_multi_targets(X_test, Y)