In [1]:
import pandas as pd  # 用于处理数据的工具
import lightgbm as lgb  # 机器学习模型 LightGBM
from sklearn.metrics import mean_absolute_error  # 评分 MAE 的计算函数
from sklearn.model_selection import train_test_split  # 拆分训练集与验证集工具
from tqdm import tqdm  # 显示循环的进度条工具

## 数据准备

In [47]:
# 数据准备
df_train = pd.read_csv("./datasets/train.csv")  # 原始训练数据。
# test_dataset = pd.read_csv("./datasets/test.csv")  # 原始测试数据（用于提交）。


# 重命名
df_train.columns = (
        ['index', 'datetime'] +
        [f'V{i + 1}' for i in range(17)] + [f'T1-{i + 1}' for i in range(17)] + [f'T2-{i + 1}' for i in range(17)] +
        [f'T1R-{i + 1}' for i in range(17)] + [f'T2R-{i + 1}' for i in range(17)]

)
df_train

Unnamed: 0,index,datetime,V1,V2,V3,V4,V5,V6,V7,V8,...,T2R-8,T2R-9,T2R-10,T2R-11,T2R-12,T2R-13,T2R-14,T2R-15,T2R-16,T2R-17
0,1,2022/11/6 9:08,35.668999,36.146000,25.558001,26.195000,25.670000,15.702,16.690001,15.991,...,827,827,827,827,827,827,827,827,827,750
1,2,2022/11/6 9:09,35.995998,36.347000,25.382000,26.348000,26.131001,15.523,16.825001,15.871,...,827,827,827,827,827,827,827,827,827,750
2,3,2022/11/6 9:11,35.340000,36.311001,25.469999,26.093000,25.639000,15.564,15.564000,15.947,...,827,827,827,827,827,827,827,827,827,750
3,4,2022/11/6 9:12,35.585999,36.091000,25.250000,26.127001,25.670000,15.575,16.775999,15.936,...,827,827,827,827,827,827,827,827,827,750
4,5,2022/11/6 9:13,35.946999,36.256001,25.163000,26.399000,25.837999,15.460,16.580999,15.795,...,827,827,827,827,827,827,827,827,827,750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26651,26652,2023/3/1 3:49,24.594000,24.377001,29.191999,25.551001,27.016001,4.377,21.929001,24.459,...,837,837,837,837,837,837,837,837,837,750
26652,26653,2023/3/1 3:54,24.379000,24.424999,29.253000,25.652000,27.188000,4.415,22.017000,24.534,...,837,837,837,837,837,837,837,837,837,750
26653,26654,2023/3/1 4:00,24.407000,24.312000,29.010000,25.382000,26.813000,4.354,21.726000,24.204,...,837,837,837,837,837,837,837,837,837,750
26654,26655,2023/3/1 4:05,24.636000,24.409000,29.162001,25.551001,27.032000,4.362,21.813000,21.813,...,837,837,837,837,837,837,837,837,837,750


## 特征工程

In [57]:
def generate_features(df, feature_types: [str, ...]):
    # Convert 'timestamp' to datetime
    df['timestamp'] = pd.to_datetime(df['datetime'])
    df = df.sort_values('timestamp')

    # 时间特征
    if 'time' in feature_types:
        df['year'] = df['timestamp'].dt.year
        df['month'] = df['timestamp'].dt.month
        df['day'] = df['timestamp'].dt.day
        df['hour'] = df['timestamp'].dt.hour
        df['minute'] = df['timestamp'].dt.minute
        df['dayofweek'] = df['timestamp'].dt.dayofweek

    # 累积统计特征
    if 'statistical' in feature_types:
        for i in range(1, 18):
            df[f'T1-{i}_mean'] = df[[f'T1-{j}' for j in range(1, i + 1)]].mean(axis=1)
            df[f'T1-{i}_std'] = df[[f'T1-{j}' for j in range(1, i + 1)]].std(axis=1)

            df[f'T2-{i}_mean'] = df[[f'T2-{j}' for j in range(1, i + 1)]].mean(axis=1)
            df[f'T2-{i}_std'] = df[[f'T2-{j}' for j in range(1, i + 1)]].std(axis=1)

            df[f'V{i}_mean'] = df[[f'V{j}' for j in range(1, i + 1)]].mean(axis=1)
            df[f'V{i}_std'] = df[[f'V{j}' for j in range(1, i + 1)]].std(axis=1)

    # 上下温度区总体特征
    if 'overall' in feature_types:
        # Average, min, max, and standard deviation
        df['T1_mean'] = df[[f'T1-{i}' for i in range(1, 18)]].mean(axis=1)
        df['T1_min'] = df[[f'T1-{i}' for i in range(1, 18)]].min(axis=1)
        df['T1_max'] = df[[f'T1-{i}' for i in range(1, 18)]].max(axis=1)
        df['T1_std'] = df[[f'T1-{i}' for i in range(1, 18)]].std(axis=1)

        df['T2_mean'] = df[[f'T2-{i}' for i in range(1, 18)]].mean(axis=1)
        df['T2_min'] = df[[f'T2-{i}' for i in range(1, 18)]].min(axis=1)
        df['T2_max'] = df[[f'T2-{i}' for i in range(1, 18)]].max(axis=1)
        df['T2_std'] = df[[f'T2-{i}' for i in range(1, 18)]].std(axis=1)

        # Difference and ratio between upper and lower heating rods
        df['T1_T2_diff'] = df['T1_mean'] - df['T2_mean']
        df['T1_T2_ratio'] = df['T1_mean'] / df['T2_mean']

    # 上下温度区交互特征
    if 'interaction' in feature_types:
        for i in range(1, 18):
            df[f'T1-{i}_T2-{i}_diff'] = df[f'T1-{i}'] - df[f'T2-{i}']
            df[f'T1-{i}_T2-{i}_ratio'] = df[f'T1-{i}'] / df[f'T2-{i}']
            df[f'T1-{i}_T2-{i}_interaction'] = df[f'T1-{i}'] * df[f'T2-{i}']

    # 相邻数据点的差值
    if 'difference' in feature_types:
        for i in range(1, 17):
            df[f'T1-{i}_T1-{i + 1}_diff'] = df[f'T1-{i}'] - df[f'T1-{i + 1}']
            df[f'T2-{i}_T2-{i + 1}_diff'] = df[f'T2-{i}'] - df[f'T2-{i + 1}']
            df[f'V{i}_V{i + 1}_diff'] = df[f'V{i}'] - df[f'V{i + 1}']

    # 比例特征
    if 'ratio' in feature_types:
        for i in range(1, 18):
            df[f'T1-{i}_V{i}_ratio'] = df[f'T1-{i}'] / df[f'V{i}']
            df[f'T2-{i}_V{i}_ratio'] = df[f'T2-{i}'] / df[f'V{i}']
            df[f'T1-{i}_V{i}_interaction'] = df[f'T1-{i}'] * df[f'V{i}']
            df[f'T2-{i}_V{i}_interaction'] = df[f'T2-{i}'] * df[f'V{i}']

    # 滑动窗口特征
    if 'rolling_window' in feature_types:
        window_sizes = [i for i in range(1, 5)]  # Change this list according to your needs
        for window_size in window_sizes:
            for i in range(1, 18):
                df[f'T1-{i}_rolling_mean_{window_size}'] = df[f'T1-{i}'].rolling(window_size).mean()
                df[f'T2-{i}_rolling_mean_{window_size}'] = df[f'T2-{i}'].rolling(window_size).mean()
                df[f'V{i}_rolling_mean_{window_size}'] = df[f'V{i}'].rolling(window_size).mean()

    # 空间梯度特征
    # 假设加热棒之间的空间距离是均匀的
    if 'spatial_gradient' in feature_types:
        for i in range(1, 17):
            df[f'T1-{i}_T1-{i + 1}_gradient'] = (df[f'T1-{i + 1}'] - df[f'T1-{i}']) / i
            df[f'T2-{i}_T2-{i + 1}_gradient'] = (df[f'T2-{i + 1}'] - df[f'T2-{i}']) / i

    # 时间梯度特征
    if 'time_gradient' in feature_types:
        for i in range(1, 18):
            df[f'T1-{i}_time_gradient'] = df[f'T1-{i}'].diff()
            df[f'T2-{i}_time_gradient'] = df[f'T2-{i}'].diff()
            df[f'V{i}_time_gradient'] = df[f'V{i}'].diff()

    # 时空梯度特征
    if 'time_spatial_gradient' in feature_types:
        for i in range(1, 17):
            df[f'T1-{i}_T1-{i + 1}_time_gradient'] = df[f'T1-{i + 1}'].diff() - df[f'T1-{i}'].diff()
            df[f'T2-{i}_T2-{i + 1}_time_gradient'] = df[f'T2-{i + 1}'].diff() - df[f'T2-{i}'].diff()
            df[f'V{i}_V{i + 1}_time_gradient'] = df[f'V{i + 1}'].diff() - df[f'V{i}'].diff()

    return df


df_add = (generate_features(
        df_train.copy(),
        ['time', 'statistical', 'overall', 'interaction', 'difference', 'ratio',
         'rolling_window',
         'spatial_gradient', 'time_gradient', 'time_spatial_gradient'])
)
# df_new[df_new.isnull().all(axis=1)]
df_add

  df[f'T1-{i}_mean'] = df[[f'T1-{j}' for j in range(1, i + 1)]].mean(axis=1)
  df[f'T1-{i}_std'] = df[[f'T1-{j}' for j in range(1, i + 1)]].std(axis=1)
  df[f'T2-{i}_mean'] = df[[f'T2-{j}' for j in range(1, i + 1)]].mean(axis=1)
  df[f'T2-{i}_std'] = df[[f'T2-{j}' for j in range(1, i + 1)]].std(axis=1)
  df[f'V{i}_mean'] = df[[f'V{j}' for j in range(1, i + 1)]].mean(axis=1)
  df[f'V{i}_std'] = df[[f'V{j}' for j in range(1, i + 1)]].std(axis=1)
  df[f'T1-{i}_mean'] = df[[f'T1-{j}' for j in range(1, i + 1)]].mean(axis=1)
  df[f'T1-{i}_std'] = df[[f'T1-{j}' for j in range(1, i + 1)]].std(axis=1)
  df[f'T2-{i}_mean'] = df[[f'T2-{j}' for j in range(1, i + 1)]].mean(axis=1)
  df[f'T2-{i}_std'] = df[[f'T2-{j}' for j in range(1, i + 1)]].std(axis=1)
  df[f'V{i}_mean'] = df[[f'V{j}' for j in range(1, i + 1)]].mean(axis=1)
  df[f'V{i}_std'] = df[[f'V{j}' for j in range(1, i + 1)]].std(axis=1)
  df['T1_mean'] = df[[f'T1-{i}' for i in range(1, 18)]].mean(axis=1)
  df['T1_min'] = df[[f'T1-{i}' for 

Unnamed: 0,index,datetime,V1,V2,V3,V4,V5,V6,V7,V8,...,V13_V14_time_gradient,T1-14_T1-15_time_gradient,T2-14_T2-15_time_gradient,V14_V15_time_gradient,T1-15_T1-16_time_gradient,T2-15_T2-16_time_gradient,V15_V16_time_gradient,T1-16_T1-17_time_gradient,T2-16_T2-17_time_gradient,V16_V17_time_gradient
0,1,2022/11/6 9:08,35.668999,36.146000,25.558001,26.195000,25.670000,15.702,16.690001,15.991,...,,,,,,,,,,
1,2,2022/11/6 9:09,35.995998,36.347000,25.382000,26.348000,26.131001,15.523,16.825001,15.871,...,1.999854e-02,0.0,0.0,0.001000,0.0,0.0,-0.011000,0.0,0.0,0.063999
2,3,2022/11/6 9:11,35.340000,36.311001,25.469999,26.093000,25.639000,15.564,15.564000,15.947,...,-2.899933e-02,0.0,0.0,-0.011000,0.0,0.0,0.011000,0.0,0.0,-0.063999
3,4,2022/11/6 9:12,35.585999,36.091000,25.250000,26.127001,25.670000,15.575,16.775999,15.936,...,1.900006e-02,0.0,0.0,0.000000,0.0,0.0,-0.048000,0.0,0.0,-0.124001
4,5,2022/11/6 9:13,35.946999,36.256001,25.163000,26.399000,25.837999,15.460,16.580999,15.795,...,-3.800011e-02,0.0,0.0,-0.032999,0.0,0.0,0.153999,0.0,0.0,0.137003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26651,26652,2023/3/1 3:49,24.594000,24.377001,29.191999,25.551001,27.016001,4.377,21.929001,24.459,...,0.000000e+00,0.0,0.0,0.077000,0.0,0.0,-0.177000,0.0,0.0,-0.011000
26652,26653,2023/3/1 3:54,24.379000,24.424999,29.253000,25.652000,27.188000,4.415,22.017000,24.534,...,2.299976e-02,0.0,0.0,-0.096001,0.0,0.0,0.035002,0.0,0.0,-0.258003
26653,26654,2023/3/1 4:00,24.407000,24.312000,29.010000,25.382000,26.813000,4.354,21.726000,24.204,...,-3.499889e-02,0.0,0.0,0.065999,0.0,0.0,0.056000,0.0,0.0,0.097002
26654,26655,2023/3/1 4:05,24.636000,24.409000,29.162001,25.551001,27.032000,4.362,21.813000,21.813,...,-9.600000e-07,0.0,0.0,-0.037998,0.0,0.0,0.212999,0.0,0.0,0.142998


## 训练前预处理

In [85]:
target_cols = df_add.columns[53: 53 + 17 * 2].tolist()
feature_cols = [col for col in df_add.columns if col not in target_cols + ['index', 'datetime', 'timestamp']]
X = df_add[feature_cols]
Y = df_add[target_cols]
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

# 准备 LightGBM 数据
train_data = lgb.Dataset(X_train, label=Y_train)
val_data = lgb.Dataset(X_val, label=Y_val, reference=train_data)

['V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'T1-1',
 'T1-2',
 'T1-3',
 'T1-4',
 'T1-5',
 'T1-6',
 'T1-7',
 'T1-8',
 'T1-9',
 'T1-10',
 'T1-11',
 'T1-12',
 'T1-13',
 'T1-14',
 'T1-15',
 'T1-16',
 'T1-17',
 'T2-1',
 'T2-2',
 'T2-3',
 'T2-4',
 'T2-5',
 'T2-6',
 'T2-7',
 'T2-8',
 'T2-9',
 'T2-10',
 'T2-11',
 'T2-12',
 'T2-13',
 'T2-14',
 'T2-15',
 'T2-16',
 'T2-17',
 'year',
 'month',
 'day',
 'hour',
 'minute',
 'dayofweek',
 'T1-1_mean',
 'T1-1_std',
 'T2-1_mean',
 'T2-1_std',
 'V1_mean',
 'V1_std',
 'T1-2_mean',
 'T1-2_std',
 'T2-2_mean',
 'T2-2_std',
 'V2_mean',
 'V2_std',
 'T1-3_mean',
 'T1-3_std',
 'T2-3_mean',
 'T2-3_std',
 'V3_mean',
 'V3_std',
 'T1-4_mean',
 'T1-4_std',
 'T2-4_mean',
 'T2-4_std',
 'V4_mean',
 'V4_std',
 'T1-5_mean',
 'T1-5_std',
 'T2-5_mean',
 'T2-5_std',
 'V5_mean',
 'V5_std',
 'T1-6_mean',
 'T1-6_std',
 'T2-6_mean',
 'T2-6_std',
 'V6_mean',
 'V6_std',
 'T1-7_mean',
 'T1-7_std',
 

## 超参数设定

In [None]:
params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'mae',
        'min_child_weight': 5,
        'num_leaves': 2 ** 5,
        'lambda_l2': 10,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 4,
        'learning_rate': 0.05,
        'seed': 2023,
        'nthread': 16,
        'verbose': -1,
}

## 模型训练

In [None]:
# Train the model
gbm = lgb.train(params,
                train_data,
                valid_sets=val_data,
                num_boost_round=5000,
                early_stopping_rounds=100)

## 模型预测

In [None]:
# Predict on the validation set
Y_val_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)

# Calculate MAE
mae = mean_absolute_error(Y_val, Y_val_pred)

print(f'Validation MAE: {mae}')

## 模型推理