In [72]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

In [73]:
def make_train_test(X, Y, seed, rate):
    idx = int(rate * X.shape[0])
    X_train = X[:idx]
    Y_train = Y[:idx]
    X_test = X[idx:]
    Y_test = Y[idx:]
    shuffled_indices = np.arange(X_train.shape[0])
    np.random.seed(seed)
    np.random.shuffle(shuffled_indices)
    X_train, Y_train = X_train[shuffled_indices], Y_train[shuffled_indices]
    return (X_train, Y_train), (X_test, Y_test)

In [74]:
data_iscp = pd.read_csv('train_data_iscp.csv', index_col='Unnamed: 0')
data_iscp.shape, data_iscp.head()

((988941, 44),
       county   target  year  month  day  hour  target_used  eic_count  \
 5857       0  107.129  2021      9    3     0       96.590      108.0   
 5859       0   19.630  2021      9    3     0       17.314       17.0   
 5861       0  690.908  2021      9    3     0      656.859      688.0   
 5863       0   64.100  2021      9    3     0       59.000        5.0   
 5865       0  529.972  2021      9    3     0      501.760       43.0   
 
       installed_capacity  lowest_price_per_mwh  ...  \
 5857              952.89                 45.62  ...   
 5859              166.40                 45.62  ...   
 5861             7207.88                 45.62  ...   
 5863              400.00                 45.62  ...   
 5865             1411.00                 45.62  ...   
 
       direct_solar_radiation_fw  surface_solar_radiation_downwards_fw  \
 5857                        0.0                                   0.0   
 5859                        0.0                     

In [75]:
data_nocp = pd.read_csv('train_data_nocp.csv', index_col='Unnamed: 0')
data_nocp.shape, data_nocp.head()

((988941, 44),
       county  target  year  month  day  hour  target_used  eic_count  \
 5856       0   0.793  2021      9    3     0        0.713      108.0   
 5858       0   0.000  2021      9    3     0        0.000       17.0   
 5860       0   0.977  2021      9    3     0        2.904      688.0   
 5862       0   0.000  2021      9    3     0        0.000        5.0   
 5864       0   0.000  2021      9    3     0        0.000       43.0   
 
       installed_capacity  lowest_price_per_mwh  ...  \
 5856              952.89                 45.62  ...   
 5858              166.40                 45.62  ...   
 5860             7207.88                 45.62  ...   
 5862              400.00                 45.62  ...   
 5864             1411.00                 45.62  ...   
 
       direct_solar_radiation_fw  surface_solar_radiation_downwards_fw  \
 5856                        0.0                                   0.0   
 5858                        0.0                           

In [76]:
data_selected_iscp = pd.read_csv('train_data_selected_iscp.csv', index_col='Unnamed: 0')
data_selected_iscp.shape, data_selected_iscp.head()

((988941, 33),
       county   target  year  month  day  hour  target_used  eic_count  \
 5857       0  107.129  2021      9    3     0       96.590      108.0   
 5859       0   19.630  2021      9    3     0       17.314       17.0   
 5861       0  690.908  2021      9    3     0      656.859      688.0   
 5863       0   64.100  2021      9    3     0       59.000        5.0   
 5865       0  529.972  2021      9    3     0      501.760       43.0   
 
       installed_capacity  lowest_price_per_mwh  ...  cloudcover_total_fw  \
 5857              952.89                 45.62  ...             0.773093   
 5859              166.40                 45.62  ...             0.773093   
 5861             7207.88                 45.62  ...             0.773093   
 5863              400.00                 45.62  ...             0.773093   
 5865             1411.00                 45.62  ...             0.773093   
 
       direct_solar_radiation_fw  surface_solar_radiation_downwards_fw  \
 

In [77]:
data_selected_nocp = pd.read_csv('train_data_selected_nocp.csv', index_col='Unnamed: 0')
data_selected_nocp.shape, data_selected_nocp.head()

((988941, 33),
       county  target  year  month  day  hour  target_used  eic_count  \
 5856       0   0.793  2021      9    3     0        0.713      108.0   
 5858       0   0.000  2021      9    3     0        0.000       17.0   
 5860       0   0.977  2021      9    3     0        2.904      688.0   
 5862       0   0.000  2021      9    3     0        0.000        5.0   
 5864       0   0.000  2021      9    3     0        0.000       43.0   
 
       installed_capacity  lowest_price_per_mwh  ...  cloudcover_total_fw  \
 5856              952.89                 45.62  ...             0.773093   
 5858              166.40                 45.62  ...             0.773093   
 5860             7207.88                 45.62  ...             0.773093   
 5862              400.00                 45.62  ...             0.773093   
 5864             1411.00                 45.62  ...             0.773093   
 
       direct_solar_radiation_fw  surface_solar_radiation_downwards_fw  \
 5856  

In [78]:
model = lgb.LGBMRegressor()

### 先来个没有筛选特征
##### iscp训练的模型计算mae，和nocp训练的模型计算的mae

In [79]:
X_iscp = np.array(data_iscp.drop('target', axis=1))
y_iscp = np.array(data_iscp.target)
(X_train_iscp, y_train_iscp), (X_test_iscp, y_test_iscp) = make_train_test(X_iscp, y_iscp, 1, 0.7)
model.fit(X_train_iscp, y_train_iscp)
y_pred_iscp = model.predict(X_test_iscp)
mae_iscp = mean_absolute_error(y_test_iscp, y_pred_iscp)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021471 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8106
[LightGBM] [Info] Number of data points in the train set: 692258, number of used features: 43
[LightGBM] [Info] Start training from score 420.218698


In [80]:
X_nocp = np.array(data_nocp.drop('target', axis=1))
y_nocp = np.array(data_nocp.target)
(X_train_nocp, y_train_nocp), (X_test_nocp, y_test_nocp) = make_train_test(X_nocp, y_nocp, 1, 0.7)
model.fit(X_train_nocp, y_train_nocp)
y_pred_nocp = model.predict(X_test_nocp)
mae_nocp = mean_absolute_error(y_test_nocp, y_pred_nocp)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020289 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8106
[LightGBM] [Info] Number of data points in the train set: 692258, number of used features: 43
[LightGBM] [Info] Start training from score 82.693246


In [81]:
mae_iscp, mae_nocp, (mae_iscp+mae_nocp)/2

(137.38071858599616, 45.954090035536524, 91.66740431076634)

### 再来个筛选特征后的
##### selected_iscp训练的模型计算mae，和selected_nocp训练的模型计算的mae

In [82]:
X_selected_iscp = np.array(data_selected_iscp.drop('target', axis=1))
y_selected_iscp = np.array(data_selected_iscp.target)
(X_train_selected_iscp, y_train_selected_iscp), (X_test_selected_iscp, y_test_selected_iscp) = make_train_test(X_selected_iscp, y_selected_iscp, 1, 0.7)
model.fit(X_train_selected_iscp, y_train_selected_iscp)
y_pred_selected_iscp = model.predict(X_test_selected_iscp)
mae_selected_iscp = mean_absolute_error(y_test_selected_iscp, y_pred_selected_iscp)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013013 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5317
[LightGBM] [Info] Number of data points in the train set: 692258, number of used features: 32
[LightGBM] [Info] Start training from score 420.218698


In [83]:
X_selected_nocp = np.array(data_selected_nocp.drop('target', axis=1))
y_selected_nocp = np.array(data_selected_nocp.target)
(X_train_selected_nocp, y_train_selected_nocp), (X_test_selected_nocp, y_test_selected_nocp) = make_train_test(X_selected_nocp, y_selected_nocp, 1, 0.7)
model.fit(X_train_selected_nocp, y_train_selected_nocp)
y_pred_selected_nocp = model.predict(X_test_selected_nocp)
mae_selected_nocp = mean_absolute_error(y_test_selected_nocp, y_pred_selected_nocp)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014289 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5317
[LightGBM] [Info] Number of data points in the train set: 692258, number of used features: 32
[LightGBM] [Info] Start training from score 82.693246


In [84]:
mae_selected_iscp, mae_selected_nocp, (mae_selected_iscp+mae_selected_nocp)/2

(138.70618581169347, 47.09456928756863, 92.90037754963106)

* 我们可以注意到每个模型训练只用了整体数据的一半，即训练iscp模型就只用iscp的数据。

In [85]:
data = pd.concat([data_iscp,data_nocp], axis=0)
X = np.array(data.drop('target', axis=1))
y = np.array(data.target)
(X_train, y_train), (X_test, y_test) = make_train_test(X, y, 1, 0.7)
model.fit(X_train,y_train)
y_pred_iscp = model.predict(X_test_iscp)
y_pred_nocp = model.predict(X_test_nocp)
mae_iscp = mean_absolute_error(y_test_iscp, y_pred_iscp)
mae_nocp = mean_absolute_error(y_test_nocp, y_pred_nocp)
mae_iscp, mae_nocp, (mae_iscp+mae_nocp)/2

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033251 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8158
[LightGBM] [Info] Number of data points in the train set: 1384517, number of used features: 43
[LightGBM] [Info] Start training from score 344.039993


(92.47473632532711, 79.68547576994806, 86.08010604763759)

In [86]:
data_selected = pd.concat([data_selected_iscp,data_selected_nocp], axis=0)
X_selected = np.array(data_selected.drop('target', axis=1))
y_selected = np.array(data_selected.target)
(X_train_selected, y_train_selected), (X_test_selected, y_test_selected) = make_train_test(X_selected, y_selected, 1, 0.7)
model.fit(X_train_selected,y_train_selected)
y_pred_selected_iscp = model.predict(X_test_selected_iscp)
y_pred_selected_nocp = model.predict(X_test_selected_nocp)
mae_selected_iscp = mean_absolute_error(y_test_selected_iscp, y_pred_selected_iscp)
mae_selected_nocp = mean_absolute_error(y_test_selected_nocp, y_pred_selected_nocp)
mae_selected_iscp, mae_selected_nocp, (mae_selected_iscp+mae_selected_nocp)/2

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024576 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5373
[LightGBM] [Info] Number of data points in the train set: 1384517, number of used features: 32
[LightGBM] [Info] Start training from score 344.039993


(92.50119178572898, 81.44163920024458, 86.97141549298678)