In [None]:
# 本地运行cpu环境运行请添加以下被注释的命令，gpu环境请参考官网gpu版paddle安装
#!pip install paddlepaddle==2.4.0
!pip install lightgbm==3.3.5
!pip install scikit-learn==1.0.2
!pip install joblib==1.2.0

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [2]:
#导入需要的包
%matplotlib inline
import os
import warnings
import pandas as pd
import numpy as np
import shutil
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor, early_stopping
from paddle.io import Dataset
from sklearn.metrics import mean_squared_error,mean_absolute_error
import paddle
import paddle.nn as nn
import paddle.optimizer as optim
warnings.filterwarnings('ignore')

# 1 数据集分析
## 1.1 数据预处理（此处数据预处理采用原作者基线方案，链接：https://aistudio.baidu.com/aistudio/projectdetail/5866171?contributionType=1&sUid=90149&shared=1&ts=1680491732413）

In [7]:
# # 解压缩数据集
# # !unzip -O GBK 区域赛训练集.zip
# !unzip -O GBK 处理后数据.zip

本次比赛重点侧重于模型算法的落地应用，因此，数据集高度贴近风电企业实际情况，选手开发数据集处理方法，需要同时适配不同格式的风机数据。预选赛提供10个风场的数据，区域赛会提供新的10个风场数据用于新的训练和预测，数据格式的情况可能会更为复杂，本文总结了风场数据可能存在的各种情况：

![](https://ai-studio-static-online.cdn.bcebos.com/6ff0922de0f548de9d1c0483d24421dc84439a6eabbb44f78acfcb8a4adac0d0)


针对以上问题，可以确定如下的数据集处理思路：
1. 将风机数据统一为csv格式
2. 将同一个风机的多条数据进行拼接，按时间戳进行排序，合成一个文件

In [8]:
def data_preprocess(data_dir):
    files = os.listdir(data_dir)
    # 第一步，完成数据格式统一
    for f in files:
        # 获取文件路径
        data_file = os.path.join(data_dir, f)
        # 获取文件名后缀
        data_type = os.path.splitext(data_file)[-1]
        # 获取文件名前缀
        data_name = os.path.splitext(data_file)[0]
        # 如果是excel文件，进行转换
        if data_type == '.xlsx':
            # 需要特别注意的是，在读取excel文件时要指定空值的显示方式，否则会在保存时以字符“.”代替，影响后续的数据分析
            data_xls = pd.read_excel(data_file, index_col=0, na_values='')
            data_xls.to_csv(data_name + '.csv', encoding='utf-8')
            # 顺便删除原文件
            os.remove(data_file)
    # 第二步，完成多文件的合并，文件目录要重新更新一次
    files = os.listdir(data_dir)
    for f in files:
        # 获取文件路径
        data_file = os.path.join(data_dir, f)
        # 获取文件名前缀
        data_basename = os.path.basename(data_file)
        # 检查风机数据是否有多个数据文件
        if len(data_basename.split('-')) > 1:
            merge_list = []
            # 找出该风机的所有数据文件
            matches = [f for f in files if (f.find(data_basename.split('-')[0] + '-') > -1)]
            for i in matches:
                # 读取风机这部分数据
                data_df = pd.read_csv(os.path.join(data_dir, i), index_col=False, keep_default_na=False)
                merge_list.append(data_df)
            if len(merge_list) > 0:
                all_data = pd.concat(merge_list, axis=0, ignore_index=True).fillna(".")
                all_data.to_csv(os.path.join(data_dir, data_basename.split('-')[0] + '.csv'), index=False)
            for i in matches:
                # 删除这部分数据文件
                os.remove(os.path.join(data_dir, i))
            # 更新文件目录
            files = os.listdir(data_dir)


In [9]:
data_preprocess('区域赛训练集')

至此，我们完成了数据集的批量预处理，将各种风机不同格式、不同数量文件的数据集进行了统一。这样，在后续的时序数据预测中，我们只需要对csv格式的时序数据进行处理即可。

In [10]:
os.listdir('区域赛训练集')

['19.csv',
 '17.csv',
 '14.csv',
 '12.csv',
 '18.csv',
 '13.csv',
 '11.csv',
 '15.csv',
 '16.csv',
 '20.csv']

# 2 模型应用
## 2.1 数据集准备
在数据集分析时，我们可以不对数据缺失值进行处理以保持数据原貌。但是在模型训练时，则必须指定缺失值的处理方式，否则训练会报错。

In [11]:
# 创建model目录，保存文件
folder_path = "./model"
if os.path.exists(folder_path):
    shutil.rmtree(folder_path)
os.mkdir(folder_path)

In [12]:
class RegressionDataset(Dataset):
    def __init__(self, x, y):
        super().__init__()
        self.features = x
        self.targets = y

    def __getitem__(self, index):
        x = paddle.to_tensor(self.features[index], dtype='float32')
        y = paddle.to_tensor(self.targets[index], dtype='float32')
        return x, y

    def __len__(self):
        return len(self.features)


In [13]:
# # 这里采用一号风机做样例
# tur_id = 1
# n = 2 - len(str(tur_id))
# filename = '0' * n + str(tur_id) + '.csv'
# data = pd.read_csv("功率预测竞赛赛题与数据集/" + filename)

# # ===========异常值处理===========
# # 当实际风速为0时，功率置为0
# data.loc[data['ROUND(A.WS,1)']==0, 'YD15'] = 0

# # TODO 风速过大但功率为0的异常：先设计函数拟合出：实际功率=f(风速)，
# # 然后代入异常功率的风速获取理想功率，替换原异常功率

# # TODO 对于在特定风速下的离群功率（同时刻用IQR检测出来），做功率修正（如均值修正）
# for sp in data['WINDSPEED'].unique():
#     dfs = data.loc[data['WINDSPEED']==sp,'YD15']
#     dfs=dfs.sort_values()
#     Q1=dfs.quantile(0.25)
#     Q3=dfs.quantile(0.75)
#     IQR=Q3-Q1;
#     down=Q1-1.5*IQR
#     up=Q3+1.5*IQR
#     data.loc[(data['WINDSPEED']==sp)&(data['YD15']<down),'YD15']=down
#     data.loc[(data['WINDSPEED']==sp)&(data['YD15']>up),'YD15']=up

# data.dropna(subset=["YD15"], inplace=True) #删除YD15为空的行
# data_mean = data.mean(axis=0)
# data = data.fillna(value=data_mean)  #用平均值填充其它列的空值
# new_data = data.copy()
# # 回归数据集自身特点，还是可以看中回归任务
# train = new_data[["WINDSPEED", "PREPOWER", "WINDDIRECTION", "TEMPERATURE", "HUMIDITY", "PRESSURE", "ROUND(A.WS,1)"]]
# print(train)

## 3.1 方案一：单模型预测
### 3.1.1 GCN单模型训练

In [21]:
# 定义GCN模型
class GCN(nn.Layer):

    def __init__(self, input_size, hidden_size, output_size):
        super(GCN, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, x, adj):
        x = paddle.matmul(adj, x) #计算乘积
        x = self.linear1(x)
        x = self.relu(x)
        x = paddle.matmul(adj, x)
        x = self.linear2(x)
        return x


# 定义数据集
class GraphDataset(paddle.io.Dataset):
    def __init__(self, x, y, use_gpu):
        super().__init__()
        self.x = x
        self.y = y
        self.gpukry = use_gpu
        if use_gpu:
            self.x = paddle.to_tensor(x,place='gpu')
            self.y = paddle.to_tensor(y,place='gpu')

    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return len(self.x)


In [22]:
def matplot_loss(train_loss):
    plt.plot(train_loss, label='train_loss')
    plt.legend(loc='best')
    plt.ylabel('loss', fontsize=12)
    plt.xlabel('epoch', fontsize=12)
    plt.title("loss")
    plt.show()

In [23]:
# 注意这里的参数请自行设定，笔者为了加速验证过程调小了batch_size,epochs数量，建议batch_size为2的n次方，epochs建议高于20
def trainModel(dataset, input_size=7, hidden_size=64, output_size=1, learning_rate=0.01, batch_size=2, epochs=30,
               use_gpu=False):
    """

    :param input_size: 输入特征大小默认是7
    :param hidden_size: 隐藏层大小默认是7
    :param output_size: 输出层大小默认是1
    :param learning_rate: 学习率默认是0.01
    :param batch_size: batch数据大小默认是2
    :param epochs: 训练批次默认是10
    :use_gpu: 是否使用gpu默认是False
    :return: 返回训练好的model
    """

    # 构建GCN模型
    model = GCN(input_size, hidden_size, output_size)
    if use_gpu:
        paddle.set_device('gpu')
        model=model.to('gpu')
    else:
        paddle.set_device('cpu')

    x_train, x_test, y_train, y_test = dataset
    # 将特征值和目标值转换为float32类型
    x_train = x_train.astype('float32')
    y_train = y_train.astype('float32')
    x_test = x_test.astype('float32')
    y_test = y_test.astype('float32')
    # 定义数据加载器
    train_dataset = GraphDataset(x_train, y_train, use_gpu)
    train_loader = paddle.io.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # 定义优化器和损失函数
    optimizer = optim.Adam(parameters=model.parameters(), learning_rate=learning_rate)
    criterion = nn.MSELoss()

    # 模型训练
    for epoch in range(epochs):
        loss = None
        for x, y in train_loader:
            adj = paddle.ones((x.shape[0], x.shape[0]))  # 假设所有节点之间都有连接
            output = model(x, adj)
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()
            optimizer.clear_grad()
        print(f'model epoch{epoch} loss is {loss}')

    # 模型评估
    test_dataset = GraphDataset(x_test, y_test, False)
    test_loader = paddle.io.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    model.eval()
    total_loss = 0
    with paddle.no_grad():
        for x, y in test_loader:
            adj = paddle.ones((x.shape[0], x.shape[0]))  # 假设所有节点之间都有连接
            output = model(x, adj)
            # print(output)
            # print(output.shape)
            loss = criterion(output, y)
            total_loss += loss.numpy().item() * x.shape[0]

    mean_loss = total_loss / len(test_dataset)
    print("Mean Loss on Test Set: {:.4f}".format(mean_loss))
    # matplot_loss(mean_loss)
    return model

In [24]:
# # 模型评估
# model = trainModel(dataset=(x_train, x_test, y_train, y_test),epochs=2,use_gpu=False)
# with paddle.no_grad():
#     adj = paddle.ones((x_test.shape[0], x_test.shape[0]))  # 假设所有节点之间都有连接
#     output = model(paddle.to_tensor(x_test, dtype="float32"), adj)
#     print(output)
#     print(output.shape)

In [25]:
# # 模型保存
# paddle.save(model.state_dict(), "model/" + str(tur_id) + ".pdparams")

### 3.1.2 GCN单模型批量训练

In [26]:
def mutiTrain_GCN(start, end):
    for i in range(start, end+1):
        n = 2 - len(str(i))
        filename = '0' * n + str(i) + '.csv'
        data = pd.read_csv("区域赛训练集/" + filename)

        # # ===========异常值处理===========
        # # 当实际风速为0时，功率置为0
        # data.loc[data['ROUND(A.WS,1)']==0, 'YD15'] = 0

        # # TODO 风速过大但功率为0的异常：先设计函数拟合出：实际功率=f(风速)，
        # # 然后代入异常功率的风速获取理想功率，替换原异常功率

        # # TODO 对于在特定风速下的离群功率（同时刻用IQR检测出来），做功率修正（如均值修正）
        # for sp in data['WINDSPEED'].unique():
        #     dfs = data.loc[data['WINDSPEED']==sp,'YD15']
        #     dfs=dfs.sort_values()
        #     Q1=dfs.quantile(0.25)
        #     Q3=dfs.quantile(0.75)
        #     IQR=Q3-Q1;
        #     down=Q1-1.5*IQR
        #     up=Q3+1.5*IQR
        #     data.loc[(data['WINDSPEED']==sp)&(data['YD15']<down),'YD15']=down
        #     data.loc[(data['WINDSPEED']==sp)&(data['YD15']>up),'YD15']=up
        
        data.dropna(subset=["YD15"], inplace=True)
        data_mean = data.mean(axis=0)
        data = data.fillna(value=data_mean)
        new_data = data.copy()
        train = new_data[
            ["WINDSPEED", "PREPOWER", "WINDDIRECTION", "TEMPERATURE", "HUMIDITY", "PRESSURE", "ROUND(A.WS,1)"]]
        print(train.shape)
        target = new_data[['YD15']].values
        x_train, x_test, y_train, y_test = train_test_split(train.values, target, test_size=0.1)
        
        model = trainModel(dataset=(x_train, x_test, y_train, y_test), batch_size=256, epochs=40,use_gpu=True)
        paddle.save(model.state_dict(), "model/" + str(i) + ".pdparams")

In [27]:
%%time
mutiTrain_GCN(11,20)

(23136, 7)


W0614 11:14:58.376343   220 gpu_resources.cc:61] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 11.2, Runtime API Version: 11.2
W0614 11:14:58.381611   220 gpu_resources.cc:91] device: 0, cuDNN Version: 8.2.


model epoch0 loss is Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=False,
       [2648242432.])
model epoch1 loss is Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=False,
       [10338301952.])
model epoch2 loss is Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=False,
       [2079151104.])
model epoch3 loss is Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=False,
       [2351676160.])
model epoch4 loss is Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=False,
       [4640833536.])
model epoch5 loss is Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=False,
       [8925767680.])
model epoch6 loss is Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=False,
       [24264019968.])
model epoch7 loss is Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=False,
       [3700633600.])
model epoch8 loss is Tensor(shape=[1], dtype=f

## 3.2 方案一：单模型预测
### 3.2.1 gbm单模型训练

In [28]:
# # 这里采用一号风机做样例
# tur_id = 1
# n = 2 - len(str(tur_id))
# filename = '0' * n + str(tur_id) + '.csv'
# data = pd.read_csv("区域赛训练集/" + filename)
# data.dropna(subset=["YD15"], inplace=True)
# data_mean = data.mean(axis=0)
# data = data.fillna(value=data_mean)
# new_data = data.copy()
# # 回归数据集自身特点，还是可以看中回归任务
# train = new_data[["WINDSPEED", "PREPOWER", "WINDDIRECTION", "TEMPERATURE", "HUMIDITY", "PRESSURE", "ROUND(A.WS,1)"]]
# print(train.shape)
# target = new_data[['YD15']].values
# # %%
# x_train, x_test, y_train, y_test = train_test_split(train.values, target, test_size=0.1)

In [29]:
# #定义模型 这里参数自行设定，获取不同调优结果，也可采用CV来调参
# gbm = LGBMRegressor(
#     boosting_type = 'gbdt', 
#     objective = 'regression', 
# 	learning_rate = 0.01, 
#     n_estimators = 20000,
#     n_jobs = -1,
#     num_leaves = 50, 
#     max_depth = 14,
# 	min_child_samples = 21,   			#min_child_samples
# 	min_child_weight = 0.001,	        #min_child_weight
# 	subsample = 1.0,					#bagging_fraction
# 	colsample_bytree = 0.8,		    	#feature_fraction
# 	reg_alpha = 0.01,					#reg_alpha
# 	reg_lambda = 0,						#reg_lambd
#     )

In [30]:
# # 模型训练
# # 指定训练集，验证集，采用l1正则化验证评估，10rounds训练损失没有下降停止，可以增至200~2000左右，这个本地训练更快一些
# gbm.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric="l1",
#         callbacks=[early_stopping(stopping_rounds=500)])
        

In [31]:
# # 模型预测
# y_pre_gbm = gbm.predict(x_test)
# # 模型评估


# print(f"gbm {tur_id} mean_absolute_error:{mean_squared_error(y_test, y_pre_gbm)}")
# print(f"gbm {tur_id} score:{gbm.score(x_test, y_test)}")

# 调参

In [3]:
tur_id = 11
n = 2 - len(str(tur_id))
filename = '0' * n + str(tur_id) + '.csv'
data = pd.read_csv("区域赛训练集/" + filename)
data.dropna(subset=["YD15"], inplace=True)
data_median= data.median(axis=0)
data_mean = data.mean(axis=0)
data = data.fillna(value=data_median)
new_data = data.copy()
# 回归数据集自身特点，还是可以看中回归任务
train = new_data[["WINDSPEED", "PREPOWER", "WINDDIRECTION", "TEMPERATURE", "HUMIDITY", "PRESSURE", "ROUND(A.WS,1)"]]
print(train.shape)
target = new_data[['YD15']].values
# %%
x_train, x_test, y_train, y_test = train_test_split(train.values, target, test_size=0.1)

(23136, 7)


In [5]:
import lightgbm as lgb

params = {    
  'boosting_type': 'gbdt', 
  'objective': 'regression', 
  'learning_rate': 0.1, 
  'num_leaves': 50, 
	'max_depth': 13,    
	'subsample': 0.8, 
  'colsample_bytree': 0.8, 
  }
data_train = lgb.Dataset(x_train, y_train, silent=True)
#lightgbm.cv官方文档https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.cv.html
cv_results = lgb.cv(
  params, 
  data_train, 
  num_boost_round=5000, 
  nfold=5, 
  stratified=False, 
  shuffle=True, 
  metrics='rmse',
  early_stopping_rounds=50, 
  verbose_eval=50, 
  show_stdv=True, 
  seed=0)
print('best n_estimators:', len(cv_results['rmse-mean']))
print('best cv score:', cv_results['rmse-mean'][-1])
n_estimators = len(cv_results['rmse-mean'])
best_cv_score = cv_results['rmse-mean'][-1]


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1192
[LightGBM] [Info] Number of data points in the train set: 16656, number of used features: 7
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1192
[LightGBM] [Info] Number of data points in the train set: 16656, number of used features: 7
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1192
[LightGBM] [Info] Number of data points in the train set: 16656, number of used features: 7
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1192
[LightGBM] [Info] Number of data points in the train set: 16656, number of used features: 7
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1192
[LightGBM] [Info] Number of data points in the train set: 16656, number of used features: 7


KeyboardInterrupt: 

In [11]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
# 粗调
### 我们可以创建lgb的sklearn模型，使用上面选择的(学习率，评估器数目)
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=50,
                              learning_rate=0.1, n_estimators=1716, max_depth=10,
                              metric='rmse', bagging_fraction = 0.8,
                              feature_fraction = 0.8)
params_test1={    
			'max_depth': range(16,19,1),     #16,17,18
			'num_leaves':range(56, 60, 1)   #56,57,58,59
			}   #一共12个组合
gsearch1 = GridSearchCV(estimator=model_lgb, 
						param_grid=params_test1, 
						scoring='neg_mean_squared_error', 
						cv=5, 	   #交叉验证参数，默认None，使用三折交叉验证。指定fold数量，默认为3，也可以是yield训练/测试数据的生成器。
						verbose=1, #日志冗长度，int：冗长度，0：不输出训练过程，1：偶尔输出，>1：对每个子模型都输出。
						n_jobs=-1  #并行数，-1即跟cpu核数一致
						)
gsearch1.fit(x_train, y_train)
means = gsearch1.cv_results_['mean_test_score']
params = gsearch1.cv_results_['params']
print(means)
print(params)
print(gsearch1.best_params_)
print(gsearch1.best_score_)


Fitting 5 folds for each of 12 candidates, totalling 60 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_o

[-1.93994642e+08 -1.96183097e+08 -1.93447617e+08 -1.94779838e+08
 -1.93040949e+08 -1.94808509e+08 -1.94633941e+08 -1.95644824e+08
 -1.95895187e+08 -1.95127796e+08 -1.93356587e+08 -1.95915240e+08]
[{'max_depth': 16, 'num_leaves': 56}, {'max_depth': 16, 'num_leaves': 57}, {'max_depth': 16, 'num_leaves': 58}, {'max_depth': 16, 'num_leaves': 59}, {'max_depth': 17, 'num_leaves': 56}, {'max_depth': 17, 'num_leaves': 57}, {'max_depth': 17, 'num_leaves': 58}, {'max_depth': 17, 'num_leaves': 59}, {'max_depth': 18, 'num_leaves': 56}, {'max_depth': 18, 'num_leaves': 57}, {'max_depth': 18, 'num_leaves': 58}, {'max_depth': 18, 'num_leaves': 59}]
{'max_depth': 17, 'num_leaves': 56}
-193040949.18770856


In [34]:
# import lightgbm as lgb
# from sklearn.model_selection import GridSearchCV
# params_test2={
#     # 'max_depth': range(13,20,3),
#     'max_depth': [15,16,17],
#     'num_leaves':[50,55,60,65,70,75,80,85]
# }
# gsearch2 = GridSearchCV(estimator=model_lgb, 
# 						param_grid=params_test2, 
# 						scoring='neg_mean_squared_error', 
# 						cv=5, verbose=1, n_jobs=4)
# gsearch2.fit(x_train, y_train)
# means = gsearch2.cv_results_['mean_test_score']
# params = gsearch2.cv_results_['params']
# print(means)
# print(params)
# print(gsearch2.best_params_)
# print(gsearch2.best_score_)


In [13]:
# import lightgbm as lgb
# from sklearn.model_selection import GridSearchCV
params_test3={
    'min_child_samples': [15,16,17,18,19],
    'min_child_weight':[0.001, 0.002]
}
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=56,
                              learning_rate=0.1, n_estimators=1716, max_depth=17, 
                              metric='rmse', bagging_fraction = 0.8, feature_fraction = 0.8)
gsearch3 = GridSearchCV(estimator=model_lgb, 
                        param_grid=params_test3, 
                        scoring='neg_mean_squared_error', 
                        cv=5, verbose=1, n_jobs=-1)
gsearch3.fit(x_train, y_train)
means = gsearch3.cv_results_['mean_test_score']
params = gsearch3.cv_results_['params']
print(means)
print(params)
print(gsearch3.best_params_)
print(gsearch3.best_score_)



Fitting 5 folds for each of 10 candidates, totalling 50 fits
[-1.91829149e+08 -1.91829149e+08 -1.93643367e+08 -1.93643367e+08
 -1.93192683e+08 -1.93192683e+08 -1.92753440e+08 -1.92753440e+08
 -1.93940560e+08 -1.93940560e+08]
[{'min_child_samples': 15, 'min_child_weight': 0.001}, {'min_child_samples': 15, 'min_child_weight': 0.002}, {'min_child_samples': 16, 'min_child_weight': 0.001}, {'min_child_samples': 16, 'min_child_weight': 0.002}, {'min_child_samples': 17, 'min_child_weight': 0.001}, {'min_child_samples': 17, 'min_child_weight': 0.002}, {'min_child_samples': 18, 'min_child_weight': 0.001}, {'min_child_samples': 18, 'min_child_weight': 0.002}, {'min_child_samples': 19, 'min_child_weight': 0.001}, {'min_child_samples': 19, 'min_child_weight': 0.002}]
{'min_child_samples': 15, 'min_child_weight': 0.001}
-191829148.7725261


In [14]:
params_test4={
		    'feature_fraction': [0.5, 0.6, 0.7, 0.8, 0.9],
		    'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0]
			}
model_lgb = lgb.LGBMRegressor(objective='regression',
								num_leaves=56,
                             	learning_rate=0.1, 
								n_estimators=1716, 
								max_depth=17, 
                             	metric='rmse',
								bagging_freq = 5,  
								min_child_samples=15)
gsearch4 = GridSearchCV(estimator=model_lgb, 
						param_grid=params_test4, 
						scoring='neg_mean_squared_error', 
						cv=5, verbose=1, n_jobs=-1)
gsearch4.fit(x_train, y_train)
means = gsearch4.cv_results_['mean_test_score']
params = gsearch4.cv_results_['params']
print(means)
print(params)
print(gsearch4.best_params_)
print(gsearch4.best_score_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_o



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_o

[-2.03129702e+08 -2.03129702e+08 -2.01059551e+08 -1.98886344e+08
 -1.98886344e+08 -1.99898931e+08 -1.99898931e+08 -1.97180812e+08
 -1.96106335e+08 -1.96106335e+08 -2.01426581e+08 -2.01426581e+08
 -1.94547450e+08 -1.92146162e+08 -1.92146162e+08 -2.00249296e+08
 -2.00249296e+08 -1.91526763e+08 -1.92417579e+08 -1.92417579e+08
 -1.99152899e+08 -1.99152899e+08 -1.95010679e+08 -1.91829149e+08
 -1.91829149e+08]
[{'bagging_fraction': 0.6, 'feature_fraction': 0.5}, {'bagging_fraction': 0.6, 'feature_fraction': 0.6}, {'bagging_fraction': 0.6, 'feature_fraction': 0.7}, {'bagging_fraction': 0.6, 'feature_fraction': 0.8}, {'bagging_fraction': 0.6, 'feature_fraction': 0.9}, {'bagging_fraction': 0.7, 'feature_fraction': 0.5}, {'bagging_fraction': 0.7, 'feature_fraction': 0.6}, {'bagging_fraction': 0.7, 'feature_fraction': 0.7}, {'bagging_fraction': 0.7, 'feature_fraction': 0.8}, {'bagging_fraction': 0.7, 'feature_fraction': 0.9}, {'bagging_fraction': 0.8, 'feature_fraction': 0.5}, {'bagging_fr

In [17]:
params_test5={
    'feature_fraction': [0.62, 0.65, 0.68, 0.7, 0.72, 0.75, 0.78 ]
			}
model_lgb = lgb.LGBMRegressor(objective='regression',	
								num_leaves=56,
                             	learning_rate=0.1, 
								n_estimators=1716, 
								max_depth=17, 
                              	metric='rmse',  
								min_child_samples=15)
gsearch5 = GridSearchCV(estimator=model_lgb, 
						param_grid=params_test5, 
						scoring='neg_mean_squared_error', 
						cv=5, verbose=1, n_jobs=-1)
gsearch5.fit(x_train, y_train)
# print(gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_)
means = gsearch5.cv_results_['mean_test_score']
params = gsearch5.cv_results_['params']
print(means)
print(params)
print(gsearch5.best_params_)
print(gsearch5.best_score_)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
[-1.99152899e+08 -1.95010679e+08 -1.95010679e+08 -1.95010679e+08
 -1.95010679e+08 -1.95010679e+08 -1.95010679e+08]
[{'feature_fraction': 0.62}, {'feature_fraction': 0.65}, {'feature_fraction': 0.68}, {'feature_fraction': 0.7}, {'feature_fraction': 0.72}, {'feature_fraction': 0.75}, {'feature_fraction': 0.78}]
{'feature_fraction': 0.65}
-195010678.9639312


In [18]:
params_test6={
    'reg_alpha': [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5],
    'reg_lambda': [0, 0.001, 0.01, 0.03, 0.08, 0.3, 0.5]
}
model_lgb = lgb.LGBMRegressor(objective='regression',
								num_leaves=56,
								learning_rate=0.1, 
								n_estimators=1716, 
								max_depth=17, 
								metric='rmse',  
								min_child_samples=15, 
								feature_fraction=0.65)
gsearch6 = GridSearchCV(estimator=model_lgb, 
						param_grid=params_test6, 
						scoring='neg_mean_squared_error', 
						cv=5, verbose=1, n_jobs=-1)
gsearch6.fit(x_train, y_train)

means = gsearch6.cv_results_['mean_test_score']
params = gsearch6.cv_results_['params']
print(means)
print(params)
print(gsearch6.best_params_)
print(gsearch6.best_score_)


Fitting 5 folds for each of 49 candidates, totalling 245 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_o



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_o

[-1.95010679e+08 -1.94031108e+08 -1.94616921e+08 -1.96078572e+08
 -1.96457057e+08 -1.95935124e+08 -1.94569895e+08 -1.95035064e+08
 -1.94031103e+08 -1.94615113e+08 -1.96168468e+08 -1.96457006e+08
 -1.95935394e+08 -1.94567973e+08 -1.95033266e+08 -1.94112630e+08
 -1.94615112e+08 -1.96168302e+08 -1.96457087e+08 -1.95936164e+08
 -1.94569982e+08 -1.95127650e+08 -1.94034211e+08 -1.94582950e+08
 -1.96168396e+08 -1.96457339e+08 -1.95935593e+08 -1.94458084e+08
 -1.95191102e+08 -1.94004925e+08 -1.94697000e+08 -1.96244533e+08
 -1.96423103e+08 -1.95869887e+08 -1.94457346e+08 -1.95152901e+08
 -1.94593675e+08 -1.94968783e+08 -1.96182597e+08 -1.96544445e+08
 -1.95384312e+08 -1.94844421e+08 -1.95035952e+08 -1.94381436e+08
 -1.94830779e+08 -1.96527992e+08 -1.96392124e+08 -1.95446413e+08
 -1.95112500e+08]
[{'reg_alpha': 0, 'reg_lambda': 0}, {'reg_alpha': 0, 'reg_lambda': 0.001}, {'reg_alpha': 0, 'reg_lambda': 0.01}, {'reg_alpha': 0, 'reg_lambda': 0.03}, {'reg_alpha': 0, 'reg_lambda': 0.08}, 

In [39]:
params = {
		    'boosting_type': 'gbdt', 
		    'objective': 'regression', 
		    'learning_rate': 0.01, 
		    'num_leaves': 50, 
		    'max_depth': 14,
			'min_child_samples': 21,   			#min_child_samples
			'min_child_weight': 0.001,	#min_child_weight
			'subsample': 1.0,					#bagging_fraction
			'colsample_bytree': 0.8,			#feature_fraction
			'reg_alpha': 0.01,					#reg_alpha
			'reg_lambd': 0,						#reg_lambd
    }
data_train = lgb.Dataset(x_train, y_train, silent=True)
cv_results = lgb.cv(params, data_train, 
					num_boost_round=50000, 
					nfold=5, 
					stratified=False, 
					shuffle=True, 
					metrics='rmse',    #均方误差
					early_stopping_rounds=50, 
					verbose_eval=100, 
					show_stdv=True)
print('best n_estimators:', len(cv_results['rmse-mean']))
print('best cv score:', cv_results['rmse-mean'][-1])


### 3.2.1 gbm单模型批量训练

In [45]:
import joblib


def mutiTrain_GBM(start, end):
    for i in range(start, end+1):
        n = 2 - len(str(i))
        filename = '0' * n + str(i) + '.csv'
        data = pd.read_csv("区域赛训练集/" + filename)
        data.dropna(subset=["YD15"], inplace=True)
        data_mean = data.mean(axis=0)
        data = data.fillna(value=data_mean)
        new_data = data.copy()
        train = new_data[
            ["WINDSPEED", "PREPOWER", "WINDDIRECTION", "TEMPERATURE", "HUMIDITY", "PRESSURE", "ROUND(A.WS,1)"]]
        print(train.shape)
        target = new_data[['YD15']].values
        # %%
        x_train, x_test, y_train, y_test = train_test_split(train.values, target, test_size=0.1)
        print(type(x_train))
        #定义模型 这里参数自行设定，获取不同调优结果，也可采用CV来调参
        gbm = LGBMRegressor(
            boosting_type = 'gbdt', 
		    objective = 'regression', 
		    learning_rate = 0.01, 
            n_estimators = 2,
            n_jobs = -1,
		    num_leaves = 50, 
		    max_depth = 14,
			min_child_samples = 21,   			#min_child_samples
			min_child_weight = 0.001,	        #min_child_weight
			subsample = 1.0,					#bagging_fraction
			colsample_bytree = 0.8,		    	#feature_fraction
			reg_alpha = 0.01,					#reg_alpha
			reg_lambd = 0,						#reg_lambd
        )
        # 模型训练
        # 指定训练集，验证集，采用l1正则化验证评估，10rounds训练损失没有下降停止，可以增至200~2000左右，这个本地训练更快一些
        gbm = gbm.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric="l1",callbacks=[early_stopping(stopping_rounds=1000)])
        # 模型预测
        y_pre_gbm = gbm.predict(x_test)
        # 模型评估
        print(f"gbm {i} mean_absolute_error:{mean_squared_error(y_test, y_pre_gbm)}")
        print(f"gbm {i} score:{gbm.score(x_test, y_test)}")
        joblib.dump(gbm, 'model/gbm' + str(i) + '.pkl')


In [46]:
%%time
mutiTrain_GBM(11, 20)

(23136, 7)
<class 'numpy.ndarray'>
Training until validation scores don't improve for 1000 rounds
Did not meet early stopping. Best iteration is:
[2]	valid_0's l1: 42855	valid_0's l2: 2.78684e+09
gbm 11 mean_absolute_error:2786836431.766496
gbm 11 score:0.03422910209864927
(23232, 7)
<class 'numpy.ndarray'>
Training until validation scores don't improve for 1000 rounds
Did not meet early stopping. Best iteration is:
[2]	valid_0's l1: 29573.8	valid_0's l2: 1.25154e+09
gbm 12 mean_absolute_error:1251543443.7958374
gbm 12 score:0.03678970577068219
(37824, 7)
<class 'numpy.ndarray'>
Training until validation scores don't improve for 1000 rounds
Did not meet early stopping. Best iteration is:
[2]	valid_0's l1: 24129.9	valid_0's l2: 8.08221e+08
gbm 13 mean_absolute_error:808221411.993305
gbm 13 score:0.03669223454696735
(37728, 7)
<class 'numpy.ndarray'>
Training until validation scores don't improve for 1000 rounds
Did not meet early stopping. Best iteration is:
[2]

## 4.1 方案一：预测
### 4.1.1 GCN单模型加载

In [42]:
from sklearn.metrics import mean_squared_error

for i in range(1, 11):
    n = 2 - len(str(i))
    filename = '0' * n + str(i) + '.csv'
    data = pd.read_csv("功率预测竞赛赛题与数据集/" + filename)

    # ===========异常值处理===========
    # 当实际风速为0时，功率置为0
    data.loc[data['ROUND(A.WS,1)']==0, 'YD15'] = 0

    # TODO 风速过大但功率为0的异常：先设计函数拟合出：实际功率=f(风速)，
    # 然后代入异常功率的风速获取理想功率，替换原异常功率

    # TODO 对于在特定风速下的离群功率（同时刻用IQR检测出来），做功率修正（如均值修正）
    for sp in data['WINDSPEED'].unique():
        dfs = data.loc[data['WINDSPEED']==sp,'YD15']
        dfs=dfs.sort_values()
        Q1=dfs.quantile(0.25)
        Q3=dfs.quantile(0.75)
        IQR=Q3-Q1;
        down=Q1-1.5*IQR
        up=Q3+1.5*IQR
        data.loc[(data['WINDSPEED']==sp)&(data['YD15']<down),'YD15']=down
        data.loc[(data['WINDSPEED']==sp)&(data['YD15']>up),'YD15']=up

    data.dropna(subset=["YD15"], inplace=True)
    data_mean = data.mean(axis=0)
    data = data.fillna(value=data_mean)
    new_data = data.copy()
    train = new_data[["WINDSPEED", "PREPOWER", "WINDDIRECTION", "TEMPERATURE", "HUMIDITY", "PRESSURE", "ROUND(A.WS,1)"]]
    print(train.shape)
    target = new_data[['YD15']].values
    x_train, x_test, y_train, y_test = train_test_split(train.values, target, test_size=0.1)
    model_dict = paddle.load("model/" + str(i) + ".pdparams")
    model = GCN(input_size=7, hidden_size=64, output_size=1)
    model.set_state_dict(model_dict)
    # 开启评估模式
    model.eval()
    with paddle.no_grad():
        adj = paddle.ones((x_test.shape[0], x_test.shape[0]))  # 假设所有节点之间都有连接
        output = model(paddle.to_tensor(x_test, dtype="float32"), adj).numpy()
        # print(output)
        # print(output.shape)
        print(f"GCN_MODEL {i} mean_absolute_error:{mean_absolute_error(y_test, output)}")

(34398, 7)


ValueError: The ``path`` (model/1.pdparams) to load model not exists.

## 4.1 方案一：预测
### 4.1.2 GBM单模型加载

In [None]:
for i in range(1, 11):
    n = 2 - len(str(i))
    filename = '0' * n + str(i) + '.csv'
    data = pd.read_csv("功率预测竞赛赛题与数据集/" + filename)
    data.dropna(subset=["YD15"], inplace=True)
    data_mean = data.mean(axis=0)
    data = data.fillna(value=data_mean)
    new_data = data.copy()
    train = new_data[["WINDSPEED", "PREPOWER", "WINDDIRECTION", "TEMPERATURE", "HUMIDITY", "PRESSURE", "ROUND(A.WS,1)"]]
    print(train.shape)
    target = new_data[['YD15']].values
    x_train, x_test, y_train, y_test = train_test_split(train.values, target, test_size=0.1)
    gbm = joblib.load('model/gbm' + str(i) + '.pkl')
    y_pre_gbm = gbm.predict(x_test)
    print(f"gbm {i} mean_absolute_error:{mean_absolute_error(y_test, y_pre_gbm)}")
    print(f"gbm {i} score:{gbm.score(x_test, y_test)}")

## 4.2 模型组合
### 4.2.1 GCN+GBM模型加载

In [None]:
def load_models(turb_id):
    model_dict = paddle.load("model/" + str(turb_id) + ".pdparams")
    model1 = GCN(input_size=7, hidden_size=64, output_size=1)
    model2 = joblib.load("model/gbm" + str(turb_id) + '.pkl')
    model1.set_state_dict(model_dict)
    # 开启评估模式
    model1.eval()
    return [model1, model2]

In [None]:
for i in range(1, 11):
    n = 2 - len(str(i))
    filename = '0' * n + str(i) + '.csv'
    data = pd.read_csv("功率预测竞赛赛题与数据集/" + filename)
    data.dropna(subset=["YD15"], inplace=True)
    data_mean = data.mean(axis=0)
    data = data.fillna(value=data_mean)
    new_data = data.copy()
    train = new_data[["WINDSPEED", "PREPOWER", "WINDDIRECTION", "TEMPERATURE", "HUMIDITY", "PRESSURE", "ROUND(A.WS,1)"]]
    print(train.shape)
    target = new_data[['YD15']]
    x_train, x_test, y_train, y_test = train_test_split(train.values, target, test_size=0.1)
    models=load_models(i)
    with paddle.no_grad():
        adj = paddle.ones((x_test.shape[0], x_test.shape[0]))  # 假设所有节点之间都有连接
        output1 = models[0](paddle.to_tensor(x_test, dtype="float32"), adj).numpy().flatten()
        output2 = models[1].predict(x_test)
        # 惩罚权重自行设定，建议outputs1小，output2大
        output = (-0.01 * output1 + 0.99 * output2)
        print(f"composition model {i} mean_absolute_error:{mean_absolute_error(y_test, output)}")

## 5 Tip
### 模型参数请自行组合，注释部分有建议值
### 提交脚本可参照上述4.2编写，也可以参考笔者的predict.py文件，为了测试笔者在基线作者infile目录下拷贝了一下0002in.csv重命名为0001in.csv

In [None]:
data = pd.read_csv("区域赛训练集/" + turb_id)
# data = pd.read_csv("区域赛训练集/" + filename)
data.dropna(subset=["YD15"], inplace=True)
data_mean = data.mean(axis=0)
data = data.fillna(value=data_mean)
new_data = data.copy()
train = new_data[
    ["WINDSPEED", "PREPOWER", "WINDDIRECTION", "TEMPERATURE", "HUMIDITY", "PRESSURE", "ROUND(A.WS,1)"]]
print(train.shape)
target = new_data[['YD15']].values
# %%
x_train, x_test, y_train, y_test = train_test_split(train.values, target, test_size=0.1)
print(type(x_train))
#定义模型 这里参数自行设定，获取不同调优结果，也可采用CV来调参
gbm = LGBMRegressor(
    boosting_type = 'gbdt', 
	objective = 'regression', 
	learning_rate = 0.01, 
    n_estimators = 2,
    n_jobs = -1,
    num_leaves = 50, 
    max_depth = 14,
	min_child_samples = 21,   			#min_child_samples
	min_child_weight = 0.001,	        #min_child_weight
	subsample = 1.0,					#bagging_fraction
	colsample_bytree = 0.8,		    	#feature_fraction
	reg_alpha = 0.01,					#reg_alpha
	reg_lambd = 0,						#reg_lambd
)
# 模型训练
# 指定训练集，验证集，采用l1正则化验证评估，10rounds训练损失没有下降停止，可以增至200~2000左右，这个本地训练更快一些
gbm = gbm.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric="l1",callbacks=[early_stopping(stopping_rounds=1000)])
# 模型预测
y_pre_gbm = gbm.predict(x_test)
# 模型评估
print(f"gbm {i} mean_absolute_error:{mean_squared_error(y_test, y_pre_gbm)}")
print(f"gbm {i} score:{gbm.score(x_test, y_test)}")
joblib.dump(gbm, 'model/gbm' + str(i) + '.pkl')