# LightGBM Model
作者：艾宏峰<br>
创建时间：2020.11.15<br>

In [55]:
import gc
import pandas as pd
import lightgbm as lgb
import numpy as np
from datetime import datetime
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import accuracy_score
import copy

import warnings
warnings.filterwarnings("ignore")

LGBMRegressor有下列参数可调整：
- boosting_type(gbdt)：提升方法，默认是gbdt。有四种：（1）gbdt:传统梯度提升决策树。（2）rf:随机森林。（3）dart:Dropouts meet Multiple Additive Regression Trees。（4）goss:Gradient-based One-Side Sampling。<br>
- num_leaves(31)：数的最大叶子数量。<br>
- max_depth(-1)：树的最大深度。<br>
- learning_rate(0.1)：学习率。<br>
- n_estimators(100)：提升树数量。<br>
- subsample_for_bin(200000)：构建bin时的样本数。<br>
- objective：默认regression对于LGBMRegressor。<br>
- class_weight(None)：对多分类任务而言，这里不用管。<br>
- min_split_gain(0)：进一步拆分树的叶子节点的最小损失减少量。<br>
- min_child_weight(1e-3)：在子叶子上需要的最小实例权重之和。<br>
- min_child_samples(20)：在子叶子上需要的数据最小量。<br>
- subsample(1)：训练集下采样率。<br>
- subsample_freq(0)：下采样频率，<=0代表不允许。<br>
- colsample_bytree(1)：构造每棵树时列的子采样率。
- reg_alpha(0)：有关权重的L1正则化值。
- reg_lambda(0)：权重的L2正则化项值。
- random_state(None)：种子数。
- important_type('split‘)：要填写的功能重要性的类型feature_importances_。如果为“ split”，则结果包含该特征在模型中使用的次数。如果为“ gain”，则结果包含使用该功能的分割的总增益。

In [56]:
# LightGBM参数
params = {
    'metric':'mse',
    'objective':'regression',
    'seed':2022,
    'boosting_type':'gbdt', # 也可用其他的，一个个试着先,dart不支持early stopping
    'early_stopping_rounds':10,
    'subsample':0.8,
    'feature_fraction':0.75,
    'bagging_fraction': 0.75,
    'reg_lambda': 10
}


verbose_flag = False # 是否展示模型训练验证详细信息
folds = 5 # 5折交叉验证

# 由eda.ipynb得到含缺失值特征的队列
# miss_qids = [297, 298, 20889, 21487, 21671, 21673, 81221, 82695, 82697, 82929, 83109, 83609]
miss_qids = []

# 导入数据
data_path = r'/media/alvinai/Documents/serverless/data/'
# 训练集非自变量特征：'QUEUE_ID', 'NEXT_5_CPU_USAGE', 'NEXT_5_LAUNCHING_JOB_NUMS'
# 测试集非自变量特征：'ID', 'QUEUE_ID', 'NEXT_5_CPU_USAGE', 'NEXT_5_LAUNCHING_JOB_NUMS'
df_train = pd.read_csv(data_path + 'train_v30b1.csv')
df_test = pd.read_csv(data_path + 'test_v30b1.csv')
sub_sample = pd.read_csv(data_path + 'submit_example.csv')

df_train.drop(['DOTTING_MINUTE_4','CPU_USAGE_3_std'], axis = 1, inplace = True)
df_test.drop(['DOTTING_MINUTE_4','CPU_USAGE_3_std'], axis = 1, inplace = True)

# # 导入lightgbm_ljn.ipynb预测好的NEXT_5_LAUNCHING_JOB_NUMS结果
# ljn_predictions = pd.read_csv(r'/media/alvinai/Documents/serverless/result/lgb_ljn_sub_20201108_2156.csv')

In [57]:
def cu_error(y, y_pred):
    '''根据官网提供对CPU_USAGE的误差测评公式进行打分'''
    return np.abs(y - y_pred) * 0.9

In [58]:
# def get_import_feats(X_train, Y_train, X_val, Y_val, import_feat_num, params):
#         model = lgb.LGBMRegressor(**params)
#         lgb_model = model.fit(X_train, 
#                           Y_train,
#                           eval_names=['train', 'valid'],
#                           eval_set=[(X_train, Y_train), (X_val, Y_val)],
#                           verbose=0,
#                           eval_metric=params['metric'],
#                           early_stopping_rounds=params['early_stopping_rounds'])
#         import_feat_df = pd.DataFrame({
#                                         'feature': list(X_train),
#                                         'importance': lgb_model.feature_importances_,
#                                         }).sort_values(by='importance',ascending=False)
#         import_feats = list(import_feat_df['feature'].values)[:import_feat_num]
# #         print(import_feat_df['feature'].values)
# #         print(import_feats)
# #         print(Y_train[import_feats])
#         return X_train[import_feats], Y_train, X_val[import_feats], Y_val, import_feats

In [59]:
def run_lgb_qid(df_train, df_test, target, qid, params):
    '''针对给定预测目标和队列进行LGB训练、验证和评估
    输入：
        1. df_train      (pd.DataFrame)：训练集
        2. df_test       (pd.DataFrame)：测试集
        3. target        (str)         : 当前预测目标变量名
        4. qid           (int)         : 当前针对的队列id
        5. params        (dict)        : 模型参数字典
    输出：
        1. prediction    (pd.DataFrame): 测试集预测结果
        2. score         (float)       : 验证集MSE分数
    '''
    if qid not in miss_qids:
        # 正常队列：过滤不相干特征，得到训练模型的输入特征
        feature_names = list(
            filter(lambda x: x not in ['QUEUE_ID'] + [f'cpu_{i}' for i in range(1,6)], df_train.columns))
    else: 
        # 对含缺失值特征的队列：过滤掉含缺失值的特征，得到训练模型的输入特征
        feature_names = list(
            filter(lambda x: x not in ['QUEUE_ID'] + [f'cpu_{i}' for i in range(1,6)] + [f for f in df_train.columns if f.startswith('DISK_USAGE')], df_train.columns))

    # 提取 QUEUE_ID 对应的数据集
    df_train = df_train[df_train['QUEUE_ID'] == qid]
    df_test = df_test[df_test['QUEUE_ID'] == qid]
    
#     # 打印当前训练信息
#     if verbose_flag == True:
#         print(f"QUEUE_ID:{qid}, target:{target}, train样本量:{len(df_train)}, test样本量:{len(df_test)}")
    
    # 构建模型
    model = lgb.LGBMRegressor(**params)
    
    prediction = df_test[['ID', 'QUEUE_ID']] # 用于存放不同折下预测结果的平均值
    prediction['pred_' + target] = 0 # 初始化
    scores = []    # 用于存放不同折下的预测分数
    pred_valid = np.zeros((len(df_train),)) # 初始化验证集预测结果
    
    kfold = StratifiedKFold(n_splits=folds, shuffle=True, random_state=params['seed'])

    for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train, df_train[target])):
        # 划分数据集
        X_train = df_train.iloc[trn_idx][feature_names]
        Y_train = df_train.iloc[trn_idx][target]
        X_val = df_train.iloc[val_idx][feature_names]
        Y_val = df_train.iloc[val_idx][target]
        
#         # 获取特征重要性
#         import_feat_num = 50
#         X_train, Y_train, X_val, Y_val, import_feats = get_import_feats(X_train, Y_train, X_val, Y_val, import_feat_num, params)
#         feature_names = import_feats
        
        # 训练模型
        lgb_model = model.fit(X_train, 
                              Y_train,
                              eval_names=['train', 'valid'],
                              eval_set=[(X_train, Y_train), (X_val, Y_val)],
                              verbose=0,
                              eval_metric=params['metric'],
                              early_stopping_rounds=params['early_stopping_rounds'])
        
        
        
        # 预测划分后的测试集和验证集
        pred_test = lgb_model.predict(df_test[feature_names], num_iteration =  lgb_model.best_iteration_)
        pred_valid[val_idx] = lgb_model.predict(X_val, num_iteration =  lgb_model.best_iteration_)
        # 记录每次fold下的模型原始分数
        scores.append(lgb_model.best_score_['valid']['l2']) # 追加当前第k折下模型的最佳分数
        # 追加预测结果
        prediction['pred_' + target] += pred_test / kfold.n_splits
# #         打印特征重要性
#         print(pd.DataFrame({
#                             'feature': list(X_train),
#                             'importance': lgb_model.feature_importances_,
#                             }).sort_values(by='importance',ascending=False))
        # 删除冗余变量
        del lgb_model, pred_test, X_train, Y_train, X_val, Y_val
        gc.collect()
    
    # 计算测评分数
    formal_score = np.mean([cu_error(y_true, y_pred) for y_true, y_pred in zip(df_train[target].values.ravel(), pred_valid)])
       
    if verbose_flag == True:
        print("每折下的MSE分数：{}, 平均每折MSE分数：{:.4f}".format([np.round(v,2) for v in scores], np.mean(scores)))
        print("-"*60)
    return prediction, np.mean(scores), formal_score


In [60]:
predictions = list()
scores = list()
formal_scores = list()

for qid in tqdm(df_test['QUEUE_ID'].unique()):    
    df = pd.DataFrame()
    for t in [f'cpu_{i}' for i in range(1,6)]:
        prediction, score, formal_score = run_lgb_qid(df_train, df_test, t, qid, params)
        if t == 'cpu_1':
            df = prediction.copy()
        else:
            df = pd.merge(df, prediction, on=['ID', 'QUEUE_ID'], how='left')            
        scores.append(score)
        formal_scores.append(formal_score)

    predictions.append(df)

100%|██████████| 23/23 [03:20<00:00,  8.83s/it]


In [62]:
print('mean MSE score: ', np.mean(scores))
print('mean 测评 score：', np.mean(formal_scores))

mean MSE score:  38.951149911096294
mean 测评 score： 2.3944517282261444


In [63]:
sub = pd.concat(predictions)

sub = sub.sort_values(by='ID').reset_index(drop=True)
sub.drop(['QUEUE_ID'], axis=1, inplace=True)
sub.columns = ['ID'] + [f'CPU_USAGE_{i}' for i in range(1,6)]

# 全置 0 都比训练出来的结果好
for col in [f'LAUNCHING_JOB_NUMS_{i}' for i in range(1,6)]:
    sub[col] = 0
    
sub = sub[['ID',
           'CPU_USAGE_1', 'LAUNCHING_JOB_NUMS_1', 
           'CPU_USAGE_2', 'LAUNCHING_JOB_NUMS_2', 
           'CPU_USAGE_3', 'LAUNCHING_JOB_NUMS_3', 
           'CPU_USAGE_4', 'LAUNCHING_JOB_NUMS_4', 
           'CPU_USAGE_5', 'LAUNCHING_JOB_NUMS_5']]

print(sub.shape)
sub.head()

(2996, 11)


Unnamed: 0,ID,CPU_USAGE_1,LAUNCHING_JOB_NUMS_1,CPU_USAGE_2,LAUNCHING_JOB_NUMS_2,CPU_USAGE_3,LAUNCHING_JOB_NUMS_3,CPU_USAGE_4,LAUNCHING_JOB_NUMS_4,CPU_USAGE_5,LAUNCHING_JOB_NUMS_5
0,1.0,91.037775,0,97.71954,0,98.027287,0,96.020144,0,95.824671,0
1,2.0,36.019274,0,31.632744,0,33.248617,0,40.212485,0,33.76655,0
2,3.0,52.013235,0,65.901737,0,35.397701,0,9.217707,0,3.714534,0
3,4.0,25.168423,0,16.661991,0,7.857895,0,6.640153,0,6.608788,0
4,5.0,2.806551,0,15.944253,0,23.637981,0,17.576932,0,8.468232,0


In [64]:
# 注意: 提交要求预测结果需为非负整数, 包括 ID 也需要是整数
sub['ID'] = sub['ID'].astype(int)

for col in [i for i in sub.columns if i != 'ID']:
    sub[col] = sub[col].round()
    sub[col] = sub[col].apply(np.floor)
    sub[col] = sub[col].apply(lambda x: 0 if x<0 else x)
    sub[col] = sub[col].apply(lambda x: 100 if x>100 else x)
    sub[col] = sub[col].astype(int)
    
sub.head(10)

Unnamed: 0,ID,CPU_USAGE_1,LAUNCHING_JOB_NUMS_1,CPU_USAGE_2,LAUNCHING_JOB_NUMS_2,CPU_USAGE_3,LAUNCHING_JOB_NUMS_3,CPU_USAGE_4,LAUNCHING_JOB_NUMS_4,CPU_USAGE_5,LAUNCHING_JOB_NUMS_5
0,1,91,0,98,0,98,0,96,0,96,0
1,2,36,0,32,0,33,0,40,0,34,0
2,3,52,0,66,0,35,0,9,0,4,0
3,4,25,0,17,0,8,0,7,0,7,0
4,5,3,0,16,0,24,0,18,0,8,0
5,6,9,0,17,0,16,0,24,0,19,0
6,7,10,0,17,0,13,0,22,0,19,0
7,8,1,0,1,0,1,0,1,0,1,0
8,9,3,0,4,0,4,0,4,0,4,0
9,10,19,0,8,0,6,0,5,0,7,0


In [65]:
# 保存最终结果
current_time = datetime.now()
current_time = current_time.strftime('%Y%m%d_%H%M')
result_name = 'lgb_cu_ljn_sub_' + current_time + '_seed2022.csv'
sub.to_csv(r'/media/alvinai/Documents/serverless/result/' + result_name, index = False)