In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
from sklearn.metrics import mutual_info_score
from sklearn.model_selection import KFold,TimeSeriesSplit

import warnings
warnings.filterwarnings("ignore")

import xgboost as xgb
import lightgbm as lgb

def load_dataset():
    [t1, t2, t3, t4] = [pd.read_csv(f'../data/train_{i}.csv', parse_dates=["时间"]) for i in range(1, 5)]
    [p1, p2, p3, p4] = [pd.read_csv(f'../data/test_{i}.csv', parse_dates=["时间"]) for i in range(1, 5)]
    [t1, t2, t3, t4] = [t.drop_duplicates().reset_index(drop=True) for t in [t1, t2, t3, t4]]
    return [t1, t2, t3, t4], [p1, p2, p3, p4]

[t1, t2, t3, t4], [p1, p2, p3, p4] = load_dataset()

In [4]:
plant_power = {
    1: 10,
    2: 10,
    3: 40,
    4: 50
}


def mae_d(df_groupby, plant):
    pm = df_groupby['pm'].values
    pp = df_groupby['pp'].values
    threshold = plant_power[plant] * 0.03
    index = pm >= threshold
    return np.abs(pm[index] - pp[index]).sum() / (np.sum(index) * plant_power[plant])


def mae_m(df, plant):
    return df.groupby(df['datetime'].dt.day).apply(lambda x: mae_d(x, plant)).mean()


def score(df, plant):
    """

    :param df: datetime, pm, pp
    :param plant:
    :return:
    """
    month = df['datetime'].dt.month.unique()
    ret = []
    for m in month:
        ret.append(mae_m(df[df['datetime'].dt.month == m], plant))
    return np.mean(ret)


def score2(pm, pp, plant):
    threshold = plant_power[plant] * 0.03
    index = pm >= threshold
    return np.abs(pm[index] - pp[index]).sum() / (np.sum(index) * plant_power[plant])


In [6]:
kf = KFold(5, shuffle=False)
x, y = t1.iloc[:, [1,2,3,4,5,6]].values, t1["实际功率"].values
ret = []
for train, valid in kf.split(x, y):
    mdl = lgb.LGBMRegressor()
    mdl.fit(x[train], y[train])
    pred = mdl.predict(x[valid])
    print(f'score {score2(y[valid], pred, 1)}')
    ret.append(score2(y[valid], pred, 1))
print(np.mean(ret))

score 0.11550962195409474
score 0.12523116172011847
score 0.12270395293794245
score 0.12548133395741365
score 0.12037888007214703
0.121860990128


In [42]:
from sklearn.model_selection import KFold, ParameterGrid
import lightgbm as lgb


def lgb_cv(params, x, y, metric, k=3, **kwargs):
    kf = KFold(k, **kwargs)
    weights = np.ones_like(y)
    weights[y>0.03] = 5
    ret = []
    for train, valid in kf.split(x):
        train_set = lgb.Dataset(x[train], y[train], weight=weights[train], **kwargs)
        valid_set = lgb.Dataset(x[valid], y[valid], weight=weights[valid], **kwargs)
        mdl = lgb.train(params, train_set, valid_sets=[train_set, valid_set], verbose_eval=-1)
        ret.append(metric(y[valid], mdl.predict(x[valid])))
    return ret


def lgb_grid_search_cv(paras_grid, x, y, k=3, **kwargs):
    grid = list(ParameterGrid(paras_grid))
    max_score = np.inf
    best_param = None
    for step, p in enumerate(grid):
        score = np.mean(lgb_cv(p, x, y, k=k, **kwargs))
        if score < max_score:
            best_param = p
            max_score = score
            print(f'step {step}, best score: {max_score: .4f}')
    return best_param, max_score

In [8]:
param ={
    'objective': 'regression_l1',
    'task': 'train',
    'boosting': 'gbdt',
    'n_iter': 75,
    'learning_rate': 0.1,
    'num_leaves': 31,
    'bagging_fraction': 0.5,
    'feature_fraction': 1.,
#     'early_stopping_round': 50
}

x, y = t1.iloc[:, [1,2,3,4,5,6]].values, t1["实际功率"].values
np.mean(lgb_cv(param, x, y, k=5, metric=lambda x, y: score2(x, y, 1)))

0.1200158700697465

In [41]:
param_grid ={
    'objective': ['regression_l1', 'regression'],
    'task': ['train'],
    'boosting': ['gbdt'],
    'n_iter': range(50, 150, 25),
    'learning_rate': [0.1, 0.05],
    'num_leaves': [31, 50],
    'bagging_fraction': [0.7, 0.5,],
    'feature_fraction': [1.],
#     'early_stopping_round': 50
}

x, y = t1.iloc[:, [1,2,3,4,5,6]].values, t1["实际功率"].values
best_p, best_score = lgb_grid_search_cv(param_grid, x, y, k=5, metric=lambda x, y: score2(x, y, 1))

step 0, best score: 0.11989611516358614
step 22, best score: 0.11988685933944729
step 24, best score: 0.11987231223650266
step 26, best score: 0.11979942496935041
step 30, best score: 0.11978035390259034


In [43]:
print(best_p, best_score)

{'bagging_fraction': 0.7, 'boosting': 'gbdt', 'feature_fraction': 1.0, 'learning_rate': 0.05, 'n_iter': 125, 'num_leaves': 50, 'objective': 'regression_l1', 'task': 'train'} 0.119780353903
