In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
from sklearn.metrics import mutual_info_score
from sklearn.model_selection import KFold,TimeSeriesSplit

import warnings
warnings.filterwarnings("ignore")

import xgboost as xgb
import lightgbm as lgb

def load_dataset():
    [t1, t2, t3, t4] = [pd.read_csv(f'../data/train_{i}.csv', parse_dates=["时间"]) for i in range(1, 5)]
    [p1, p2, p3, p4] = [pd.read_csv(f'../data/test_{i}.csv', parse_dates=["时间"]) for i in range(1, 5)]
    [t1, t2, t3, t4] = [t.drop_duplicates().reset_index(drop=True) for t in [t1, t2, t3, t4]]
    return [t1, t2, t3, t4], [p1, p2, p3, p4]

[t1, t2, t3, t4], [p1, p2, p3, p4] = load_dataset()

In [24]:
def score2(pm, pp, plant):
    threshold = plant_power[plant] * 0.03
    index = pm >= threshold
    return np.abs(pm[index] - pp[index]).sum() / (np.sum(index) * plant_power[plant])

In [33]:
kf = KFold(5, shuffle=True)
x, y = t1.iloc[:, [1,2,3,4,5,6]].values, t1["实际功率"].values

for train, valid in kf.split(x, y):
    mdl = lgb.LGBMRegressor()
    mdl.fit(x[train], y[train])
    pred = mdl.predict(x[valid])
    print(f'score {score2(y[valid], pred, 1)}')

score 0.10361088170005413
score 0.10335815372745352
score 0.10440236137714709
score 0.10249219573140987
score 0.1048609828861999


In [57]:
kf = KFold(5, shuffle=False)
x, y = t1.iloc[:, [1,2,3,4,5,6]].values, t1["实际功率"].values
ret = []
for train, valid in kf.split(x, y):
    mdl = lgb.LGBMRegressor()
    mdl.fit(x[train], y[train])
    pred = mdl.predict(x[valid])
    print(f'score {score2(y[valid], pred, 1)}')
    ret.append(score2(y[valid], pred, 1))
print(np.mean(ret))

score 0.11550962195409474
score 0.12523116172011847
score 0.12270395293794245
score 0.12548133395741365
score 0.12037888007214703
0.121860990128


In [120]:
from sklearn.model_selection import KFold, ParameterGrid
import lightgbm as lgb


def lgb_cv(params, x, y, metric, k=3, **kwargs):
    kf = KFold(k, **kwargs)
    weights = np.ones_like(y)
    weights[y>0.03] = 5
    ret = []
    for train, valid in kf.split(x):
        train_set = lgb.Dataset(x[train], y[train], weight=weights[train], **kwargs)
        valid_set = lgb.Dataset(x[valid], y[valid], weight=weights[valid], **kwargs)
        mdl = lgb.train(params, train_set, valid_sets=[train_set, valid_set], verbose_eval=-1)
        ret.append(metric(y[valid], mdl.predict(x[valid])))
    return ret

In [121]:
param ={
    'objective': 'regression_l1',
    'task': 'train',
    'boosting': 'gbdt',
    'n_iter': 75,
    'learning_rate': 0.1,
    'num_leaves': 31,
    'bagging_fraction': 0.5,
    'feature_fraction': 1.,
#     'early_stopping_round': 50
}

x, y = t1.iloc[:, [1,2,3,4,5,6]].values, t1["实际功率"].values
np.mean(lgb_cv(param, x, y, k=5, metric=lambda x, y: score2(x, y, 1)))

0.1200158700697465

In [124]:
list(ParameterGrid({"a": [1,2,3], "b":[2,3]}))

[{'a': 1, 'b': 2},
 {'a': 1, 'b': 3},
 {'a': 2, 'b': 2},
 {'a': 2, 'b': 3},
 {'a': 3, 'b': 2},
 {'a': 3, 'b': 3}]