In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
from sklearn.metrics import mutual_info_score

import warnings
warnings.filterwarnings("ignore")

import xgboost as xgb
import lightgbm as lgb

plant_power = {
    1: 10,
    2: 10,
    3: 40,
    4: 50
}


def mae_d(df_groupby, plant):
    pm = df_groupby['pm'].values
    pp = df_groupby['pp'].values
    threshold = plant_power[plant] * 0.03
    index = pm >= threshold
    return np.abs(pm[index] - pp[index]).sum() / (np.sum(index) * plant_power[plant])


def mae_m(df, plant):
    return df.groupby(df['datetime'].dt.day).apply(lambda x: mae_d(x, plant)).mean()


def score(df, plant):
    """

    :param df: datetime, pm, pp
    :param plant:
    :return:
    """
    month = df['datetime'].dt.month.unique()
    ret = []
    for m in month:
        ret.append(mae_m(df[df['datetime'].dt.month == m], plant))
    return np.mean(ret)

In [3]:
def load_dataset():
    [t1, t2, t3, t4] = [pd.read_csv(f'../data/train_{i}.csv', parse_dates=["时间"]) for i in range(1, 5)]
    [p1, p2, p3, p4] = [pd.read_csv(f'../data/test_{i}.csv', parse_dates=["时间"]) for i in range(1, 5)]
    [t1, t2, t3, t4] = [t.drop_duplicates().reset_index(drop=True) for t in [t1, t2, t3, t4]]
    return [t1, t2, t3, t4], [p1, p2, p3, p4]

[t1, t2, t3, t4], [p1, p2, p3, p4] = load_dataset()

In [22]:
def get_x(t):
    t['month_cat'] = t["时间"].dt.month
    t['hour_cat'] = t["时间"].dt.month
    t['irr'] = t["辐照度"]
    t['ws'] = t["风速"]
    t['wd_cat'] = pd.cut(t1["风向"], bins=list(range(0, 360, 30)))
    t['pr'] = t["压强"]
    t['hm'] = t["湿度"]
    return t.loc[:, ['month_cat', 'hour_cat', 'irr', 'ws', 'wd_cat', 'pr', 'hm']]

train_x = get_x(t1)
train_y = t1["实际功率"].values

from sklearn.cross_validation import train_test_split

x1, x2, y1, y2 = train_x[:60000], train_x[60000:], train_y[:60000], train_y[60000:]

In [47]:
train_x = get_x(t4)
train_y = t4["实际功率"].values
k = int(train_x.shape[0] * 0.8)
x1, x2, y1, y2 = train_x[:k], train_x[k:], train_y[:k], train_y[k:]

In [48]:
param = {
    'bagging_fraction': 0.95, 
    'feature_fraction': 1.0, 
    'learning_rate': 0.01, 
    'num_boost_round': 400, 
    'num_leaves': 31, 
    'reg_alpha': 0.1
}

model = lgb.LGBMRegressor(boosting_type='gbdt' ,bagging_freq=5, categorical_feature=[0,1,4], **param)


In [58]:
model.fit(x1, y1)
pred_y = model.predict(x2)
df = pd.DataFrame({'datetime': t4['时间'][k:], 'pm': y2, 'pp': pred_y})
score(df, 4)

0.13702701146535712

In [57]:
score2(y2, pred_y, plant=4)

0.1461389218028768

In [54]:
def score2(pm, pp, plant):
    threshold = plant_power[plant] * 0.03
    index = pm >= threshold
    return np.abs(pm[index] - pp[index]).sum() / (np.sum(index) * plant_power[plant])

In [62]:
def score3(pm, pp):
#     threshold = plant_power[plant] * 0.03
#     index = pm >= threshold
    return np.abs(pm - pp).mean()

In [63]:
score3(y2, pred_y)

3.4947231919566994