In [10]:
# coding: utf-8
# Author: Zhirui Zhou
# Mail  : evilpsycho42@gmail.com
# Time  : 11/5/18
import numpy as np


plant_power = {
    1: 10,
    2: 10,
    3: 40,
    4: 50
}


def mae_d(df_groupby, plant):
    pm = df_groupby['pm'].values
    pp = df_groupby['pp'].values
    threshold = plant_power[plant] * 0.03
    index = pm >= threshold
    return np.abs(pm[index] - pp[index]).sum() / (np.sum(index) * plant_power[plant])


def mae_m(df, plant):
    return df.groupby(df['datetime'].dt.day).apply(lambda x: mae_d(x, plant)).mean()


def score(df, plant):
    """

    :param df: datetime, pm, pp
    :param plant:
    :return:
    """
    month = df['datetime'].dt.month.unique()
    ret = []
    for m in month:
        ret.append(mae_m(df[df['datetime'].dt.month == m], plant))
    return np.mean(ret)


def score2(pm, pp, plant):
    threshold = plant_power[plant] * 0.03
    index = pm >= threshold
    return np.abs(pm[index] - pp[index]).sum() / (np.sum(index) * plant_power[plant])


# coding:utf8
# @Time    : 18-11-6 下午9:02
# @Author  : evilpsycho
# @Mail    : evilpsycho42@gmail.com
import numpy as np
from chinese_calendar import is_holiday


def arithmetic_mapping(field_1, field_2, df):
    ret = []
    features = []
    feature_cat = ['num'] * 4
    for act in '+-*/':
        ret.append(eval(f'df[field_1] {act} df[field_2]').values)
        features.append(f'{field_1}{act}{field_2}')
    ret = np.stack(ret, axis=1)
    return ret, features, feature_cat


def arithmetic_field_mapping(fields_1, fields_2, df):
    field_combination = [(f1, f2) for f1 in fields_1 for f2 in fields_2]
    ret, features, feature_cat = [], [], []
    for f1, f2 in field_combination:
        r, fs, fc = arithmetic_mapping(f1, f2, df)
        ret.append(r)
        features += fs
        feature_cat += fc
    return np.concatenate(ret, axis=1), features, feature_cat


def origin_feature(df):
    feature = df[['hm', 'irr', 'pr', 'temp', 'wd', 'ws']]
    feature.columns = ['org '+col for col in feature.columns]
    return feature.values, feature.columns.tolist(), ['num'] * 6

def date_feature(df, time=True, hour=True, month=True,
         weekday=True, holiday=True, year=True):
    ret, features, feature_cat = [], [], []
    if time:
        ret.append((df.time.dt.minute + df.time.dt.hour * 60).values)
        features.append('date_time')
        feature_cat.append('num')
    if hour:
        ret.append(df.time.dt.hour)
        features.append('date_hour')
        feature_cat.append('num')
    if month:
        ret.append(df.time.dt.month)
        features.append('date_month')
        feature_cat.append('cat')
    if weekday:
        ret.append(df.time.dt.weekday)
        features.append('date_weekday')
        feature_cat.append('cat')
    if year:
        ret.append(df.time.dt.year)
        features.append('date_year')
        feature_cat.append('cat')
    if holiday:
        ret.append(df.time.apply(is_holiday).astype(np.int).values)
        features.append('date_holiday')
        feature_cat.append('cat')
    if len(ret) == 0:
        raise ValueError('必须输入至少一个特征')
    return np.stack(ret, axis=1), features, feature_cat


# coding:utf8
# @Time    : 18-11-6 下午6:19
# @Author  : evilpsycho
# @Mail    : evilpsycho42@gmail.com
from sklearn.model_selection import KFold, ParameterGrid
import lightgbm as lgb
import numpy as np
import pandas as pd


def _get_sample_weight(y, plant, w=5):
    plant_power = {
        1: 10,
        2: 10,
        3: 40,
        4: 50
    }
    weights = np.ones_like(y)
    weights[y > plant_power[plant] * 0.03] = w
    return weights


def _score2(pm, pp, plant):
    plant_power = {
        1: 10,
        2: 10,
        3: 40,
        4: 50
    }
    threshold = plant_power[plant] * 0.03
    index = pm >= threshold
    return np.abs(pm[index] - pp[index]).sum() / (np.sum(index) * plant_power[plant])


def lgb_cv(params, x, y, plant, k=3, **kwargs):
    kf = KFold(k, **kwargs)
    weights = _get_sample_weight(y, plant)
    ret = []

    def metric(t, p):
        return _score2(t, p, plant)

    for train, valid in kf.split(x):
        train_set = lgb.Dataset(x[train], y[train], weight=weights[train], **kwargs)
        valid_set = lgb.Dataset(x[valid], y[valid], weight=weights[valid], **kwargs)
        mdl = lgb.train(params, train_set, valid_sets=[train_set, valid_set], verbose_eval=-1)
        ret.append(metric(y[valid], mdl.predict(x[valid])))
    return ret


def lgb_grid_search_cv(param_grid, x, y, plant, k=5, **kwargs):
    grid = list(ParameterGrid(param_grid))
    max_score = np.inf
    best_param = None
    n_step = len(grid)
    for step, p in enumerate(grid):
        score = np.mean(lgb_cv(p, x, y, plant=plant, k=k, **kwargs))
        if score < max_score:
            best_param = p
            max_score = score
            print(f'step {step / n_step * 100: .1f}%, best cv score: {max_score: .4f}')
    return best_param, max_score


def lgb_train(param, x, y, plant, **kwargs):
    weights = _get_sample_weight(y, plant)
    train_set = lgb.Dataset(x, y, weight=weights, **kwargs)
    model = lgb.train(param, train_set, verbose_eval=10)
    print(f'Plant {plant} trainset score: {_score2(y, model.predict(x), plant):.4f}')
    return model


def lgb_predict(model, x, idx):
    y = model.predict(x)
    pred = pd.DataFrame({"id": idx, "predicition": y})
    pred['id'] = pred['id'].astype(int)
    return pred


In [11]:
param_grid = {
    'bagging_fraction': [0.7, 0.8], 
    'boosting': ['gbdt'], 
    'feature_fraction': [0.8, 0.9], 
    'learning_rate': [0.05, 0.1, 0.03] ,
    'n_iter': range(75, 600, 50) ,
    'num_leaves': [50, 31], 
    'objective': ['regression', 'regression_l1'],
    'task': ['train']
}

param_feature_selector = {'bagging_fraction': 0.7, 'boosting': 'gbdt', 'feature_fraction': 0.8, 
                          'learning_rate': 0.1, 'n_iter': 100, 'num_leaves': 31, 'objective': 'regression_l1', 
                          'task': 'train'}

In [13]:
featureTopN = 40
sampleWeight = 5


plants = [1, 2, 3, 4]
ret = []
for plant in plants:
    data = load_dataset(plant)
    train = data.id.isnull()
    test = data.power.isnull()

    X = []
    train_y = data[train].power.values
    feature_name = []
    feature_category = []

    X_date, date_name, date_cate = date_feature(data)
    X_org, org_name, org_cate = origin_feature(data)
    X_map, map_name, map_cate = arithmetic_field_mapping(['hm', 'irr', 'pr', 'temp'], 
                                                         ['hm', 'irr', 'pr', 'temp'], data)

    X = np.concatenate([X_date, X_org, X_map], axis=1)
    feature_name = date_name + org_name + map_name
    feature_category = date_cate + org_cate + map_cate

    train_X = X[train]
    test_X = X[test]
    
    feature_model = lgb_train(param_feature_selector, x=train_X, y=train_y, plant=plant)
    feature_mask = np.where(np.argsort(-feature_model.feature_importance())<=featureTopN)[0]
    best_param, best_score = lgb_grid_search_cv(param_grid=param_grid, 
                       x=train_X[: ,feature_mask],
                       y=train_y,
                       k=5, plant=plant)
    print(best_param, best_score)
    model = lgb_train(param=best_param, x=train_X[: ,feature_mask], y=train_y, plant=plant)
    ret_plant = lgb_predict(model, x=test_X[: ,feature_mask], idx=data[test]['id'].values)
    ret.append(ret_plant)

loading plant 1 data
Plant 1 trainset score: 0.0895
step  0.0%, best cv score:  0.1178
step  0.2%, best cv score:  0.1159
step  0.6%, best cv score:  0.1152
step  18.8%, best cv score:  0.1151
Plant 1 trainset score: 0.0946
loading plant 2 data
Plant 2 trainset score: 0.0956
step  0.0%, best cv score:  0.1372
step  0.2%, best cv score:  0.1329
step  0.6%, best cv score:  0.1328
step  1.3%, best cv score:  0.1322
step  18.8%, best cv score:  0.1319
step  19.5%, best cv score:  0.1318
step  20.3%, best cv score:  0.1317
Plant 2 trainset score: 0.0984
loading plant 3 data
Plant 3 trainset score: 0.0766
step  0.0%, best cv score:  0.1315
step  0.2%, best cv score:  0.1230
step  0.6%, best cv score:  0.1214
Plant 3 trainset score: 0.0858
loading plant 4 data
Plant 4 trainset score: 0.0819
step  0.0%, best cv score:  0.1250
step  0.2%, best cv score:  0.1235
step  0.9%, best cv score:  0.1222
step  8.9%, best cv score:  0.1220
step  44.1%, best cv score:  0.1219
Plant 4 trainset score: 0.079

In [14]:
ret = pd.concat(ret)
ret.to_csv("/home/zhouzr/桌面/submit_20181107v2.csv", index=None)