In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
from sklearn.metrics import mutual_info_score

import warnings
warnings.filterwarnings("ignore")

# import xgboost as xgb
import lightgbm as lgb
import logging

from sklearn.model_selection import KFold, ParameterGrid
import lightgbm as lgb
import numpy as np
# coding: utf-8
# Author: Zhirui Zhou
# Mail  : evilpsycho42@gmail.com
# Time  : 11/5/18
import numpy as np


plant_power = {
    1: 10,
    2: 10,
    3: 40,
    4: 50
}


def mae_d(df_groupby, plant):
    pm = df_groupby['pm'].values
    pp = df_groupby['pp'].values
    threshold = plant_power[plant] * 0.03
    index = pm >= threshold
    return np.abs(pm[index] - pp[index]).sum() / (np.sum(index) * plant_power[plant])


def mae_m(df, plant):
    return df.groupby(df['datetime'].dt.day).apply(lambda x: mae_d(x, plant)).mean()


def score(df, plant):
    """

    :param df: datetime, pm, pp
    :param plant:
    :return:
    """
    month = df['datetime'].dt.month.unique()
    ret = []
    for m in month:
        ret.append(mae_m(df[df['datetime'].dt.month == m], plant))
    return np.mean(ret)


def score2(pm, pp, plant):
    threshold = plant_power[plant] * 0.03
    index = pm >= threshold
    return np.abs(pm[index] - pp[index]).sum() / (np.sum(index) * plant_power[plant])



def lgb_cv(params, x, y, metric, k=3, **kwargs):
    kf = KFold(k, **kwargs)
    weights = np.ones_like(y)
    weights[y > 0.03] = 5
    ret = []
    for train, valid in kf.split(x):
        train_set = lgb.Dataset(x[train], y[train], weight=weights[train], **kwargs)
        valid_set = lgb.Dataset(x[valid], y[valid], weight=weights[valid], **kwargs)
        mdl = lgb.train(params, train_set, valid_sets=[train_set, valid_set], verbose_eval=-1)
        ret.append(metric(y[valid], mdl.predict(x[valid])))
    return ret


def lgb_grid_search_cv(paras_grid, x, y, k=3, **kwargs):
    grid = list(ParameterGrid(paras_grid))
    max_score = np.inf
    best_param = None
    n_step = len(grid)
    for step, p in enumerate(grid):
        score = np.mean(lgb_cv(p, x, y, k=k, **kwargs))
        if score < max_score:
            best_param = p
            max_score = score
            print(f'step {step / n_step * 100: .1f}%, best cv score: {max_score: .4f}')
    return best_param, max_score

def load_dataset(plant):
    print(f'loading plant {plant} data')
    train = pd.read_csv(f'../data/train_{plant}.csv', parse_dates=["时间"]).drop_duplicates().reset_index(drop=True)
    test = pd.read_csv(f'../data/test_{plant}.csv', parse_dates=["时间"])
    train.columns = ['time', 'irr', 'ws', 'wd', 'temp', 'pr', 'hm', 'mirr', 'power']
    test.columns = ['id', 'time', 'irr', 'ws', 'wd', 'temp', 'pr', 'hm']
#     print(f'train_set from {train.time.min().date()} to {train.time.max().date()}')
#     print(f'test_set from {test.time.min().date()} to {test.time.max().date()}')
    data = pd.concat([train, test])
    return data


In [12]:
param = {
    'bagging_fraction': [0.7], 
    'boosting': ['gbdt'], 
    'feature_fraction': [1.0], 
    'learning_rate': [0.07, 0.05, 0.03] ,
    'n_iter': range(75, 350, 50) ,
    'num_leaves': [50, 31], 
    'objective': ['regression', 'regression_l1'],
    'task': ['train']
}

In [18]:
plants = [1, 2, 3, 4]
ret = []
for plant in plants:
    data = load_dataset(plant=plant)
    train = data.id.isnull()
    test = data.power.isnull()
    data['month'] = data['time'].dt.month
    data['hour'] = data['time'].dt.hour
    train_x = data.loc[train, ['hm', 'pr', 'temp', 'wd', 'ws', 'irr', 'month', 'hour']].values
    train_y = data.loc[train, 'power'].values
    print(train_x.shape, train_y.shape)
    best_param, best_score = lgb_grid_search_cv(paras_grid=param, 
                       x=train_x,
                       y=train_y,
                       k=5, metric=lambda x, y: score2(x, y, plant))
    print(best_param, best_score)
    
    
    trainset = lgb.Dataset(data.loc[train, ['hm', 'pr', 'temp', 'wd', 'ws', 'irr', 'month', 'hour']], 
                           label=data[train]['power'])
    model = lgb.train(best_param, trainset)
    pred = model.predict(data.loc[test, ['hm', 'pr', 'temp', 'wd', 'ws', 'irr', 'month', 'hour']])
    pred = pd.DataFrame({"id": data[test]['id'], "predicition": pred})
    assert pred.shape[0] == test.sum()
    ret.append(pred)

loading plant 1 data
(66858, 8) (66858,)
step  0.0%, best cv score:  0.1177
step  1.4%, best cv score:  0.1171
step  4.2%, best cv score:  0.1168
step  37.5%, best cv score:  0.1165
step  43.1%, best cv score:  0.1165
{'bagging_fraction': 0.7, 'boosting': 'gbdt', 'feature_fraction': 1.0, 'learning_rate': 0.05, 'n_iter': 125, 'num_leaves': 31, 'objective': 'regression_l1', 'task': 'train'} 0.1164864648
loading plant 2 data
(43755, 8) (43755,)
step  0.0%, best cv score:  0.1354
step  1.4%, best cv score:  0.1329
step  4.2%, best cv score:  0.1323
step  9.7%, best cv score:  0.1318
{'bagging_fraction': 0.7, 'boosting': 'gbdt', 'feature_fraction': 1.0, 'learning_rate': 0.07, 'n_iter': 125, 'num_leaves': 31, 'objective': 'regression_l1', 'task': 'train'} 0.13183237243
loading plant 3 data
(29792, 8) (29792,)
step  0.0%, best cv score:  0.1325
step  1.4%, best cv score:  0.1264
step  4.2%, best cv score:  0.1242
step  37.5%, best cv score:  0.1236
{'bagging_fraction': 0.7, 'boosting': 'gbdt'

In [20]:
ret = pd.concat(ret)
ret['id'] = ret['id'].astype(int)

ret.to_csv("/home/zhouzr/桌面/submit_20181107.csv", index=None)