In [118]:
import numpy as np
from chinese_calendar import is_holiday
from sklearn.model_selection import KFold, ParameterGrid
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import pandas as pd
import datetime as dt

plant_power = {
    1: 10,
    2: 10,
    3: 40,
    4: 50
}


def score2(pm, pp, plant):
    threshold = plant_power[plant] * 0.03
    index = pm >= threshold
    return np.abs(pm[index] - pp[index]).sum() / (np.sum(index) * plant_power[plant])



def _get_sample_weight(y, plant, w=5):
    plant_power = {
        1: 10,
        2: 10,
        3: 40,
        4: 50
    }
    weights = np.ones_like(y)
    weights[y > plant_power[plant] * 0.03] = w
    return weights


def _score2(pm, pp, plant):
    plant_power = {
        1: 10,
        2: 10,
        3: 40,
        4: 50
    }
    threshold = plant_power[plant] * 0.03
    index = pm >= threshold
    return np.abs(pm[index] - pp[index]).sum() / (np.sum(index) * plant_power[plant])


def lgb_cv(params, x, y, plant, shuffle, k=5, weight=5, **kwargs):
    kf = KFold(k, shuffle=shuffle, **kwargs)
    weights = _get_sample_weight(y, plant, w=weight)
    ret = []

    def metric(t, p):
        return _score2(t, p, plant)

    for train, valid in kf.split(x):
        train_set = lgb.Dataset(x[train], y[train], weight=weights[train], **kwargs)
        valid_set = lgb.Dataset(x[valid], y[valid], weight=weights[valid], **kwargs)
        mdl = lgb.train(params, train_set, valid_sets=[train_set, valid_set], verbose_eval=-1)
        ret.append(metric(y[valid], mdl.predict(x[valid])))
    return ret


def lgb_grid_search_cv(param_grid, x, y, plant, shuffle, k=5, **kwargs):
    grid = list(ParameterGrid(param_grid))
    max_score = np.inf
    best_param = None
    n_step = len(grid)
    for step, p in enumerate(grid):
        score = np.mean(lgb_cv(p, x, y, shuffle, plant=plant, k=k, **kwargs))
        if score < max_score:
            best_param = p
            max_score = score
            print(f'step {step / n_step * 100: .1f}%, best cv score: {max_score: .4f}')
    return best_param, max_score



def lgb_train(param, x, y, plant, **kwargs):
    weights = _get_sample_weight(y, plant)
    train_set = lgb.Dataset(x, y, weight=weights, **kwargs)
    model = lgb.train(param, train_set, verbose_eval=10)
    print(f'Plant {plant} trainset score: {_score2(y, model.predict(x), plant):.4f}')
    return model


def lgb_predict(model, x, idx):
    y = model.predict(x)
    pred = pd.DataFrame({"id": idx, "predicition": y})
    pred['id'] = pred['id'].astype(int)
    return pred


In [119]:
def load_dataset(plant):
    train = pd.read_csv(f'../data/train_{plant}.csv', parse_dates=["时间"]).drop_duplicates().reset_index(drop=True)
    test = pd.read_csv(f'../data/test_{plant}.csv', parse_dates=["时间"])
    train.columns = ['time', 'irr', 'ws', 'wd', 'temp', 'pr', 'hm', 'mirr', 'power']
    test.columns = ['id', 'time', 'irr', 'ws', 'wd', 'temp', 'pr', 'hm']
    data = pd.concat([train, test])
    return data

def data_preprocessing(data):
    print(f'data preprocessing ...')
    data['time'] = data['time'] + dt.timedelta(seconds=1.)
    # 处理异常power
    # 处理异常其他值
    return data


def date_feature(df, night=False, hour=True, month=True, month_hour=True,
         weekday=False, holiday=False, year=False):
    ret, features, feature_cat = [], [], []
    night_idx = ((df.time.dt.hour <= 5) | (df.time.dt.hour >= 20))
    if night:
        ret.append(night_idx.astype(int))
        features.append('date_night')
        feature_cat.append('cat')
    if hour:
        hour = df.time.dt.hour + df.time.dt.minute / 60
        hour = hour.apply(lambda x: '0' if x <= 5 or x >=20 else f'{x}')
        hour = LabelEncoder().fit_transform(hour)
        ret.append(hour)
        features.append('date_hour')
        feature_cat.append('cat')
    if month:
        ret.append(df.time.dt.month)
        features.append('date_month')
        feature_cat.append('cat')
    if month_hour:
        hour_map = lambda x: '0' if x <= 5 or x >=20 else f'{x}'
        hour = df.time.dt.hour.apply(hour_map)
        month = df.time.dt.month.astype(str)
        hour_month = LabelEncoder().fit_transform(hour + month)
        ret.append(hour_month)
        features.append('date_hour_month')
        feature_cat.append('cat')
    if weekday:
        ret.append(df.time.dt.weekday)
        features.append('date_weekday')
        feature_cat.append('cat')
    if year:
        ret.append(df.time.dt.year)
        features.append('date_year')
        feature_cat.append('cat')
    if holiday:
        ret.append(df.time.apply(is_holiday).astype(np.int).values)
        features.append('date_holiday')
        feature_cat.append('cat')
    if len(ret) == 0:
        raise ValueError('必须输入至少一个特征')
    return np.stack(ret, axis=1), features, feature_cat


def cross_irr_month(df):
    # month & irr_lvl
    irr_lvl = pd.cut(df['irr'], 20, labels=[f'irr_{i}' for i in range(20)])
    month = df.time.dt.month.astype(str)
    irr_month = LabelEncoder().fit_transform(irr_lvl + month)
    return irr_month.reshape(-1, 1), ['cross_irr_month'], ['cat']


def arithmetic_mapping(field_1, field_2, df):
    ret = []
    features = []
    feature_cat = ['num'] * 4
    for act in '+-*/':
        ret.append(eval(f'df[field_1] {act} df[field_2]').values)
        features.append(f'{field_1}{act}{field_2}')
    ret = np.stack(ret, axis=1)
    return ret, features, feature_cat


def arithmetic_field_mapping(fields_1, fields_2, df):
    field_combination = [(f1, f2) for f1 in fields_1 for f2 in fields_2]
    ret, features, feature_cat = [], [], []
    for f1, f2 in field_combination:
        r, fs, fc = arithmetic_mapping(f1, f2, df)
        ret.append(r)
        features += fs
        feature_cat += fc
    return np.concatenate(ret, axis=1), features, feature_cat


def origin_feature(df):
    feature = df[['hm', 'irr', 'pr', 'temp', 'wd', 'ws']]
    feature.columns = ['org '+col for col in feature.columns]
    return feature.values, feature.columns.tolist(), ['num'] * 6

In [120]:
featureTopN = 40
sampleWeight = 5

param_grid_full = {
    'boosting': ['gbdt'], 
    'learning_rate': [0.05, 0.03, 0.01] ,
    'num_iterations': range(500, 1000, 50) ,
    'num_leaves': [50, 60, 70],
    'max_depth': [7],
    'bagging_fraction': [0.7, 0.9],
    'feature_fraction': [0.7, 0.9],
    'objective': ['regression'],
    'task': ['train']
}

param_feature_selector = {'bagging_fraction': 0.7, 'boosting': 'gbdt', 'feature_fraction': 0.8, 
                          'learning_rate': 0.1, 'n_iter': 100, 'num_leaves': 31, 'objective': 'regression_l1', 
                          'task': 'train'}

plant = 1
choice_n = 5000

data = load_dataset(plant)
train = data.id.isnull()
test = data.power.isnull()

X = []
train_y = data[train].power.values
feature_name = []
feature_category = []

X_date, date_name, date_cate = date_feature(data)
X_org, org_name, org_cate = origin_feature(data)
X_cross1, cross1_name, cross1_cate = cross_irr_month(data)
X_map, map_name, map_cate = arithmetic_field_mapping(['hm', 'irr', 'pr', 'temp'], 
                                                     ['hm', 'irr', 'pr', 'temp'], data)

X = np.concatenate([X_date, X_org, X_map, X_cross1], axis=1)
feature_name = date_name + org_name + map_name + cross1_name
feature_category = date_cate + org_cate + map_cate + cross1_cate

train_X = X[train]
test_X = X[test]

choice = np.random.choice(train_X.shape[0], size=choice_n, replace=False)

feature_model = lgb_train(param_feature_selector, x=train_X, y=train_y, plant=plant)
feature_mask = np.where(np.argsort(-feature_model.feature_importance())<=featureTopN)[0]
feature_importance = \
pd.Series(feature_model.feature_importance(), index=feature_name).sort_values(ascending=False)
print(feature_importance[: featureTopN])
best_param, best_score = lgb_grid_search_cv(param_grid=param_grid_full, 
                   x=train_X[choice, :][:, feature_mask],
                   y=train_y[choice], shuffle=True,
                   k=5, plant=plant)
print(best_param, best_score)



Plant 1 trainset score: 0.0915
date_hour_month    261
date_hour          233
date_month         193
org irr            140
org wd             118
org ws             113
irr-temp           111
pr+temp             99
hm+temp             97
irr*irr             91
org hm              86
cross_irr_month     70
irr/temp            59
irr+temp            58
temp-irr            58
hm-pr               57
pr*temp             53
hm/pr               52
hm*temp             48
hm/temp             48
hm*pr               44
org temp            44
pr-temp             43
temp/pr             41
irr*temp            40
pr-hm               38
pr/temp             38
irr+pr              33
temp-hm             32
hm-temp             32
irr+irr             31
irr-hm              30
temp*temp           30
temp+hm             30
temp/irr            28
hm+pr               28
temp-pr             28
hm+irr              26
hm-irr              25
temp+pr             22
dtype: int64


TypeError: lgb_cv() got multiple values for argument 'plant'

In [100]:
featureTopN = 40
sampleWeight = 5

max_score = 10
best = None

for f in range(20,45,5):
    for w in range(4,12,1):
        feature_model = lgb_train(param_feature_selector, x=train_X, y=train_y, plant=plant)
        feature_mask = np.where(np.argsort(-feature_model.feature_importance())<=featureTopN)[0]
        s = lgb_cv(best_param, train_X[:, feature_mask], train_y, k=5, plant=1, weight=w)
        s = np.mean(s)
        if s <= max_score:
            best = [f, w]
            max_score = s
            print(max_score, best)



Plant 1 trainset score: 0.0912
0.118954573838 [20, 4]
Plant 1 trainset score: 0.0912
Plant 1 trainset score: 0.0912
Plant 1 trainset score: 0.0912
Plant 1 trainset score: 0.0912
Plant 1 trainset score: 0.0912
0.118954573838 [20, 9]
Plant 1 trainset score: 0.0912
0.118954573838 [20, 10]
Plant 1 trainset score: 0.0912
0.118954573838 [20, 11]
Plant 1 trainset score: 0.0912
Plant 1 trainset score: 0.0912
Plant 1 trainset score: 0.0912
Plant 1 trainset score: 0.0912
0.118954573838 [25, 7]
Plant 1 trainset score: 0.0912
Plant 1 trainset score: 0.0912
Plant 1 trainset score: 0.0912
Plant 1 trainset score: 0.0912
Plant 1 trainset score: 0.0912
Plant 1 trainset score: 0.0912
Plant 1 trainset score: 0.0912
Plant 1 trainset score: 0.0912
Plant 1 trainset score: 0.0912
Plant 1 trainset score: 0.0912
Plant 1 trainset score: 0.0912
Plant 1 trainset score: 0.0912
Plant 1 trainset score: 0.0912
Plant 1 trainset score: 0.0912
Plant 1 trainset score: 0.0912
Plant 1 trainset score: 0.0912
Plant 1 trainse

In [108]:
featureTopN = 40
sampleWeight = 5

param_grid = {
    'bagging_fraction': [0.7, 0.8], 
    'boosting': ['gbdt'], 
    'feature_fraction': [0.8, 0.9], 
    'learning_rate': [0.05, 0.1, 0.03] ,
    'n_iter': range(75, 600, 50) ,
    'num_leaves': [50, 31], 
    'objective': ['regression', 'regression_l1'],
    'task': ['train']
}

param_feature_selector = {'bagging_fraction': 0.7, 'boosting': 'gbdt', 'feature_fraction': 0.8, 
                          'learning_rate': 0.1, 'n_iter': 100, 'num_leaves': 31, 'objective': 'regression_l1', 
                          'task': 'train'}


plants = [1, 2, 3, 4]
ret = []
for plant in plants:
    data = load_dataset(plant)
    train = data.id.isnull()
    test = data.power.isnull()

    X = []
    train_y = data[train].power.values
    feature_name = []
    feature_category = []

    X_date, date_name, date_cate = date_feature(data)
    X_org, org_name, org_cate = origin_feature(data)
    X_cross1, cross1_name, cross1_cate = cross_irr_month(data)
    X_map, map_name, map_cate = arithmetic_field_mapping(['hm', 'irr', 'pr', 'temp', 'wd', 'ws'], 
                                                         ['hm', 'irr', 'pr', 'temp', 'wd', 'ws'], data)

    X = np.concatenate([X_date, X_org, X_map, X_cross1], axis=1)
    feature_name = date_name + org_name + map_name + cross1_name
    feature_category = date_cate + org_cate + map_cate + cross1_cate

    train_X = X[train]
    test_X = X[test]
    
    feature_model = lgb_train(param_feature_selector, x=train_X, y=train_y, plant=plant)
    feature_mask = np.where(np.argsort(-feature_model.feature_importance())<=featureTopN)[0]
    best_param, best_score = lgb_grid_search_cv(param_grid=param_grid, 
                       x=train_X[: ,feature_mask],
                       y=train_y,
                       k=5, plant=plant)
    print(best_param, best_score)
    model = lgb_train(param=best_param, x=train_X[: ,feature_mask], y=train_y, plant=plant)
    ret_plant = lgb_predict(model, x=test_X[: ,feature_mask], idx=data[test]['id'].values)
    ret.append(ret_plant)



Plant 1 trainset score: 0.0900
step  0.0%, best cv score:  0.1178
step  0.2%, best cv score:  0.1152
step  0.6%, best cv score:  0.1152
step  0.9%, best cv score:  0.1151
step  1.3%, best cv score:  0.1149
step  2.1%, best cv score:  0.1149
step  18.0%, best cv score:  0.1148
step  18.8%, best cv score:  0.1146
step  19.5%, best cv score:  0.1145
{'bagging_fraction': 0.7, 'boosting': 'gbdt', 'feature_fraction': 0.8, 'learning_rate': 0.03, 'n_iter': 225, 'num_leaves': 31, 'objective': 'regression_l1', 'task': 'train'} 0.114459876371
Plant 1 trainset score: 0.0931
Plant 2 trainset score: 0.0947
step  0.0%, best cv score:  0.1360
step  0.2%, best cv score:  0.1339
step  1.3%, best cv score:  0.1338
step  2.1%, best cv score:  0.1336
step  18.0%, best cv score:  0.1332
step  18.8%, best cv score:  0.1328


KeyboardInterrupt: 

In [117]:
p = {
    'bagging_fraction': 0.9, 'boosting': 'gbdt', 
    'feature_fraction': 1.0, 
    'learning_rate': 0.05, 
    'n_iter': 200, 
    'num_leaves': 20, 
    'max_depth': 6,
    'objective': 'regression_l1', 
    'task': 'train'
}

plants = [1, 2, 3, 4]
ret = []
for plant in plants:
    data = load_dataset(plant)
    train = data.id.isnull()
    test = data.power.isnull()

    X = []
    train_y = data[train].power.values
    feature_name = []
    feature_category = []

    X_date, date_name, date_cate = date_feature(data)
    X_org, org_name, org_cate = origin_feature(data)
    X_cross1, cross1_name, cross1_cate = cross_irr_month(data)
    X_map, map_name, map_cate = arithmetic_field_mapping(['hm', 'irr', 'pr', 'temp', 'wd', 'ws'], 
                                                         ['hm', 'irr', 'pr', 'temp', 'wd', 'ws'], data)

    X = np.concatenate([X_date, X_org, X_map, X_cross1], axis=1)
    feature_name = date_name + org_name + map_name + cross1_name
    feature_category = date_cate + org_cate + map_cate + cross1_cate

    train_X = X[train]
    test_X = X[test]
    
    feature_model = lgb_train(param_feature_selector, x=train_X, y=train_y, plant=plant)
    feature_mask = np.where(np.argsort(-feature_model.feature_importance())<=20)[0]
#     best_param, best_score = lgb_grid_search_cv(param_grid=param_grid, 
#                        x=train_X[: ,feature_mask],
#                        y=train_y,
#                        k=5, plant=plant)
#     print(best_param, best_score)
    s = lgb_cv(p, train_X[:, feature_mask], train_y, k=5, plant=plant)
    print(np.mean(s))
    model = lgb_train(param=p, x=train_X[: ,feature_mask], y=train_y, plant=plant)
    ret_plant = lgb_predict(model, x=test_X[: ,feature_mask], idx=data[test]['id'].values)
    ret.append(ret_plant)



Plant 1 trainset score: 0.0900
0.117100306285
Plant 1 trainset score: 0.1016
Plant 2 trainset score: 0.0947
0.133799048207
Plant 2 trainset score: 0.1092
Plant 3 trainset score: 0.0742


KeyboardInterrupt: 

In [110]:
ret = pd.concat(ret)
ret.to_csv("/home/zhouzr/桌面/submit_20181109v2.csv", index=None)