In [2]:
import joblib
import itertools as it

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm

mpl.style.use('seaborn-dark-palette')

font = {'family': 'normal',
        'weight': 'normal',
        'size'  : 16}
plt.tight_layout()

mpl.rc('font', **font)

sns.set_style("whitegrid")

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

pd.options.display.float_format = '{:,.4f}'.format

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


## Load solar power data

In [3]:
df_solar_power = pd.read_csv( '../data/solar/solar_power_train.csv',
                             header=0,
                             index_col=0,
                             parse_dates=True)

df_solar_meteo = pd.read_csv('../data/solar/solar_meteo_train.csv',
                             header=[0, 1],
                             skiprows=[2],
                             index_col=0,
                             parse_dates=True)

df_solar_meteo_test = pd.read_csv('../data/solar/solar_meteo_test.csv',
                             header=[0, 1],
                             skiprows=[2],
                             index_col=0,
                             parse_dates=True)

In [4]:
df_solar_meteo.head()

ZONEID,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3
Unnamed: 0_level_1,VAR134,VAR134,VAR134,VAR157,VAR157,VAR157,VAR164,VAR164,VAR164,VAR165,VAR165,VAR165,VAR166,VAR166,VAR166,VAR167,VAR167,VAR167,VAR169,VAR169,VAR169,VAR175,VAR175,VAR175,VAR178,VAR178,VAR178,VAR228,VAR228,VAR228,VAR78,VAR78,VAR78,VAR79,VAR79,VAR79
2012-04-01 01:00:00,94843.625,94290.625,92667.625,60.2219,57.3743,57.0032,0.2446,0.191,0.2381,1.0393,0.8579,0.9862,-2.503,-2.8016,-3.1334,294.4485,294.3933,293.9036,2577830.0,2595302.0,2606438.0,1202532.0,1192092.0,1158284.0,2861797.0,2884677.0,2901861.0,0.0,0.0,0.0,0.002,0.0015,0.0035,0.0036,0.0054,0.0247
2012-04-01 02:00:00,94757.9375,94217.6875,92622.6875,54.6786,57.1298,64.9608,0.4571,0.5171,0.5616,2.4829,2.5941,2.6884,-2.9933,-2.6532,-2.5749,295.6514,294.9624,293.1965,5356093.0,5374973.0,5352637.0,2446757.0,2420485.0,2341093.0,5949378.0,5984578.0,5980930.0,0.0,0.0,0.0008,0.0055,0.0183,0.0275,0.0336,0.1016,0.1405
2012-04-01 03:00:00,94732.8125,94201.0625,92598.8125,61.2949,63.0761,66.7959,0.7714,0.7052,0.6616,3.3399,2.8814,2.8845,-1.9825,-1.6478,-1.8937,294.4546,293.8767,292.5925,7921788.0,7858492.0,7714300.0,3681336.0,3643656.0,3531304.0,8939176.0,8929064.0,8822632.0,0.0013,0.0013,0.0017,0.0301,0.0353,0.0324,0.132,0.1129,0.1366
2012-04-01 04:00:00,94704.0625,94156.0625,92542.0625,67.7753,62.9755,65.1776,0.9659,0.9934,0.9416,3.1061,3.0248,3.1757,-1.4461,-1.4968,-1.6399,293.2615,293.0713,292.1748,9860520.0,9719720.0,9619368.0,4921504.0,4885120.0,4740096.0,11331679.0,11258463.0,11178847.0,0.0025,0.0017,0.0021,0.0572,0.0645,0.032,0.1106,0.1078,0.0974
2012-04-01 05:00:00,94675.0,94124.25,92508.0,70.173,65.0538,65.7443,0.9447,0.9376,0.9447,2.6011,2.5216,2.6973,-1.9045,-1.8366,-1.8801,292.7329,292.5444,291.5845,11143097.0,11054009.0,11054009.0,6254380.0,6211372.0,6014412.0,13105558.0,13079318.0,13079958.0,0.0033,0.0021,0.0025,0.051,0.0595,0.0485,0.1896,0.1576,0.1638


In [18]:
df_solar_power.describe()

Unnamed: 0,1,2,3
count,15360.0,15360.0,15360.0
mean,0.1735,0.1923,0.1992
std,0.2628,0.2795,0.288
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0027,0.0024,0.003
75%,0.2848,0.3575,0.3707
max,0.9162,0.9778,1.0035


In [19]:
val_set_size = 3000

In [20]:
len(df_solar_meteo)

15359

In [21]:
len(df_solar_power)

15360

In [22]:
plant_cols = df_solar_power.columns

In [23]:
plant_cols

Index(['1', '2', '3'], dtype='object')

In [24]:
meteo_cols = df_solar_meteo['1']

In [25]:
dfs = {}
for plant in plant_cols:
    tmpdf = df_solar_meteo[plant].copy()
    tmpdf['label'] = df_solar_power[plant].iloc[:-1]
    tmpdf['split'] = ['train' for _ in range(len(df_solar_meteo)-val_set_size)] + ['val' for _ in range(val_set_size)]
    
    tmpdf_test = df_solar_meteo_test[plant].copy()
    tmpdf_test['split'] = 'test'
    dfs[plant] = pd.concat([tmpdf, tmpdf_test])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if __name__ == '__main__':


In [26]:
df = pd.concat([df_solar_meteo, df_solar_power.iloc[:-1]], axis=1)
df['split'] = ['train' for _ in range(len(df_solar_meteo)-val_set_size)] + ['val' for _ in range(val_set_size)]


In [324]:
# Tests on how easy it is to get good predictions:

In [128]:
df_power = df_solar_power.copy()
for col in plant_cols:
    df_power[col + '_shifted'] = df_power[col].shift(1).fillna(0)

In [129]:
from sklearn.metrics import *

In [55]:
mean_squared_error(df_power[plant_cols], df_power[plant_cols + '_shifted'], multioutput='raw_values')

array([0.01301517, 0.01323873, 0.01361373])

In [67]:
avg_prediction = df_power[plant_cols].mean().values.reshape([1, -1]).repeat(len(df_power), axis=0)

In [68]:
mean_squared_error(df_power[plant_cols], avg_prediction, multioutput='raw_values')

array([0.06903618, 0.07810757, 0.08293667])

In [52]:
((df_power[plant_cols].values - df_power[plant_cols + '_shifted'].values)**2).mean(axis=0)

array([0.01301517, 0.01323873, 0.01361373])

In [None]:
((df_power[plant_cols].values - df_power[plant_cols + '_shifted'].values)**2).mean(axis=0)

In [18]:
for k in dfs:
    dfs[k]['hour'] =  dfs[k].index.hour

In [19]:
# does not help:
# for k in dfs:
#     for col in dfs[k].columns.drop(['label', 'split']):
#         dfs[k][col + '_is_zero'] = dfs[k][col] == 0.0

In [None]:
# Groupby is used to separate days. Can be used for many nice tricks.

In [21]:
# last and next
for k in dfs:
    for col in list(meteo_cols) + ['hour']:
        day = ((dfs[k].index.astype('int64')-dfs[k].index.astype('int64').min())/1e9/60/60/24).astype('int64')
        for i in range(1,5):
            dfs[k][col+'_+{}'.format(i)] = dfs[k].groupby(day)[col].shift(i).fillna(0)
            dfs[k][col+'_-{}'.format(i)] = dfs[k].groupby(day)[col].shift(-i).fillna(0)        
            dfs[k][col+'_minus_+{}'.format(i)] = dfs[k][col] - dfs[k][col+'_+{}'.format(i)]
            dfs[k][col+'_minus_-{}'.format(i)] = dfs[k][col] - dfs[k][col+'_-{}'.format(i)]
# It seems like the explicit 

In [417]:
# col = list(meteo_cols)[0]
# dfs[k].groupby(day)[col].apply(lambda x: [])

'VAR134'

In [None]:
# for k in dfs:
#     for col in meteo_cols:
#         day = ((dfs[k].index.astype('int64')-dfs[k].index.astype('int64').min())/1e9/60/60/24).astype('int64')
#         dfs[k][col] = dfs[k].groupby(day)[col].


In [34]:
# Note that the LGBM API has updated from this
m = lightgbm.LGBMRegressor(
    random_state=0, bagging_fraction=0.5, 
    num_iterations=1000, learning_rate=0.05, early_stopping_rounds=50)

In [35]:
tmpdf = dfs['1']
# I dropped columns from feature set, could also have used a selection like so:
# selection = ['hour', 'VAR137', ...]
# features = tmpdf[selection]

m.fit(
    tmpdf[tmpdf.split == 'train'].drop(['label', 'split'], axis=1),
    tmpdf[tmpdf.split == 'train']['label'],
    eval_set=[(tmpdf[tmpdf.split == 'val'].drop(['label', 'split'], axis=1), tmpdf[tmpdf.split == 'val']['label'])],
#     categorical_feature=[col for col in tmpdf.columns if col.endswith('_is_zero')],# ['hour']
)

# [100]	valid_0's l2: 0.0116614
# [100]	valid_0's l2: 0.00536193 with hour
# [1000]	valid_0's l2: 0.00531883 1000 iterations, 0.01 learning rate
# [1000]	valid_0's l2: 0.00525209 1000 iterations, 0.01 learning rate, 0.5 bagging
# [280]	valid_0's l2: 0.00517255 0.05 learning rate, 0.5 bagging
#[110]	valid_0's l2: 0.00481502, as above
# [110]	valid_0's l2: 0.00478648 (fixed bug?) and prev hour
# [133]	valid_0's l2: 0.00449676 with prev minus multiple times and prev hour

# [127]	valid_0's l2: 0.00475489 with prev and prev minus
# [183]	valid_0's l2: 0.00457361 with hour previous and duplicates etc
# [219]	valid_0's l2: 0.00466977 with 2 diffs and previous

# [225]	valid_0's l2: 0.00519401 with 12 previous 
# [115]	valid_0's l2: 0.00481267 with 12 previous and diffs


# [100]	valid_0's l2: 0.00574346 with hour as categorical

# [100]	valid_0's l2: 0.0141626  using dart
#[100]	valid_0's l2: 0.00814383 using dart with hour




[1]	valid_0's l2: 0.0774502
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's l2: 0.0717287
[3]	valid_0's l2: 0.0665136
[4]	valid_0's l2: 0.0619163
[5]	valid_0's l2: 0.0576785
[6]	valid_0's l2: 0.0537926
[7]	valid_0's l2: 0.0503358
[8]	valid_0's l2: 0.047131
[9]	valid_0's l2: 0.0441876
[10]	valid_0's l2: 0.0414777
[11]	valid_0's l2: 0.0390307
[12]	valid_0's l2: 0.0368628
[13]	valid_0's l2: 0.0348443
[14]	valid_0's l2: 0.0329815
[15]	valid_0's l2: 0.0312686
[16]	valid_0's l2: 0.0296602
[17]	valid_0's l2: 0.0281615
[18]	valid_0's l2: 0.0268999
[19]	valid_0's l2: 0.0257031
[20]	valid_0's l2: 0.0245615
[21]	valid_0's l2: 0.0235347
[22]	valid_0's l2: 0.0226736
[23]	valid_0's l2: 0.0218193
[24]	valid_0's l2: 0.0210258
[25]	valid_0's l2: 0.0203185
[26]	valid_0's l2: 0.0196519
[27]	valid_0's l2: 0.0190369
[28]	valid_0's l2: 0.018456
[29]	valid_0's l2: 0.0179304
[30]	valid_0's l2: 0.0174376
[31]	valid_0's l2: 0.0169945
[32]	valid_0's l2: 0.0165904
[33]	valid_0's l2: 0.

LGBMRegressor(bagging_fraction=0.5, boosting_type='gbdt', class_weight=None,
       colsample_bytree=1.0, early_stopping_rounds=50,
       importance_type='split', learning_rate=0.05, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_iterations=1000, num_leaves=31,
       objective=None, random_state=0, reg_alpha=0.0, reg_lambda=0.0,
       silent=True, subsample=1.0, subsample_for_bin=200000,
       subsample_freq=0)

In [36]:
m.predict(tmpdf[tmpdf.split == 'test'].drop(['label', 'split'], axis=1))

array([0.70539424, 0.55306566, 0.33989707, ..., 0.02170833, 0.03495289,
       0.31230635])

In [40]:
# run for all plants
predicted = {}
for k, tmpdf in dfs.items():
    m = lightgbm.LGBMRegressor(
    random_state=0, bagging_fraction=0.5, 
    num_iterations=1000, learning_rate=0.05, early_stopping_rounds=50)
    
    print(k)
    m.fit(
        tmpdf[tmpdf.split == 'train'].drop(['label', 'split'], axis=1),
        tmpdf[tmpdf.split == 'train']['label'],
        eval_set=[(tmpdf[tmpdf.split == 'val'].drop(['label', 'split'], axis=1), tmpdf[tmpdf.split == 'val']['label'])],
    )
    
    best_iteration = np.argmin(m.evals_result_['valid_0']['l2'])
    print('best iteration at ', best_iteration)
    # Now we know the number of iterations to run for, we used that to train on all data WITHOUT 
    # validation data (that could be used for early stopping)
    
    m = lightgbm.LGBMRegressor(
    random_state=0, bagging_fraction=0.5, 
    num_iterations=best_iteration, learning_rate=0.05)
    
    m.fit(
        tmpdf[tmpdf.split != 'test'].drop(['label', 'split'], axis=1),
        tmpdf[tmpdf.split != 'test']['label'],
    )
    
    predicted[k] = m.predict(tmpdf[tmpdf.split == 'test'].drop(['label', 'split'], axis=1))

1
[1]	valid_0's l2: 0.0774502
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's l2: 0.0717287
[3]	valid_0's l2: 0.0665136
[4]	valid_0's l2: 0.0619163
[5]	valid_0's l2: 0.0576785




[6]	valid_0's l2: 0.0537926
[7]	valid_0's l2: 0.0503358
[8]	valid_0's l2: 0.047131
[9]	valid_0's l2: 0.0441876
[10]	valid_0's l2: 0.0414777
[11]	valid_0's l2: 0.0390307
[12]	valid_0's l2: 0.0368628
[13]	valid_0's l2: 0.0348443
[14]	valid_0's l2: 0.0329815
[15]	valid_0's l2: 0.0312686
[16]	valid_0's l2: 0.0296602
[17]	valid_0's l2: 0.0281615
[18]	valid_0's l2: 0.0268999
[19]	valid_0's l2: 0.0257031
[20]	valid_0's l2: 0.0245615
[21]	valid_0's l2: 0.0235347
[22]	valid_0's l2: 0.0226736
[23]	valid_0's l2: 0.0218193
[24]	valid_0's l2: 0.0210258
[25]	valid_0's l2: 0.0203185
[26]	valid_0's l2: 0.0196519
[27]	valid_0's l2: 0.0190369
[28]	valid_0's l2: 0.018456
[29]	valid_0's l2: 0.0179304
[30]	valid_0's l2: 0.0174376
[31]	valid_0's l2: 0.0169945
[32]	valid_0's l2: 0.0165904
[33]	valid_0's l2: 0.0162757
[34]	valid_0's l2: 0.015966
[35]	valid_0's l2: 0.01565
[36]	valid_0's l2: 0.0153774
[37]	valid_0's l2: 0.0151344
[38]	valid_0's l2: 0.0149001
[39]	valid_0's l2: 0.0146351
[40]	valid_0's l2: 0.01



2
[1]	valid_0's l2: 0.0888543
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's l2: 0.081897
[3]	valid_0's l2: 0.0755652
[4]	valid_0's l2: 0.0698103
[5]	valid_0's l2: 0.0645864
[6]	valid_0's l2: 0.0598724
[7]	valid_0's l2: 0.0556635
[8]	valid_0's l2: 0.0516856
[9]	valid_0's l2: 0.0480965
[10]	valid_0's l2: 0.0447063




[11]	valid_0's l2: 0.0417232
[12]	valid_0's l2: 0.0389776
[13]	valid_0's l2: 0.03648
[14]	valid_0's l2: 0.0343007
[15]	valid_0's l2: 0.032348
[16]	valid_0's l2: 0.0305005
[17]	valid_0's l2: 0.0287822
[18]	valid_0's l2: 0.0273098
[19]	valid_0's l2: 0.0258801
[20]	valid_0's l2: 0.024628
[21]	valid_0's l2: 0.0234397
[22]	valid_0's l2: 0.0223747
[23]	valid_0's l2: 0.021396
[24]	valid_0's l2: 0.0205452
[25]	valid_0's l2: 0.0197435
[26]	valid_0's l2: 0.0189938
[27]	valid_0's l2: 0.0182623
[28]	valid_0's l2: 0.0176591
[29]	valid_0's l2: 0.0171112
[30]	valid_0's l2: 0.016603
[31]	valid_0's l2: 0.0161357
[32]	valid_0's l2: 0.0156979
[33]	valid_0's l2: 0.0153205
[34]	valid_0's l2: 0.0149803
[35]	valid_0's l2: 0.0146151
[36]	valid_0's l2: 0.0143031
[37]	valid_0's l2: 0.0140298
[38]	valid_0's l2: 0.0137425
[39]	valid_0's l2: 0.0135002
[40]	valid_0's l2: 0.0132534
[41]	valid_0's l2: 0.0130537
[42]	valid_0's l2: 0.0128469
[43]	valid_0's l2: 0.012657
[44]	valid_0's l2: 0.0124868
[45]	valid_0's l2: 0



3
[1]	valid_0's l2: 0.0937569
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's l2: 0.0866288
[3]	valid_0's l2: 0.0801582
[4]	valid_0's l2: 0.0742837
[5]	valid_0's l2: 0.068896
[6]	valid_0's l2: 0.0639952
[7]	valid_0's l2: 0.0596254
[8]	valid_0's l2: 0.0557192
[9]	valid_0's l2: 0.052049
[10]	valid_0's l2: 0.0487556




[11]	valid_0's l2: 0.0457578
[12]	valid_0's l2: 0.043072
[13]	valid_0's l2: 0.0405818
[14]	valid_0's l2: 0.0384339
[15]	valid_0's l2: 0.0363473
[16]	valid_0's l2: 0.0344817
[17]	valid_0's l2: 0.0328015
[18]	valid_0's l2: 0.0312461
[19]	valid_0's l2: 0.0297931
[20]	valid_0's l2: 0.0284724
[21]	valid_0's l2: 0.0272866
[22]	valid_0's l2: 0.0262354
[23]	valid_0's l2: 0.0251658
[24]	valid_0's l2: 0.0242763
[25]	valid_0's l2: 0.023455
[26]	valid_0's l2: 0.0226739
[27]	valid_0's l2: 0.0219901
[28]	valid_0's l2: 0.0213061
[29]	valid_0's l2: 0.0206337
[30]	valid_0's l2: 0.0200378
[31]	valid_0's l2: 0.0195173
[32]	valid_0's l2: 0.019023
[33]	valid_0's l2: 0.0185677
[34]	valid_0's l2: 0.0181617
[35]	valid_0's l2: 0.0177832
[36]	valid_0's l2: 0.017372
[37]	valid_0's l2: 0.0170609
[38]	valid_0's l2: 0.0167309
[39]	valid_0's l2: 0.0164195
[40]	valid_0's l2: 0.0161172
[41]	valid_0's l2: 0.0158623
[42]	valid_0's l2: 0.0156782
[43]	valid_0's l2: 0.0154323
[44]	valid_0's l2: 0.0152278
[45]	valid_0's l2:



In [None]:
df_predicted = pd.DataFrame(predicted, index=df_solar_meteo_test.index)

In [None]:
df_predicted.to_msgpack('anton2.msgpack')

In [372]:
df_predicted.head()

Unnamed: 0,1,2,3
2014-01-01 00:00:00,0.6677,0.4877,0.6043
2014-01-01 01:00:00,0.5133,0.5313,0.7179
2014-01-01 02:00:00,0.4505,0.4627,0.4849
2014-01-01 03:00:00,0.3478,0.3415,0.3877
2014-01-01 04:00:00,0.2378,0.3154,0.2517


In [314]:
np.mean([0.0048, 0.0066, 0.00608])

0.0058266666666666675

In [304]:
for k in dfs:
    dfs[k]['plant_id'] = k
    dfs[k]['plant_id'] = pd.Categorical(dfs[k]['plant_id'])

In [308]:
# concatenating all powerplants -> Gets worse results!

df_long = pd.concat(dfs.values())
df_long['plant_id'] = pd.Categorical(df_long['plant_id'])

In [309]:
m = lightgbm.LGBMRegressor(
    random_state=0, bagging_fraction=0.5, 
    num_iterations=1000, learning_rate=0.05, early_stopping_rounds=50)

In [381]:

m.fit(
    df_long[df_long.split == 'train'].drop(['label', 'split'], axis=1),
    df_long[df_long.split == 'train']['label'],
    eval_set=[(df_long[df_long.split == 'val'].drop(['label', 'split'], axis=1), df_long[df_long.split == 'val']['label'])],
#     categorical_feature=[col for col in tmpdf.columns if col.endswith('_is_zero')],# ['hour']
)
x



[1]	valid_0's l2: 0.0860239
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's l2: 0.0784936
[3]	valid_0's l2: 0.0718002
[4]	valid_0's l2: 0.065743
[5]	valid_0's l2: 0.0602294
[6]	valid_0's l2: 0.0553351
[7]	valid_0's l2: 0.0508355
[8]	valid_0's l2: 0.0467617
[9]	valid_0's l2: 0.0430605
[10]	valid_0's l2: 0.0397445
[11]	valid_0's l2: 0.0367052
[12]	valid_0's l2: 0.0340408
[13]	valid_0's l2: 0.0315289
[14]	valid_0's l2: 0.0293373
[15]	valid_0's l2: 0.0273432
[16]	valid_0's l2: 0.0255453
[17]	valid_0's l2: 0.0238878
[18]	valid_0's l2: 0.0224247
[19]	valid_0's l2: 0.0210844
[20]	valid_0's l2: 0.0198841
[21]	valid_0's l2: 0.0187538
[22]	valid_0's l2: 0.0176686
[23]	valid_0's l2: 0.0167196
[24]	valid_0's l2: 0.0158362
[25]	valid_0's l2: 0.0150287
[26]	valid_0's l2: 0.0142868
[27]	valid_0's l2: 0.0136727
[28]	valid_0's l2: 0.0130493
[29]	valid_0's l2: 0.0124372
[30]	valid_0's l2: 0.011913
[31]	valid_0's l2: 0.0114717
[32]	valid_0's l2: 0.0110472
[33]	valid_0's l2: 0.

In [249]:
df['hour'] = df.index.hour

In [253]:
m = lightgbm.LGBMRegressor(
    random_state=0, bagging_fraction=0.5, 
    num_iterations=1000, learning_rate=0.05, early_stopping_rounds=50)

In [254]:
non_features = ['1', '2', '3', 'split']

m.fit(
    df[df.split == 'train'].drop(non_features, axis=1),
    df[df.split == 'train']['1'],
    eval_set=[(df[df.split == 'val'].drop(non_features, axis=1), df[df.split == 'val']['1'])],
#     categorical_feature=[col for col in tmpdf.columns if col.endswith('_is_zero')],# ['hour']
)
# [100]	valid_0's l2: 0.0103299
# [100]	valid_0's l2: 0.00541201 with hour
# [132]	valid_0's l2: 0.00537534 best settings from before




[1]	valid_0's l2: 0.0765213
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's l2: 0.0700206
[3]	valid_0's l2: 0.0642298
[4]	valid_0's l2: 0.0588397
[5]	valid_0's l2: 0.0539443
[6]	valid_0's l2: 0.0496665
[7]	valid_0's l2: 0.0456841
[8]	valid_0's l2: 0.0420387
[9]	valid_0's l2: 0.0388143
[10]	valid_0's l2: 0.0358017
[11]	valid_0's l2: 0.0331023
[12]	valid_0's l2: 0.0306652
[13]	valid_0's l2: 0.0284459
[14]	valid_0's l2: 0.0264925
[15]	valid_0's l2: 0.0246686
[16]	valid_0's l2: 0.0230668
[17]	valid_0's l2: 0.0215824
[18]	valid_0's l2: 0.0202598
[19]	valid_0's l2: 0.0190033
[20]	valid_0's l2: 0.0178729
[21]	valid_0's l2: 0.0168186
[22]	valid_0's l2: 0.0158781
[23]	valid_0's l2: 0.0150231
[24]	valid_0's l2: 0.0142236
[25]	valid_0's l2: 0.013541
[26]	valid_0's l2: 0.0128717
[27]	valid_0's l2: 0.0123097
[28]	valid_0's l2: 0.0118021
[29]	valid_0's l2: 0.0113095
[30]	valid_0's l2: 0.0108627
[31]	valid_0's l2: 0.0104858
[32]	valid_0's l2: 0.0101096
[33]	valid_0's l2: 0

LGBMRegressor(bagging_fraction=0.5, boosting_type='gbdt', class_weight=None,
       colsample_bytree=1.0, early_stopping_rounds=50, learning_rate=0.05,
       max_depth=-1, min_child_samples=20, min_child_weight=0.001,
       min_split_gain=0.0, n_estimators=100, n_jobs=-1,
       num_iterations=1000, num_leaves=31, objective=None, random_state=0,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)

In [375]:
df['hour'] = df.index.hour

In [378]:
pd.get_dummies(df, columns=['hour']).head()

Unnamed: 0_level_0,"(1, VAR134)","(2, VAR134)","(3, VAR134)","(1, VAR157)","(2, VAR157)","(3, VAR157)","(1, VAR164)","(2, VAR164)","(3, VAR164)","(1, VAR165)","(2, VAR165)","(3, VAR165)","(1, VAR166)","(2, VAR166)","(3, VAR166)","(1, VAR167)","(2, VAR167)","(3, VAR167)","(1, VAR169)","(2, VAR169)","(3, VAR169)","(1, VAR175)","(2, VAR175)","(3, VAR175)","(1, VAR178)","(2, VAR178)","(3, VAR178)","(1, VAR228)","(2, VAR228)","(3, VAR228)","(1, VAR78)","(2, VAR78)","(3, VAR78)","(1, VAR79)","(2, VAR79)","(3, VAR79)",1,2,3,split,hour_0,hour_1,hour_2,hour_3,hour_4,hour_5,hour_6,hour_7,hour_8,hour_9,hour_10,hour_11,hour_12,hour_13,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
TIMESTAMP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1
2012-04-01 01:00:00,94843.625,94290.625,92667.625,60.2219,57.3743,57.0032,0.2446,0.191,0.2381,1.0393,0.8579,0.9862,-2.503,-2.8016,-3.1334,294.4485,294.3933,293.9036,2577830.0,2595302.0,2606438.0,1202532.0,1192092.0,1158284.0,2861797.0,2884677.0,2901861.0,0.0,0.0,0.0,0.002,0.0015,0.0035,0.0036,0.0054,0.0247,0.7541,0.6473,0.799,train,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2012-04-01 02:00:00,94757.9375,94217.6875,92622.6875,54.6786,57.1298,64.9608,0.4571,0.5171,0.5616,2.4829,2.5941,2.6884,-2.9933,-2.6532,-2.5749,295.6514,294.9624,293.1965,5356093.0,5374973.0,5352637.0,2446757.0,2420485.0,2341093.0,5949378.0,5984578.0,5980930.0,0.0,0.0,0.0008,0.0055,0.0183,0.0275,0.0336,0.1016,0.1405,0.555,0.6218,0.8175,train,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2012-04-01 03:00:00,94732.8125,94201.0625,92598.8125,61.2949,63.0761,66.7959,0.7714,0.7052,0.6616,3.3399,2.8814,2.8845,-1.9825,-1.6478,-1.8937,294.4546,293.8767,292.5925,7921788.0,7858492.0,7714300.0,3681336.0,3643656.0,3531304.0,8939176.0,8929064.0,8822632.0,0.0013,0.0013,0.0017,0.0301,0.0353,0.0324,0.132,0.1129,0.1366,0.4384,0.4727,0.5715,train,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2012-04-01 04:00:00,94704.0625,94156.0625,92542.0625,67.7753,62.9755,65.1776,0.9659,0.9934,0.9416,3.1061,3.0248,3.1757,-1.4461,-1.4968,-1.6399,293.2615,293.0713,292.1748,9860520.0,9719720.0,9619368.0,4921504.0,4885120.0,4740096.0,11331679.0,11258463.0,11178847.0,0.0025,0.0017,0.0021,0.0572,0.0645,0.032,0.1106,0.1078,0.0974,0.1454,0.0989,0.0638,train,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2012-04-01 05:00:00,94675.0,94124.25,92508.0,70.173,65.0538,65.7443,0.9447,0.9376,0.9447,2.6011,2.5216,2.6973,-1.9045,-1.8366,-1.8801,292.7329,292.5444,291.5845,11143097.0,11054009.0,11054009.0,6254380.0,6211372.0,6014412.0,13105558.0,13079318.0,13079958.0,0.0033,0.0021,0.0025,0.051,0.0595,0.0485,0.1896,0.1576,0.1638,0.112,0.1044,0.1037,train,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
lr_df = dfs['1'].copy()
for col in lr_df.columns.drop(['split', 'hour', 'label']):
    lr_df[col] = (lr_df[col] - lr_df[col].mean()) / lr_df[col].std()

m.fit(
    tmpdf[tmpdf.split == 'train'].drop(['label', 'split'], axis=1),
    tmpdf[tmpdf.split == 'train']['label'],
    eval_set=[(tmpdf[tmpdf.split == 'val'].drop(['label', 'split'], axis=1), tmpdf[tmpdf.split == 'val']['label'])],
#     categorical_feature=[col for col in tmpdf.columns if col.endswith('_is_zero')],# ['hour']
)
#[100]	valid_0's l2: 0.0116614
# [100]	valid_0's l2: 0.00536193 with hour
# [1000]	valid_0's l2: 0.00531883 1000 iterations, 0.01 learning rate


# [100]	valid_0's l2: 0.00574346 with hour as categorical

# [100]	valid_0's l2: 0.0141626  using dart
#[100]	valid_0's l2: 0.00814383 using dart with hour


In [1]:
sorted(list(zip(m.feature_importances_, dfs[k].columns.drop(['label', 'split']))), reverse=True)

NameError: name 'm' is not defined

In [418]:
sorted(list(zip(m.feature_importances_, dfs[k].columns.drop(['label', 'split']))), reverse=True)

[(71, 'VAR165'),
 (68, 'hour'),
 (65, 'VAR157'),
 (57, 'VAR79'),
 (54, 'VAR164'),
 (52, 'VAR166'),
 (38, 'VAR78'),
 (37, 'VAR167'),
 (30, 'VAR175'),
 (28, 'VAR134'),
 (26, 'VAR169'),
 (22, 'VAR228'),
 (20, 'VAR178')]

In [8]:
import hnswlib
import numpy as np
                                                                    
dim = len(meteo_cols)
num_elements = 20000

# Declaring index
p = hnswlib.Index(space = 'l2', dim = 24) # possible options are l2, cosine or ip
                                                                                                                                                                                                                                                                  
# Initing index - the maximum number of elements should be known beforehand
p.init_index(max_elements = num_elements, ef_construction = 200, M = 16)
x
# Element insertion (can be called several times):
p.add_items(data, data_labels)

# Controlling the recall by setting ef:
p.set_ef(50) # ef should always be > k

# Query dataset, k - number of closest elements (returns 2 numpy arrays)
labels, distances = p.knn_query(data, k = 1)


NameError: name 'meteo_cols' is not defined