In [1]:
import pandas as pd
from enefit_challenge.dataset.dataset import EnefitDataset
from enefit_challenge.models.lightgbm.lightgbm_forecaster import LightGBMForecaster

  from .autonotebook import tqdm as notebook_tqdm


## Training

In [2]:
dataset = EnefitDataset()

In [3]:
train_df = dataset.load_enefit_training_data()

print(train_df.shape)
train_df.head()

(2017824, 80)


Unnamed: 0,county,is_business,product_type,target,is_consumption,datetime,data_block_id,row_id,prediction_unit_id,date,...,surface_solar_radiation_downwards_max_f,surface_solar_radiation_downwards_std_f,snowfall_min_f,snowfall_mean_f,snowfall_max_f,snowfall_std_f,total_precipitation_min_f,total_precipitation_mean_f,total_precipitation_max_f,total_precipitation_std_f
0,0,0,1,0.713,0,2021-09-01,0,0,0,2021-09-01,...,,,,,,,,,,
1,11,0,2,7.62,1,2021-09-01,0,89,44,2021-09-01,...,,,,,,,,,,
2,11,0,2,0.0,0,2021-09-01,0,88,44,2021-09-01,...,,,,,,,,,,
3,11,0,1,21.099,1,2021-09-01,0,87,43,2021-09-01,...,,,,,,,,,,
4,11,0,1,0.0,0,2021-09-01,0,86,43,2021-09-01,...,,,,,,,,,,


In [4]:
not_feature_columns = ['datetime', 'row_id','prediction_unit_id','date','time', 'data_block_id']
cat_columns = ['county', 'product_type']
to_drop_cols = [
    '10_metre_u_wind_component_mean_f',
    '10_metre_v_wind_component_min_f',
    'cloudcover_low_mean_f',
    'dayofweek_sine',
    'direct_solar_radiation_max_f',
    'eic_count',
    'euros_per_mwh', # not sure about this one
    'hour_sine',
    'temperature_min_f',
    'total_precipitation_max_f',
    'week_sine',
    '10_metre_u_wind_component_min_f',
    '10_metre_v_wind_component_max_f',
    '10_metre_v_wind_component_std_f',
    'cloudcover_high_mean_f',
    'cloudcover_high_std_f',
    'cloudcover_low_min_f',
    'cloudcover_low_std_f',
    'cloudcover_mid_std_f',
    'cloudcover_total_std_f',
    'county_12', # find a way to drop
    'county_3', # find a way to drop
    'county_9', # find a way to drop
    'direct_solar_radiation_min_f',
    'direct_solar_radiation_std_f',
    'highest_price_per_mwh', # not sure about this one
    # 'installed_capacity', #this one should alspo be dropped?
    'month_cosine',
    'product_type_3', # find a way to drop
    'snowfall_max_f',
    'temperature_std_f',
    'total_precipitation_min_f',
    'total_precipitation_std_f',
    # 'year'
]

In [5]:
lgbf = LightGBMForecaster()

lgbf.train_model(
    train_df=train_df,
    target_col="target",
    exclude_cols=not_feature_columns+to_drop_cols,
    categorical_features=cat_columns,
    n_trials=1
)

[I 2023-12-30 11:50:19,034] A new study created in memory with name: lightgbm
2023/12/30 11:50:19 INFO mlflow.tracking.fluent: Experiment with name 'lightgbm' does not exist. Creating a new experiment.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006870 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8120
[LightGBM] [Info] Number of data points in the train set: 551778, number of used features: 59
[LightGBM] [Info] Start training from score 242.074828
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022061 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8332
[LightGBM] [Info] Number of data points in the train set: 1042444, number of used features: 59
[LightGBM] [Info] Start training from score 247.723599
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035184 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is no

[I 2023-12-30 11:50:32,918] Trial 0 finished with value: 85.00109526111159 and parameters: {'n_estimators': 121, 'boosting_type': 'gbdt', 'eta': 0.11957168053633911, 'max_depth': 2, 'min_child_weight': 0.38333321561566636, 'colsample_bytree': 0.2738969595234697, 'subsample': 0.927727492754704, 'lambda': 7.155682161754866, 'alpha': 0.03417952912061012}. Best is trial 0 with value: 85.00109526111159.


## Testing 

In [6]:
test = pd.read_csv("../../input/example_test_files/test.csv")
revealed_targets = pd.read_csv("../../input/example_test_files/revealed_targets.csv")
client = pd.read_csv("../../input/example_test_files/client.csv")
ee = pd.read_csv("../../input/example_test_files/electricity_prices.csv")
gas = pd.read_csv("../../input/example_test_files/gas_prices.csv")
fcst_weather = pd.read_csv(
    "../../input/example_test_files/forecast_weather.csv",
    parse_dates=['origin_datetime', 'forecast_datetime']
)
test_df = dataset.prepare_enefit_new_data(
    new_df=test,
    revealed_targets=revealed_targets,
    df_client=client,
    df_electricity=ee,
    df_gas=gas,
    df_weather_fc=fcst_weather,
    train_df=train_df
)

print(test_df.shape)
test_df.head()

(12480, 80)


Unnamed: 0,county,is_business,product_type,is_consumption,datetime,data_block_id,row_id,prediction_unit_id,date,time,...,snowfall_min_f,snowfall_mean_f,snowfall_max_f,snowfall_std_f,total_precipitation_min_f,total_precipitation_mean_f,total_precipitation_max_f,total_precipitation_std_f,target_1_days_ago,target_7_days_ago
0,0,0,1,0,2023-05-28,634,2005872,0,2023-05-28,00:00:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.821,3.77
1,0,0,1,1,2023-05-28,634,2005873,0,2023-05-28,00:00:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,537.429,588.634
2,0,0,2,0,2023-05-28,634,2005874,1,2023-05-28,00:00:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,2,1,2023-05-28,634,2005875,1,2023-05-28,00:00:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.593,3.817
4,0,0,3,0,2023-05-28,634,2005876,2,2023-05-28,00:00:00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.473,19.662


In [7]:
preds = lgbf.predict(
    input_data=test_df,
    use_best_from_run=True
)

pd.DataFrame(preds)

Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 190.10it/s] 
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 325.29it/s]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 247.03it/s]


Unnamed: 0,0
0,53.472995
1,611.261178
2,-21.145550
3,51.744400
4,-171.629982
...,...
12475,389.833871
12476,-18.199176
12477,92.124877
12478,43.240886
