In [1]:
!pip install --no-index --find-links /kaggle/input/pvlib-dependency/package pvlib

Looking in links: /kaggle/input/pvlib-dependency/package
Processing /kaggle/input/pvlib-dependency/package/pvlib-0.10.3-py3-none-any.whl
Installing collected packages: pvlib
Successfully installed pvlib-0.10.3


In [2]:
import gc  # Garbage collection for memory management
import os  # Operating system-related functions
import time  # Time-related functions
import warnings  # Handling warnings
from itertools import combinations, product  # For creating combinations of elements
from warnings import simplefilter  # Simplifying warning handling

# 📦 Importing machine learning libraries
import joblib  # For saving and loading models
import lightgbm as lgb  # LightGBM gradient boosting framework
import numpy as np  # Numerical operations
import pandas as pd  # Data manipulation and analysis
from sklearn.metrics import mean_absolute_error  # Metric for evaluation
from sklearn.model_selection import KFold, TimeSeriesSplit  # Cross-validation techniques
from catboost import CatBoostRegressor, Pool
from catboost import EShapCalcType, EFeaturesSelectionAlgorithm

from tqdm.notebook import tqdm  # Progress bar
import seaborn as sns
import matplotlib.pyplot as plt
import numba
import plotly.express as px
import math
import itertools
import holidays
import pvlib

# 🤐 Disable warnings to keep the code clean
warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [3]:
# switching NB mode
IS_OFFLINE = False
IS_SUBMIT = True
IS_TRAIN = False
IS_GENFEATS = True
K_FOLD = False

IS_CONS_MODEL = False
IS_CONS_DIFF_MODEL = True
IS_CONS_DIFF7_MODEL = False
weights2name = {'cons-0':0, 'cons-1':0,
                'diff-cons-0':1, 'diff-cons-1':0.5,
                'diff7-cons-0':0, 'diff7-cons-1':0,}

split_data_block = 500
if IS_OFFLINE:
    mode = f'offline-{split_data_block}'
else:
    mode = 'online'

In [4]:
root = "/kaggle/input/predict-energy-behavior-of-prosumers"
# if not IS_SUBMIT:
df_data = pd.read_csv(os.path.join(root, "train.csv"))
df_client = pd.read_csv(os.path.join(root, "client.csv"))
df_gas_prices = pd.read_csv(os.path.join(root, "gas_prices.csv"))
df_electricity_prices = pd.read_csv(os.path.join(root, "electricity_prices.csv"))
df_forecast_weather = pd.read_csv(os.path.join(root, "forecast_weather.csv"))
df_historical_weather = pd.read_csv(os.path.join(root, "historical_weather.csv"))
df_weather_station_to_county_mapping = pd.read_csv(os.path.join(root, "weather_station_to_county_mapping.csv"))

# Data Analysis

In [5]:
def plot_data_groupby(df, x, y, groupby):
    if 'date' in x:
        df[x] = pd.to_datetime(df[x])
    df = df.groupby(groupby)
    num_of_groups = len(df)
    n_col = 4
    n_row = math.ceil(num_of_groups / n_col)
    fig, axs = plt.subplots(n_row, n_col, figsize=(24, 4*n_row))
    if len(axs.shape) == 1:
        axs = axs.reshape(1, axs.shape[0])
    print(axs.shape)
    for i, (group_id, values) in enumerate(df[y]):
        ax = axs[i // 4, i % 4]
        ax.plot(values)
        ax.set_title(str(groupby) + ': ' + str(group_id) + ' | n=' + str(len(values)), fontsize=10)
    plt.show()

# Data Preparation

In [6]:
class ClientCompiler:
    def __init__(self, days_to_save=30):
        self.days_to_save = days_to_save
        all_codes = []
        is_business = [0, 1]
        product_type = [0, 1, 2, 3]
        county = list(range(16))
        for comb in itertools.product(county, product_type, is_business):
            all_codes.append(f'C_{comb[0]}|PT_{comb[1]}|B_{comb[2]}')
        all_columns = ['product_type', 'county', 'eic_count', 'installed_capacity', 'is_business']
        self.client_last = pd.DataFrame({'product_type': pd.Series(dtype='int'),
                                         'county': pd.Series(dtype='int'),
                                         'eic_count': pd.Series(dtype='float'),
                                         'installed_capacity': pd.Series(dtype='float'), 
                                         'is_business': pd.Series(dtype='int')}, 
                                        index=all_codes)
        self.client_relevant = pd.DataFrame({'product_type': pd.Series(dtype='int'),
                                             'county': pd.Series(dtype='int'),
                                             'eic_count': pd.Series(dtype='float'),
                                             'installed_capacity': pd.Series(dtype='float'), 
                                             'is_business': pd.Series(dtype='int'),
                                             'date': pd.Series(dtype='object'),
                                            })
    
    def update(self, client_block):
        cols_to_int = ['is_business', 'product_type', 'county']
        client_block[cols_to_int] = client_block[cols_to_int].astype(int)
        date_value = pd.to_datetime(client_block['date'].iloc[0])
        client_block['code'] = client_block.apply(lambda x: f'C_{x["county"]}|PT_{x["product_type"]}|B_{x["is_business"]}', axis=1)
        client_block = client_block.set_index('code').drop(columns=['date', 'data_block_id'], errors='ignore')
        self.client_last.update(client_block)
        client_block['date'] = date_value
        self.client_relevant = pd.concat([self.client_relevant, client_block], ignore_index=True)
        self.client_relevant.drop(self.client_relevant[
            self.client_relevant['date'] <= date_value - pd.Timedelta(days=self.days_to_save)
        ].index, axis=0, inplace=True)    
            
    def add_client(self, data):
        data = pd.merge(data, self.client_last, on=['county', 'product_type', 'is_business'], how='left')
        return data
    
    def add_client_daylag(self, data, days):
        client_block = self.client_relevant.copy()
        client_block['date']  = client_block['date'] + pd.Timedelta(days=days)
        data['date'] = pd.to_datetime(data['datetime'].apply(lambda x: x.date()))
        data = pd.merge(data, client_block, on=['date', 'county', 'product_type', 'is_business'], how='left', suffixes=['', f'__lag_{days}d'])
        data = data.drop(columns=['date'])
        return data

In [7]:
class HistoricalWeatherCompiler:
    def __init__(self, days_to_save=30):
        self.days_to_save = days_to_save
        self.historical_relevant = pd.DataFrame(columns=[
                'county', 'datetime', 'temperature', 'dewpoint', 'snowfall', 'cloudcover_high', 
                 'cloudcover_mid', 'cloudcover_low', 'cloudcover_total',
                 'windspeed_total', 'u_wind_component', 'v_wind_component', 'winddirection',
                 'shortwave_radiation', 'direct_solar_radiation', 'diffuse_radiation', 
                 'surface_pressure', 'rain', 
                 'solar_angle_hor_cos', 'solar_angle_ideal_cos'
                ])
        
        self.historical_allsta = pd.DataFrame(columns=[
                 'datetime', 'temperature', 'dewpoint', 'snowfall', 'cloudcover_high', 
                 'cloudcover_mid', 'cloudcover_low', 'cloudcover_total',
                 'windspeed_total', 'u_wind_component', 'v_wind_component', 'winddirection',
                 'shortwave_radiation', 'direct_solar_radiation', 'diffuse_radiation', 
                 'surface_pressure', 'rain', 
                 'solar_angle_hor_cos', 'solar_angle_ideal_cos'
                ])
        
        self.county_mapping = df_weather_station_to_county_mapping[['longitude', 'latitude', 'county']]
        self.county_mapping['longitude'] = (self.county_mapping['longitude'] * 10).round(0).astype(int)
        self.county_mapping['latitude'] = (self.county_mapping['latitude'] * 10).round(0).astype(int)
        
    def update(self, historical_block):
        date_value = pd.to_datetime(historical_block['datetime'].iloc[0])
        
        # HISTORICAL
        historical_block['datetime'] = pd.to_datetime(historical_block['datetime'])
        cols = pvlib.solarposition.get_solarposition(historical_block['datetime'], 
                                                     historical_block['latitude'], 
                                                     historical_block['longitude'], 
                                                     temperature=historical_block['temperature']
                                                    )[['zenith', 'azimuth']].reset_index()
        historical_block['solar_angle_hor_cos'] = np.cos(np.radians(pvlib.irradiance.aoi(0, 180, cols['zenith'], cols['azimuth']).values))
        historical_block['solar_angle_ideal_cos'] =  np.cos(np.radians(pvlib.irradiance.aoi(41, 180, cols['zenith'], cols['azimuth']).values))
        
        historical_block['longitude'] = (historical_block['longitude'] * 10).round(0).astype(int)
        historical_block['latitude'] = (historical_block['latitude'] * 10).round(0).astype(int)
        historical_block = historical_block.merge(self.county_mapping, on=['longitude', 'latitude'], how='left')
        historical_block['windspeed_total'] = historical_block['windspeed_10m']
        historical_block['u_wind_component'] = historical_block['windspeed_10m'] * np.sin(historical_block['winddirection_10m'] / 180 * np.pi)
        historical_block['v_wind_component'] = historical_block['windspeed_10m'] * np.cos(historical_block['winddirection_10m'] / 180 * np.pi)
        historical_block['winddirection'] = np.sin(historical_block['winddirection_10m'] / 180 * np.pi)
        
        historical_block_avg = historical_block.drop(columns=['county']).groupby(['datetime']).mean().reset_index()
        historical_block = historical_block.groupby(['county', 'datetime']).mean().reset_index() #  ==========
        
        historical_block_avg = historical_block_avg.drop(columns=['latitude', 'longitude', 'windspeed_10m', 'winddirection_10m', 'data_block_id'],
                                                 errors='ignore')
        historical_block = historical_block.drop(columns=['latitude', 'longitude', 'windspeed_10m', 'winddirection_10m', 'data_block_id'],
                                                 errors='ignore')
        
        self.historical_allsta = pd.concat([self.historical_allsta, historical_block_avg], ignore_index=True)
        self.historical_allsta.drop(self.historical_allsta[
            self.historical_allsta['datetime'] <= date_value - pd.Timedelta(days=self.days_to_save)
        ].index, axis=0, inplace=True)
                                    
        
        self.historical_relevant = pd.concat([self.historical_relevant, historical_block], ignore_index=True)
        self.historical_relevant.drop(self.historical_relevant[
            self.historical_relevant['datetime'] <= date_value - pd.Timedelta(days=self.days_to_save)
        ].index, axis=0, inplace=True)
        
    def add_weather_daylag(self, data, days, feats=None): #  historical data is used
        assert days >= 1          
        if feats is None:
            feats = list(self.historical_relevant.columns)
        else:
            feats = list(set(['datetime','county'] + feats))
        historical_block = self.historical_relevant[feats]
        if days == 1:
            historical_block = historical_block[historical_block['datetime'].apply(lambda x: x.hour) <= 10]
        historical_block['datetime'] = historical_block['datetime'] + pd.Timedelta(days=days)
        data = pd.merge(data, historical_block, on=['county', 'datetime'], how='left', suffixes=['', f'__lag_{days}d'])
        data.rename(columns={'shortwave_radiation': 'shortwave_radiation' + f'__lag_{days}d',
                             'diffuse_radiation': 'diffuse_radiation' + f'__lag_{days}d',
                             'surface_pressure': 'surface_pressure' + f'__lag_{days}d',
                             'rain': 'rain' + f'__lag_{days}d'}, 
                    inplace=True)
        return data
    
    def add_weather_allsta_daylag(self, data, days, feats=None): #  historical data is used
        assert days >= 1
        if feats is None:
            feats = list(self.historical_allsta.columns)
        else:
            feats = list(set(['datetime'] + feats))
        historical_block = self.historical_allsta[feats]
        if days == 1:
            historical_block = historical_block[historical_block['datetime'].apply(lambda x: x.hour) <= 10]
        historical_block['datetime'] = historical_block['datetime'] + pd.Timedelta(days=days)
        data = pd.merge(data, historical_block, on=['datetime'], how='left', suffixes=['', f'__lag_{days}d_allsta'])
        data.rename(columns={'shortwave_radiation': 'shortwave_radiation' + f'__lag_{days}d_allsta',
                             'diffuse_radiation': 'diffuse_radiation' + f'__lag_{days}d_allsta',
                             'surface_pressure': 'surface_pressure' + f'__lag_{days}d_allsta',
                             'rain': 'rain' + f'__lag_{days}d_allsta'}, 
                    inplace=True)
        return data
    
    def add_weather_daylag_daymean(self, data, days, feats=None):
        assert days >= 2
        if feats is None:
            feats = list(self.historical_relevant.columns)
        else:
            feats = list(set(['datetime','county'] + feats))
        historical_block = self.historical_relevant[feats]
        historical_block['datetime'] = historical_block['datetime'] + pd.Timedelta(days=days)
        historical_block['date'] = historical_block['datetime'].apply(lambda x: x.date())
        historical_block = historical_block.drop(columns=['datetime'])
        historical_block = historical_block.groupby(['date', 'county']).mean().reset_index()
        data['date'] = data['datetime'].apply(lambda x: x.date())
        data = pd.merge(data, historical_block, on=['county', 'date'], how='left', suffixes=['', f'__meanlag_{days}d']).drop(columns=['date'])
        data.rename(columns={'shortwave_radiation': 'shortwave_radiation' + f'__meanlag_{days}d',
                             'diffuse_radiation': 'diffuse_radiation' + f'__meanlag_{days}d',
                             'surface_pressure': 'surface_pressure' + f'__meanlag_{days}d',
                             'rain': 'rain' + f'__meanlag_{days}d'}, 
                    inplace=True)
        return data

In [8]:
class ForecastedWeatherCompiler:
    def __init__(self, days_to_save=30):
        self.days_to_save = days_to_save
        
        self.forecasted_relevant = pd.DataFrame({   'datetime': pd.Series(dtype='object'),
                                                    'county': pd.Series(dtype='float'),
                                                    'temperature': pd.Series(dtype='float'),
                                                    'dewpoint': pd.Series(dtype='float'),
                                                    'cloudcover_high': pd.Series(dtype='float'),
                                                    'cloudcover_low': pd.Series(dtype='float'),
                                                    'cloudcover_mid': pd.Series(dtype='float'),
                                                    'cloudcover_total': pd.Series(dtype='float'),
                                                    'direct_solar_radiation': pd.Series(dtype='float'),
                                                    'surface_solar_radiation_downwards': pd.Series(dtype='float'),
                                                    'snowfall': pd.Series(dtype='float'),
                                                    'total_precipitation': pd.Series(dtype='float'),
                                                    'windspeed_total': pd.Series(dtype='float'),
                                                    'u_wind_component': pd.Series(dtype='float'),
                                                    'v_wind_component': pd.Series(dtype='float'),
                                                    'winddirection': pd.Series(dtype='float'),
                                                    'solar_angle_hor_cos': pd.Series(dtype='float'),
                                                    'solar_angle_ideal_cos': pd.Series(dtype='float'),
                                                }) #  'origin_datetime', 'hours_ahead'
        
        self.forecasted_allsta = self.forecasted_relevant.drop(columns=['county'])
        
        self.county_mapping = df_weather_station_to_county_mapping[['longitude', 'latitude', 'county']]
        self.county_mapping['longitude'] = (self.county_mapping['longitude'] * 10).round(0).astype(int)
        self.county_mapping['latitude'] = (self.county_mapping['latitude'] * 10).round(0).astype(int)
        
    def update(self, forecasted_block, do_mean=False):
        date_value = pd.to_datetime(forecasted_block['origin_datetime'].iloc[0])
        
        # FORECASTED
        forecasted_block['datetime'] = pd.to_datetime(forecasted_block['forecast_datetime'])
        date_value = pd.to_datetime(forecasted_block['datetime'].iloc[0])
        
        cols = pvlib.solarposition.get_solarposition(forecasted_block['datetime'], 
                                                     forecasted_block['latitude'], 
                                                     forecasted_block['longitude'], 
                                                     temperature=forecasted_block['temperature']
                                                    )[['zenith', 'azimuth']].reset_index()  # indexes = datetime
        forecasted_block['solar_angle_hor_cos'] = np.cos(np.radians(pvlib.irradiance.aoi(0, 180, cols['zenith'], cols['azimuth']).values))
        forecasted_block['solar_angle_ideal_cos'] =  np.cos(np.radians(pvlib.irradiance.aoi(41, 180, cols['zenith'], cols['azimuth']).values))
        
        forecasted_block['longitude'] = (forecasted_block['longitude'] * 10).round(0).astype(int)
        forecasted_block['latitude'] = (forecasted_block['latitude'] * 10).round(0).astype(int)
        forecasted_block = forecasted_block.merge(self.county_mapping, on=['longitude', 'latitude'], how='left')
#         forecasted_block = forecasted_block.dropna(subset=['county'])
        forecasted_block['windspeed_total'] = (forecasted_block['10_metre_u_wind_component']**2 + forecasted_block['10_metre_v_wind_component']**2)**0.5
        forecasted_block['u_wind_component'] = forecasted_block['10_metre_u_wind_component']
        forecasted_block['v_wind_component'] = forecasted_block['10_metre_v_wind_component']
        forecasted_block['winddirection'] = forecasted_block['10_metre_u_wind_component'] / forecasted_block['windspeed_total']
        if not do_mean:
            forecasted_block = forecasted_block[(forecasted_block['hours_ahead'] >= 22) & (forecasted_block['hours_ahead'] < 46)]
        forecasted_block = forecasted_block.drop(columns=['latitude', 'longitude', 'forecast_datetime', 'origin_datetime', 
                                                          '10_metre_u_wind_component', '10_metre_v_wind_component', 'data_block_id'], 
                                                 errors='ignore')
        forecasted_block = forecasted_block.drop(columns=['hours_ahead'])
        forecasted_block_avg = forecasted_block.drop(columns=['county']).groupby(['datetime']).mean().reset_index() #  ==========
        forecasted_block = forecasted_block.groupby(['county', 'datetime']).mean().reset_index() #  ==========

        

        self.forecasted_relevant = pd.concat([self.forecasted_relevant, forecasted_block], ignore_index=True)
#         self.forecasted_relevant = self.forecasted_relevant.groupby(['county', 'datetime']).mean().reset_index() #  ==========
        self.forecasted_relevant.drop(self.forecasted_relevant[
            self.forecasted_relevant['datetime'] <= date_value - pd.Timedelta(days=self.days_to_save)
        ].index, axis=0, inplace=True)
        
        self.forecasted_allsta = pd.concat([self.forecasted_allsta, forecasted_block_avg], ignore_index=True)
#         self.forecasted_relevant = self.forecasted_relevant.groupby(['county', 'datetime']).mean().reset_index() #  ==========
        self.forecasted_allsta.drop(self.forecasted_allsta[
            self.forecasted_allsta['datetime'] <= date_value - pd.Timedelta(days=self.days_to_save)
        ].index, axis=0, inplace=True)
        
    def add_weather(self, data) :
        return self.add_weather_hourlag(data, 0)
    
    def add_weather_allsta(self, data) :
        return self.add_weather_allsta_hourlag(data, 0)
        
    def add_weather_hourlag(self, data, hours, feats=None): #  forecasted data is used
        if feats is None:
            feats = list(self.forecasted_relevant.columns)
        else:
            feats = list(set(['datetime','county'] + feats))
        forecasted_block = self.forecasted_relevant[feats]
        forecasted_block['datetime'] = forecasted_block['datetime'] + pd.Timedelta(hours=hours)
        return pd.merge(data, forecasted_block, on=['county', 'datetime'], how='left', suffixes=['', f'__lag_{hours}h'])
    
    def add_weather_allsta_hourlag(self, data, hours, feats=None): #  forecasted data is used
        if feats is None:
            feats = list(self.forecasted_allsta.columns)
        else:
            feats = list(set(['datetime'] + feats))
        forecasted_block = self.forecasted_allsta[feats]
        forecasted_block['datetime'] = forecasted_block['datetime'] + pd.Timedelta(hours=hours)
        return pd.merge(data, forecasted_block, on=['datetime'], how='left', suffixes=['', f'__lag_{hours}h_allsta'])


In [9]:
class GasCompiler:
    def __init__(self, days_to_save=30):
        self.days_to_save = days_to_save
        self.gas_relevant = pd.DataFrame(columns=['date', 'gas_price_low', 'gas_price_high'])
    
    def update(self, gas_block):
        gas_block = gas_block.iloc[[-1]].drop(columns=['origin_date', 'data_block_id'],
                                              errors='ignore')
        gas_block = gas_block.rename(columns={'forecast_date': 'date',
                                              'lowest_price_per_mwh': 'gas_price_low',
                                              'highest_price_per_mwh': 'gas_price_high'})
        self.gas_relevant = pd.concat([self.gas_relevant, gas_block], ignore_index=True)
        date_value = pd.to_datetime(gas_block['date'].iloc[0])
        self.gas_relevant.drop(self.gas_relevant[
            pd.to_datetime(self.gas_relevant['date']) <= date_value - pd.Timedelta(days=self.days_to_save)
        ].index, axis=0, inplace=True)
        
    
    def add_gas(self, data):
        return self.add_gas_daylag(data, 0)
    
    def add_gas_daylag(self, data, days):
        col_names = [f'gas_price_low__lag_{days}d', f'gas_price_high__lag_{days}d']
        if days < len(self.gas_relevant):   
            data[col_names[0]] = self.gas_relevant.iloc[-days-1, :]['gas_price_low']
            data[col_names[1]] = self.gas_relevant.iloc[-days-1, :]['gas_price_high']
        else:
            data[col_names[0]] = np.nan
            data[col_names[1]] = np.nan
        return data

In [10]:
class ElectricityCompiler:
    def __init__(self, days_to_save=30):
        self.days_to_save = days_to_save
        self.electricity_relevant = pd.DataFrame(columns=['date', 'electricity_price'])
    
    def update(self, electricity_block):
        electricity_block = electricity_block.drop(columns=['origin_date', 'data_block_id'], 
                                                   errors='ignore')
        electricity_block['euros_per_mwh'] = electricity_block['euros_per_mwh'].mean()
        electricity_block = electricity_block.iloc[[-1]].rename(columns={'forecast_date': 'date',
                                                                          'euros_per_mwh': 'electricity_price'})
        self.electricity_relevant = pd.concat([self.electricity_relevant, electricity_block], ignore_index=True)
        date_value = pd.to_datetime(electricity_block['date'].iloc[0])
        self.electricity_relevant.drop(self.electricity_relevant[
                pd.to_datetime(self.electricity_relevant['date']) <= date_value - pd.Timedelta(days=self.days_to_save)
        ].index, axis=0, inplace=True)
    
    def add_electricity(self, data):
        return self.add_electricity_daylag(data, 0)
    
    def add_electricity_daylag(self, data, days):
        col_names = [f'electricity_price__lag_{days}d']
        if days < len(self.electricity_relevant):
            data[col_names[0]] = self.electricity_relevant.iloc[-days-1, :]['electricity_price']
        else:
            data[col_names[0]] = np.nan
        return data

In [11]:
class TargetCompiler:
    def __init__(self, days_to_save=30):
        self.days_to_save = days_to_save
        self.target_relevant = pd.DataFrame(columns=['county', 'is_business', 'product_type', 'target', 'is_consumption', 'datetime'])
    
    def update(self, previous_target_block):
        self.target_relevant = pd.concat([self.target_relevant, previous_target_block])
        date_value = pd.to_datetime(previous_target_block['datetime'].iloc[0])
        self.target_relevant.drop(self.target_relevant[
            self.target_relevant['datetime'] <= date_value - pd.Timedelta(days=self.days_to_save)
        ].index, axis=0, inplace=True)
    
    def add_agg_target_daylag(self, data, days, agg_trough=[['county'], ['product_type'], ['county', 'product_type']], agg_fns=['mean', 'sum', 'max', 'min', 'std']):
        assert days > 1
        ALL_GROUP_COLS = ['county', 'product_type', 'is_business', 'is_consumption', 'datetime']
        col_name = f'target__lag_{days}d'
        target_block = self.target_relevant.copy()
        target_block['datetime'] = target_block['datetime'] + pd.Timedelta(days=days)
        if col_name not in data.columns:
            target_block[col_name] = target_block['target']
        for by in agg_trough:
            grby_cols = list(set(ALL_GROUP_COLS) - set(by))
            for func in agg_fns:
                new_name = col_name + '__all_' + '_'.join(by) + '_' + func
                target_block[new_name] = target_block.groupby(grby_cols)['target'].transform(func)
        target_block = target_block.drop(columns=['target'])
        data = pd.merge(data, target_block, on=['county', 'product_type', 'is_business', 'is_consumption', 'datetime'], how='left') #  equal is_consumption
        return data

    def add_inverted_target_daylag(self, data, days):
        assert days > 1
        target_block = self.target_relevant.copy()
        target_block['datetime'] = target_block['datetime'] + pd.Timedelta(days=days)
        target_block = target_block.rename(columns={'target': f'inv_target__lag_{days}d'})
        target_block['is_consumption'] = (target_block['is_consumption'] + 1) % 2
        data = pd.merge(data, target_block, on=['county', 'product_type', 'is_business', 'is_consumption', 'datetime'], how='left') #  invert is_consumption
        return data
    
    def add_target_return(self, data, nominator_daylag, denominator_daylag):
        assert nominator_daylag > 1 and denominator_daylag > nominator_daylag
        nominator_block = self.target_relevant.copy()
        nominator_block['datetime'] = nominator_block['datetime'] + pd.Timedelta(days=nominator_daylag)
        nominator_block.rename(columns={'target': 'target_nominator'}, inplace=True)
        denominator_block = self.target_relevant.copy()
        denominator_block['datetime'] = denominator_block['datetime'] + pd.Timedelta(days=denominator_daylag)
        denominator_block.rename(columns={'target': 'target_denominator'}, inplace=True)
        data = pd.merge(data, nominator_block, on=['county', 'product_type', 'is_business', 'is_consumption', 'datetime'], how='left')
        data['target_nominator'] = data['target_nominator'].fillna(value=0)
        data = pd.merge(data, denominator_block, on=['county', 'product_type', 'is_business', 'is_consumption', 'datetime'], how='left')
        data['target_denominator'] = data['target_denominator'].fillna(value=0)
        data[f'target_return_{nominator_daylag}_{denominator_daylag}'] = data['target_nominator'] / (data['target_denominator'] + 1e-3)
        data = data.drop(columns=['target_nominator', 'target_denominator'])
        return data
        
    def add_target_sma_ema(self, data, day_period):
        assert day_period > 1
        target_block = self.target_relevant.copy()
        target_block['datetime'] = target_block['datetime'] + pd.Timedelta(days=2)
        sma_block = data[['county', 'product_type', 'is_business', 'is_consumption', 'datetime']]
        if len(target_block) > 0:
#             day_period = 0
            for day in range(day_period):
                target_block['datetime'] = target_block['datetime'] + pd.Timedelta(days=1)
                sma_block = pd.merge(sma_block, target_block, on=['county', 'product_type', 'is_business', 'is_consumption', 'datetime'], how='left')
                sma_block.rename(columns={'target': f'target_day_{day}'}, inplace=True)
                sma_block[f'target_day_{day}'] = sma_block[f'target_day_{day}'].fillna(value=0)
        
        sma_block['target_sma'] = 0
        sma_block['target_ema'] = 0
        alpha = 2 / (1 + day_period)
        if len(target_block) > 0:
            for day in range(day_period):
                sma_block['target_sma'] = sma_block['target_sma'] + sma_block[f'target_day_{day}']
                sma_block['target_ema'] = (1 - alpha) * sma_block['target_ema'] + alpha * sma_block[f'target_day_{day}']

        data[f'target__sma_{day_period}'] = list(sma_block['target_sma'] / (day_period + 1e-9))
        data[f'target__ema_{day_period}'] = list(sma_block['target_ema'] / 1)  # div by 1 -> int to float in case of day_period=0
        return data
    
#     def add_target_daylag_daymean(self, data, days):
#         self.update_counter += 1
#         col_names = [f'target__lag_{days}d', f'inv_target__lag_{days}d']
#         target_block = self.target_relevant.copy()
#         target_block['datetime'] = target_block['datetime'] + pd.Timedelta(days=days)
#         target_block = target_block.rename(columns={'target': col_names[0]})
#         data = pd.merge(data, target_block, on=['county', 'product_type', 'is_business', 'is_consumption', 'datetime'], how='left') #  equal is_consumption
#         target_block['is_consumption'] = (target_block['is_consumption'] + 1) % 2
#         target_block = target_block.rename(columns={col_names[0]: col_names[1]})
#         data = pd.merge(data, target_block, on=['county', 'product_type', 'is_business', 'is_consumption', 'datetime'], how='left') #  invert is_consumption
#         return data

In [12]:
class Compiler:
    def __init__(self, days_to_save=30):
        self.days_to_save = days_to_save
        self.forecasted_weather_compiler = ForecastedWeatherCompiler(days_to_save=days_to_save)
        self.historical_weather_compiler = HistoricalWeatherCompiler(days_to_save=days_to_save)
        self.client_compiler = ClientCompiler(days_to_save=days_to_save)
        self.gas_compiler = GasCompiler(days_to_save=days_to_save)
        self.electricity_compiler = ElectricityCompiler(days_to_save=days_to_save)
        self.target_compiler = TargetCompiler(days_to_save=days_to_save)
        
        self.holidays = holidays.country_holidays('EE', years=range(2021, 2026)).keys()
        
    def _check_valid(self, block):
        if block is None:
            return False
        if len(block) == 0:
            return False
        return True
        
    def update(self, historical_weather_block=None, forecasted_weather_block=None,
                     client_block=None,
                     gas_block=None, electricity_block=None,
                     previous_target=None):
        if self._check_valid(forecasted_weather_block):
            self.forecasted_weather_compiler.update(forecasted_weather_block)
        if self._check_valid(historical_weather_block):
            self.historical_weather_compiler.update(historical_weather_block)
        if self._check_valid(client_block):
            self.client_compiler.update(client_block)
        if self._check_valid(gas_block):
            self.gas_compiler.update(gas_block)
        if self._check_valid(electricity_block):
            self.electricity_compiler.update(electricity_block)
        if self._check_valid(previous_target):
            self.target_compiler.update(previous_target)
        
    def add_base_features(self, data):
        data['datetime'] = pd.to_datetime(data['datetime'])
        data['segment'] = data['county'].astype(int).astype(str) + '_' + \
                          data['is_business'].astype(int).astype(str) + '_' + \
                          data['product_type'].astype(int).astype(str) + '_' + \
                          data['is_consumption'].astype(int).astype(str)
        data['hour'] = data['datetime'].apply(lambda x: x.hour)
        data['hour_sin'] = np.sin(data['hour'] / 12 * np.pi)
        data['hour_cos'] = np.cos(data['hour'] / 12 * np.pi)
        data['day_of_week'] = data['datetime'].apply(lambda x: x.weekday())
        data['month'] = data['datetime'].apply(lambda x: x.month)
        data['month_sin'] = np.sin(data['month'] / 6 * np.pi)
        data['month_cos'] = np.cos(data['month'] / 6 * np.pi)
        data['is_weekend'] = data['day_of_week'].isin([5, 6])
        data['is_holiday'] = data['datetime'].apply(lambda x: x.date()).isin(self.holidays)
        data['day_of_year'] = data['datetime'].apply(lambda x: x.timetuple().tm_yday)
        data['day_of_year_sin'] = np.sin(data['day_of_year'] / 183 * np.pi)
        data['day_of_year_cos'] = np.cos(data['day_of_year'] / 183 * np.pi)

        data = self.forecasted_weather_compiler.add_weather(data)
        data = self.forecasted_weather_compiler.add_weather_allsta(data)
        data = self.client_compiler.add_client(data)
        data = self.gas_compiler.add_gas(data)
        data = self.electricity_compiler.add_electricity(data)
        
        data['direct_solar_radiation__effective'] = data['direct_solar_radiation'] * data['installed_capacity'] / (data['temperature'] + 273.15)
        data['direct_solar_radiation__eff_solar_ideal'] = data['direct_solar_radiation__effective'] * data['solar_angle_ideal_cos']
        data['direct_solar_radiation__eff_solar'] = data['direct_solar_radiation__effective'] * data['solar_angle_hor_cos']
        data['surface_solar_radiation_downwards__effective'] = data['surface_solar_radiation_downwards'] * data['installed_capacity'] / (data['temperature'] + 273.15)
        return data
    
    def add_weather_client_daycombolag(self, data, days, feats=None):
        suffix = f'__lag_{days}d'
        suffix_mean = f'__meanlag_{days}d'
        # data = self.forecasted_weather_compiler.add_weather_hourlag(data, days, feats)
        data = self.historical_weather_compiler.add_weather_daylag(data, days, feats)
        data = self.historical_weather_compiler.add_weather_daylag_daymean(data, days, feats)
        data = self.client_compiler.add_client_daylag(data, days)
        data['shortwave_radiation__effective' + suffix] = data['shortwave_radiation' + suffix] * data['installed_capacity' + suffix] / (data['temperature' + suffix] + 273.15)
        data['direct_solar_radiation__effective' + suffix] = data['direct_solar_radiation' + suffix] * data['installed_capacity' + suffix] / (data['temperature' + suffix] + 273.15)
        data['direct_solar_radiation__eff_solar_ideal' + suffix] = data['direct_solar_radiation__effective' + suffix] * data['solar_angle_ideal_cos' + suffix]
        data['direct_solar_radiation__eff_solar' + suffix] = data['direct_solar_radiation__effective' + suffix] * data['solar_angle_hor_cos' + suffix]
        
        data['diffuse_radiation__effective' + suffix] = data['diffuse_radiation' + suffix] * data['installed_capacity' + suffix] / (data['temperature' + suffix] + 273.15)
        data['shortwave_radiation__effective' + suffix_mean] = data['shortwave_radiation' + suffix_mean] * data['installed_capacity' + suffix] / (data['temperature' + suffix] + 273.15)
        data['direct_solar_radiation__effective' + suffix_mean] = data['direct_solar_radiation' + suffix_mean] * data['installed_capacity' + suffix] / (data['temperature' + suffix] + 273.15)
        data['diffuse_radiation__effective' + suffix_mean] = data['diffuse_radiation' + suffix_mean] * data['installed_capacity' + suffix] / (data['temperature' + suffix] + 273.15)
        
        return data
    
    def add_weather_client_hourcombolag(self, data, hours, feats=None):
        suffix = f'__lag_{hours}h'
        data = self.forecasted_weather_compiler.add_weather_hourlag(data, hours, feats)
        data['direct_solar_radiation__effective' + suffix] = data['direct_solar_radiation' + suffix] * data['installed_capacity'] / (data['temperature' + suffix] + 273.15)
        data['direct_solar_radiation__eff_solar_ideal' + suffix] = data['direct_solar_radiation__effective' + suffix] * data['solar_angle_ideal_cos' + suffix]
        data['direct_solar_radiation__eff_solar' + suffix] = data['direct_solar_radiation__effective' + suffix] * data['solar_angle_hor_cos' + suffix]
        data['surface_solar_radiation_downwards__effective' + suffix] = data['surface_solar_radiation_downwards' + suffix] * data['installed_capacity'] / (data['temperature' + suffix] + 273.15)
        return data
        
    def add_lag_features(self, data): 
        data = self.forecasted_weather_compiler.add_weather_hourlag(data, 7*24)
        data = self.forecasted_weather_compiler.add_weather_allsta_hourlag(data, 7*24)
    
#         data = self.historical_weather_compiler.add_weather_daylag(data, 2)
        data = self.add_weather_client_daycombolag(data, 2)
        data = self.historical_weather_compiler.add_weather_daylag(data, 7)
        data = self.historical_weather_compiler.add_weather_allsta_daylag(data, 1)
        data = self.historical_weather_compiler.add_weather_allsta_daylag(data, 2)
        data = self.historical_weather_compiler.add_weather_allsta_daylag(data, 7)
        
        for lag in range(2, 15):
            data = self.target_compiler.add_agg_target_daylag(data, lag, 
                                                              agg_trough=[], 
                                                              agg_fns=[])
        for lag in [2, 3, 7, 14]:
            data = self.target_compiler.add_agg_target_daylag(data, lag, 
                                                              agg_trough=[['product_type'], ['county', 'product_type']], 
                                                              agg_fns=['sum', 'std'])
            data = self.target_compiler.add_inverted_target_daylag(data, lag)
        
        data = self.target_compiler.add_target_return(data, 2, 3)
        data = self.target_compiler.add_target_return(data, 3, 10)
        data = self.target_compiler.add_target_return(data, 2, 9)
        data = self.target_compiler.add_target_return(data, 7, 14)
        
        data[f'target__all_product_type_sum__return_2d_3d'] = data['target__lag_2d__all_product_type_sum'] / (data['target__lag_3d__all_product_type_sum'] + 1e-3)
        data[f'target__all_product_type_sum__return_7d_14d'] = data['target__lag_7d__all_product_type_sum'] / (data['target__lag_14d__all_product_type_sum'] + 1e-3)
        data[f'target__all_county_product_type_sum__return_2d_3d'] = data['target__lag_2d__all_county_product_type_sum'] / (data['target__lag_3d__all_county_product_type_sum'] + 1e-3)
        data[f'target__all_county_product_type_sum__return_7d_14d'] = data['target__lag_7d__all_county_product_type_sum'] / (data['target__lag_14d__all_county_product_type_sum'] + 1e-3)
        
        target_lags = [f'target__lag_{lag}d' for lag in range(2, 15)]
        data['target_mean_14'] = data[target_lags].mean(axis=1)
        data['target_std_14'] = data[target_lags].std(axis=1)
        
        target_lags = [f'target__lag_{lag}d' for lag in range(2, 8)]
        data['target_mean_7'] = data[target_lags].mean(axis=1)
        data['target_std_7'] = data[target_lags].std(axis=1)
        
        data = self.target_compiler.add_target_sma_ema(data, 7)
        
        for col in ['temperature', 
                    'dewpoint', 
                    'u_wind_component', 
                    'v_wind_component', 
            ]:
                data[f"{col}_diff_1"] = data.groupby(["county", 'is_consumption', 'product_type', 'is_business'])[col].diff(1)
        
        return data
    
    def generate_features(self, data):
        data = self.add_base_features(data)
        data = self.add_lag_features(data)
        segment_columns = ['county', 'is_business', 'product_type', 'is_consumption']
        data[segment_columns] = data[segment_columns].astype(int)
        data['segment'] = data['segment'].astype('category')
        return data

In [13]:
if IS_GENFEATS:
    compiler = Compiler(days_to_save=18)
    df_feats_full = []
    blocks = df_data['data_block_id'].unique()
    blocks.sort()
    if IS_SUBMIT:
        blocks = blocks[-24:-4]
    for b in tqdm(blocks):
        df_block = df_data.copy()
        df_block = df_block[df_block['data_block_id'] == b]
        df_block = df_block.drop(columns=['row_id', 'prediction_unit_id', 'target'])

        df_block_target = df_data.copy()
        df_block_target = df_block_target[df_block_target['data_block_id'] == b-2]
        df_block_target['datetime'] = pd.to_datetime(df_block_target['datetime'])
        df_block_target = df_block_target.drop(columns=['data_block_id', 'row_id', 'prediction_unit_id'])

        df_client_block = df_client[df_client['data_block_id'] == b]
        df_gas_prices_block = df_gas_prices[df_gas_prices['data_block_id'] == b]
        df_electricity_prices_block = df_electricity_prices[df_electricity_prices['data_block_id'] == b]
        df_forecast_weather_block = df_forecast_weather[df_forecast_weather['data_block_id'] == b]
        df_historical_weather_block = df_historical_weather[df_historical_weather['data_block_id'] == b]

        compiler.update(df_historical_weather_block, df_forecast_weather_block, 
                        df_client_block, df_gas_prices_block, df_electricity_prices_block, 
                        df_block_target)

        df_feats_full.append(compiler.generate_features(df_block))

    df_feats = pd.concat(df_feats_full).reset_index(drop=True)

    df_target = df_data.copy()
    df_target['datetime'] = pd.to_datetime(df_data['datetime'])
    df_target = df_target.drop(columns=['data_block_id', 'row_id', 'prediction_unit_id'])
    df_feats_target = pd.merge(df_feats, df_target, on=['county', 'product_type', 'is_business', 
                                                        'is_consumption', 'datetime'], how='left')
    df_feats_target = df_feats_target.dropna(subset=['target'])
    # df_feats_target.fillna(0, inplace=True) ???

    del df_feats_full, df_target, df_feats
    df_feats_full, df_target, df_feats = [], pd.DataFrame(), pd.DataFrame()
    gc.collect()
else:
    df_feats_target = pd.read_csv('/kaggle/input/enefit-dataset-187feats/df_187feats_target.csv', 
                                 index_col = [0])
    df_feats_target['datetime'] = pd.to_datetime(df_feats_target['datetime'])

df_feats_target['segment'] = df_feats_target['segment'].astype('category')

  0%|          | 0/20 [00:00<?, ?it/s]

In [14]:
# reduce mem usage
if not IS_SUBMIT:
    cols = list(df_feats_target.columns)
    cols.remove('datetime')
    cols.remove('segment')
    df_feats_target[cols] = df_feats_target[cols].apply(pd.to_numeric, downcast='float')
    df_feats_target[cols] = df_feats_target[cols].apply(pd.to_numeric, downcast='integer')

In [15]:
del df_forecast_weather, df_historical_weather
df_forecast_weather, df_historical_weather = pd.DataFrame(), pd.DataFrame()
gc.collect()

0

## LightGBMs

In [16]:
model_save_path = f'enefit-final'
model_load_paths = [f'/kaggle/input/enefit-final/']
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)
models = dict()

### Lgbms division by is_consumption

In [17]:
if IS_CONS_MODEL:
    cat_features = ['segment', 'hour','day_of_week', 'month','is_weekend','is_holiday', 
                    'county', 'is_business', 'product_type']
    for is_cons in [0,1]:
        name = f"cons-{is_cons}"
        models[name] = []
        for seed in [0, 1, 2, 3, 4, 42]:
            if IS_TRAIN:
                print(f'Train. {name}')
                if IS_OFFLINE or K_FOLD:
                    n_estimators = 4000
                else:
                    if is_cons == 0:
                        n_estimators = 2300
                    elif is_cons == 1:
                        n_estimators = 1700
                lgb_params = {
                        "random_state": seed,
                        "objective": "mae",
                        "n_estimators": n_estimators,
                        "num_leaves": 256,
                        "subsample": 0.6, #0.8,
                        "colsample_bynode": 0.6, #0.8,
                        "colsample_bytree": 0.9,
                        "learning_rate": 0.02,
                        'max_depth': 10,
                        "n_jobs": 4,
                        "device": "gpu",
                        "verbose": -1,
                        "importance_type": "gain",
                }
                model = lgb.LGBMRegressor(**lgb_params)

                mask = (df_feats_target['is_consumption'].astype(int) == is_cons)
                df = df_feats_target[mask]
                df = df.drop(['is_consumption'], axis=1)

                if IS_OFFLINE:
                    X_train = df[df['data_block_id'] <= split_data_block].drop(['data_block_id', 'datetime', 'target'], axis=1)
                    y_train = df[df['data_block_id'] <= split_data_block]['target']
                    X_valid = df[df['data_block_id'] > split_data_block].drop(['data_block_id', 'datetime', 'target'], axis=1)
                    y_valid = df[df['data_block_id'] > split_data_block]['target']
                    gc.collect()
                    model.fit(X_train, y_train,
                        eval_set=[(X_valid, y_valid)],
                        categorical_feature=cat_features,
                        callbacks=[lgb.callback.early_stopping(stopping_rounds=100),
                                   lgb.callback.log_evaluation(period=100)
                                  ],
                    )

                    model_filename = os.path.join(model_save_path, f'{mode}_{name}_seed-{seed}.txt')
                    model.booster_.save_model(model_filename)
                    models[name].append(model.booster_)
                    print(f"Final model saved to {model_filename}")

                elif not K_FOLD:  # not IS_OFFLINE
                    X_train = df.drop(['data_block_id', 'datetime', 'target'], axis=1)
                    y_train = df['target']
                    gc.collect()
                    model.fit(X_train, y_train,
                        categorical_feature=cat_features,
                    )

                    model_filename = os.path.join(model_save_path, f'{mode}_{name}_seed-{seed}.txt')
                    model.booster_.save_model(model_filename)
                    print(f"Final model saved to {model_filename}")
                    models[name].append(model.booster_)

                elif K_FOLD:
                    block_ids = df['data_block_id'].values
                    df_train = df.drop(['data_block_id', 'datetime'], axis=1)
                    X_train, y_train = df_train.drop(['target'], axis=1), df_train['target']
                    del df_train
                    gc.collect()
                    num_folds = 10
                    fold_size = 638 // num_folds
                    for i in range(5):
                        start = i * fold_size
                        end = start + fold_size
                        test_indices = (block_ids >= start) & (block_ids < end)
                        model = lgb.LGBMRegressor(**lgb_params)
                        model.fit(
                            X_train[~test_indices], y_train[~test_indices],
                            eval_set=[(X_train[test_indices], y_train[test_indices])],
                            categorical_feature=cat_features,
                            callbacks=[
                                lgb.callback.early_stopping(stopping_rounds=100),
                                lgb.callback.log_evaluation(period=100),
                            ],
                        )
                        model_filename = os.path.join(model_save_path, f'{name}_10folds-{i+1}_seed-{seed}.txt')
                        model.booster_.save_model(model_filename)
                        print(f"Model for fold {i+1} saved to {model_filename}")
                        models[name].append(model.booster_)

            else:  # not IS_TRAIN
                for model_load_path in model_load_paths:
                    model_load_file = os.path.join(model_load_path, f'{mode}_{name}_seed-{seed}.txt')
                    if os.path.exists(model_load_file):
                        models[name].append(lgb.Booster(model_file=model_load_file))
                    if K_FOLD:
                        for i in range(5):
                            model_load_file = os.path.join(model_load_path, f'{name}_10folds-{i+1}_seed-{seed}.txt')
                            if os.path.exists(model_load_file):
                                models[name].append(lgb.Booster(model_file=model_load_file))
        print(f'{len(models[name])} models are ready')                    
    print(f'{len(models)} groups of models are ready')

In [18]:
if not IS_SUBMIT:
    for name in ['cons-0', 'cons-1']:
        model = models[name][-1]
        fea_imp = pd.DataFrame({'Feature Id':model.feature_name(), 'Importances':model.feature_importance()})
        feature_importances_all = fea_imp.loc[fea_imp['Importances'] > 0].sort_values(by=['Importances'], ascending = False)
        feature_importances_all.to_csv(f'fea_imp_{name}.csv')
        plt.figure(figsize=(12, 12))
        sns.barplot(x="Importances", y="Feature Id", data=feature_importances_all[:30])
        plt.title(f'LightGBM[{name}] features importance ')

### Diff lgbms division by is_consumption

In [19]:
if IS_CONS_DIFF_MODEL:
    cat_features = ['segment', 'hour','day_of_week', 'month','is_weekend','is_holiday', 
                    'county', 'is_business', 'product_type']
    for is_cons in [0,1]:
        name = f"diff-cons-{is_cons}"
        models[name] = []
        for seed in [0, 1, 2, 3, 4, 42]:
            if IS_TRAIN:
                print(f'Train. {name}')
                if IS_OFFLINE or K_FOLD:
                    n_estimators = 4000
                else:
                    if is_cons == 0:
                        n_estimators = 1700
                    elif is_cons == 1:
                        n_estimators = 1600
                lgb_params = {
                        "random_state": seed,
                        "objective": "mae",
                        "n_estimators": n_estimators,
                        "num_leaves": 256,
                        "subsample": 0.6,
                        "colsample_bynode": 0.6,
                        "colsample_bytree": 0.9,
                        "learning_rate": 0.038,
                        'max_depth': 10,
                        "n_jobs": 4,
                        "device": "gpu",
                        "verbose": -1,
                        "importance_type": "gain",
                }
                model = lgb.LGBMRegressor(**lgb_params)

                mask = (df_feats_target['is_consumption'].astype(int) == is_cons)
                df = df_feats_target[mask]
                df = df.drop(['is_consumption'], axis=1)

                if IS_OFFLINE:
                    df_train = df[df['data_block_id'] <= split_data_block].drop(columns=['data_block_id', 'datetime'])
                    df_valid = df[df['data_block_id'] > split_data_block].drop(columns=['data_block_id', 'datetime'])
                    gc.collect()
                    model.fit(df_train.drop(columns=['target']), 
                              df_train['target'] - df_train['target__lag_2d'].fillna(0),
                        eval_set=[(
                            df_valid.drop(columns=['target']), 
                            df_valid['target'] - df_valid['target__lag_2d'].fillna(0)
                        )],
                        categorical_feature=cat_features,
                        callbacks=[lgb.callback.early_stopping(stopping_rounds=100),
                                   lgb.callback.log_evaluation(period=100)
                                  ],
                    )

                    model_filename = os.path.join(model_save_path, f'{mode}_{name}_seed-{seed}.txt')
                    model.booster_.save_model(model_filename)
                    models[name].append(model.booster_)
                    print(f"Final model saved to {model_filename}")

                elif not K_FOLD:  # not IS_OFFLINE
                    df_train = df.drop(columns=['data_block_id', 'datetime'])
                    gc.collect()
                    model.fit(df_train.drop(columns=['target']), 
                              df_train['target'] - df_train['target__lag_2d'].fillna(0),
                        categorical_feature=cat_features,
                    )

                    model_filename = os.path.join(model_save_path, f'{mode}_{name}_seed-{seed}.txt')
                    model.booster_.save_model(model_filename)
                    print(f"Final model saved to {model_filename}")
                    models[name].append(model.booster_)

            else:  # not IS_TRAIN
                for model_load_path in model_load_paths:
                    model_load_file = os.path.join(model_load_path, f'{mode}_{name}_seed-{seed}.txt')
                    if os.path.exists(model_load_file):
                        models[name].append(lgb.Booster(model_file=model_load_file))
                    
        print(f'{len(models[name])} models are ready')                    
    print(f'{len(models)} groups of models are ready')

6 models are ready
6 models are ready
2 groups of models are ready


In [20]:
if not IS_SUBMIT:
    for name in ['diff-cons-0', 'diff-cons-1']:
        model = models[name][-1]
        fea_imp = pd.DataFrame({'Feature Id':model.feature_name(), 'Importances':model.feature_importance()})
        feature_importances_all = fea_imp.loc[fea_imp['Importances'] > 0].sort_values(by=['Importances'], ascending = False)
        feature_importances_all.to_csv(f'fea_imp_{name}.csv')
        plt.figure(figsize=(12, 12))
        sns.barplot(x="Importances", y="Feature Id", data=feature_importances_all[:30])
        plt.title(f'LightGBM[{name}] features importance ')

### Diff 7days lgbms division by is_consumption

In [21]:
if IS_CONS_DIFF7_MODEL:
    cat_features = ['segment', 'hour','day_of_week', 'month','is_weekend','is_holiday', 
                    'county', 'is_business', 'product_type']
    for is_cons in [0,1]:
        name = f"diff7-cons-{is_cons}"
        models[name] = []
        for seed in [0, 1, 2, 3, 4, 42]:
            if IS_TRAIN:
                print(f'Train. {name}')
                if IS_OFFLINE or K_FOLD:
                    n_estimators = 4000
                else:
                    if is_cons == 0:
                        n_estimators = 3000
                    elif is_cons == 1:
                        n_estimators = 1200
                lgb_params = {
                        "random_state": seed,
                        "objective": "mae",
                        "n_estimators": n_estimators,
                        "num_leaves": 256,
                        "subsample": 0.6,
                        "colsample_bynode": 0.6,
                        "colsample_bytree": 0.9,
                        "learning_rate": 0.03,
                        'max_depth': 10,
                        "n_jobs": 4,
                        "device": "gpu",
                        "verbose": -1,
                        "importance_type": "gain",
                }
                model = lgb.LGBMRegressor(**lgb_params)

                mask = (df_feats_target['is_consumption'].astype(int) == is_cons)
                df = df_feats_target[mask]
                df = df.drop(['is_consumption'], axis=1)

                if IS_OFFLINE:
                    df_train = df[df['data_block_id'] <= split_data_block].drop(columns=['data_block_id', 'datetime'])
                    df_valid = df[df['data_block_id'] > split_data_block].drop(columns=['data_block_id', 'datetime'])
                    gc.collect()
                    model.fit(df_train.drop(columns=['target']), 
                              df_train['target'] - df_train['target__lag_7d'].fillna(0),
                        eval_set=[(
                            df_valid.drop(columns=['target']), 
                            df_valid['target'] - df_valid['target__lag_7d'].fillna(0)
                        )],
                        categorical_feature=cat_features,
                        callbacks=[lgb.callback.early_stopping(stopping_rounds=100),
                                   lgb.callback.log_evaluation(period=100)
                                  ],
                    )

                    model_filename = os.path.join(model_save_path, f'{mode}_{name}_seed-{seed}.txt')
                    model.booster_.save_model(model_filename)
                    models[name].append(model.booster_)
                    print(f"Final model saved to {model_filename}")

                elif not K_FOLD:  # not IS_OFFLINE
                    df_train = df.drop(columns=['data_block_id', 'datetime'])
                    gc.collect()
                    model.fit(df_train.drop(columns=['target']), 
                              df_train['target'] - df_train['target__lag_7d'].fillna(0),
                        categorical_feature=cat_features,
                    )

                    model_filename = os.path.join(model_save_path, f'{mode}_{name}_seed-{seed}.txt')
                    model.booster_.save_model(model_filename)
                    print(f"Final model saved to {model_filename}")
                    models[name].append(model.booster_)

            else:  # not IS_TRAIN
                for model_load_path in model_load_paths:
                    model_load_file = os.path.join(model_load_path, f'{mode}_{name}_seed-{seed}.txt')
                    if os.path.exists(model_load_file):
                        models[name].append(lgb.Booster(model_file=model_load_file))
                    
        print(f'{len(models[name])} models are ready')                    
    print(f'{len(models)} groups of models are ready')

In [22]:
if not IS_SUBMIT:
    for name in ['diff7-cons-0', 'diff7-cons-1']:
        model = models[name][-1]
        fea_imp = pd.DataFrame({'Feature Id':model.feature_name(), 'Importances':model.feature_importance()})
        feature_importances_all = fea_imp.loc[fea_imp['Importances'] > 0].sort_values(by=['Importances'], ascending = False)
        feature_importances_all.to_csv(f'fea_imp_{name}.csv')
        plt.figure(figsize=(12, 12))
        sns.barplot(x="Importances", y="Feature Id", data=feature_importances_all[:30])
        plt.title(f'LightGBM[{name}] features importance ')

In [23]:
def predict(models, df_data, sample_prediction=None):
    if sample_prediction is None:
        sample_prediction = df_data.copy()
        sample_prediction['target'] = 0
        
    # consumption division
    if 'cons-0' in models.keys() and 'cons-1' in models.keys():
        for is_cons in [0,1]:
            name = f"cons-{is_cons}"
            mask = (df_data['is_consumption'].astype(int) == is_cons)
            df = df_data[mask]
            if df.shape[0] == 0:
                continue
            df = df.drop(['is_consumption', 'data_block_id', 'datetime',
                          'target', 'prediction', 'currently_scored',
                          'row_id', 'prediction_unit_id'],
                         axis=1, errors='ignore')
            weight = 1 / len(models[name])
            prediction = np.zeros(df.shape[0])
            for model in models[name]:
                prediction += weight * np.nan_to_num(model.predict(df[model.feature_name()]))
            sample_prediction.loc[mask.values, 'target'] += weights2name[name] * prediction.clip(0)
    
    # diff consumption division
    if 'diff-cons-0' in models.keys() and 'diff-cons-1' in models.keys():
        for is_cons in [0,1]:
            name = f"diff-cons-{is_cons}"
            mask = (df_data['is_consumption'].astype(int) == is_cons)
            df = df_data[mask]
            if df.shape[0] == 0:
                continue
            df = df.drop(['is_consumption', 'data_block_id', 'datetime',
                          'target', 'prediction', 'currently_scored',
                          'row_id', 'prediction_unit_id'],
                         axis=1, errors='ignore')
            weight = 1 / len(models[name])
            prediction = np.zeros(df.shape[0])
            for model in models[name]:
                prediction += weight * \
                              np.nan_to_num(model.predict(df[model.feature_name()]) + \
                                                     df['target__lag_2d'].fillna(0))
            sample_prediction.loc[mask.values, 'target'] += weights2name[name] * prediction.clip(0)
            
    # diff 7days consumption division
    if 'diff7-cons-0' in models.keys() and 'diff7-cons-1' in models.keys():
        for is_cons in [0,1]:
            name = f"diff7-cons-{is_cons}"
            mask = (df_data['is_consumption'].astype(int) == is_cons)
            df = df_data[mask]
            if df.shape[0] == 0:
                continue
            df = df.drop(['is_consumption', 'data_block_id', 'datetime',
                          'target', 'prediction', 'currently_scored',
                          'row_id', 'prediction_unit_id'],
                         axis=1, errors='ignore')
            weight = 1 / len(models[name])
            prediction = np.zeros(df.shape[0])
            for model in models[name]:
                prediction += weight * \
                              np.nan_to_num(model.predict(df[model.feature_name()]) + \
                                                     df['target__lag_7d'].fillna(0))
            sample_prediction.loc[mask.values, 'target'] += weights2name[name] * prediction.clip(0)
        
    return sample_prediction['target']

In [24]:
def refit(models, df_data, refit_decay=0.995):
    # consumption division
    if 'cons-0' in models.keys() and 'cons-1' in models.keys():
        for is_cons in [0,1]:
            name = f"cons-{is_cons}"
            mask = (df_data['is_consumption'].astype(int) == is_cons)
            df = df_data[mask]
            if df.shape[0] == 0:
                continue
            df = df.drop(['is_consumption', 'data_block_id', 'datetime',
                          'prediction', 'currently_scored',
                          'row_id', 'prediction_unit_id'],
                         axis=1, errors='ignore')
            y_train = df['target']
            X_train = df.drop(['is_consumption', 'data_block_id', 'datetime',
                               'target', 'prediction', 'currently_scored'],
                              axis=1, errors='ignore')
            for i in range(len(models[name])):
                models[name][i] = models[name][i].refit(X_train[models[name][i].feature_name()], 
                                                        y_train, 
                                                        decay_rate=refit_decay)
    
    # diff consumption division
    if 'diff-cons-0' in models.keys() and 'diff-cons-1' in models.keys():
        for is_cons in [0,1]:
            name = f"diff-cons-{is_cons}"
            mask = (df_data['is_consumption'].astype(int) == is_cons)
            df = df_data[mask]
            if df.shape[0] == 0:
                continue
            df = df.drop(['is_consumption', 'data_block_id', 'datetime',
                          'prediction', 'currently_scored',
                          'row_id', 'prediction_unit_id'],
                         axis=1, errors='ignore')
            y_train = df['target'] - df['target__lag_2d'].fillna(0)
            X_train = df.drop(['is_consumption', 'data_block_id', 'datetime',
                               'target', 'prediction', 'currently_scored'],
                              axis=1, errors='ignore')
            for i in range(len(models[name])):
                models[name][i] = models[name][i].refit(X_train[models[name][i].feature_name()], 
                                                        y_train, 
                                                        decay_rate=refit_decay)
                    
    # diff 7days consumption division
    if 'diff7-cons-0' in models.keys() and 'diff7-cons-1' in models.keys():
        for is_cons in [0,1]:
            name = f"diff7-cons-{is_cons}"
            mask = (df_data['is_consumption'].astype(int) == is_cons)
            df = df_data[mask]
            if df.shape[0] == 0:
                continue
            df = df.drop(['is_consumption', 'data_block_id', 'datetime',
                          'prediction', 'currently_scored',
                          'row_id', 'prediction_unit_id'],
                         axis=1, errors='ignore')
            y_train = df['target'] - df['target__lag_7d'].fillna(0)
            X_train = df.drop(['is_consumption', 'data_block_id', 'datetime',
                               'target', 'prediction', 'currently_scored'],
                              axis=1, errors='ignore')
            for i in range(len(models[name])):
                models[name][i] = models[name][i].refit(X_train[models[name][i].feature_name()], 
                                                        y_train, 
                                                        decay_rate=refit_decay)
    return models

In [25]:
if IS_OFFLINE and not IS_SUBMIT:
    df_valid1 = df_feats_target[df_feats_target['data_block_id'] > 500]
    df_valid1['prediction'] = predict(models, df_valid1)
    val_score = mean_absolute_error(df_valid1['target'], df_valid1['prediction'])
    print(f'validation mae score = {val_score}')
    # mean_absolute_error(df_valid1[df_valid1['data_block_id'] > 618]['target'], df_valid1[df_valid1['data_block_id'] > 618]['prediction'])    

In [26]:
for name in models.keys():
    for i in range(len(models[name])):
        models[name][i] = models[name][i].reset_parameter({'verbose':-1})

In [27]:
if IS_OFFLINE and not IS_SUBMIT:
    df_valid = df_feats_target[df_feats_target['data_block_id'] > split_data_block]
    df_valid['prediction'] = None
    days_to_refit = 28
    min_block = np.min(df_valid['data_block_id'])
    max_block = np.max(df_valid['data_block_id'])
    for block in tqdm(range(min_block, max_block+1, days_to_refit)):
        mask = (df_valid['data_block_id'] >= block) & \
               (df_valid['data_block_id'] < block + days_to_refit)
        df_valid.loc[mask, 'prediction'] = predict(models, df_valid[mask])
        mask = (df_valid['data_block_id'] >= block-2) & \
               (df_valid['data_block_id'] < block-2 + days_to_refit)
        models = refit(models, df_valid[mask], refit_decay=0.996)
    val_score = mean_absolute_error(df_valid['target'], df_valid['prediction'])
    print(f'After refit: validation mae score = {val_score}')
#     mean_absolute_error(df_valid[df_valid['data_block_id'] > 618]['target'], df_valid[df_valid['data_block_id'] > 618]['prediction'])

## API Submission

In [28]:
import enefit
env = enefit.make_env()
iter_test = env.iter_test()

In [29]:
def is_prediction_needed(test):
    return not all(test['currently_scored'] == False)

In [30]:
refit_decay = 2.
days_to_refit = 28
assert days_to_refit > 5

cur_data_block = 1000
cache_data = pd.DataFrame()
target_cols = ['target', 'county', 'is_business', 'product_type', 
               'is_consumption', 'datetime']
cache_target = pd.DataFrame()

for (test, revealed_targets, client, 
     historical_weather, forecast_weather, 
     electricity_prices, gas_prices, 
     sample_prediction) in iter_test:
    
    test = test.rename(columns={"prediction_datetime": "datetime"})
    revealed_targets['datetime'] = pd.to_datetime(revealed_targets['datetime'])
    revealed_targets = revealed_targets.drop(columns=['data_block_id', 'row_id', 
                                                      'prediction_unit_id'],
                                             errors='ignore')
    compiler.update(historical_weather, forecast_weather, 
                    client, gas_prices, electricity_prices, 
                    revealed_targets)
    test_feats = compiler.generate_features(test)
    
    # == online training ==
    if refit_decay < 1.:
        revealed_targets = revealed_targets[target_cols]
        revealed_targets['data_block_id'] = cur_data_block
        cache_target = pd.concat([cache_target, revealed_targets], ignore_index=True, axis=0)
        cache_target.drop(cache_target['data_block_id'] < cur_data_block - days_to_refit*2,
                          axis=0, inplace=True)
        test_feats['data_block_id'] = cur_data_block
        cache_data = pd.concat([cache_data, test_feats], ignore_index=True, axis=0)
        cache_data.drop(cache_data['data_block_id'] < cur_data_block - days_to_refit*2,
                        axis=0, inplace=True)
        if cur_data_block > 1000 + 5 and (cur_data_block-1000) % days_to_refit == 2:
            cache_merged = pd.merge(cache_data, cache_target.drop(columns=['data_block_id']), 
                            on=['datetime', 'county', 'is_business', 
                                'product_type', 'is_consumption'], 
                            how='left')
            refit_data = cache_merged[cache_merged['data_block_id'] >= cur_data_block - days_to_refit - 1]
            refit_data = refit_data.dropna(subset=['target'])
            models = refit(models, refit_data, refit_decay=refit_decay)
    # ==-------------==
    
    # prediction
    if is_prediction_needed(test):
        sample_prediction['target'] = predict(models, test_feats, sample_prediction)
    else:
        sample_prediction['target'] = 0.0
    
    env.predict(sample_prediction)
    
    cur_data_block += 1

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
