In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import wandb
import os
os.environ["WANDB_SILENT"] = "true"

import gc

import math

from xgboost import XGBRegressor as XGB
from lightgbm import LGBMRegressor as LGB
from sklearn.metrics import mean_absolute_error as mae
SEED=42

from src.correlations import *
from src.features_train import get_features

from src.styles import *
set_styles()

import warnings
warnings.filterwarnings("ignore")

In [4]:
class SplitKFold:
    def __init__(self, n_splits=3, test_len=3*24*30):
        self.n_splits = n_splits
        self.test_len = test_len
        
    def split(self, X):        
        for fold in range(self.n_splits):
            offset = X['time_id'].max() - (self.n_splits - fold) * self.test_len
            idx_train = X.loc[X['time_id'] < offset].index
            idx_test = X.loc[(X['time_id'] >= offset) & (X['time_id'] < offset + self.test_len)].index            
            yield idx_train, idx_test
            
    def get_n_splits(self, X, y):
        return self.n_splits




def eval(X, Y, splitter, model):
    scores = []
    
    df_oof = pd.DataFrame()
    
    for fold, (idx_train, idx_val) in enumerate(splitter.split(X)):
        X_train = X.loc[idx_train].dropna()
        Y_train = Y.loc[X_train.index]
        Y_val = Y.loc[idx_val].dropna()
        X_val = X.loc[Y_val.index]
        
        model.fit(X_train, Y_train)
        preds = model.predict(X_val)
        score = mae(Y_val, preds)
        scores.append(score)
        
        df = X_val[['time_id', 'is_consumption', 'is_business', 'product_type', 'county']]
        df['preds'] = preds
        df['target'] = Y_val
        df_oof = pd.concat([df_oof, df], axis=0)
        
    return np.array(scores), df_oof


def score_cv(X, Y, factor, splitter, model):
    scores = []
    
    df_oof = pd.DataFrame()
    
    for fold, (idx_train, idx_val) in enumerate(splitter.split(X)):
        X_train = X.loc[idx_train].dropna()
        Y_train = Y.loc[X_train.index]
        Y_val = Y.loc[idx_val].dropna()
        X_val = X.loc[Y_val.index]
        factor_val = factor.loc[Y_val.index]
        
        model.fit(X_train, Y_train)
        preds = model.predict(X_val)
        score = mae(Y_val*factor_val, preds*factor_val)
        scores.append(score)
        
        df = X_val[['time_id', 'is_consumption', 'is_business', 'product_type', 'county']]
        df['preds'] = preds*factor_val
        df['target'] = Y_val*factor_val
        df_oof = pd.concat([df_oof, df], axis=0)
        
    return np.array(scores), df_oof



def score_dif(X, Y, A, splitter, model):
    scores = []
    
    df_oof = pd.DataFrame()
    
    for fold, (idx_train, idx_val) in enumerate(splitter.split(X)):
        X_train = X.loc[idx_train].dropna()
        Y_train = Y.loc[X_train.index]
        Y_val = Y.loc[idx_val].dropna()
        X_val = X.loc[Y_val.index]
        a = A.loc[Y_val.index]
        
        model.fit(X_train, Y_train)
        preds = model.predict(X_val)
        score = mae((Y_val+a)*b, (preds+a)*b)
        scores.append(score)
        
        df = X_val[['time_id', 'is_consumption', 'is_business', 'product_type', 'county']]
        df['preds'] = preds+a
        df['target'] = Y_val+a
        df_oof = pd.concat([df_oof, df], axis=0)
        
    return np.array(scores), df_oof


def print_mae_sep(df_oof):
    for c in [0,1]:
        for b in [0,1]:
            df = df_oof.query(f'(is_consumption=={c}) & (is_business=={b})')    
            print(c, b, end='       ')
            for fold in range(4):
                offset = df['time_id'].max() - (4 - fold) * 3*24*30
                dd = df.loc[(df['time_id'] >= offset) & (df['time_id'] < offset + 3*24*30)]
                print(f"{mae(dd['target'], dd['preds']):.3f}".rjust(10), end='    ')
            print(f"        {mae(df['target'], df['preds']):.3f}")

In [5]:
%%time
df_weather_station_to_county_mapping = pd.read_csv('data/weather_station_to_county_mapping.csv').dropna()
    
df_weather_station_to_county_mapping['lat_lon'] = df_weather_station_to_county_mapping.apply(lambda row: \
                                                     f'{row["latitude"]:.1f}_{row["longitude"]:.1f}', axis=1)
dict_county = df_weather_station_to_county_mapping[['lat_lon', 'county']].set_index('lat_lon').to_dict()['county']

CPU times: total: 0 ns
Wall time: 4.91 ms


In [6]:
%%time

df_gas_prices = pd.read_csv('data/gas_prices.csv')
df_electricity_prices = pd.read_csv('data/electricity_prices.csv')
df_client = pd.read_csv('data/client.csv')
df_train = pd.read_csv('data/train.csv')
df_forecast_weather = pd.read_csv('data/forecast_weather.csv')
df_historical_weather = pd.read_csv('data/historical_weather.csv')
df = get_features(df_train, df_client, df_gas_prices, df_electricity_prices, df_forecast_weather, df_historical_weather, dict_county)
del df_forecast_weather, df_historical_weather
gc.collect()

CPU times: total: 1min 22s
Wall time: 1min 36s


53

In [7]:
df

Unnamed: 0,time_id,data_block_id,is_consumption,is_business,product_type,county,eic_count,installed_capacity,target_48,target_168,...,cloudcover_mid_y_1,cloudcover_total_y_1,10_metre_u_wind_component_y_1,10_metre_v_wind_component_y_1,direct_solar_radiation_y_1,surface_solar_radiation_downwards_y_1,snowfall_y_1,total_precipitation_y_1,target_ratio,target
5856,48,2,0,0,1,0,108.0,952.89,0.713,,...,0.661287,0.728578,7.299380,-0.873767,0.000000,0.000000,0.0,0.00022,,0.793
5857,48,2,1,0,1,0,108.0,952.89,96.590,,...,0.661287,0.728578,7.299380,-0.873767,0.000000,0.000000,0.0,0.00022,,107.129
5858,48,2,0,0,2,0,17.0,166.40,0.000,,...,0.661287,0.728578,7.299380,-0.873767,0.000000,0.000000,0.0,0.00022,,0.000
5859,48,2,1,0,2,0,17.0,166.40,17.314,,...,0.661287,0.728578,7.299380,-0.873767,0.000000,0.000000,0.0,0.00022,,19.630
5860,48,2,0,0,3,0,688.0,7207.88,2.904,,...,0.661287,0.728578,7.299380,-0.873767,0.000000,0.000000,0.0,0.00022,,0.977
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018347,15311,637,1,1,0,15,15.0,620.00,188.167,415.530,...,0.027475,0.456015,4.466787,-2.069000,3.715556,0.413333,0.0,0.00000,0.985352,197.233
2018348,15311,637,0,1,1,15,20.0,624.50,0.000,0.000,...,0.027475,0.456015,4.466787,-2.069000,3.715556,0.413333,0.0,0.00000,0.000000,0.000
2018349,15311,637,1,1,1,15,20.0,624.50,31.484,31.286,...,0.027475,0.456015,4.466787,-2.069000,3.715556,0.413333,0.0,0.00000,0.942943,28.404
2018350,15311,637,0,1,3,15,55.0,2188.20,0.000,0.000,...,0.027475,0.456015,4.466787,-2.069000,3.715556,0.413333,0.0,0.00000,0.000000,0.000


In [8]:
df_corr = compute_correlations(df.iloc[:, :-1], df['target'])

In [9]:
df_corr.sort_values('corr', ascending=False).head(30)

Unnamed: 0,feature,corr
9,target_168,0.967732
10,target_336,0.962427
11,target_504,0.956631
17,target_144,0.951987
8,target_48,0.936987
16,target_120,0.932641
12,target_49,0.932501
14,target_72,0.929452
15,target_96,0.927719
13,target_50,0.92187


# Single model

## no target transform

In [10]:
%%time
print('-'*100)

X = df.copy()
Y = X.pop('target')

splitter = SplitKFold(n_splits=4)
model = LGB(random_state=SEED, n_estimators=500, max_depth=5, num_leaves=31, objective='mae', device='gpu', verbose=-100)
scores, df_oof_single_orig = eval(X, Y, splitter, model)

for fold, s in enumerate(scores):
    print(f'\t{TXT_ACC} Fold {fold} {TXT_RESET}      {s:.3f}')
print(f'Mean score {scores.mean():.3f}')
print_mae_sep(df_oof_single_orig)

----------------------------------------------------------------------------------------------------
	[1m[38;5;254m[48;5;240m Fold 0 [0m      58.235
	[1m[38;5;254m[48;5;240m Fold 1 [0m      41.052
	[1m[38;5;254m[48;5;240m Fold 2 [0m      42.768
	[1m[38;5;254m[48;5;240m Fold 3 [0m      79.628
Mean score 55.421
0 0           50.245        23.542         9.139        96.067            44.369
0 1           57.129        19.334         5.785        83.567            41.291
1 0            9.849        20.013        35.600        32.137            24.411
1 1          106.866        95.063       114.728       101.710            104.518
CPU times: total: 7min 19s
Wall time: 4min 12s


## target / installed_capacity

In [11]:
%%time
print('-'*100)

X = df.copy()
Y = X.pop('target') / X['installed_capacity']

splitter = SplitKFold(n_splits=4)
model = LGB(random_state=SEED, n_estimators=500, max_depth=5, num_leaves=31, objective='mae', device='gpu', verbose=-100)
scores, df_oof_single = score_cv(X, Y, X['installed_capacity'], splitter, model)

for fold, s in enumerate(scores):
    print(f'\t{TXT_ACC} Fold {fold} {TXT_RESET}      {s:.3f}')
print(f'Mean score {scores.mean():.3f}')
print_mae_sep(df_oof_single)

----------------------------------------------------------------------------------------------------
	[1m[38;5;254m[48;5;240m Fold 0 [0m      59.973
	[1m[38;5;254m[48;5;240m Fold 1 [0m      42.897
	[1m[38;5;254m[48;5;240m Fold 2 [0m      47.644
	[1m[38;5;254m[48;5;240m Fold 3 [0m      82.348
Mean score 58.216
0 0           46.136        25.560        10.801        65.970            36.868
0 1           56.608        21.550         9.093        72.716            39.846
1 0           12.687        20.758        36.544        34.393            26.095
1 1          114.819        97.345       127.379       145.920            121.176
CPU times: total: 7min 55s
Wall time: 3min 54s


# Dual model

1. Model for is_consumption==0
2. Model for is_consumption==1

## original target

In [12]:
%%time
X = df.query('is_consumption==0')
Y = X.pop('target') 

splitter = SplitKFold(n_splits=4)
model = LGB(random_state=SEED, n_estimators=500, max_depth=5, num_leaves=31, objective='mae', device='gpu', verbose=-100)

scores_0, df_oof_dual_0 = eval(X, Y, splitter, model)



X = df.query('is_consumption==1')
Y = X.pop('target') 

splitter = SplitKFold(n_splits=4)
model = LGB(random_state=SEED, n_estimators=500, max_depth=5, num_leaves=31, objective='mae', device='gpu', verbose=-100)

scores_1, df_oof_dual_1 = eval(X, Y, splitter, model)

df_oof_dual_orig = pd.concat([df_oof_dual_0, df_oof_dual_1], axis=0)
print_mae_sep(df_oof_dual_orig)

0 0           48.961        19.808         8.352        93.306            42.226
0 1           52.343        16.109         5.357        79.261            38.104
1 0           11.125        18.104        32.518        30.530            23.070
1 1          101.115        87.398        98.664        97.795            96.177
CPU times: total: 8min 38s
Wall time: 4min 3s


## target / installed_capacity for is_consumption==0

In [13]:
%%time
X = df.query('is_consumption==0')
Y = X.pop('target') / X['installed_capacity']

splitter = SplitKFold(n_splits=4)
model = LGB(random_state=SEED, n_estimators=500, max_depth=5, num_leaves=31, objective='mae', device='gpu', verbose=-100)

scores_0, df_oof_dual_0 = score_cv(X, Y, X['installed_capacity'], splitter, model)



X = df.query('is_consumption==1')
Y = X.pop('target') 

splitter = SplitKFold(n_splits=4)
model = LGB(random_state=SEED, n_estimators=500, max_depth=5, num_leaves=31, objective='mae', device='gpu', verbose=-100)

scores_1, df_oof_dual_1 = eval(X, Y, splitter, model)

df_oof_dual = pd.concat([df_oof_dual_0, df_oof_dual_1], axis=0)
print_mae_sep(df_oof_dual)

0 0           44.966        19.615         7.428        65.636            34.140
0 1           52.033        17.399         6.073        68.707            35.909
1 0           11.125        18.104        32.518        30.530            23.070
1 1          101.115        87.398        98.664        97.795            96.177
CPU times: total: 8min 54s
Wall time: 4min 43s


## new_target = target - target_lag_48h

In [18]:
%%time

splitter = SplitKFold(n_splits=4)
model = LGB(random_state=SEED, n_estimators=500, max_depth=5, num_leaves=31, objective='mae', device='gpu', verbose=-100)

X = df.query('is_consumption==0')
Y = X.pop('target') - X['target_48']
scores_0, df_oof_dual_0_dif = score_dif(X, Y, X['target_48'], splitter, model)


X = df.query('is_consumption==1')
Y = X.pop('target') - X['target_48']
scores_1, df_oof_dual_1_dif = score_dif(X, Y, X['target_48'], splitter, model)

df_dual_diff = pd.concat([df_oof_dual_0_dif, df_oof_dual_1_dif], axis=0)
print_mae_sep(df_dual_diff) 

0 0           43.630        20.250         8.486        82.804            38.452
0 1           51.167        17.345         5.453        69.986            35.837
1 0            8.372        17.955        32.984        26.912            21.576
1 1           93.143        84.839        95.275        96.603            92.402
CPU times: total: 7min 41s
Wall time: 4min 19s


# 4-model solution

## original target

In [15]:
%%time

df_oofs = []
splitter = SplitKFold(n_splits=4)

for c in [0,1]:
    for b in [0,1]:

        X = df.query(f'(is_consumption=={c}) & (is_business=={b})')
        Y = X.pop('target') 
        scores, df_oof = eval(X, Y, splitter, LGB(random_state=SEED, n_estimators=100, max_depth=5, num_leaves=31, objective='mae', device='gpu', verbose=-100))
        df_oofs.append(df_oof)

df_oof_quad_orig = pd.concat(df_oofs, axis=0)
print_mae_sep(df_oof_quad_orig)

0 0           43.887        20.968         8.447        83.423            38.854
0 1           51.320        16.387         5.569        70.101            35.701
1 0            7.824        23.114        45.637        24.067            25.243
1 1          104.210        96.220       120.922       105.136            106.543
CPU times: total: 4min 9s
Wall time: 2min 26s


## transformed target

In [16]:
%%time

df_oofs = []
splitter = SplitKFold(n_splits=4)

for c in [0,1]:
    for b in [0,1]:

        X = df.query(f'(is_consumption=={c}) & (is_business=={b})')

        if not ((c==1) and (b==1)):
            Y = X.pop('target') / X['installed_capacity'] 
            scores, df_oof = score_cv(X, Y, X['installed_capacity'], splitter, LGB(random_state=SEED, n_estimators=100, max_depth=5, num_leaves=31, objective='mae', device='gpu', verbose=-100))
            df_oofs.append(df_oof)
        else:
            Y = X.pop('target') 
            scores, df_oof = eval(X, Y, splitter, LGB(random_state=SEED, n_estimators=100, max_depth=5, num_leaves=31, objective='mae', device='gpu', verbose=-100))
            df_oofs.append(df_oof)

df_oof_quad = pd.concat(df_oofs, axis=0)
print_mae_sep(df_oof_quad)

0 0           42.948        17.417         7.198        50.399            29.273
0 1           46.927        15.543         5.184        61.323            32.112
1 0           15.462        18.974        26.892        20.975            20.594
1 1          104.210        96.220       120.922       105.136            106.543
CPU times: total: 4min 6s
Wall time: 2min 28s


## new_target = target - target_lag_48h

In [17]:
%%time

df_oofs = []
splitter = SplitKFold(n_splits=4)

for c in [0,1]:
    for b in [0,1]:

        X = df.query(f'(is_consumption=={c}) & (is_business=={b})')
        Y = X.pop('target') - X['target_48']
        scores, df_oof = score_dif(X, Y, X['target_48'], splitter, LGB(random_state=SEED, n_estimators=100, max_depth=5, num_leaves=31, objective='mae', device='gpu', verbose=-100))
        df_oofs.append(df_oof)

df_oof_quad_orig = pd.concat(df_oofs, axis=0)
print_mae_sep(df_oof_quad_orig)

0 0           45.390        22.287         8.358        72.204            36.767
0 1           51.733        17.727         5.437        63.342            34.427
1 0            8.110        18.504        34.813        28.412            22.480
1 1           98.408        98.022       107.836       105.944            102.507
CPU times: total: 4min 2s
Wall time: 2min 23s
