In [121]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


## Data Import

In [122]:
x_train = pd.read_csv('data/x_train.csv')
y_train = pd.read_csv('data/y_train.csv')
x_test = pd.read_csv('data/x_test.csv')

In [123]:
x_train['target'] = y_train['TARGET']
def clean_df(df, country):
    df = df.set_index(keys='ID')

    values = df[df['COUNTRY'] == country.upper()]
    res = pd.DataFrame()
    for c in values.columns:
        if c[:3] == f'{country.upper()}_':
            res[c[3:]] = values[c]

    for data_c in ['GAS_RET', 'COAL_RET', 'CARBON_RET']:
        res[data_c] = values[data_c]

    if 'target' in df.columns:
        res['target'] = df['target']

    if country == 'de':
        res['LIGNITE'] = values['DE_LIGNITE']

    return res

df = clean_df(x_train, 'de')

# change FR_EXCHANGE to DE_EXCHANGE if country == 'fr' and viceversa
independent_variables = ['CONSUMPTION', 'FR_EXCHANGE', 'NET_EXPORT', 'GAS', 'COAL', 'HYDRO', 'NUCLEAR', 'SOLAR', 'WINDPOW', 'RESIDUAL_LOAD', 'RAIN', 'WIND', 'TEMP', 'GAS_RET', 'COAL_RET', 'CARBON_RET']
dependent_variables = ['target']

### Calculate output value for ranges

### Stratify into train/test sets

In [124]:
def generate_quantiles(df, n):

    step = 1 / n

    intervals_raw = [np.quantile(df['target'], i) for i in np.arange(step, 1.0, step)]
    intervals_ = [float('-inf')] + intervals_raw + [float('inf')]


    intervals = dict()

    for b_val, t_val in zip(intervals_, intervals_[1:]):
        indices = np.where(np.logical_and(df['target'] > b_val, df['target'] <= t_val))
        interval_mean = round(df['target'].iloc[indices].mean(), 3)
        intervals[interval_mean] = b_val, t_val
    
    return intervals



def discretize_df(df,n):
        for index, value in df['target'].items():
            for disc, (b_range, t_range) in generate_quantiles(df,n).items():
                if b_range < value <= t_range:
                    df.loc[index, 'disc_target'] = disc
                    break


### GridSearchCV for parameter hypertuning

In [126]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

# might take a while
# ----------------------
param_grid = {
    'objective': ['mape', 'mae'],
    'boosting_type': ['gbdt', 'dart'],  
    'bagging_fraction': [0.5, 0.7],
    'learning_rate': [0.1, 0.05],
    'max_bin': [20, 25, 30],
    'max_depth': [10, 15, 20],
    'min_data_in_leaf': [30, 40, 50],
    'min_sum_hessian_in_leaf': [50, 100.0],
    'num_leaves': [50, 75],
    'subsample': [0.5, 0.2],
}

lgb_model = lgb.LGBMRegressor(verbose=-1, is_unbalance=True, boost_from_average=False)

grid_search = GridSearchCV(lgb_model, param_grid, scoring='neg_mean_absolute_percentage_error', cv=5, verbose=1)

grid_search.fit(df[independent_variables],df['target']) 

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Fitting 5 folds for each of 3456 candidates, totalling 17280 fits
Best Hyperparameters: {'bagging_fraction': 0.5, 'boosting_type': 'dart', 'learning_rate': 0.05, 'max_bin': 20, 'max_depth': 10, 'min_data_in_leaf': 30, 'min_sum_hessian_in_leaf': 100.0, 'num_leaves': 50, 'objective': 'mae', 'subsample': 0.5}


### Estimate model performance by iteratively fit the model on different validation sets

In [130]:

best_params['verbose'] = -1
best_params['is_unbalance'] = True
best_params['boost_from_average'] = False

n = 30
res = []
for i in range(n):

    
    discretize_df(df, 12)

    train, test = train_test_split(df, test_size=0.2, stratify=df['disc_target'])
    train_d = lgb.Dataset(df[independent_variables], label = df['target'])
    test_d = lgb.Dataset(df[independent_variables], label = df['target']) 



    num_boost_round = 20  # Keep low to prevent overfitting
    model = lgb.train(best_params, train_d, num_boost_round=num_boost_round)

    outputs = model.predict(test[independent_variables], num_iteration=model.best_iteration)


    from scipy.stats import spearmanr

    spearman_ = spearmanr(outputs, test['target']).correlation
    pearson_ = np.corrcoef(outputs,  test['target'])[0,1]
    res.append(spearman_)

    print(f'Spearman: {spearman_}, Pearson: {pearson_}')


# Average spearman correlation over n fits
print(np.mean(res))



Spearman: 0.5163865836389324, Pearson: 0.29238692063635807
Spearman: 0.42173445742527554, Pearson: 0.37718516091837845
Spearman: 0.43171243013582344, Pearson: 0.2826871855933726
Spearman: 0.36811153259466406, Pearson: 0.35107481107632094
Spearman: 0.4754605832783587, Pearson: 0.38410025769416084
Spearman: 0.37011530092551786, Pearson: 0.21278716335651993
Spearman: 0.46064808397822987, Pearson: 0.3684393966059429
Spearman: 0.44252836935411044, Pearson: 0.3578060855814162
Spearman: 0.4461626294882926, Pearson: 0.33885870386743133
Spearman: 0.5057840068365139, Pearson: 0.34085322345274144
Spearman: 0.437620955153927, Pearson: 0.40947807960083143
Spearman: 0.3636387836077498, Pearson: 0.18900686063377586
Spearman: 0.31627480795756496, Pearson: 0.21460506323581408
Spearman: 0.4929285714185081, Pearson: 0.3406725161933845
Spearman: 0.44385210131101616, Pearson: 0.2734997420909929
Spearman: 0.5640968571903996, Pearson: 0.41318504078315776
Spearman: 0.5533298581136771, Pearson: 0.3817573558434

## Disclaimer
This notebook shows the model for Germany only. We used different parameters for France but since we were in a rush, so we did not store them.


## Takeaways
- CVGridSearch can be further optimized by sampling the uniform distribution
- Increasing the number of boosting iterations will overfit the model for this dataset
- Low learning rates will get the model stuck in a local minimum

## Further Improvements
- Try to add lags of the most significant predictors
- It can be interesting to use unsupervised learning to cluster the data and try to use cluster labels as predictors
- Using both models and take the average might lead to a more generalized model

