#### Auxiliary notebook, for filling in the NaN weather values

# QRT ENS Data Challenge 2023 (Filling NaN Weather)
## Data Specs (Copied from Official)
- `X_train` and `X_test` both have $35$ columns that represent the same explanatory variables but over different time periods. 

- `X_train` and `Y_train` share the same column `ID` - each row corresponds to a unique ID associated wwith a day and a country. 

- The target of this challenge `TARGET` in `Y_train` corresponds to the price change for daily futures contracts of 24H electricity baseload. 

- **You will notice some columns have missing values**.

Input data sets comprise 35 columns:

ID: Unique row identifier, associated with a day (DAY_ID) and a country (COUNTRY),

DAY_ID: Day identifier - dates have been anonymized, but all data corresponding to a specific day is consistent,

COUNTRY: Country identifier - DE = Germany, FR = France,
and then contains daily commodity price variations,

GAS_RET: European gas,

COAL_RET: European coal,

CARBON_RET: Carbon emissions futures,

#### Weather measures (daily, in the country x)

x_TEMP: Temperature,

x_RAIN: Rainfall,

x_WIND: Wind,


#### Energy production measures (daily, in the country x)

x_GAS: Natural gas,

x_COAL: Hard coal,

x_HYDRO: Hydro reservoir,

x_NUCLEAR: Daily nuclear production,

x_SOLAR: Photovoltaic,

x_WINDPOW: Wind power,

x_LIGNITE: Lignite,

#### Electricity use metrics (daily, in the country x)

x_CONSUMPTON: Total electricity consumption,

x_RESIDUAL_LOAD: Electricity consumption after using all renewable energies,

x_NET_IMPORT: Imported electricity from Europe,

x_NET_EXPORT: Exported electricity to Europe,

DE_FR_EXCHANGE: Total daily electricity exchange between Germany and France,

FR_DE_EXCHANGE: Total daily electricity exchange between France and Germany.

Output data sets are composed of two columns:

ID: Unique row identifier - corresponding to the input identifiers,

TARGET: Daily price variation for futures of 24H electricity baseload.

In [None]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import itertools as it
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import mutual_info_regression as mir
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder as onehot
from sklearn import linear_model
from sklearn.cluster import KMeans
from sklearn.neural_network import MLPRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from statsmodels.tsa.deterministic import DeterministicProcess
import xgboost as xgb
import lightgbm as lgb
from itertools import product
from scipy import signal
from utils import *

In [None]:
df = pd.read_csv('x_train.csv').set_index('ID').sort_index()
y = pd.read_csv('y_train.csv').set_index('ID').sort_index()
true_test = pd.read_csv('x_test.csv').set_index('ID')
pd.set_option('display.max_columns', None)
de = df[df['COUNTRY'] == 'DE']
y_de = y[y.index.isin(de.index)]
fr = df[df['COUNTRY'] == 'FR']
y_fr = y[y.index.isin(fr.index)]

bx_train, bx_test, by_train, by_test = train_test_split(df, y, test_size=0.33, random_state=88)
df_numeric = df.drop(['DAY_ID', 'COUNTRY', 'FR_DE_EXCHANGE', 'FR_NET_EXPORT', 'DE_NET_EXPORT'], axis=1)
df_numeric = df_numeric.fillna(df_numeric.median(numeric_only=True))

weather_vars = ['DE_RAIN', 'DE_WIND', 'DE_TEMP', 'FR_RAIN', 'FR_WIND', 'FR_TEMP']

In [None]:
ls = de.index
for i in range(ls.size - 1):
    if ls[i + 1] - ls[i] > 2:
        print(f'{i}, {ls[i]}, {ls[i + 1]}')

In [None]:
sns.scatterplot(x=df.index, y=df['FR_TEMP'])

In [None]:
sns.scatterplot(x=de.index, y=de['DE_WIND'])

In [None]:
sns.scatterplot(x=true_test[true_test['COUNTRY'] == 'DE'].index, y=true_test[true_test['COUNTRY'] == 'DE']['DE_WIND'])

In [None]:
temp_de = de.copy()
temp_de['TARGET'] = y_de['TARGET']
for w in ['RAIN', 'WIND', 'TEMP']:
    plt.figure()
    sns.scatterplot(x=temp_de.index, y=temp_de[f'DE_{w}'])

In [None]:
temp_de['DE_RAIN'].loc[439]

In [None]:
df_rain = de['DE_RAIN'].copy()
dp = DeterministicProcess(
    constant=True,
    period=52,
    index=df_rain.index,
    order=1,
    fourier=2,
)
x_fit = dp.in_sample().loc[:439]
y_fit = df_rain.loc[:439]
x_test = dp.in_sample().loc[509:]
y_test = df_rain.loc[509:]
ridge = linear_model.Ridge(alpha=3)
ridge.fit(x_fit, y_fit)
y_deseasoned = y_fit - ridge.predict(x_fit)
# plt.clf()
# fig, ax = plt.subplots(2)
# ax[0].set_ylabel('DE_RAIN'), ax[1].set_ylabel('DE_RAIN')
# sns.lineplot(x=x_fit.index, y=y_deseasoned, ax=ax[0])
# sns.lineplot(x=x_fit.index, y=y_fit, ax=ax[1])

In [None]:
x_fit = dp.in_sample().loc[:439]
y_fit = df_rain.loc[:439]
x_test = dp.in_sample().loc[509:]
y_test = df_rain.loc[509:]
x_fit['LAG_1'] = y_fit.shift(1).fillna(y_fit.mean())
x_test['LAG_1'] = y_test.shift(1).fillna(y_test.mean())
de_rain_fit = de.loc[:439].drop(['DAY_ID', 'COUNTRY', 'DE_RAIN'], axis=1)
de_rain_test = de.loc[509:].drop(['DAY_ID', 'COUNTRY', 'DE_RAIN'], axis=1)
x_fit = pd.concat([de_rain_fit, x_fit], axis=1)
x_test = pd.concat([de_rain_test, x_test], axis=1)
ridge = linear_model.Ridge(alpha=1)
ridge.fit(x_fit, y_fit)
print(metric_train(ridge.predict(x_test), y_test))
sns.lineplot(x=y_fit.index, y=y_fit)

In [None]:
ridge = linear_model.Ridge(alpha=1)
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=88, gamma=1)
test_model(ridge, x_fit, x_test, y_fit, y_test, model_1=xgb_model, detailed=True, graph_residuals=True)

In [None]:
de

In [None]:
cols = pd.DataFrame(de.drop(['COUNTRY'], axis=1).columns)
for w in ['RAIN', 'WIND', 'TEMP']:
    cols[f'MI_FR_{w}'] = mir(X=de.drop(['COUNTRY'], axis=1).dropna(), y=de[f'FR_{w}'].dropna())
cols

In [None]:
fs = np.linspace(0.1, 500, 1000)
for col in weather_vars:
    plt.figure()
    fig, axs = plt.subplots(2)
    
    pgram = signal.lombscargle(x=fr[col].dropna().index, y=fr[col].dropna(), freqs=fs)
    print([fs[i] for i in np.argwhere(pgram > 5).ravel()])
    p_fr = sns.lineplot(x=fs, y=pgram, ax=axs[0])
    p_fr.set(xlabel=f'{col} for FR')
    
    pgram = signal.lombscargle(x=de[col].dropna().index, y=de[col].dropna(), freqs=fs)
    print([fs[i] for i in np.argwhere(pgram > 5).ravel()])
    p_de = sns.lineplot(x=fs, y=pgram, ax=axs[1])
    p_de.set(xlabel=f'{col} for DE')
    
    plt.show()

In [None]:
de.count()

In [None]:
df_weather = country_flow(fourier_features(de)).drop(['DAY_ID', 'COUNTRY'], axis=1)

weather_train = df_weather[df_weather['DE_RAIN'].notna()]
weather_test = df_weather[df_weather['DE_RAIN'].isna()]

y = weather_train[['DE_WIND', 'DE_RAIN', 'DE_TEMP', 'FR_WIND', 'FR_RAIN', 'FR_TEMP']]
weather_train = weather_train.drop(['DE_WIND', 'DE_RAIN', 'DE_TEMP', 'FR_WIND', 'FR_RAIN', 'FR_TEMP'], axis=1)
weather_test = weather_test.drop(['DE_WIND', 'DE_RAIN', 'DE_TEMP', 'FR_WIND', 'FR_RAIN', 'FR_TEMP'], axis=1)

### Feature Engineering

In [None]:
y

### Models

In [None]:
x = lag_shift(weather_train, np.arange(3))
target = y
curr = 'FR_TEMP'
# de: ridge alpha 40, xgb gamma 25

In [None]:
weather_train

In [None]:
# x = (x - x.mean()) / x.std()

In [None]:
# perform train test split after features have been calculated
x_train, x_test, y_train, y_test = train_test_split(x, target[curr], test_size=0.33, random_state=88, shuffle=False)

#### Baseline Linear Regression

In [None]:
lr = LinearRegression()
train_result, test_result = test_model(lr, x_train, x_test, y_train, y_test)

#### Ridge Regression

In [None]:
ridge = linear_model.Ridge(alpha=0.1)
train_result, test_result = test_model(ridge, x_train, x_test, y_train, y_test, print_output=False, graph_residuals=False)

#### XGB Regression

In [None]:
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=88, gamma=10)
train_result, test_result = test_model(xgb_model, x_train, x_test, y_train, y_test, print_output=False, graph_residuals=False)

In [None]:
kf = KFold(n_splits = 5, shuffle=True)
kf_test_model(kf, xgb_model, x, target, wind_excess=False, target_col=curr)

#### Ridge Regression - XGB Hybrid

In [None]:
ridge = linear_model.Ridge(alpha=0.1)
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=88, gamma=6.8)
train_result, test_result = test_model(ridge, x_train, x_test, y_train, y_test, model_1=xgb_model, detailed=True, print_output=False, graph_residuals=True)

### Output

In [None]:
weather_forecast = de[de['DE_WIND'].isna()][weather_vars]

#### DE_WIND

In [None]:
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=88, gamma=3)
xgb_model.fit(weather_train, y['DE_WIND'])
weather_forecast['DE_WIND'] = xgb_model.predict(weather_test)

#### FR_WIND

In [None]:
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=88, gamma=0.5)
xgb_model.fit(weather_train, y['FR_WIND'])
weather_forecast['FR_WIND'] = xgb_model.predict(weather_test)

In [None]:
weather_forecast

In [None]:
de_wind = de[de['DE_WIND'].notna()]['DE_WIND'].append(weather_forecast['DE_WIND']).sort_index()
sns.scatterplot(x=de_wind.index, y=de_wind)

In [None]:
fr_wind = de[de['FR_WIND'].notna()]['FR_WIND'].append(weather_forecast['FR_WIND']).sort_index()
sns.scatterplot(x=fr_wind.index, y=fr_wind)