In [None]:
import os

import numpy as np
import pandas as pd

import warnings
import joblib
from tqdm.notebook import tqdm

warnings.filterwarnings("ignore")

from utils import *

In [None]:
data_folder = "/home/goubetcle/Documents/Pôle/Challenge 2022/data_challenge/data"
output_folder = "/home/goubetcle/Documents/Pôle/Challenge 2022/data_challenge/outputs"


In [None]:
df_prev = pd.read_feather(os.path.join(data_folder, "df_prev_sans_obs2020.feather"))
df_prev = df_prev.loc[df_prev.echeance.isin([0.5,1,2,4])]
df_prev["hh_mm_cible"] = df_prev.date_cible.dt.hour + df_prev.date_cible.dt.minute / 60
df_prev["prev_obs_gap"] = df_prev.prev.values - df_prev.obs.values
df_prev.head(10)

In [None]:
df_enr_capacity = make_zone_columns(df=df_prev[['date_cible', 'echeance', 'type', 'pi']].loc[df_prev.type.isin(["eolien", "photovoltaique"])], columns='type')
df_enr_capacity.describe()

In [None]:
df_meteo_fr = pd.read_csv(os.path.join(data_folder,"meteo_fr.csv"), header=0, sep=";").rename(columns={"utc_datetime":"date_cible"})
df_meteo_fr["date_cible"] = pd.to_datetime(df_meteo_fr.date_cible, utc=True)
df_meteo_halfhour = df_meteo_fr.loc[df_meteo_fr.echeance == 1].copy()
df_meteo_halfhour["echeance"] = df_meteo_halfhour.echeance * 0.5

df_meteo_fr = pd.concat([df_meteo_fr.loc[df_meteo_fr.echeance.isin([1,2,4])], df_meteo_halfhour], ignore_index=True)

df_meteo_fr.head()

In [None]:
df_meteo_zone_eol = pd.read_feather(os.path.join(data_folder, "meteo_zone_echeance12_2016_2020_HRES_piEOL_smooth.feather"))
df_meteo_zone_pv = pd.read_feather(os.path.join(data_folder, "meteo_zone_echeance12_2016_2020_HRES_piPV_smooth.feather"))

df_meteo_zone_eol = df_meteo_zone_eol.loc[df_meteo_zone_eol.echeance.isin([0.5,1,2,4])]
df_meteo_zone_pv = df_meteo_zone_pv.loc[df_meteo_zone_pv.echeance.isin([0.5,1,2,4])]


dirwindeol = compute_dirwind(df_meteo_zone_eol.u100.values, df_meteo_zone_eol.v100.values) * pi / 180
df_meteo_zone_eol["cosphi100"] = np.cos(dirwindeol)
df_meteo_zone_eol["sinphi100"] = np.sin(dirwindeol)
df_meteo_zone_eol["ff100_cubic"] = df_meteo_zone_eol.ff100.values ** 3
df_meteo_zone_eol.head()

In [None]:
df_prodpv_fc_q90 = pd.read_feather(os.path.join(data_folder, "productionPV_FC_cielclair_q90.feather"))
df_fcpv = pd.DataFrame(dict(date_cible=np.unique(df_prev.loc[(df_prev.echeance.isin([0.5,1,2,4]))*(df_prev.type.isin(['consommation', 'eolien', 'photovoltaique']))][["date_cible"]].values)))
df_fcpv["yday_cible"] = df_fcpv.date_cible.dt.day_of_year
df_fcpv["hour_cible"] = df_fcpv.date_cible.dt.hour
df_fcpv["minute_cible"] = df_fcpv.date_cible.dt.minute

df_fcpv = df_fcpv.merge(df_prodpv_fc_q90, how="left", 
                        on=["yday_cible", "hour_cible", "minute_cible"])

df_fcpv.head()

In [None]:
def prepare_features(pred_type=None):
    df_features = df_prev[['date_cible','type', 'prev', 'echeance', 'hh_mm_cible', 'obs']]
    df_features['mois_cible'] = df_prev.date_cible.dt.month
    df_features = expand_calendarfeatures(df_features)
    
    if pred_type == 'eolien':
        #df_newfeatures = make_zone_columns(df_meteo_zone_eol[['date_cible', 'echeance', 'zone', 'cosphi100', 'sinphi100', 'ff100_cubic']])
        #df_features = df_features.merge(df_newfeatures, on=['date_cible', 'echeance'], how="left", sort=False)
        df_features = df_features.merge(
            df_enr_capacity[['pi_eolien']], on=['date_cible', 'echeance'], how="left", sort=False)
    elif pred_type =='photovoltaique':
        #df_newfeatures = make_zone_columns(df_meteo_zone_eol[['date_cible', 'echeance', 'zone', 'tcc', 't2m', 'ssrd']])
        #df_features = df_features.merge(df_newfeatures, on=['date_cible', 'echeance'], how="left", sort=False)
        df_features = df_features.merge(
            df_enr_capacity[['pi_photovoltaique']], on=['date_cible', 'echeance'], how="left", sort=False)
        df_features = df_features.merge(df_fcpv[["date_cible", "clear_sky_FC"]], how="left", sort=False, on="date_cible")
    elif pred_type == 'consommation':
        df_features = df_features.merge(df_meteo_fr, on=['date_cible', 'echeance'], how="left", sort=False).fillna(method='bfill').fillna(method="ffill")
    elif pred_type == 'consommation_residuelle':
        #df_newfeatures = make_zone_columns(df_meteo_zone_eol[['date_cible', 'echeance', 'zone', 'cosphi100', 'sinphi100', 'ff100_cubic']])
        #df_features = df_features.merge(df_newfeatures, on=['date_cible', 'echeance'], how="left", sort=False)
        #df_newfeatures = make_zone_columns(df_meteo_zone_eol[['date_cible', 'echeance', 'zone', 'tcc', 't2m', 'ssrd']])
        #df_features = df_features.merge(df_newfeatures, on=['date_cible', 'echeance'], how="left", sort=False)
        df_features = df_features.merge(df_meteo_fr, on=['date_cible', 'echeance'], how="left", sort=False).fillna(method='bfill').fillna(method="ffill")
        df_features = df_features.merge(
            df_enr_capacity, on=['date_cible', 'echeance'], how="left", sort=False)
        df_features = df_features.merge(df_fcpv[["date_cible", "clear_sky_FC"]], how="left", sort=False, on="date_cible")
    return df_features  

In [None]:
quantile_target = np.arange(0.005, 1, 0.005)
pred_types = np.unique(df_prev.type.values)
echeances = np.unique(df_prev.echeance.values)
hh_mm_target = np.unique(df_prev.hh_mm_cible.values)

In [None]:
df_quantile_prev_4x199 = df_prev[['date_cible', 'date_lancement','type', 'echeance', 'hh_mm_cible', 'prev']].loc[df_prev.date_cible.dt.year == 2020]
n_examples = df_quantile_prev_4x199.shape[0]
df_quantile_prev_4x199 = pd.concat([df_quantile_prev_4x199] * len(quantile_target), ignore_index=True)
df_quantile_prev_4x199["quantile_niveau"] = np.repeat(quantile_target, n_examples)
df_quantile_prev_4x199["prev_q"] = np.zeros(n_examples * len(quantile_target))
df_quantile_prev_4x199 = df_quantile_prev_4x199[['date_cible', 'date_lancement', 'echeance', 'type', 'prev_q', 'quantile_niveau']]
df_quantile_prev_4x199.head()

# Apprentissage de l'écart à la prevision

In [None]:
import random 

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import make_scorer

random.seed(42)

In [None]:
situations = set([item for item in zip(df_quantile_prev_4x199.echeance.values,
                                       df_quantile_prev_4x199.type.values,
                                       df_quantile_prev_4x199.quantile_niveau.values                                      
                                       )])

len(situations)

In [None]:
from dataclasses import dataclass

def prepare_gbreg(quantile):
    reg = GradientBoostingRegressor(loss='quantile', n_estimators=300, alpha=quantile, random_state=42, n_iter_no_change=33)
    
    gridparams = dict(
        learning_rate=[0.01,0.035, 0.07, 0.1],
        max_depth=[3, 7],
        min_samples_split=[2,8],
        max_features=[1.0, 'sqrt', 'log2'],   
    )
    
    scorer = make_scorer(QuantileScore(q=quantile), greater_is_better=False)
        
    return GridSearchCV(reg, gridparams, scoring = scorer, refit=True, n_jobs=-2, cv=5, verbose=1)

fixed_params = dict(loss='quantile', n_estimators=300, random_state=42, n_iter_no_change=33)

@dataclass
class BestParams:
    q: float

In [None]:
list_df_pred = []
best_params = {}

DO_TRAIN = False

index_cols = ['date_cible', 'echeance', 'type', 'hh_mm_cible', 'obs']

for situation in tqdm(situations):
    print(f"Learning of situation {situation}")
    #prepare features
    echeance, pred_type, quantile = situation
    
    df_features = prepare_features(pred_type=pred_type)
    df_features = df_features.loc[(df_features.type.values == pred_type) * (df_features.echeance.values == echeance)]
    
    df_features_train = df_features.loc[df_features.date_cible.dt.year < 2020]
    df_features_pred = df_features.loc[df_features.date_cible.dt.year == 2020]
    
    #training
    df_modelpred = df_features_pred[['date_cible', 'echeance', 'type']]
    df_modelpred["quantile_niveau"] = np.ones(df_modelpred.shape[0]) * quantile
    
    x_train = df_features_train.drop(columns=index_cols).values
    x_pred = df_features_pred.drop(columns=index_cols).values
    
    y_train = df_features_train.obs.values
    
    if DO_TRAIN:
        if best_params.get(pred_type):
            model = GradientBoostingRegressor(
                alpha=quantile,
                **fixed_params,
                **best_params.get(pred_type)
            )
            model.fit(x_train, y_train)
            print(model.score(x_train, y_train))
            best_reg=model
        else:
            model = prepare_gbreg(quantile)
            model.fit(x_train, y_train)
            print(model.score(x_train, y_train))
            best_reg = model.best_estimator_
            best_params[pred_type] = model.best_params_
        
        #saving
        filename = "_".join(str(it) for it in situation)+"_gbregressor.sav"
        joblib.dump(best_reg, os.path.join(output_folder+"/models/gb",filename))
        
    else:
        filename = "_".join(str(it) for it in situation)+"_gbregressor.sav"
        best_reg = joblib.load(os.path.join(output_folder+"/models/gb",filename))
    
    y_pred = best_reg.predict(x_pred)
    df_modelpred["prev_q"] = y_pred
    list_df_pred.append(df_modelpred)
    

In [None]:
df_pred_final = pd.concat(list_df_pred, axis=0)
df_pred_final.head()

In [None]:
df_quantile_prev_4x199= df_quantile_prev_4x199[['date_cible', 'date_lancement', 'echeance', 'type', 'quantile_niveau']].merge(
    df_pred_final, how='left', sort=False, on=['date_cible', 'echeance',  'type', 'quantile_niveau']
)[['date_cible', 'date_lancement', 'type', 'prev_q', 'quantile_niveau']]

df_quantile_prev_4x199.head()

In [None]:
df_quantile_prev_4x199.isna().sum().sum()

### mise en forme des données

In [None]:
df_quantile_prev_Q1Q99 = df_quantile_prev_4x199.loc[df_quantile_prev_4x199.type=="consommation_residuelle"].loc[
    df_quantile_prev_4x199.quantile_niveau.isin([0.01, 0.99])].reset_index(drop=True)
df_quantile_prev_Q1Q99.head()

In [None]:
df_quantile_prev_4x199.to_feather(os.path.join(output_folder, "CG_BrutGradientBoosting_4x199.feather"), compression="zstd")
df_quantile_prev_Q1Q99.to_feather(os.path.join(output_folder, "CG_BrutGradientBoosting_Q1Q99.feather"))