In [None]:
import os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline
sn.set(rc={'figure.figsize':(11.7,8.27)})
%load_ext autoreload
%autoreload 2
import warnings

warnings.filterwarnings("ignore")

In [None]:
import datetime


other_cols = ["tcc","t2m","ssrd","ff100","u100","v100"]
data_folder = "./data_challenge/data"
def percent_rows_na(df):
    return (len(df)-len(df.dropna(axis=0)))*100/len(df)

def fix_echeance(df):
    df['echeance'] = (df.date_cible - df.date_lancement).dt.seconds/3600
    
def add_datetime_features(df):
    # time in the year
    #df['year_dt'] =  datetime.datetime(year=df.date_cible.dt.year)
    tzinfo = df.date_cible.dt.tz
    df['tiy'] = (df.date_cible - df.date_cible.dt.year.apply(lambda y: datetime.datetime(year=y,month=1,day=1,tzinfo=tzinfo))).dt.total_seconds()/(365*24*60*60)
    # time in the day
    df['tid'] = (df.date_cible.dt.hour *3600 + df.date_cible.dt.minute *60 + df.date_cible.dt.second)/(24*60*60)
    # TODO: type of day for consumption


In [None]:
df_list_station = pd.read_csv(os.path.join(data_folder,"liste_stations.csv"), sep=";", header=0)
df_list_station.head()

In [None]:
df_prev_sans_obs2020 = pd.read_feather(os.path.join(data_folder, "df_prev_sans_obs2020.feather"))
print(df_prev_sans_obs2020.echeance.unique()) # echeance 30min - 7h
print(df_prev_sans_obs2020.isnull().sum()) # Missing 417844 observations (for 2020)
# Fake FC for conso
df_prev_sans_obs2020.loc[df_prev_sans_obs2020.type.str.contains('conso'),'pi'] = df_prev_sans_obs2020[df_prev_sans_obs2020.type.str.contains('conso')].prev.max() + 10**4
df_prev_sans_obs2020['fc'] = df_prev_sans_obs2020['obs'] / df_prev_sans_obs2020['pi']
fix_echeance(df_prev_sans_obs2020)
add_datetime_features(df_prev_sans_obs2020)
df_prev_sans_obs2020



In [None]:
df_grille_zoneclimat_fin18 = pd.read_feather(os.path.join(data_folder, "grille_zone_climatique_fin2018.feather"))
df_grille_zoneclimat_fin18.head(10)

In [None]:
df_meteo_zone_eol = pd.read_feather(os.path.join(data_folder, "meteo_zone_echeance12_2016_2020_HRES_piEOL_smooth.feather"))
print(sorted(df_meteo_zone_eol.echeance.unique())) # echeance 0min - 11h30
assert df_meteo_zone_eol.isnull().sum().sum() == 0 # No missing value
df_meteo_zone_eol.rename(columns={"date_lancement_meteo": "date_lancement"}, inplace=True)
fix_echeance(df_meteo_zone_eol)
# Long to large

df_meteo_zone_eol = df_meteo_zone_eol.pivot(index=["date_lancement","date_cible","echeance"], values=other_cols, columns="zone").reset_index()
assert df_meteo_zone_eol.isnull().sum().sum() == 0 # No missing value
df_meteo_zone_eol

In [None]:
df_meteo_zone_pv = pd.read_feather(os.path.join(data_folder, "meteo_zone_echeance12_2016_2020_HRES_piPV_smooth.feather"))
print(f"echeances:{sorted(df_meteo_zone_pv.echeance.unique())}") # echeance 0min - 11h30
print(f"zones:{sorted(df_meteo_zone_pv.zone.unique())}") # echeance 0min - 11h30
assert df_meteo_zone_pv.isnull().sum().sum() == 0 # No missing value
df_meteo_zone_pv.rename(columns={"date_lancement_meteo": "date_lancement"}, inplace=True)
fix_echeance(df_meteo_zone_pv)

# Long to large
other_cols = ["tcc","t2m","ssrd","ff100","u100","v100"]
df_meteo_zone_pv = df_meteo_zone_pv.pivot(index=["date_lancement","date_cible","echeance"], values=other_cols, columns="zone").reset_index()
assert df_meteo_zone_pv.isnull().sum().sum() == 0 # No missing value
df_meteo_zone_pv

In [None]:
df_prodpv_fc_q90 = pd.read_feather(os.path.join(data_folder, "productionPV_FC_cielclair_q90.feather"))
df_prodpv_fc_q90.head()

# Preprocessing

In [None]:

df = df_prev_sans_obs2020

# DROP ECHEANCES > 4
df = df[df.echeance <= 4.0]

df_conso = df[df.type =='consommation'].drop(columns='type')
df_pv = df[df.type =='photovoltaique'].drop(columns='type')
df_conso_res = df[df.type =='consommation_residuelle'].drop(columns='type')
df_eol = df[df.type =='eolien'].drop(columns='type')
# No missing data in year < 2020, prev
assert percent_rows_na(df_eol[df_eol.date_cible.dt.year<2020])==0.0 # No missing value in train
assert percent_rows_na(df_pv[df_pv.date_cible.dt.year<2020])==0.0 # No missing value in train
assert percent_rows_na(df_conso_res[df_conso_res.date_cible.dt.year<2020])==0.0 # No missing value in train
assert percent_rows_na(df_conso[df_conso.date_cible.dt.year<2020])==0.0 # No missing value in train

df_pv


## PV

In [None]:
# PV
PV_USELESS_COLS = ['ff100','u100','v100']

df_pv_meteo = df_pv.merge(df_meteo_zone_pv.drop(columns=PV_USELESS_COLS), on=['date_cible','date_lancement'], how='inner')
print(f"""
      {percent_rows_na(df_pv_meteo)} % rows with missing values.
      They come from merging meteo and prod/conso time series
      """) 
df_pv_meteo

## EOL

In [None]:
# LONG
EOL_USELESS_COLS = ['tcc','ssrd','t2m']
df_eol_meteo = df_eol.merge(df_meteo_zone_eol.drop(columns=EOL_USELESS_COLS), on=['date_cible','date_lancement'], how='inner')
# TODO check how many values are lost during inner join
df_eol_meteo

## CONSO

In [None]:
CONSO_USELESS_COLS = ['ff100','u100','v100']
# WARNING: TODO USE REAL WEATHER DATA 


df_conso_meteo = df_conso.merge(df_meteo_zone_pv.drop(columns=PV_USELESS_COLS), on=['date_cible','date_lancement'], how='inner')
df_conso_meteo


df_conso_res_meteo = df_conso_res.merge(df_meteo_zone_pv.drop(columns=PV_USELESS_COLS), on=['date_cible','date_lancement'], how='inner')
df_conso_res_meteo


## Save features

In [None]:
df_pv_meteo.to_hdf("./features/photovoltaique.hdf",key="data")
df_eol_meteo.to_hdf("./features/eolien.hdf",key="data")
df_conso_meteo.to_hdf("./features/consommation.hdf",key="data")
df_conso_res_meteo.to_hdf("./features/consommation_residuelle.hdf",key="data")

## Training

In [None]:
# Run train.py

## Inference

In [None]:
from cgi import test
from pathlib import Path
import time
from unittest import result
from pytorch_lightning import Trainer
from ray import tune
import torch

from train import DataModule, Regressor

OBS_TYPES = ['photovoltaique','eolien','consommation','consommation_residuelle']

def prepare_submission(obs_type, results):
    
    net=Regressor(results.best_config)
    
    # Predict quantiles
    with results.best_checkpoint.as_directory() as loaded_checkpoint_dir:
        ckp = torch.load(Path(loaded_checkpoint_dir) / "checkpoint")
        net.load_state_dict(ckp['state_dict'])
    
    df = pd.read_hdf(f'./features/{obs_type}.hdf')
    dm = DataModule(df, label='fc', batch_size=results.best_config['batch_size'])
    dm.prepare_data()
    net.eval()
    outs = net(dm.x_test).detach()
    
    quantiles_cols = [f"{level:.3f}" for level in  np.array(Regressor(results.best_config).quantile_levels)]
    quantiles_df = pd.DataFrame(columns=quantiles_cols, data=outs)
    
    # Concat to original DF
    results_df = pd.concat([dm.df_test, quantiles_df.set_index(dm.df_test.index)],axis=1)
    
    # Remove useless echeances
    results_df = results_df[results_df.echeance.isin([0.5,1,2,4])]
    
    # Large to long
    results_df['id'] = results_df.index
    results_df = results_df[['date_cible','date_lancement','pi']+quantiles_cols]
    results_df =  pd.melt(results_df, id_vars=['date_cible','date_lancement','pi'],value_vars=quantiles_cols,var_name="quantile_niveau",value_name="prev_q")
    results_df['quantile_niveau'] = pd.to_numeric(results_df['quantile_niveau'])
    results_df['type'] = obs_type
    
    # Multiply by installed power  / WARNING: for conso, does nothing.
    results_df['prev_q'] = results_df['prev_q'] * results_df['pi']
    results_df.drop(columns='pi', inplace=True)
    
    # Zeroing negative productions
    results_df.loc[results_df.prev_q < 0, 'prev_q'] = 0 
    return results_df
        

outs = []
for obs_type in OBS_TYPES:
    exp_path = max(Path(f"./ray/{obs_type}/").glob('*experiment*'), key=os.path.getctime) # Hopefully take latest
    print(f"""
          Preparing submission for {obs_type}...
          Using Experiment {exp_path}""")
    results = tune.ExperimentAnalysis(experiment_checkpoint_path=exp_path,default_metric="val/loss",default_mode="min")
    print(results.best_config)

    outs.append(prepare_submission(obs_type, results))
submission = pd.concat(outs,axis=0)
submission.reset_index(inplace=True)
submission.drop(columns='index').to_feather(f'./submissions/AR_{time.time()}.feather')


In [None]:
pd.options.plotting.backend = "plotly"
import plotly.express as px

px.line(submission[submission.type=='consommation_residuelle'], x='date_cible',y='prev_q',color='quantile_niveau')



In [None]:
pd.options.plotting.backend = "plotly"
import plotly.express as px
df_pv_plot = df_prev_sans_obs2020[(df_prev_sans_obs2020.date_cible.dt.year ==2019) & (df_prev_sans_obs2020.type=="photovoltaique")]
df_pv_plot[df_pv_plot.echeance==0.0]
px.line(df_pv_plot, x='date_cible',y='obs')


## IDEAs
- facteur de charge 
- coeff zone meteo ? altitude ? longitude ?
- is it damageable to scale features for quantile prediction ?

## TODO
- add dvc 
- compute score for best models on val set 
- add prev to visualisation of outputs

### features

- ssrd
- tcc	
t2m	
ssrd	
ff100
u100
v100
echeance
prod_installée 
puissance_installee
zone_climatique
zone
clear_sky_FC
