In [None]:
import os
from itertools import combinations
from omegaconf import OmegaConf

def get_config(yaml_name):
    conf = OmegaConf.load(yaml_name)
    return conf

def get_combinations_string(lista,combo):
    # Gera as combinações
    todas_combinacoes = []
    combinacoes_string = ["#"]
    for r in range(1, combo + 1):
        todas_combinacoes.extend(list(combinations(lista, r)))
    
    for elto in todas_combinacoes:
        combinacoes_string.append("-".join(elto))
    # Converte as tuplas em strings com underline
    return combinacoes_string

def mkdir_(destination_folder,name_folders):
    for name_dir in name_folders:
        try:
            os.makedirs(destination_folder+name_dir, exist_ok=True)
        except Exception as e:
            print(f"Erro ao criar diretório {name_dir}: {str(e)}")

def create_folders():
    mkdir_(config['paths']['eval_folder'],combinacoes)
    mkdir_(config['paths']['results_folder'],combinacoes)
    mkdir_(config['paths']['pred_folder'],combinacoes)

lista = ['month', 'dayofweek_num', 'hour', 'holiday', 'bool_weather_missing_values', 'precipType']
combinacoes = get_combinations_string(lista,1)

config = get_config("lstm_config.yaml")

create_folders()

In [None]:
import pandas as pd
import os
import warnings
import time
warnings.filterwarnings("ignore")

import copy
from pathlib import Path
import warnings
import shutil

import lightning.pytorch as pl
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import TensorBoardLogger
import numpy as np
import pandas as pd
import torch

import matplotlib.pyplot as plt

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet, RecurrentNetwork
from pytorch_forecasting.data import GroupNormalizer, NaNLabelEncoder
from pytorch_forecasting.metrics import MAE, SMAPE, PoissonLoss, QuantileLoss, MAPE, RMSE

import pickle
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters
from omegaconf import OmegaConf

torch.set_grad_enabled(True)

In [None]:
def get_csv_file_list(path):
    list_csv = os.listdir(path)
    return list_csv

def get_csv(path):
    df = pd.read_csv(path)
    #estado.drop(columns='Unnamed: 0',inplace=True)
    #estado.data = pd.to_datetime(estado.data)
    #estado = estado.loc[estado.data >= '2022-01-01']

    df['temperature'].fillna(method='ffill', inplace=True)
    df['windSpeed'].fillna(method='ffill', inplace=True)
    df['year'] = df['year'].astype(str)
    df['hour'] = df['hour'].astype(str)
    df['month'] = df['month'].astype(str)
    df['day'] = df['day'].astype(str)
    df['dayofweek_num'] = df['dayofweek_num'].astype(str)
    df['house_hold'] = df['house_hold'].astype(str)
    df['precipType'] = df['precipType'].astype(str)
    df['icon'] = df['icon'].astype(str)
    df['holiday'] = df['holiday'].astype(str)
    df['summary'] = df['summary'].astype(str)
    df['bool_weather_missing_values'] = df['bool_weather_missing_values'].astype(str)

    return df 

def evaluation_metrics(val_dataloader,best_model,householde_name,execution_time):    
    predictions = best_model.predict(val_dataloader, return_y=True, trainer_kwargs=dict(accelerator="cpu"))
    mae = MAE()(predictions.output, predictions.y)
    mape = MAPE()(predictions.output, predictions.y)
    smape = SMAPE()(predictions.output, predictions.y)
    rmse = RMSE()(predictions.output, predictions.y)

    dict_ = {'House_Hold':householde_name.split(".")[0].upper(),
            'MAE':[mae.to('cpu').numpy().round(3)],
            'MAPE':[mape.to('cpu').numpy().round(3)],
            'SMAPE':[smape.to('cpu').numpy().round(3)],
            'RMSE':[rmse.to('cpu').numpy().round(3)],
            'Time_execution':f"{execution_time:.2f}s"}

    return dict_


def get_attention_values(interpretation,householde_name,encoder_list,decoder_list):
  att_encoder_values = interpretation['encoder_variables'].to('cpu').numpy()
  att_decoder_values = interpretation['decoder_variables'].to('cpu').numpy()
  tft_encoder = encoder_list
  tft_decoder = decoder_list
  encoder_dict = {}
  decoder_dict = {}
  for value, name in zip(att_encoder_values,tft_encoder):   
    encoder_dict[name] = [f"{np.round(value,4)*100:.2f}%"]
  encoder_dict['House_Hold'] = [householde_name.split(".")[0].upper()]
  for value, name in zip(att_decoder_values,tft_decoder):   
    decoder_dict[name] = [f"{np.round(value,4)*100:.2f}%"]
  decoder_dict['House_Hold'] = [householde_name.split(".")[0].upper()]
  return encoder_dict,decoder_dict


def cleaning_eval_metrics_results(path_origin, path_destiny,model_name):
    list_csv = os.listdir(path_origin)
    df_list = []
    for csv in list_csv:
        if os.path.isfile(os.path.join(path_origin,csv)):
            print(csv)
            df_list.append(pd.read_csv(path_origin + "/" + csv))
        else:
           print('Nao e csv')
    concat_df = pd.concat(df_list)
    concat_df.to_csv(path_destiny + "/" + f"{model_name}_metrics_results.csv",index=False)


def cleaning_attention_results(path_origin, path_destiny):
    list_csv = os.listdir(path_origin)
    df_encoder_list = []
    df_decoder_list = []
    for csv in list_csv:
        if os.path.isfile(os.path.join(path_origin,csv)):
            if csv.split("_")[0] == 'decoder':
                df_encoder_list.append(pd.read_csv(path_origin + "/" + csv))
            else:
                df_decoder_list.append(pd.read_csv(path_origin + "/" + csv))
        else:
            print('Nao e csv')
    concat_df_encoder = pd.concat(df_encoder_list)
    concat_df_decoder = pd.concat(df_decoder_list)
    concat_df_encoder.to_csv(path_destiny + "/" + "encoder_attention_results.csv",index=False)
    concat_df_decoder.to_csv(path_destiny + "/" + "decoder_attention_results.csv",index=False)

In [None]:
def get_config(yaml_name):
    conf = OmegaConf.load(yaml_name)
    return conf

def get_covariate_combination(path):
    """ 
    look at eval_metric folder to get all combinations of covariates and return all combinations in a list
    """
    path = config.paths.eval_folder
    combinations = os.listdir(path)
    feature_combination = {}
    for combination in combinations:
        feature_combination[combination] = combination.split('-')
    return feature_combination


config = get_config("lstm_config.yaml")

feature_combination = get_covariate_combination(config.paths.eval_folder)

In [None]:
def ensure_requires_grad(model):
    """
    Função auxiliar para garantir que todos os parâmetros do modelo tenham requires_grad=True
    """
    for param in model.parameters():
        param.requires_grad = True
    return model

def process_batch(batch):
    """
    Processa cada batch para garantir que todos os tensores tenham requires_grad=True
    """
    if isinstance(batch, dict):
        return {key: process_batch(value) for key, value in batch.items()}
    elif isinstance(batch, list):
        return [process_batch(item) for item in batch]
    elif isinstance(batch, tuple):
        return tuple(process_batch(item) for item in batch)
    elif isinstance(batch, torch.Tensor):
        return batch.clone().detach().requires_grad_(True)
    else:
        return batch

In [None]:
def run_RNN_model(df,
                  csv_file_name,
                  cell_type,
                  path_pred,
                  path_metrics_val,
                  covariates,
                  learning_rate = .1,
                  hidden_size = 15,
                  dropout = .2,
                  loss = MAE(),
                  optimizer = "Ranger",
                  rnn_layers  = 2,
                  patience=10,
                  max_prediction_length = 168,
                  max_encoder_length = 720,
                  batch_size = 128,
                  seed = 81):

    df['Energy_kwh'] = df['Energy_kwh'].astype('float32')
    df['time_idx'] = df['time_idx'].astype('int32')
    training_cutoff = df["time_idx"].max() - max_prediction_length

    pl.seed_everything(seed)

    if covariates == ["time_idx"]:
        training = TimeSeriesDataSet(
                                df[lambda x: df.time_idx <= training_cutoff],
                                time_idx = 'time_idx',
                                target = 'Energy_kwh',
                                group_ids = ['house_hold'],
                                time_varying_known_reals=['time_idx'],
                                time_varying_unknown_reals = ['Energy_kwh'],
                                static_categoricals=['house_hold'],
                                min_encoder_length = max_encoder_length // 2,
                                max_encoder_length = max_encoder_length,
                                min_prediction_length=1,
                                max_prediction_length = max_prediction_length,
                                categorical_encoders = {'house_hold': NaNLabelEncoder(add_nan=True, warn=True)}
        )
    else:
        training = TimeSeriesDataSet(
                                    df[lambda x: df.time_idx <= training_cutoff],
                                    time_idx = 'time_idx',
                                    target = 'Energy_kwh',
                                    group_ids = ['house_hold'],
                                    #time_varying_known_reals=['time_idx'],
                                    time_varying_unknown_reals = ['Energy_kwh'],
                                    static_categoricals=['house_hold'],
                                    time_varying_known_categoricals = covariates,
                                    min_encoder_length = max_encoder_length // 2,
                                    max_encoder_length = max_encoder_length,
                                    min_prediction_length=1,
                                    max_prediction_length = max_prediction_length,
                                    categorical_encoders = {'house_hold': NaNLabelEncoder(add_nan=True, warn=True)}
        )


    validation = TimeSeriesDataSet.from_dataset(training, 
                                                df,
                                                predict = True,
                                                stop_randomization = True)


    train_dataloader = training.to_dataloader(train = True,
                                            batch_size = batch_size,
                                            num_workers = 1)


    val_dataloader = validation.to_dataloader(train = False,
                                            batch_size = batch_size,
                                            num_workers = 1)

    rnn = RecurrentNetwork.from_dataset(
                                            training,
                                            cell_type =cell_type,
                                            learning_rate = learning_rate,
                                            hidden_size = hidden_size,
                                            dropout = dropout,
                                            loss = loss,
                                            optimizer = optimizer,
                                            rnn_layers  = rnn_layers
        )

    early_stop_callback = EarlyStopping(monitor = "val_loss",
                                    min_delta = 0.0001,
                                    patience = patience,
                                    verbose = True,
                                    mode = "min")


    lr_logger = LearningRateMonitor()
    logger_LSTM = TensorBoardLogger(f"{cell_type}_logs")

    checkpoint_callback = ModelCheckpoint(monitor="val_loss")

    trainer = pl.Trainer(
                            max_epochs = 350,
                            accelerator = 'gpu',
                            enable_model_summary = True,
                            limit_train_batches = 300,
                            gradient_clip_val = 0.1,
                            callbacks = [lr_logger, early_stop_callback, checkpoint_callback],
                            logger = logger_LSTM,
                            enable_progress_bar=False
            )
    
    start_time = time.time()
    trainer.fit(
                    rnn,
                    train_dataloaders = train_dataloader,
                    val_dataloaders = val_dataloader)
    end_time = time.time()  
    execution_time = end_time - start_time  

    best_model_path = trainer.checkpoint_callback.best_model_path
    best_rnn = RecurrentNetwork.load_from_checkpoint(best_model_path)

    best_model_path = str(best_model_path)
    best_rnn = RecurrentNetwork.load_from_checkpoint(best_model_path)

    predictions = best_rnn.predict(val_dataloader, mode = "raw", return_x = True)

    df_predictions = pd.DataFrame({'time_idx':predictions.x['decoder_time_idx'][0].to('cpu').numpy(),
                                'Real':predictions.x['decoder_target'][0].to('cpu').numpy().round(3),
                                'predict':predictions.output[0][0].to('cpu').numpy().round(3).squeeze()})

    df_predictions.to_csv(path_pred + csv_file_name, index=False)

    eval_dict = evaluation_metrics(val_dataloader,best_rnn,csv_file_name,execution_time)
    df_eval_metrics = pd.DataFrame(eval_dict)
    df_eval_metrics.to_csv(path_metrics_val + csv_file_name, index=False)
    
    #shutil.rmtree('lightning_logs')
    #shutil.rmtree('LSTM_logs')

In [None]:
for folder_name,comb_features in feature_combination.items():
    if comb_features[0] == "#":
        comb_features = ['time_idx']
    for csv in os.listdir(config.paths.data_dir)[:2]:
        print(config.paths.pred_folder+folder_name+'/'+csv)

In [None]:
len(os.listdir(config.paths.data_dir)[:20])

In [None]:
len(os.listdir(config.paths.data_dir)[20:40])

In [None]:
new =  {
 '#': ['#'],
 #'hour':['hour'],
 #'precipType': ['precipType'],
 #'month': ['month'],
 #'bool_weather_missing_values': ['bool_weather_missing_values'],
 #'holiday': ['holiday'],
 #'precipType': ['precipType']
 }

In [None]:
csv_problematicos = ["stretchedSociety_block_91_MAC000350.csv","stretchedSociety_block_107_MAC001785.csv"]

In [None]:
csv_treinados = os.listdir("eval_metrics/#")

In [None]:
for folder_name,comb_features in new.items():
    if comb_features[0] == "#":
        comb_features = ['time_idx']
    for csv in os.listdir(config.paths.data_dir)[7:]:
        if csv in csv_problematicos + csv_treinados:
            pass
        else:
            df = get_csv(config.paths.data_dir + csv)
            

            print(f"Training model for {folder_name} using {csv}")
            print(len(df)-168)
            run_RNN_model(df,
                            max_prediction_length = 168,
                            max_encoder_length = len(df)-168,
                            covariates = comb_features,
                            csv_file_name = csv,
                            cell_type = "LSTM",
                            path_pred = config.paths.pred_folder + folder_name+'/',
                            path_metrics_val=config.paths.eval_folder + folder_name+'/')
        