In [1]:
import os
import pickle
import json
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras import optimizers

from functools import partial
from multiprocessing import Pool
from collections import namedtuple
from urllib.request import urlopen
from sklearn.preprocessing import MinMaxScaler

In [2]:
def generate_single_region_dataset(key, region_data, look_back, look_forward, x_columns, y_columns=None, gen_x=True, gen_y=True):
    
    # Check region dataframe
    if region_data is None:
        print('generate_single_region_dataset error: Region data is None!')
        return (None, None)
    
    # Check number of regions
    if len(region_data[key].unique()) > 1:
        print('generate_single_region_dataset error: More than one region in the dataframe!')
        return (None, None)
    else:
        region_name = region_data[key].unique()[0]
    
    # Drop 'Region' column
    region_data = region_data.drop(columns=key)
    
    # Check the number of samples available to
    # generate the look back and look forward windows
    if len(region_data) < (look_back + look_forward):
        print('generate_single_region_dataset error: Not enough samples '+
              'in {} to generate the windows!'.format(region_name))
        return (None, None)
    
    n_samples = len(region_data) - look_back - look_forward + 1

    var_names = x_columns
    
    # Generate inputs
    if gen_x:
        inputs = pd.DataFrame()
        
        for i in range(n_samples):
            input_window = region_data.T.iloc[:, i:i+look_back]
            wide_input_sample = pd.DataFrame()
            
            for var in var_names:
                var_input_sample = input_window.loc[var:var, :]
                var_input_sample.columns = ['{}_t{}'.format(var, a) for a in range(1-look_back, 1)]
                var_input_sample = var_input_sample.reset_index(drop=True)
                wide_input_sample = pd.concat([wide_input_sample, var_input_sample], axis='columns')
                
            inputs = pd.concat([inputs, wide_input_sample], axis='index')
            
        # Insert region name
        #inputs.insert(loc=0, column=key, value=region_name)
        # Reset index
        inputs = inputs.reset_index(drop=True)

    # Generate outputs
    if gen_y:
        
        if y_columns is None:
            print('generate_single_region_dataset error: Need to specify column labels!')
            return (None, None)
        
        var_names = y_columns
        outputs = pd.DataFrame()
        
        for i in range(n_samples):
            output_window = region_data.T.iloc[:, i+look_back : i+look_back+look_forward]
            wide_output_sample = pd.DataFrame()

            for var in var_names:
                var_output_sample = output_window.loc[var:var, :]
                var_output_sample.columns = ['{}_t+{}'.format(var, a) for a in range(1, look_forward+1)]
                var_output_sample = var_output_sample.reset_index(drop=True)
                wide_output_sample = pd.concat([wide_output_sample, var_output_sample], axis='columns')

            outputs = pd.concat([outputs, wide_output_sample], axis='index')
        
        # Insert region name
        #outputs.insert(loc=0, column=key, value=region_name)
        # Reset index
        outputs = outputs.reset_index(drop=True)
        
    if gen_x and gen_y:
        return (inputs, outputs)
    elif gen_x:
        return (inputs, None)
    elif gen_y:
        return (None, outputs)

In [3]:
def _get_single_region_dataset(args, region_name):
    
    key = args[0]
    regions_data = args[1]
    
    region_data = regions_data[regions_data[key]==region_name]
    
    new_args = args.copy()
    new_args[1] = region_data
    
    region_x, region_y = generate_single_region_dataset(*new_args)
    
    return (region_x, region_y)

In [4]:
def generate_regions_dataset(key, regions_data, look_back, look_forward, x_columns, y_columns=None, 
                             gen_x=True, gen_y=True):
    
    regions_names = regions_data[key].unique()
    
    all_regions_x = pd.DataFrame()
    all_regions_y = pd.DataFrame()
    
    args = [key, regions_data, look_back, look_forward, x_columns, y_columns, gen_x, gen_y]
    
    func_gen_region_dataset = partial(
        _get_single_region_dataset,
        args
    )
    
    n_jobs = os.cpu_count()
    
    chunk = max(1, (int(len(regions_names)/n_jobs)))
    
    with Pool(n_jobs) as pool:
        datasets_list = pool.map(func_gen_region_dataset, regions_names, chunksize=chunk)
        
    for dataset in datasets_list:
        
        region_x = dataset[0]
        region_y = dataset[1]
        
        if not (region_x is None):
            all_regions_x = pd.concat([all_regions_x, region_x])
            all_regions_x = all_regions_x.reset_index(drop=True)
            
        if not (region_y is None):
            all_regions_y = pd.concat([all_regions_y, region_y])
            all_regions_y = all_regions_y.reset_index(drop=True)
    
    if gen_x and gen_y:
        return (all_regions_x, all_regions_y)
    elif gen_x:
        return (all_regions_x, None)
    elif gen_y:
        return (None, all_regions_y)

### Funções para carregamento dos dados de entrada dos modelos treinados com dados sintéticos

In [5]:
def input_ID(df):
    lista_regiao = list(df["Country/Region"].unique())
    identificadores = np.arange(0,len(lista_regiao))
    lista_df = []
    l=0
    for i in lista_regiao:
        
        mask = df['Country/Region'] == i
        df_temp = df.loc[mask]
        df_temp.insert(0, "ID", identificadores[l] )
        lista_df.append(df_temp)
        l=l+1
    df_final = pd.concat(lista_df) 
    return df_final

In [6]:
def create_PT_multi(sequences, n_steps_in, n_steps_out):
    #Função pare desenvolvimento dos preditores
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out-1
        # check if we are beyond the dataset
        if out_end_ix >= len(sequences):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix:out_end_ix+1, -1]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

In [7]:
def preditores_targets(np_treino, vp, vf):
    identificador = np.unique(np_treino[:,0], axis=0)  
    lista_treinoX = []
    lista_treinoY = []
    for i in identificador:
        
        mask = np_treino[:,0] ==i
        dados_treino_temp = np_treino[mask]
        dados_treino_temp = dados_treino_temp[:,1:]
        
        if(vp + vf <= dados_treino_temp.shape[0]):
            treinamentoX_full, treinamentoY_full = create_PT_multi(dados_treino_temp, vp, vf) 
            lista_treinoX.append(treinamentoX_full)
            lista_treinoY.append(treinamentoY_full)
        else:
            print("A serie temporal com identificação "+ str(i) + " não foi considerada (poucos dados)")
    
    preditores = np.concatenate(lista_treinoX)
    targets = np.concatenate(lista_treinoY)
    
    return preditores, targets

### Classe para gerenciamento dos modelos de base

In [8]:
class BaseModelsDeaths:
    
    def __init__(self, base_path):
        
        self.load_models(base_path)
    
    def _load_artifacts(self, model_path):
        
        with open('{}/x_scaler.pkl'.format(model_path), 'rb') as scaler_file:
            x_scaler = pickle.load(scaler_file)

        with open('{}/y_scaler.pkl'.format(model_path), 'rb') as scaler_file:
            y_scaler = pickle.load(scaler_file)

        model = keras.models.load_model('{}/model.h5'.format(model_path))
        
        artifacts = {'x_scaler':x_scaler, 'y_scaler':y_scaler, 'model':model}
        
        artifacts = namedtuple('Artifacts', artifacts.keys())(*artifacts.values())
        
        return artifacts
    
    def load_models(self, base_path):
        
        model_paths = os.listdir(base_path)
        
        models = dict()
        
        for model_path in model_paths:
            
            model_artifacts = self._load_artifacts('{}/{}'.format(base_path,model_path))
            models[model_path] = model_artifacts
        
        self.models = namedtuple('Models', models.keys())(*models.values())
        
    def transform_x_data(self, x_data, model_name):
        
        model_artifacts = getattr(self.models, model_name)
        
        x_data_scaled = model_artifacts.x_scaler.transform(x_data)
        
        return x_data_scaled
        
    def inverse_transform_y_data(self, y_data_scaled, model_name):
        
        model_artifacts = getattr(self.models, model_name)
        
        y_data = model_artifacts.y_scaler.inverse_transform(y_data_scaled)
        
        return y_data
        
    def reshape_x_data(self, x_data, model_name, lookback=4, lookforward=0):
        
        if model_name == 'CNN_LSTM_real':
            x_columns = 10
            
            new_x_data = np.empty([x_data.shape[0], lookback, x_columns])
            
            k = 0
            for i in range(x_columns):
                for j in range(lookback):
                    new_x_data[:, j, i] = x_data[:, k]
                    k = k + 1
        
        elif model_name == 'LSTM_real':
            
            new_x_data = np.reshape(x_data, (x_data.shape[0], 1, x_data.shape[1]))
            
        elif (model_name=='LSTM_sintetico') or (model_name=='CNN_LSTM_sintetico'):
            
            new_x_data, _ = preditores_targets(x_data, lookback, lookforward)
        
        else:
            print('BaseModelsDeaths.reshape_x_data error. Model name not defined.')
            return None
        
        return new_x_data
    
    def model_predict(self, dataset, model_name, lookback=4, lookforward=0, scale=True):
        
        if (model_name=='LSTM_real') or (model_name=='CNN_LSTM_real'):
            
            x_columns = ["Deaths", "Confirmed", "C1","C2","C3","C4","C5","C6","C7","C8"]
            y_columns = ["Deaths"]
            
            # Gera dados de entrada para o modelo com colunas de lookback
            x_data, _ = generate_regions_dataset(
                "Country/Region", dataset, lookback, lookforward, x_columns, y_columns=y_columns,
                gen_x=True, gen_y=False
            )
                
        elif (model_name=='LSTM_sintetico') or (model_name=='CNN_LSTM_sintetico'):
            
            # Insere coluna ID com IDs para cada região do DataFrame
            dataset_id = input_ID(dataset)
            
            # Formatando colunas de interesse na ordem correta
            x_columns = ["ID","Deaths", "Confirmed", "C1","C2","C3","C4","C5","C6","C7","C8"]
            y_columns = ["Deaths"]
            cols_sequence = x_columns + y_columns
            x_data = dataset_id[cols_sequence]
            
        else:
            print('BaseModelsDeaths.model_predict error. Model name not defined.')
            return None
        
        x_data_scaled = self.transform_x_data(x_data, model_name)
        
        new_x_data = self.reshape_x_data(x_data_scaled, model_name, lookback, lookforward)
        
        model_artifacts = getattr(self.models, model_name)
        y_pred = model_artifacts.model.predict(new_x_data)    
        
        if scale:
            y_pred = self.inverse_transform_y_data(y_pred, model_name)
        
        return y_pred
    
    def predict(self, dataset, lookback=4, lookforward=0, scale=False):
        
        models_names = self.models._fields
        n_models = len(models_names)
        
        models_preds = []
        
        for model_name in models_names:
            
            y_pred = self.model_predict(
                dataset, model_name, lookback=lookback, lookforward=lookforward, scale=scale
            )
            
            models_preds.append(y_pred)
            
        y_preds = np.hstack(models_preds)
        
        y_preds = np.reshape(y_preds, (y_preds.shape[0], 30, n_models), order='F')
        
        return y_preds

### Classe para gerenciamento do metamodelo

In [9]:
class MetaModelDeaths:
    
    def __init__(self, base_path, metamodel_path):
        
        self.base_models = BaseModelsDeaths(base_path)
        
        self.load_model(metamodel_path)
        
    def load_model(self, metamodel_path):
        
        with open('{}/meta_y_scaler.pkl'.format(metamodel_path), 'rb') as scaler_file:
            self.meta_y_scaler = pickle.load(scaler_file)

        self.metamodel = keras.models.load_model('{}/metamodel.h5'.format(metamodel_path))
    
    def predict(self, x_data, lookback=4, lookforward=0, scale=True):
        
        base_pred = self.base_models.predict(x_data, lookback=lookback, lookforward=lookforward)
        
        meta_pred = self.metamodel.predict(base_pred)
        
        if scale:
            meta_pred = self.meta_y_scaler.inverse_transform(meta_pred)
        
        return meta_pred

### Carregando dados atualizados

Carregando arquivo CSV com dataset atualizado.

In [10]:
complete_dataset = pd.read_csv('../dataset/complete_20200821.csv')

Carregando dados atualizados a partir de endpoint.

In [11]:
url="https://covid19.fieb.org.br:9050/api/return/calculate/cases/?country=Brazil"
response = urlopen(url)
request = json.loads(response.read())
dataframe = pd.DataFrame(request['Cases'])

Modificando DataFrame gerado com dados do endpoint para formato esperado.

In [12]:
cols_to_drop = ['c{}1valor'.format(i) for i in range(1,9)]
cols_to_drop.append('recovered')
dataframe = dataframe.drop(columns=cols_to_drop)
dataframe.columns = [
    'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8',
    'Confirmed', 'Deaths', 'Date', 'Country/Region'
]

Carregando metamodelo para fazer previsões.

In [13]:
metamodel = MetaModelDeaths('base_models', 'CNN_LSTM_metamodel')



Criando dataframe para a Bahia.

In [14]:
bahia_endpoint = dataframe[dataframe['Country/Region']=='Bahia'].copy()
bahia_dataset = complete_dataset[complete_dataset['Country/Region']=='BA'].copy()

Eliminando últimos dados dos DataFrames apenas para fazer a comparação das previsões.

In [15]:
bahia_endpoint = bahia_endpoint.iloc[:-43, :]
bahia_dataset = bahia_dataset.iloc[:-38, :]

Modelo possui lookback de 4 dias. Selecionando apenas últimos 4 dias dos dados para entrar no modelo.

In [16]:
last_bahia_endpoint = bahia_endpoint.tail(4)
last_bahia_dataset = bahia_dataset.tail(4)

Gerando previsões do metamodelo para o conjunto de dados da Bahia.

In [17]:
pred_metamodel_endpoint = metamodel.predict(last_bahia_endpoint)
pred_metamodel_dataset = metamodel.predict(last_bahia_dataset)

Previsões para os próximos 30 dias usando dados do endpoint na entrada.

In [18]:
pred_endpoint = pd.DataFrame(pred_metamodel_endpoint)
display(pred_endpoint)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,43.765167,47.473778,48.893234,54.261185,53.254818,49.888027,56.788425,55.138222,59.242661,61.555618,...,62.96949,65.91568,64.428535,61.30611,62.507931,65.645279,67.06739,64.187859,64.163872,68.330368


Previsões para os próximos 30 dias usando dados do dataset na entrada.

In [19]:
pred_dataset = pd.DataFrame(pred_metamodel_dataset)
display(pred_dataset)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,43.765167,47.473778,48.893234,54.261185,53.254818,49.888027,56.788425,55.138222,59.242661,61.555618,...,62.96949,65.91568,64.428535,61.30611,62.507931,65.645279,67.06739,64.187859,64.163872,68.330368


Verificando se previsões são iguais.

In [20]:
pred_endpoint.equals(pred_dataset)

True

Previsões na escala normalizada

In [21]:
metamodel.predict(last_bahia_endpoint, scale=False)

array([[0.01678484, 0.01817539, 0.01870762, 0.02072035, 0.02034301,
        0.01908062, 0.02166795, 0.0210492 , 0.02258817, 0.01608941,
        0.01284448, 0.01555829, 0.01416678, 0.01531472, 0.01659657,
        0.01687943, 0.01537516, 0.01591045, 0.01692881, 0.01691584,
        0.01645306, 0.01721082, 0.01682833, 0.01602523, 0.01633434,
        0.01714128, 0.01750704, 0.01676643, 0.01676025, 0.01783188]],
      dtype=float32)