# Notebook to update data for the forecast

In [1]:
import numpy as np
import pandas as pd
from epiweeks import Week
import matplotlib.pyplot as plt
import os
os.getcwd()

'/Users/eduardoaraujo/Documents/Github/transfer-learning-forecast/forecast'

## LSTM models for the states and "Macroregionais de saúde"

### Path where the cases and climate data are saved:  

In [2]:
PATH = '../data'

### The dataframe below will be used in the functions to get the link between the geocodes and the health macroregion code: 
    

In [3]:
dfs = pd.read_csv('../macro_saude.csv')

dfs.head()

Unnamed: 0,geocode,name_muni,name_region,code_region,name_macro,code_macro,state
0,1100015,Alta Floresta D'Oeste,Zona da Mata,11005,Cacoal,1101,RO
1,1100023,Ariquemes,Vale do Jamari,11001,Porto Velho,1102,RO
2,1100031,Cabixi,Cone Sul,11006,Cacoal,1101,RO
3,1100049,Cacoal,Café,11002,Cacoal,1101,RO
4,1100056,Cerejeiras,Cone Sul,11006,Cacoal,1101,RO


In [4]:
def add_new_columns(df):
    '''
    This function add the number of the ep week, the number of the month and the first difference of the cases 
    as new columns in the table
    '''
    
    df['month'] = df.index.month
    
    weeks = []
    for date in df.index:
        #print(date)
        weeks.append(Week.fromdate(date).weektuple()[1])
        #print(Week.fromdate(date).weektuple()[1])
        #break  
        
    df['SE'] = weeks
    
    df.loc[df.index == '2018-04-04', 'SE'] = 15
     
    diff_series = [df]
        
    for i in df.columns[df.columns.str.startswith('casos')]:

        diff_series.append(pd.DataFrame(data = np.diff(df[f'{i}'], 1), index = df.index[1:], columns = [f'diff_{i}']))

    df = pd.concat(diff_series, axis = 1, join = 'outer')    
    
    return df
    
    
    

In [5]:
def get_geocodes_and_state(macro): 
    '''
    This function is used to get the geocodes and state that refer to a specific health macro region code
    
    :param macro:int. A four-digit number
        
    '''
    
    dfs = pd.read_csv('../macro_saude.csv')
    
    geocodes = dfs.loc[dfs.code_macro == macro].geocode.unique()
    state = dfs.loc[dfs.code_macro == macro].state.values[0]

    return geocodes, state

In [6]:
def split_geocodes(geocodes):
    
    '''
    This function split the geocodes between the cities with populations up and below 30k in 2022.
    
    :param geocode:list of int. A list with seven-digit ibge codes for brazilian cities 
     
    '''
    
    dfpop = pd.read_csv('poptcu2010-2022_rgi.csv')

    g_low = dfpop.loc[ (dfpop.CODMUN7.isin(geocodes)) & (dfpop.POP22 <= 30000) ].CODMUN7.unique()
    
    g_up = np.setdiff1d(geocodes, g_low)
    
    if geocodes.shape[0] != g_low.shape[0] + g_up.shape[0]:
    
        print('Error subtracting geocodes')
    
    return g_up, g_low

In [7]:
def transform_data(df, geocode, geo_col = 'municipio_geocodigo'): 
    '''
    This filters the data for a specific region and returns it as a separate dataframe.
    
    :param df: pd.DataFrame.
    :param geocode:. Must be at the same type of the geo_col 
    :param geo_col: str. Name of the column in the df that it will be used to filter the geocode value
     
    '''
        
    
    df_ep = df.loc[df[geo_col] == geocode]
    
    del df_ep[geo_col]
    
    df_ep.columns = df_ep.columns + f'_{geocode}'
    
    return df_ep

In [8]:
predictors_clim = ['temp_min', 'temp_max', 'umid_min', 'umid_max',
                   'pressao_min', 'pressao_max', 'precip_tot', 'rainy_days',
                   'temp_mean', 'temp_amp','umid_mean','umid_amp',
                   'pressao_mean']

def predictors_ep_macro(macro): 
    '''
    This function is used to organize in a table the epidemiological predictors related to a specific health macroregion
    
    :params macro: int. A four digit number
    '''
    
    geocodes, state = get_geocodes_and_state(macro)

    # get epidemiological factors 
    df_ep = pd.read_parquet(f'{PATH}/cases/{state}_dengue.parquet',
                           columns = ['data_iniSE', 'casos_est', 'municipio_geocodigo', 'p_rt1', 'Rt', 'p_inc100k'])
    
    # select only the geocodes include in the health macroregion
    df_ep = df_ep.loc[df_ep.municipio_geocodigo.isin(geocodes)]
    
    df_ep = df_ep.sort_index()
    
    # split the geocodes between cities with population up and below 30k in 2022
    g_up, g_low = split_geocodes(geocodes)    

    # get the data of each city with population above 30k
    list_data_ep = []

    for g in g_up:

        list_data_ep.append(transform_data(df_ep, g))
    
    # get the total weekly cases of this health macroregion 
    data_macro_ep = df_ep[['casos_est']].resample('W-SUN').sum()#.agg({'casos_est':np.sum, 
                                                      #'p_rt1': np.mean, 
                                                      #'Rt': np.mean})

    data_macro_ep.columns = data_macro_ep.columns + f'_{macro}'

    list_data_ep.append(data_macro_ep)
    
    
    # aggregate the data from small cities
    
    data_small_cities = df_ep.loc[df_ep.municipio_geocodigo.isin(g_low)][['casos_est','p_rt1', 'Rt']].resample('W-SUN').agg({'casos_est':np.sum, 
                                                                                                        'p_rt1': np.mean, 'Rt': np.mean})
    data_small_cities.columns = data_small_cities.columns + f'_small'
    
    list_data_ep.append(data_small_cities)
    
    data_ep = pd.concat(list_data_ep, axis=1, join='outer')#.fillna(method='ffill')
    
    #remove columns with all values nan 
    data_ep = data_ep.dropna(axis =1, how = 'all')
    
    return data_ep 


def predictors_clim_macro(macro):
    '''
    This function is used to organize in a table the climate predictors related to a specific health macroregion
    
    :params macro: int. A four digit number
    '''
    geocodes, state = get_geocodes_and_state(macro)
    
    # get climate factors 
    df_clim = pd.read_parquet(f'../data/climate/{state}_climate.parquet',
                         columns = predictors_clim.append('geocodigo'))
    
    # select only the geocodes include in the health macroregion
    df_clim = df_clim.loc[df_clim.geocodigo.isin(geocodes)]

    df_clim = df_clim.loc[df_clim.index.year >= 2010]

    del df_clim['index']
    
    # compute other climate features 
    df_clim['temp_mean'] = (df_clim.temp_max+df_clim.temp_min)/2

    df_clim['pressao_mean'] = (df_clim.pressao_max+df_clim.pressao_min)/2

    df_clim['umid_mean'] = (df_clim.umid_max+df_clim.umid_min)/2

    df_clim['temp_amp'] = df_clim.temp_max-df_clim.temp_min
        # Rainy days
    df_clim['rainy_days'] = df_clim.precip_max > 0
        # Humidity amplitude
    df_clim['umid_amp'] = df_clim.umid_max - df_clim.umid_min

    # agg data by weekly since that's the time scale of the cases 
    df_clim = df_clim.groupby('geocodigo').resample('W-SUN').agg({'temp_min':np.mean, 'temp_max': np.mean,

                                                                'umid_min':np.mean, 'umid_max': np.mean,
                                                                'pressao_min':np.mean, 'pressao_max': np.mean,
                                                                'precip_tot':np.sum, 'rainy_days': np.sum,
                                                                'temp_mean':np.mean, 'temp_amp':np.mean,
                                                                'umid_mean': np.mean,'umid_amp': np.mean,
                                                                'pressao_mean':np.mean}).reset_index().set_index('date')
    
    # split the geocodes between cities with population up and below 30k in 2022
    g_up, g_low = split_geocodes(geocodes)
    
    
    # get the predictors of each city with population above 30k
    
    list_data_clim = []

    for g in g_up:

        list_data_clim.append(transform_data(df_clim, g, 'geocodigo'))

    #del df_clim['geocodigo']

    #data_macro_clim = df_clim.resample('W-SUN').agg({'temp_min':np.mean, 'temp_max': np.mean,

     #                                                           'umid_min':np.mean, 'umid_max': np.mean,
      #                                                          'pressao_min':np.mean, 'pressao_max': np.mean,
       #                                                         'precip_tot':np.sum, 'rainy_days': np.sum,
        #                                                        'temp_mean':np.mean, 'temp_amp':np.mean,
         #                                                       'umid_mean': np.mean,'umid_amp': np.mean,
          #                                                      'pressao_mean':np.mean}).reset_index().set_index('date')

    #data_macro_clim.columns = data_macro_clim.columns + f'_{macro}'

    #list_data_clim.append(data_macro_clim)
    
    # aggregate the data from small cities and save the mean as predictor
    
    data_small_cities = df_clim.loc[df_clim.geocodigo.isin(g_low)][['temp_min','temp_max',
                                                                'umid_min', 'umid_max',
                                                                'pressao_min', 'pressao_max',
                                                                'precip_tot', 'rainy_days',
                                                                'temp_mean', 'temp_amp',
                                                                'umid_mean', 'umid_amp',
                                                                'pressao_mean']].resample('W-SUN').mean()
    

    data_small_cities.columns = data_small_cities.columns + f'_small'
    
    list_data_clim.append(data_small_cities)
    
    data_clim = pd.concat(list_data_clim, axis=1, join='outer').fillna(method='ffill')
    
    #remove columns with all values nan 
    data_clim = data_clim.dropna(axis =1, how = 'all')
    
    return data_clim 


def get_data_macro(macro):
    '''
    This function is used to organize in a table the climate and epidemiological predictors 
    related to a specific health macroregion.
    
    :params macro: int. A four-digit number
    '''
    
    data_ep = predictors_ep_macro(macro)
    
    data_clim = predictors_clim_macro(macro)
    
    data_full = pd.concat([data_ep, data_clim], axis = 1, join = 'outer')#.fillna(method = 'ffill')
    
    data_full = add_new_columns(data_full)
    
    return data_full


def predictors_ep_state(state): 
    
    '''
    This function is used to organize in a table the epidemiological predictors related to a specific state    
    :params state: str. Two leters code 
    '''
        
    
    # get epidemiological factors 
    df_ep = pd.read_parquet(f'../data/cases/{state}_dengue.parquet',
                           columns = ['data_iniSE', 'casos_est', 'municipio_geocodigo', 'p_rt1', 'Rt'])

    df_ep = df_ep.sort_index()
    
    # this copy will be used to compute the target for all the state later 
    df_ep_copy = df_ep.copy()
    
    # link the geocode and the health macroregion code 
    df_ep = df_ep.reset_index().merge(dfs[['code_macro', 'geocode']].rename(columns = {'geocode':'municipio_geocodigo'}),
                          on = 'municipio_geocodigo').set_index('data_iniSE')
    
    del df_ep['municipio_geocodigo']
    
    # resample the data based of the macroregion 
    df_ep = df_ep.groupby('code_macro').resample('W-SUN').agg({'casos_est':np.sum, 
                               'p_rt1': np.mean, 
                               'Rt': np.mean}).reset_index().set_index('data_iniSE')
    
    df_ep.index = pd.to_datetime(df_ep.index)
    
    # transform in column the data of each predictor by macroregion
    list_data_ep = []

    for m in df_ep.code_macro.unique():

        list_data_ep.append(transform_data(df_ep, m, 'code_macro'))
    
    # get the total weekly cases of the state (it will be used as target)
    data_state_ep = df_ep_copy[['casos_est']].resample('W-SUN').sum()#agg({'casos':np.sum, 
                               #'p_rt1': np.mean, 
                               #'Rt': np.mean})

    data_state_ep.columns = data_state_ep.columns + f'_{state}'

    list_data_ep.append(data_state_ep)
    
    # final dataframe
    data_ep = pd.concat(list_data_ep, axis=1, join='outer')#.fillna(method='ffill')
    
    #remove columns with all values nan 
    data_ep = data_ep.dropna(axis =1, how = 'all')
    
    return data_ep 

def predictors_clim_state(state):
    '''
    This function is used to organize in a table the climate predictors related to a specific state    
    :params state: str. Two leters code 
    '''
    
    # get climate factors 
    df_clim = pd.read_parquet(f'../data/climate/{state}_climate.parquet',
                         columns = predictors_clim.append('geocodigo'))

    df_clim = df_clim.loc[df_clim.index.year >= 2010]
    
    del df_clim['index']

    df_clim['temp_mean'] = (df_clim.temp_max+df_clim.temp_min)/2

    df_clim['pressao_mean'] = (df_clim.pressao_max+df_clim.pressao_min)/2

    df_clim['umid_mean'] = (df_clim.umid_max+df_clim.umid_min)/2

    df_clim['temp_amp'] = df_clim.temp_max-df_clim.temp_min
        # Rainy days
    df_clim['rainy_days'] = df_clim.precip_max > 0
        # Humidity amplitude
    df_clim['umid_amp'] = df_clim.umid_max - df_clim.umid_min

    # link the geocode and the health macroregion code 
    
    df_clim = df_clim.reset_index().merge(dfs[['code_macro', 'geocode']].rename(columns = {'geocode':'geocodigo'}),
                          on = 'geocodigo').set_index('date')

    del df_clim['geocodigo']

    # resample the data based of the macroregion 
    df_clim = df_clim.groupby('code_macro').resample('W-SUN').agg({'temp_min':np.mean, 'temp_max': np.mean,

                                                                'umid_min':np.mean, 'umid_max': np.mean,
                                                                'pressao_min':np.mean, 'pressao_max': np.mean,
                                                                'precip_tot':np.sum, 'rainy_days': np.sum,
                                                                'temp_mean':np.mean, 'temp_amp':np.mean,
                                                                'umid_mean': np.mean,'umid_amp': np.mean,
                                                                'pressao_mean':np.mean}).reset_index().set_index('date')

    # transform in column the data of each predictor by macroregion
    list_data_clim = []

    for m in df_clim.code_macro.unique():

        list_data_clim.append(transform_data(df_clim, m, 'code_macro'))
    

    #data_state_clim = df_clim_copy.resample('W-SUN').agg({'temp_min':np.mean, 'temp_max': np.mean,

     #                                                           'umid_min':np.mean, 'umid_max': np.mean,
      #                                                          'pressao_min':np.mean, 'pressao_max': np.mean,
       #                                                         'precip_tot':np.sum, 'rainy_days': np.sum,
        #                                                        'temp_mean':np.mean, 'temp_amp':np.mean,
         #                                                       'umid_mean': np.mean,'umid_amp': np.mean,
          #                                                      'pressao_mean':np.mean}).reset_index().set_index('date')

    #data_state_clim.columns = data_state_clim.columns + f'_{state}'

    #list_data_clim.append(data_state_clim)

    # final dataframe
    data_clim = pd.concat(list_data_clim, axis=1, join='outer').ffill()#.fillna(method='ffill')
    
    #remove columns with all values nan 
    data_clim = data_clim.dropna(axis =1, how = 'all')
    
    return data_clim 


def get_data_state(state):
    '''
    This function is used to organize in a table the climate and epidemiological predictors 
    related to a specific state.
    
    :params macro: int. A four digit number
    '''
    
    data_ep = predictors_ep_state(state)
    
    data_clim = predictors_clim_state(state)
    
    data_full = pd.concat([data_ep, data_clim], axis = 1, join = 'outer')#.fillna(method = 'ffill')
    
    data_full = add_new_columns(data_full)
    
    return data_full


Get data for all macro in MG: 

In [11]:
macro = 3524
df1 = get_data_macro(macro)
df1 = df1.dropna()
df1.tail()

Unnamed: 0,casos_est_3509007,p_rt1_3509007,Rt_3509007,p_inc100k_3509007,casos_est_3509205,p_rt1_3509205,Rt_3509205,p_inc100k_3509205,casos_est_3516309,p_rt1_3516309,...,umid_amp_3528502,pressao_mean_3528502,month,SE,diff_casos_est_3509007,diff_casos_est_3509205,diff_casos_est_3516309,diff_casos_est_3516408,diff_casos_est_3528502,diff_casos_est_3524
2023-12-24,11.0,0.849614,1.626461,10.702992,5.0,0.531612,1.049846,6.415685,1.0,0.561013,...,44.306006,0.998822,12,52,9.0,5.0,-1.0,26.0,7.0,46.0
2023-12-31,48.0,1.0,5.556517,46.703964,17.0,0.998727,4.261405,21.81333,1.0,0.449917,...,38.311157,1.001046,12,1,37.0,12.0,0.0,78.0,16.0,143.0
2024-01-07,48.0,0.999794,2.449435,46.703964,24.0,0.998628,3.092819,30.795288,3.0,0.823479,...,35.258153,0.999859,1,2,0.0,7.0,2.0,5.0,26.0,40.0
2024-01-14,123.0,1.0,3.058544,119.67891,32.0,0.992491,2.057293,41.060383,5.0,0.914892,...,36.338773,1.00044,1,3,75.0,8.0,2.0,50.0,4.0,139.0
2024-01-21,102.0,0.996929,1.530675,99.245926,22.0,0.446684,0.961129,28.229013,0.0,0.0,...,30.937168,0.999843,1,4,-21.0,-10.0,-5.0,94.5,-23.0,35.5


In [14]:
df1.isnull().sum()[df1.isnull().sum()>0]

Series([], dtype: int64)

In [15]:

# for macro in dfs.loc[dfs.state=='MG'].code_macro.unique():
for macro in dfs.code_macro.unique():

    df1 = get_data_macro(macro)
    
    df1 = df1.dropna()
    
    df1.to_csv(f'../data/dengue_{macro}.csv.gz')
    
    df1.head()

In [16]:
df1.tail()

Unnamed: 0,casos_est_5300108,p_rt1_5300108,Rt_5300108,p_inc100k_5300108,casos_est_5302,temp_min_5300108,temp_max_5300108,umid_min_5300108,umid_max_5300108,pressao_min_5300108,...,rainy_days_5300108,temp_mean_5300108,temp_amp_5300108,umid_mean_5300108,umid_amp_5300108,pressao_mean_5300108,month,SE,diff_casos_est_5300108,diff_casos_est_5302
2023-12-24,3710.0,1.0,1.497524,121.43434,3710.0,19.553248,27.436691,54.491029,87.907621,0.99686,...,6.0,23.494969,7.883443,71.199325,33.416592,0.999133,12,52,738.0,738.0
2023-12-31,7222.5,1.0,2.250844,236.40417,7222.5,19.730083,28.39201,57.310584,91.638233,0.998582,...,8.0,24.061047,8.661926,74.474408,34.327649,1.000773,12,1,3512.5,3512.5
2024-01-07,7915.5,1.0,1.749926,259.0872,7915.5,19.584608,25.075889,71.175181,94.354639,0.997495,...,7.0,22.330248,5.491281,82.76491,23.179458,0.999493,1,2,693.0,693.0
2024-01-14,11792.0,1.0,1.813226,385.97134,11792.0,19.405648,25.900037,66.06918,95.123479,0.999285,...,7.0,22.652843,6.494389,80.59633,29.054298,1.001286,1,3,3876.5,3876.5
2024-01-21,17397.5,1.0,1.952161,569.4485,17397.5,18.981236,29.644913,46.036609,90.327574,0.998762,...,8.0,24.313074,10.663677,68.182091,44.290965,1.001174,1,4,5605.5,5605.5


In [20]:
state = 'MG'

df2 = get_data_state(state)

df2.to_csv(f'../data/dengue_{state}.csv.gz')

df2.head()

Unnamed: 0,casos_est_3101,p_rt1_3101,Rt_3101,casos_est_3102,p_rt1_3102,Rt_3102,casos_est_3103,p_rt1_3103,Rt_3103,casos_est_3104,...,diff_casos_est_3106,diff_casos_est_3107,diff_casos_est_3108,diff_casos_est_3109,diff_casos_est_3110,diff_casos_est_3111,diff_casos_est_3112,diff_casos_est_3113,diff_casos_est_3114,diff_casos_est_MG
2010-01-03,12.0,0.0,0.0,4.0,0.0,0.0,614.0,0.0,0.0,74.0,...,,,,,,,,,,
2010-01-10,10.0,0.0,0.0,3.0,0.0,0.0,892.0,0.0,0.0,78.0,...,56.0,103.0,211.0,61.0,3.0,44.0,40.0,69.0,9.0,935.0
2010-01-17,28.0,0.0,0.0,6.0,0.0,0.0,1092.0,0.0,0.0,85.0,...,50.0,-26.0,284.0,63.0,-7.0,34.0,46.0,-20.0,22.0,927.0
2010-01-24,55.0,0.0,0.0,6.0,0.0,0.0,1673.0,0.0,0.0,84.0,...,14.0,8.0,571.0,147.0,4.0,42.0,17.0,-32.0,7.0,1539.0
2010-01-31,74.0,0.0,0.0,2.0,0.0,0.0,2298.0,0.0,0.0,34.0,...,22.0,70.0,-65.0,161.0,20.0,64.0,-44.0,59.0,9.0,984.0


In [21]:
macro = 1101 

filename_data = f'../data/dengue_{macro}.csv.gz'

df = pd.read_csv(filename_data, index_col='Unnamed: 0', nrows = 1)

df

Unnamed: 0,casos_est_1100049,p_rt1_1100049,Rt_1100049,p_inc100k_1100049,casos_est_1100189,p_rt1_1100189,Rt_1100189,p_inc100k_1100189,casos_est_1100288,p_rt1_1100288,...,umid_amp_small,pressao_mean_small,month,SE,diff_casos_est_1100049,diff_casos_est_1100189,diff_casos_est_1100288,diff_casos_est_1100304,diff_casos_est_1101,diff_casos_est_small
2010-01-03,164.0,0.0,0.0,190.93523,52.0,0.0,0.0,140.994,186.0,0.0,...,21.13686,0.994904,1,1,,,,,,


In [23]:
df.shape[0]

1