# Notebook to update data for the forecast

In [1]:
import numpy as np
import pandas as pd
from epiweeks import Week
import matplotlib.pyplot as plt
import os
os.getcwd()

'/Users/eduardoaraujo/Documents/Github/transfer-learning-forecast/forecast'

## LSTM models for the states and "Macroregionais de saúde"

### Path where the cases and climate data are saved:  

In [2]:
PATH = '../data'

### The dataframe below will be used in the functions to get the link between the geocodes and the health macroregion code: 
    

In [3]:
dfs = pd.read_csv('../macro_saude.csv')

dfs.head()

Unnamed: 0,geocode,name_muni,name_region,code_region,name_macro,code_macro,state
0,1100015,Alta Floresta D'Oeste,Zona da Mata,11005,Cacoal,1101,RO
1,1100023,Ariquemes,Vale do Jamari,11001,Porto Velho,1102,RO
2,1100031,Cabixi,Cone Sul,11006,Cacoal,1101,RO
3,1100049,Cacoal,Café,11002,Cacoal,1101,RO
4,1100056,Cerejeiras,Cone Sul,11006,Cacoal,1101,RO


In [4]:
def add_new_columns(df):
    '''
    This function add the number of the ep week, the number of the month and the first difference of the cases 
    as new columns in the table
    '''
    
    df['month'] = df.index.month
    
    weeks = []
    for date in df.index:
        #print(date)
        weeks.append(Week.fromdate(date).weektuple()[1])
        #print(Week.fromdate(date).weektuple()[1])
        #break  
        
    df['SE'] = weeks
    
    df.loc[df.index == '2018-04-04', 'SE'] = 15
     
    diff_series = [df]
        
    for i in df.columns[df.columns.str.startswith('casos')]:

        diff_series.append(pd.DataFrame(data = np.diff(df[f'{i}'], 1), index = df.index[1:], columns = [f'diff_{i}']))

    df = pd.concat(diff_series, axis = 1, join = 'outer')    
    
    return df
    
    
    

In [5]:
def get_geocodes_and_state(macro): 
    '''
    This function is used to get the geocodes and state that refer to a specific health macro region code
    
    :param macro:int. A four-digit number
        
    '''
    
    dfs = pd.read_csv('../macro_saude.csv')
    
    geocodes = dfs.loc[dfs.code_macro == macro].geocode.unique()
    state = dfs.loc[dfs.code_macro == macro].state.values[0]

    return geocodes, state

In [6]:
def split_geocodes(geocodes):
    
    '''
    This function split the geocodes between the cities with populations up and below 30k in 2022.
    
    :param geocode:list of int. A list with seven-digit ibge codes for brazilian cities 
     
    '''
    
    dfpop = pd.read_csv('poptcu2010-2022_rgi.csv')

    g_low = dfpop.loc[ (dfpop.CODMUN7.isin(geocodes)) & (dfpop.POP22 <= 30000) ].CODMUN7.unique()
    
    g_up = np.setdiff1d(geocodes, g_low)
    
    if geocodes.shape[0] != g_low.shape[0] + g_up.shape[0]:
    
        print('Error subtracting geocodes')
    
    return g_up, g_low

In [7]:
def transform_data(df, geocode, geo_col = 'municipio_geocodigo'): 
    '''
    This filters the data for a specific region and returns it as a separate dataframe.
    
    :param df: pd.DataFrame.
    :param geocode:. Must be at the same type of the geo_col 
    :param geo_col: str. Name of the column in the df that it will be used to filter the geocode value
     
    '''
        
    
    df_ep = df.loc[df[geo_col] == geocode]
    
    del df_ep[geo_col]
    
    df_ep.columns = df_ep.columns + f'_{geocode}'
    
    return df_ep

In [8]:
predictors_clim = ['temp_min', 'temp_max', 'umid_min', 'umid_max',
                   'pressao_min', 'pressao_max', 'precip_tot', 'rainy_days',
                   'temp_mean', 'temp_amp','umid_mean','umid_amp',
                   'pressao_mean']

def predictors_ep_macro(macro): 
    '''
    This function is used to organize in a table the epidemiological predictors related to a specific health macroregion
    
    :params macro: int. A four digit number
    '''
    
    geocodes, state = get_geocodes_and_state(macro)

    # get epidemiological factors 
    df_ep = pd.read_parquet(f'{PATH}/cases/{state}_dengue.parquet',
                           columns = ['data_iniSE', 'casos_est', 'municipio_geocodigo', 'p_rt1', 'Rt', 'p_inc100k'])
    
    # select only the geocodes include in the health macroregion
    df_ep = df_ep.loc[df_ep.municipio_geocodigo.isin(geocodes)]
    
    df_ep = df_ep.sort_index()
    
    # split the geocodes between cities with population up and below 30k in 2022
    g_up, g_low = split_geocodes(geocodes)    

    # get the data of each city with population above 30k
    list_data_ep = []

    for g in g_up:

        list_data_ep.append(transform_data(df_ep, g))
    
    # get the total weekly cases of this health macroregion 
    data_macro_ep = df_ep[['casos_est']].resample('W-SUN').sum()#.agg({'casos_est':np.sum, 
                                                      #'p_rt1': np.mean, 
                                                      #'Rt': np.mean})

    data_macro_ep.columns = data_macro_ep.columns + f'_{macro}'

    list_data_ep.append(data_macro_ep)
    
    
    # aggregate the data from small cities
    
    data_small_cities = df_ep.loc[df_ep.municipio_geocodigo.isin(g_low)][['casos_est','p_rt1', 'Rt']].resample('W-SUN').agg({'casos_est':np.sum, 
                                                                                                        'p_rt1': np.mean, 'Rt': np.mean})
    data_small_cities.columns = data_small_cities.columns + f'_small'
    
    list_data_ep.append(data_small_cities)
    
    data_ep = pd.concat(list_data_ep, axis=1, join='outer')#.fillna(method='ffill')
    
    #remove columns with all values nan 
    data_ep = data_ep.dropna(axis =1, how = 'all')
    
    return data_ep 


def predictors_clim_macro(macro):
    '''
    This function is used to organize in a table the climate predictors related to a specific health macroregion
    
    :params macro: int. A four digit number
    '''
    geocodes, state = get_geocodes_and_state(macro)
    
    # get climate factors 
    df_clim = pd.read_parquet(f'../data/climate/{state}_climate.parquet',
                         columns = predictors_clim.append('geocodigo'))
    
    # select only the geocodes include in the health macroregion
    df_clim = df_clim.loc[df_clim.geocodigo.isin(geocodes)]

    df_clim = df_clim.loc[df_clim.index.year >= 2010]

    del df_clim['index']
    
    # compute other climate features 
    df_clim['temp_mean'] = (df_clim.temp_max+df_clim.temp_min)/2

    df_clim['pressao_mean'] = (df_clim.pressao_max+df_clim.pressao_min)/2

    df_clim['umid_mean'] = (df_clim.umid_max+df_clim.umid_min)/2

    df_clim['temp_amp'] = df_clim.temp_max-df_clim.temp_min
        # Rainy days
    df_clim['rainy_days'] = df_clim.precip_max > 0
        # Humidity amplitude
    df_clim['umid_amp'] = df_clim.umid_max - df_clim.umid_min

    # agg data by weekly since that's the time scale of the cases 
    df_clim = df_clim.groupby('geocodigo').resample('W-SUN').agg({'temp_min':np.mean, 'temp_max': np.mean,

                                                                'umid_min':np.mean, 'umid_max': np.mean,
                                                                'pressao_min':np.mean, 'pressao_max': np.mean,
                                                                'precip_tot':np.sum, 'rainy_days': np.sum,
                                                                'temp_mean':np.mean, 'temp_amp':np.mean,
                                                                'umid_mean': np.mean,'umid_amp': np.mean,
                                                                'pressao_mean':np.mean}).reset_index().set_index('date')
    
    # split the geocodes between cities with population up and below 30k in 2022
    g_up, g_low = split_geocodes(geocodes)
    
    
    # get the predictors of each city with population above 30k
    
    list_data_clim = []

    for g in g_up:

        list_data_clim.append(transform_data(df_clim, g, 'geocodigo'))

    #del df_clim['geocodigo']

    #data_macro_clim = df_clim.resample('W-SUN').agg({'temp_min':np.mean, 'temp_max': np.mean,

     #                                                           'umid_min':np.mean, 'umid_max': np.mean,
      #                                                          'pressao_min':np.mean, 'pressao_max': np.mean,
       #                                                         'precip_tot':np.sum, 'rainy_days': np.sum,
        #                                                        'temp_mean':np.mean, 'temp_amp':np.mean,
         #                                                       'umid_mean': np.mean,'umid_amp': np.mean,
          #                                                      'pressao_mean':np.mean}).reset_index().set_index('date')

    #data_macro_clim.columns = data_macro_clim.columns + f'_{macro}'

    #list_data_clim.append(data_macro_clim)
    
    # aggregate the data from small cities and save the mean as predictor
    
    data_small_cities = df_clim.loc[df_clim.geocodigo.isin(g_low)][['temp_min','temp_max',
                                                                'umid_min', 'umid_max',
                                                                'pressao_min', 'pressao_max',
                                                                'precip_tot', 'rainy_days',
                                                                'temp_mean', 'temp_amp',
                                                                'umid_mean', 'umid_amp',
                                                                'pressao_mean']].resample('W-SUN').mean()
    

    data_small_cities.columns = data_small_cities.columns + f'_small'
    
    list_data_clim.append(data_small_cities)
    
    data_clim = pd.concat(list_data_clim, axis=1, join='outer').fillna(method='ffill')
    
    #remove columns with all values nan 
    data_clim = data_clim.dropna(axis =1, how = 'all')
    
    return data_clim 


def get_data_macro(macro):
    '''
    This function is used to organize in a table the climate and epidemiological predictors 
    related to a specific health macroregion.
    
    :params macro: int. A four-digit number
    '''
    
    data_ep = predictors_ep_macro(macro)
    
    data_clim = predictors_clim_macro(macro)
    
    data_full = pd.concat([data_ep, data_clim], axis = 1, join = 'outer')#.fillna(method = 'ffill')
    
    data_full = add_new_columns(data_full)
    
    return data_full


def predictors_ep_state(state): 
    
    '''
    This function is used to organize in a table the epidemiological predictors related to a specific state    
    :params state: str. Two leters code 
    '''
        
    
    # get epidemiological factors 
    df_ep = pd.read_parquet(f'../data/cases/{state}_dengue.parquet',
                           columns = ['data_iniSE', 'casos_est', 'municipio_geocodigo', 'p_rt1', 'Rt'])

    df_ep = df_ep.sort_index()
    
    # this copy will be used to compute the target for all the state later 
    df_ep_copy = df_ep.copy()
    
    # link the geocode and the health macroregion code 
    df_ep = df_ep.reset_index().merge(dfs[['code_macro', 'geocode']].rename(columns = {'geocode':'municipio_geocodigo'}),
                          on = 'municipio_geocodigo').set_index('data_iniSE')
    
    del df_ep['municipio_geocodigo']
    
    # resample the data based of the macroregion 
    df_ep = df_ep.groupby('code_macro').resample('W-SUN').agg({'casos_est':np.sum, 
                               'p_rt1': np.mean, 
                               'Rt': np.mean}).reset_index().set_index('data_iniSE')
    
    df_ep.index = pd.to_datetime(df_ep.index)
    
    # transform in column the data of each predictor by macroregion
    list_data_ep = []

    for m in df_ep.code_macro.unique():

        list_data_ep.append(transform_data(df_ep, m, 'code_macro'))
    
    # get the total weekly cases of the state (it will be used as target)
    data_state_ep = df_ep_copy[['casos_est']].resample('W-SUN').sum()#agg({'casos':np.sum, 
                               #'p_rt1': np.mean, 
                               #'Rt': np.mean})

    data_state_ep.columns = data_state_ep.columns + f'_{state}'

    list_data_ep.append(data_state_ep)
    
    # final dataframe
    data_ep = pd.concat(list_data_ep, axis=1, join='outer')#.fillna(method='ffill')
    
    #remove columns with all values nan 
    data_ep = data_ep.dropna(axis =1, how = 'all')
    
    return data_ep 

def predictors_clim_state(state):
    '''
    This function is used to organize in a table the climate predictors related to a specific state    
    :params state: str. Two leters code 
    '''
    
    # get climate factors 
    df_clim = pd.read_parquet(f'../data/climate/{state}_climate.parquet',
                         columns = predictors_clim.append('geocodigo'))

    df_clim = df_clim.loc[df_clim.index.year >= 2010]
    
    del df_clim['index']

    df_clim['temp_mean'] = (df_clim.temp_max+df_clim.temp_min)/2

    df_clim['pressao_mean'] = (df_clim.pressao_max+df_clim.pressao_min)/2

    df_clim['umid_mean'] = (df_clim.umid_max+df_clim.umid_min)/2

    df_clim['temp_amp'] = df_clim.temp_max-df_clim.temp_min
        # Rainy days
    df_clim['rainy_days'] = df_clim.precip_max > 0
        # Humidity amplitude
    df_clim['umid_amp'] = df_clim.umid_max - df_clim.umid_min

    # link the geocode and the health macroregion code 
    
    df_clim = df_clim.reset_index().merge(dfs[['code_macro', 'geocode']].rename(columns = {'geocode':'geocodigo'}),
                          on = 'geocodigo').set_index('date')

    del df_clim['geocodigo']

    # resample the data based of the macroregion 
    df_clim = df_clim.groupby('code_macro').resample('W-SUN').agg({'temp_min':np.mean, 'temp_max': np.mean,

                                                                'umid_min':np.mean, 'umid_max': np.mean,
                                                                'pressao_min':np.mean, 'pressao_max': np.mean,
                                                                'precip_tot':np.sum, 'rainy_days': np.sum,
                                                                'temp_mean':np.mean, 'temp_amp':np.mean,
                                                                'umid_mean': np.mean,'umid_amp': np.mean,
                                                                'pressao_mean':np.mean}).reset_index().set_index('date')

    # transform in column the data of each predictor by macroregion
    list_data_clim = []

    for m in df_clim.code_macro.unique():

        list_data_clim.append(transform_data(df_clim, m, 'code_macro'))
    

    #data_state_clim = df_clim_copy.resample('W-SUN').agg({'temp_min':np.mean, 'temp_max': np.mean,

     #                                                           'umid_min':np.mean, 'umid_max': np.mean,
      #                                                          'pressao_min':np.mean, 'pressao_max': np.mean,
       #                                                         'precip_tot':np.sum, 'rainy_days': np.sum,
        #                                                        'temp_mean':np.mean, 'temp_amp':np.mean,
         #                                                       'umid_mean': np.mean,'umid_amp': np.mean,
          #                                                      'pressao_mean':np.mean}).reset_index().set_index('date')

    #data_state_clim.columns = data_state_clim.columns + f'_{state}'

    #list_data_clim.append(data_state_clim)

    # final dataframe
    data_clim = pd.concat(list_data_clim, axis=1, join='outer').ffill()#.fillna(method='ffill')
    
    #remove columns with all values nan 
    data_clim = data_clim.dropna(axis =1, how = 'all')
    
    return data_clim 


def get_data_state(state):
    '''
    This function is used to organize in a table the climate and epidemiological predictors 
    related to a specific state.
    
    :params macro: int. A four digit number
    '''
    
    data_ep = predictors_ep_state(state)
    
    data_clim = predictors_clim_state(state)
    
    data_full = pd.concat([data_ep, data_clim], axis = 1, join = 'outer')#.fillna(method = 'ffill')
    
    data_full = add_new_columns(data_full)
    
    return data_full


Get data for all macro in MG: 

In [9]:
macro = 3524
df1 = get_data_macro(macro)
df1 = df1.dropna()
df1.tail()

Unnamed: 0,casos_est_3509007,p_rt1_3509007,Rt_3509007,p_inc100k_3509007,casos_est_3509205,p_rt1_3509205,Rt_3509205,p_inc100k_3509205,casos_est_3516309,p_rt1_3516309,...,umid_amp_3528502,pressao_mean_3528502,month,SE,diff_casos_est_3509007,diff_casos_est_3509205,diff_casos_est_3516309,diff_casos_est_3516408,diff_casos_est_3528502,diff_casos_est_3524
2023-12-31,48.0,1.0,5.556517,46.703964,18.0,0.999216,4.451825,23.096466,2.0,0.694782,...,38.311157,1.001046,12,1,37.0,13.0,1.0,79.0,16.0,146.0
2024-01-07,47.0,0.999709,2.405032,45.730965,24.0,0.998334,2.999294,30.795288,7.0,0.974084,...,35.258153,0.999859,1,2,-1.0,6.0,5.0,2.0,25.0,37.0
2024-01-14,130.0,1.0,3.221716,126.48991,44.0,0.999848,2.650402,56.458027,13.0,0.993219,...,36.338773,1.00044,1,3,83.0,20.0,6.0,43.0,10.0,162.0
2024-01-21,139.0,0.999999,1.987708,135.2469,77.0,0.999999,2.654727,98.80155,16.0,0.967595,...,30.937168,0.999843,1,4,9.0,33.0,3.0,92.5,1.0,138.5
2024-01-28,161.0,0.999389,1.487955,156.65288,124.5,1.0,2.504608,159.75056,20.0,0.926808,...,22.88671,1.001558,1,5,22.0,47.5,4.0,73.5,-22.0,125.0


In [10]:
df1.isnull().sum()[df1.isnull().sum()>0]

Series([], dtype: int64)

In [11]:

# for macro in dfs.loc[dfs.state=='MG'].code_macro.unique():
for macro in dfs.code_macro.unique():

    df1 = get_data_macro(macro)
    
    df1 = df1.dropna()
    
    df1.to_csv(f'../data/dengue_{macro}.csv.gz')
    
    df1.head()

In [12]:
df1.tail()

Unnamed: 0,casos_est_5300108,p_rt1_5300108,Rt_5300108,p_inc100k_5300108,casos_est_5302,temp_min_5300108,temp_max_5300108,umid_min_5300108,umid_max_5300108,pressao_min_5300108,...,rainy_days_5300108,temp_mean_5300108,temp_amp_5300108,umid_mean_5300108,umid_amp_5300108,pressao_mean_5300108,month,SE,diff_casos_est_5300108,diff_casos_est_5302
2023-12-31,7856.5,1.0,2.409659,257.15604,7856.5,19.730083,28.39201,57.310584,91.638233,0.998582,...,8.0,24.061047,8.661926,74.474408,34.327649,1.000773,12,1,4055.5,4055.5
2024-01-07,8215.0,1.0,1.738111,268.89032,8215.0,19.584608,25.075889,71.175181,94.354639,0.997495,...,7.0,22.330248,5.491281,82.76491,23.179458,0.999493,1,2,358.5,358.5
2024-01-14,12394.5,1.0,1.800837,405.69217,12394.5,19.405648,25.900037,66.06918,95.123479,0.999285,...,7.0,22.652843,6.494389,80.59633,29.054298,1.001286,1,3,4179.5,4179.5
2024-01-21,17864.5,1.0,1.908464,584.7342,17864.5,18.981236,29.644913,46.036609,90.327574,0.998762,...,8.0,24.313074,10.663677,68.182091,44.290965,1.001174,1,4,5470.0,5470.0
2024-01-28,22088.5,1.0,1.713725,722.99255,22088.5,20.090332,26.234539,66.416177,93.977923,0.997492,...,8.0,23.162435,6.144207,80.19705,27.561746,0.999577,1,5,4224.0,4224.0


In [13]:

for state in dfs.state.unique():
    df2 = get_data_state(state)

    df2.to_csv(f'../data/dengue_{state}.csv.gz')

    df2.head()

In [14]:
df2.tail()

Unnamed: 0,casos_est_5302,p_rt1_5302,Rt_5302,casos_est_DF,temp_min_5302,temp_max_5302,umid_min_5302,umid_max_5302,pressao_min_5302,pressao_max_5302,...,rainy_days_5302,temp_mean_5302,temp_amp_5302,umid_mean_5302,umid_amp_5302,pressao_mean_5302,month,SE,diff_casos_est_5302,diff_casos_est_DF
2024-01-07,8215.0,1.0,1.738111,8215.0,19.584608,25.075889,71.175181,94.354639,0.997495,1.001491,...,7,22.330248,5.491281,82.76491,23.179458,0.999493,1,2,358.5,358.5
2024-01-14,12394.5,1.0,1.800837,12394.5,19.405648,25.900037,66.06918,95.123479,0.999285,1.003288,...,7,22.652843,6.494389,80.59633,29.054298,1.001286,1,3,4179.5,4179.5
2024-01-21,17864.5,1.0,1.908464,17864.5,18.981236,29.644913,46.036609,90.327574,0.998762,1.003587,...,8,24.313074,10.663677,68.182091,44.290965,1.001174,1,4,5470.0,5470.0
2024-01-28,22088.5,1.0,1.713725,22088.5,20.090332,26.234539,66.416177,93.977923,0.997492,1.001663,...,8,23.162435,6.144207,80.19705,27.561746,0.999577,1,5,4224.0,4224.0
2024-02-04,,,,,20.156189,26.814667,61.65046,94.29798,1.000725,1.0056,...,1,23.485428,6.658478,77.97422,32.64752,1.003163,2,6,,


In [21]:
macro = 1101 

filename_data = f'../data/dengue_{macro}.csv.gz'

df = pd.read_csv(filename_data, index_col='Unnamed: 0', nrows = 1)

df

Unnamed: 0,casos_est_1100049,p_rt1_1100049,Rt_1100049,p_inc100k_1100049,casos_est_1100189,p_rt1_1100189,Rt_1100189,p_inc100k_1100189,casos_est_1100288,p_rt1_1100288,...,umid_amp_small,pressao_mean_small,month,SE,diff_casos_est_1100049,diff_casos_est_1100189,diff_casos_est_1100288,diff_casos_est_1100304,diff_casos_est_1101,diff_casos_est_small
2010-01-03,164.0,0.0,0.0,190.93523,52.0,0.0,0.0,140.994,186.0,0.0,...,21.13686,0.994904,1,1,,,,,,


In [23]:
df.shape[0]

1

In [33]:
s = ['AC', 'AL', 'AP', 'DF', 'RN', 'RO', 'RR', 'SE', 'TO']

In [34]:
'AC' in s

True

In [15]:
pd.read_csv('forecast_tables/forecast_5302.csv.gz')

Unnamed: 0.1,Unnamed: 0,date,lower_2_5,lower_25,forecast,upper_75,upper_97_5,macroregion,prob_high,prob_low,HT,LT,HTinc,LTinc
0,0,2024-02-04,8392.25998,10541.763995,11164.928782,11749.171762,12911.138032,5302,100.0,0.0,658.800198,252.089879,21.563603,8.251312
1,1,2024-02-11,8764.017637,10505.634319,11326.954672,12074.482518,13420.374424,5302,100.0,0.0,666.253411,239.835754,21.807559,7.850215
2,2,2024-02-18,8977.330297,10390.177082,11399.544155,12325.275662,13396.019853,5302,100.0,0.0,787.000132,260.607695,25.759795,8.530114
3,3,2024-02-25,8958.071949,10279.996686,11047.885728,11608.991509,13148.232289,5302,100.0,0.0,908.66331,331.099791,29.742029,10.837435
4,4,2024-03-03,8427.867945,10234.003049,11027.127584,12017.441482,13755.418635,5302,100.0,0.0,984.478794,388.902864,32.223593,12.729424
5,5,2024-03-10,7801.468246,9295.84437,10424.235618,11170.054217,12463.743739,5302,100.0,0.0,1065.511249,441.752748,34.875918,14.459287
6,6,2024-03-17,7289.096145,8862.899973,10040.468176,10724.493884,12162.54129,5302,100.0,0.0,1378.318612,520.621707,45.114612,17.040796
7,7,2024-03-24,6917.312025,8499.924577,9082.556561,9687.321029,10947.436401,5302,100.0,0.0,1588.558081,577.431732,51.996092,18.900281
8,8,2024-03-31,6307.542885,7556.473094,8390.133066,9165.049009,10114.666764,5302,100.0,0.0,1784.461285,650.966682,58.408323,21.307199
9,9,2024-04-07,5636.194779,6930.381885,7577.106821,8243.608063,9495.361902,5302,100.0,0.0,1978.927462,715.63191,64.773517,23.423797
