This notebook was used to download the data used to train the models based on the clusters computed in the `compute_clusters.ipynb` notebook:

In [1]:
import pickle
import getpass, os
import numpy as np
import pandas as pd
from epiweeks import Week
from dotenv import load_dotenv

# Carrega as variáveis de ambiente do arquivo .env
load_dotenv()

os.environ['PSQL_USER']=os.getenv('PSQL_USER')
os.environ['PSQL_PASSWORD']=os.getenv('PSQL_PASSWORD')
os.environ['PSQL_HOST']=os.getenv('PSQL_HOST')
os.environ['PSQL_DB']=os.getenv('PSQL_DB')
os.environ['PSQL_PORT']=os.getenv('PSQL_PORT')

In [2]:
import matplotlib.pyplot as plt
from upload_data import get_cluster_data,combined_data

List of predictors used to forecast the cases: 

In [3]:
predictors = ['casos', 'p_rt1','Rt', 'p_inc100k', 'temp_min', 'temp_mean',
                                      'temp_max', 'temp_amp', 
              'umid_min', 'umid_max', 'umid_mean', 'umid_amp', 'pressao_min', 'pressao_max', 'pressao_mean', 
              'precip_tot']


In [4]:
def get_table_data_cluster(city, state, doenca = 'dengue', data_types = ['alerta', 'weather'], 
    predictors = ['casos', 'p_rt1','Rt', 'p_inc100k', 'temp_min',
                                      'temp_max', 'umid_min', 'umid_max', 
                  'pressao_min', 'pressao_max', 'precip_tot'], temp = 'cop'):
    
    """
    Generate a table with all the predictors computed for each city in the cluster 

    Parameters:
    -----------
    city : int
        ibge 7 digit code
    state : str
        UF, it will be used to find the file with the clusters 
    doenca : str
        Name of the disease which cases will be returned.
    data_types : list of str
        List with the source of the predictors
    predictors : list of str
        List with the predictors name
    temp : str
        source of the weather data.

    Retorns:
    --------
    pd.DataFrame
        A dataframe with the predictors for all regions in the cluster

    """
    

    

    with open(f'clusters/clusters_{state}.pkl', 'rb') as fp:
            clusters = pickle.load(fp)

    data, group = get_cluster_data(city, clusters=clusters, data_types= data_types, cols= predictors, doenca = doenca, temp = temp)   
    
    cluster_city = (list(filter(lambda x: str(city) in x, clusters))[0]).astype(int)
    
    for city in cluster_city:
        
        data[f'temp_mean_{city}'] = (data[f'temp_min_{city}'] + data[f'temp_max_{city}'])/2

        data[f'temp_amp_{city}'] = data[f'temp_max_{city}'] - data[f'temp_min_{city}']

        data[f'umid_mean_{city}'] = (data[f'umid_min_{city}'] + data[f'umid_max_{city}'])/2

        data[f'umid_amp_{city}'] = data[f'umid_max_{city}'] - data[f'umid_min_{city}']

        data[f'pressao_mean_{city}'] = (data[f'pressao_max_{city}'] - data[f'pressao_min_{city}'])/2
        
    
    data['month'] = data.index.month
    
    weeks = []
    for date in data.index:
        #print(date)
        weeks.append(Week.fromdate(date).weektuple()[1])
        #print(Week.fromdate(date).weektuple()[1])
        #break  
        
    data['SE'] = weeks
    
    data.loc[data.index == '2018-04-04', 'SE'] = 15
     
    diff_series = [data]
        
    for i in data.columns[data.columns.str.startswith('casos')]:

        diff_series.append(pd.DataFrame(data = np.diff(data[f'{i}'], 1), index = data.index[1:], columns = [f'diff_{i}']))

    data = pd.concat(diff_series, axis = 1, join = 'outer') 
    
        
        
    
    return data

In [5]:
df = pd.read_csv('./notebooks/s_cities.csv',index_col = 'Unnamed: 0')

#df = df.loc[~df.geocode.isin([2914802, 2905701, 1721000, 2311405, 2806701, 2925303, 2207702,
 #      3119401, 2211001, 2408003])]

df.head()

Unnamed: 0,geocode,muni_name,year,peak_week,beta,gamma,R0,total_cases,alpha,sum_res,ep_ini,ep_end,ep_dur,uf,state,n_last_ep,year_first_ep,year_last_ep_before_2023,start_train_chik,end_train_chik
30,3304102,Porciúncula,2023,7.40843,0.742929,0.301735,2.462195,510.383085,0.593858,0.746891,202302,202316,14,33,RJ,3,2016,2020,2015-10-01,2020-11-01
8,2303501,Cascavel,2023,21.031323,0.448789,0.300973,1.491123,196.764406,0.329365,1.124633,202302,202339,37,23,CE,3,2017,2022,2016-10-01,2022-11-01
86,2311405,Quixeramobim,2023,18.37503,0.5347,0.304761,1.754491,466.379506,0.430034,0.93029,202302,202332,30,23,CE,3,2016,2022,2015-10-01,2022-11-01
88,2306801,Jaguaribara,2023,16.405845,0.742924,0.308515,2.408069,302.646554,0.58473,1.125556,202309,202325,16,23,CE,1,2017,2017,2016-10-01,2017-11-01
122,2305407,Icó,2023,13.68278,0.429925,0.3001,1.432607,87.124092,0.301972,1.434495,202302,202334,32,23,CE,2,2017,2022,2016-10-01,2022-11-01


In [6]:
%%time 
for geocode, state in zip(df.geocode, df.state):
    dfd = get_table_data_cluster(geocode, state, doenca = 'dengue')

    dfd.to_csv(f'./data/dengue_{geocode}_cluster.csv.gz')

    dfc = get_table_data_cluster(geocode, state, doenca = 'chik')

    dfc.to_csv(f'./data/chik_{geocode}_cluster.csv.gz')

CPU times: user 1min 18s, sys: 3.1 s, total: 1min 21s
Wall time: 1h 15min 41s
