This notebook was used to download the data used to train the models based on the clusters computed in the `compute_clusters.ipynb` notebook:

In [1]:
import pickle
import getpass, os
import numpy as np
import pandas as pd
from dotenv import load_dotenv

# Carrega as variáveis de ambiente do arquivo .env
load_dotenv()

os.environ['PSQL_USER']=os.getenv('PSQL_USER')
os.environ['PSQL_PASSWORD']=os.getenv('PSQL_PASSWORD')
os.environ['PSQL_HOST']=os.getenv('PSQL_HOST')
os.environ['PSQL_DB']=os.getenv('PSQL_DB')
os.environ['PSQL_PORT']=os.getenv('PSQL_PORT')

In [2]:
import matplotlib.pyplot as plt
from upload_data import get_cluster_data,combined_data

List of predictors used to forecast the cases: 

In [3]:
predictors = ['casos', 'p_rt1','Rt', 'p_inc100k', 'temp_min', 'temp_mean',
                                      'temp_max', 'temp_amp', 
              'umid_min', 'umid_max', 'umid_mean', 'umid_amp', 'pressao_min', 'pressao_max', 'pressao_mean', 
              'precip_tot']


In [4]:
def get_table_data_cluster(city, state, doenca = 'dengue', data_types = ['alerta', 'weather'], 
    predictors = ['casos', 'p_rt1','Rt', 'p_inc100k', 'temp_min',
                                      'temp_max', 'umid_min', 'umid_max', 
                  'pressao_min', 'pressao_max', 'precip_tot'], temp = 'cop'):
    
    """
    Generate a table with all the predictors computed for each city in the cluster 

    Parameters:
    -----------
    city : int
        ibge 7 digit code
    state : str
        UF, it will be used to find the file with the clusters 
    doenca : str
        Name of the disease which cases will be returned.
    data_types : list of str
        List with the source of the predictors
    predictors : list of str
        List with the predictors name
    temp : str
        source of the weather data.

    Retorns:
    --------
    pd.DataFrame
        A dataframe with the predictors for all regions in the cluster

    """
    

    

    with open(f'clusters/clusters_{state}.pkl', 'rb') as fp:
            clusters = pickle.load(fp)

    data, group = get_cluster_data(city, clusters=clusters, data_types= data_types, cols= predictors, doenca = doenca, temp = temp)   
    
    cluster_city = (list(filter(lambda x: str(city) in x, clusters))[0]).astype(int)
    
    for city in cluster_city:
        
        data[f'temp_mean_{city}'] = (data[f'temp_min_{city}'] + data[f'temp_max_{city}'])/2

        data[f'temp_amp_{city}'] = data[f'temp_max_{city}'] - data[f'temp_min_{city}']

        data[f'umid_mean_{city}'] = (data[f'umid_min_{city}'] + data[f'umid_max_{city}'])/2

        data[f'umid_amp_{city}'] = data[f'umid_max_{city}'] - data[f'umid_min_{city}']

        data[f'pressao_mean_{city}'] = (data[f'pressao_max_{city}'] - data[f'pressao_min_{city}'])/2
        
        
    
    return data

In [5]:
df = pd.read_csv('./notebooks/selected_cities.csv',index_col = 'Unnamed: 0')

df.head()

Unnamed: 0,geocode,state,sum_cases,POP22,inc,n_last_ep,year_first_ep,year_f_ep_2,region,region_codes,start_train_chik,start_train_chik2
0,2211001,PI,2325,868523,267.695847,4,2015,2016,ne,1,2014-10-01,2015-10-01
1,2913606,BA,1858,197163,942.367483,2,2015,2015,ne,1,2014-10-01,2014-10-01
2,2914802,BA,1846,185500,995.148248,2,2016,2015,ne,1,2015-10-01,2014-10-01
3,2111300,MA,1788,1061374,168.460882,2,2016,2016,ne,1,2015-10-01,2015-10-01
4,2925303,BA,1687,158736,1062.77089,2,2016,2015,ne,1,2015-10-01,2014-10-01


In [6]:
%%time 
for geocode, state in zip(df.geocode, df.state):
    dfd = get_table_data_cluster(geocode, state, doenca = 'dengue')

    dfd.to_csv(f'./data/dengue_{geocode}_cluster.csv')

    dfc = get_table_data_cluster(geocode, state, doenca = 'chik')

    dfc.to_csv(f'./data/chik_{geocode}_cluster.csv')

  weather = weather.resample('W').apply(np.nanmean)
  full_data = pd.concat(to_concat, axis=1, join='inner').fillna(method='ffill')
  full_data = pd.concat([tmp, full_data], axis=1).fillna(method='ffill')
  weather = weather.resample('W').apply(np.nanmean)
  full_data = pd.concat(to_concat, axis=1, join='inner').fillna(method='ffill')
  full_data = pd.concat([tmp, full_data], axis=1).fillna(method='ffill')
  weather = weather.resample('W').apply(np.nanmean)
  full_data = pd.concat(to_concat, axis=1, join='inner').fillna(method='ffill')
  full_data = pd.concat([tmp, full_data], axis=1).fillna(method='ffill')
  weather = weather.resample('W').apply(np.nanmean)
  full_data = pd.concat(to_concat, axis=1, join='inner').fillna(method='ffill')
  full_data = pd.concat([tmp, full_data], axis=1).fillna(method='ffill')
  weather = weather.resample('W').apply(np.nanmean)
  full_data = pd.concat(to_concat, axis=1, join='inner').fillna(method='ffill')
  full_data = pd.concat([tmp, full_data], axis=1

CPU times: user 25.8 s, sys: 1 s, total: 26.8 s
Wall time: 2h 1min 15s


  weather = weather.resample('W').apply(np.nanmean)
  full_data = pd.concat(to_concat, axis=1, join='inner').fillna(method='ffill')
  full_data = pd.concat([tmp, full_data], axis=1).fillna(method='ffill')


In [7]:
geocode = 2211001
state = 'PI'

In [8]:
%%time
dfd = get_table_data_cluster(geocode, state, doenca = 'dengue')

dfd.to_csv(f'./data/dengue_{geocode}_cluster.csv')
    
dfd.head()

CPU times: user 154 ms, sys: 30.6 ms, total: 184 ms
Wall time: 32.8 s


Unnamed: 0,casos_2202083,p_rt1_2202083,Rt_2202083,p_inc100k_2202083,temp_min_2202083,temp_max_2202083,umid_min_2202083,umid_max_2202083,pressao_min_2202083,pressao_max_2202083,...,temp_mean_2200400,temp_amp_2200400,umid_mean_2200400,umid_amp_2200400,pressao_mean_2200400,temp_mean_2202083,temp_amp_2202083,umid_mean_2202083,umid_amp_2202083,pressao_mean_2202083
2010-01-03,0,0.0,0.0,0.0,24.977604,30.859375,56.61949,89.780958,0.995561,0.998614,...,26.995492,6.281704,79.470976,28.831833,0.001931,27.918489,5.88177,73.200224,33.161468,0.001527
2010-01-10,0,0.0,0.0,0.0,24.670175,32.149815,51.042092,89.410381,0.995509,0.998474,...,28.317963,7.939793,73.378837,39.355617,0.001775,28.409995,7.479641,70.226236,38.36829,0.001483
2010-01-17,0,0.0,0.0,0.0,25.216902,33.797145,44.254512,86.297968,0.99642,0.99986,...,28.899942,9.408936,72.761208,42.420207,0.001926,29.507023,8.580244,65.27624,42.043456,0.00172
2010-01-24,0,0.0,0.0,0.0,25.130397,33.668836,45.381766,89.111741,0.99524,0.998741,...,29.311761,10.213296,70.112416,44.847404,0.002156,29.399617,8.538439,67.246754,43.729975,0.00175
2010-01-31,0,0.0,0.0,0.0,25.397287,32.782745,51.685666,89.466513,0.994337,0.997688,...,27.906749,8.127524,78.964517,36.802842,0.001999,29.090016,7.385459,70.576089,37.780847,0.001675


In [14]:
%%time 
dfc = get_table_data_cluster(geocode, state, doenca = 'chik')

dfc.to_csv(f'./data/chik_{geocode}_cluster.csv')

dfd.head()

CPU times: user 463 ms, sys: 71.9 ms, total: 535 ms
Wall time: 36.8 s


Unnamed: 0,casos_2202083,p_rt1_2202083,Rt_2202083,p_inc100k_2202083,temp_min_2202083,temp_max_2202083,umid_min_2202083,umid_max_2202083,pressao_min_2202083,pressao_max_2202083,...,temp_mean_2200400,temp_amp_2200400,umid_mean_2200400,umid_amp_2200400,pressao_mean_2200400,temp_mean_2202083,temp_amp_2202083,umid_mean_2202083,umid_amp_2202083,pressao_mean_2202083
2010-01-03,0,0.0,0.0,0.0,24.977604,30.859375,56.61949,89.780958,0.995561,0.998614,...,26.995492,6.281704,79.470976,28.831833,0.001931,27.918489,5.88177,73.200224,33.161468,0.001527
2010-01-10,0,0.0,0.0,0.0,24.670175,32.149815,51.042092,89.410381,0.995509,0.998474,...,28.317963,7.939793,73.378837,39.355617,0.001775,28.409995,7.479641,70.226236,38.36829,0.001483
2010-01-17,0,0.0,0.0,0.0,25.216902,33.797145,44.254512,86.297968,0.99642,0.99986,...,28.899942,9.408936,72.761208,42.420207,0.001926,29.507023,8.580244,65.27624,42.043456,0.00172
2010-01-24,0,0.0,0.0,0.0,25.130397,33.668836,45.381766,89.111741,0.99524,0.998741,...,29.311761,10.213296,70.112416,44.847404,0.002156,29.399617,8.538439,67.246754,43.729975,0.00175
2010-01-31,0,0.0,0.0,0.0,25.397287,32.782745,51.685666,89.466513,0.994337,0.997688,...,27.906749,8.127524,78.964517,36.802842,0.001999,29.090016,7.385459,70.576089,37.780847,0.001675
