In [13]:
import os.path
import os
import pickle
import numpy as np
import pandas as pd
import geopandas as gpd 
import matplotlib.pyplot as plt
from clustering import compute_clusters, lag_ccf

The path where the parquet of the cases is saved:

In [2]:
PATH = 'data/cases'

In [3]:
def get_data(state, disease = 'dengue'):
    """
    Create a dataframe with the disease cases in all the cities of the state. 

    Parameters:
    -----------
    state : str
        UF name
    disease: str
        disease name options: 'dengue' or 'chik'

    Returns:
    --------
    pd.DataFrame
        Dataframe with datetime index where each colum represents the time series of cases for a city of the state
    """
    
    df = pd.read_parquet(f'{PATH}/{state}_{disease}.parquet',
                    columns = ['municipio_geocodigo', 'casos'])
    
    
    geocodes = df.municipio_geocodigo.unique()
        
    df_ = None  
    columns = []
    for geo in geocodes: 
        columns.append(f'casos_{geo}')
        df_s = df.loc[df.municipio_geocodigo == geo]
        if df_ is None:
            df_ = pd.DataFrame(df_s.casos.copy())
        else:
            df_ = pd.concat([df_,df_s.casos.copy()],axis=1) 
        
    df_.columns = columns
        
    return df_

In [8]:
get_data('MG')

Unnamed: 0_level_0,casos_3131802,casos_3138302,casos_3150604,casos_3164472,casos_3162104,casos_3131901,casos_3150703,casos_3132008,casos_3138401,casos_3150802,...,casos_3124104,casos_3104601,casos_3105608,casos_3107901,casos_3102902,casos_3103306,casos_3103207,casos_3103405,casos_3103504,casos_3103009
data_iniSE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-03,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,10,35,0
2010-01-10,0,0,0,0,0,0,0,0,0,0,...,3,1,0,0,0,0,0,13,61,0
2010-01-17,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,17,34,0
2010-01-24,0,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,34,0,0
2010-01-31,1,0,1,0,0,0,0,0,0,0,...,3,0,0,0,0,0,0,9,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-25,2,0,5,0,3,6,3,0,18,0,...,11,1,1,0,0,0,0,18,94,0
2023-07-02,1,0,0,0,4,0,3,0,6,0,...,8,0,3,0,1,0,0,16,71,2
2023-07-09,0,0,0,0,0,1,2,0,10,0,...,4,0,0,0,0,0,0,7,86,2
2023-07-16,0,0,2,0,0,0,3,0,5,0,...,0,0,0,0,0,0,0,6,38,3


The cell below apply the hierarchial clusterization of the time series of dengue cases in PR. It is used just the data before 2020, to don't introduce bias in the 2023 predictions. 

In [9]:
state = 'MG'
df = get_data(state)
    
df = df.loc[df.index.year <= 2020]

for col in df.columns: 

    if sum(df[col]) <= 50:

        df.drop(col, axis =1, inplace = True)

    else:
        df.rename(columns = {col:col[-7:]}, inplace = True)

clusters, fig = compute_clusters(df, lags = 10, t = 2, plot = False)
with open(f'clusters/clusters_{state}.pkl', 'wb') as f:
    pickle.dump(clusters, f)


  corr = correlate(x, y, mode="full") / np.sqrt(np.dot(x, x) * np.dot(y, y))


In [None]:
clusters[0]

In [59]:
brmap = gpd.read_file('muni_br.gpkg', driver='GPKG')

brmap.code_muni = brmap.code_muni.astype(int)

In [60]:
def plot_map(state, city, brmap = brmap):
    '''
    Map of the state highlighting the cities that compose the cluster computed
    
    Parameters:
    -----------
    state : str
        UF name
    city : int
        7 digit ibge code 
    brmap: geopandas Dataframe
        contains the polygons for all the country. 

    Returns:
    --------
    None

    '''
    
    with open(f'clusters/clusters_{state}.pkl', 'rb') as fp:
            clusters = pickle.load(fp)
    
    cluster_city = (list(filter(lambda x: str(city) in x, clusters))[0]).astype(int)
    
    statemap = brmap[brmap.abbrev_state==state]
    
    fig, ax = plt.subplots()

    statemap.plot(ax = ax, color = 'lightgray')
    statemap.loc[statemap.code_muni.isin(cluster_city)].plot(ax = ax, color = 'tab:orange')
    ax.set_axis_off()
    ax.set_title(f'Cities clusterized with {city}')
    plt.show()


Cities clusterized with FOZ DO IGUAÇU (4108304):

In [61]:
plot_map('PR',4108304)

IndexError: list index out of range

### Computing the clusters for all the states: 

In [16]:
estados = {'RJ': 'Rio de Janeiro', 'ES': 'Espírito Santo', 'PR': 'Paraná', 'CE': 'Ceará',
               'MA': 'Maranhão', 'MG': 'Minas Gerais', 'SC': 'Santa Catarina', 'PE': 'Pernambuco', 
               'PB': 'Paraíba', 'RN': 'Rio Grande do Norte', 'PI': 'Piauí', 'AL': 'Alagoas',
               'SE': 'Sergipe', 'SP': 'São Paulo', 'RS': 'Rio Grande do Sul','PA': 'Pará',
               'AP': 'Amapá', 'RR': 'Roraima', 'RO': 'Rondônia', 'AM': 'Amazonas', 'AC': 'Acre',
               'MT': 'Mato Grosso', 'MS': 'Mato Grosso do Sul', 'GO': 'Goiás', 'TO': 'Tocantins',
               # 'DF': 'Distrito Federal', # we cant clusterize the DF because it is not a state
               'BA': 'Bahia'
               }


for state in estados.keys(): 
    if os.path.exists(f'clusters/clusters_{state}.pkl'):
        continue
    df = get_data(state)
    
    df = df.loc[df.index.year <= 2020]

    for col in df.columns: 

        if sum(df[col]) <= 50:

            df.drop(col, axis =1, inplace = True)

        else:
            df.rename(columns = {col:col[-7:]}, inplace = True)

    clusters, fig = compute_clusters(df, lags = 10, t = 2, plot = False)
    with open(f'clusters/clusters_{state}.pkl', 'wb') as f:
        pickle.dump(clusters, f)