# Data fusion for Brazil

## **1.** Authentications and imports

In [None]:
# insert your desired path to work on
import os
from os.path import join
project_path = os.path.dirname(os.getcwd())
os.chdir(join('..','..','data'))
os.getcwd()

Import necessary python libraries.

In [None]:
!pip install pyreadr geopandas geoplot --quiet
import geopandas as gpd
import geoplot as gplt
import geoplot.crs as gcrs
import pyreadr
import numpy as np
import glob
import pandas as pd
import pprint
import seaborn as sns
import matplotlib
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from matplotlib.colors import Normalize

pp = pprint.PrettyPrinter(indent=4)
%matplotlib inline

Set folder structure.

In [None]:
config = {
    'main_brazil': 'Brazil',
    'main_peru': 'Peru'
}

# List comprehension for the folder structure code
[os.makedirs(val, exist_ok=True) for key, val in config.items()]

## **2.** Load and clean data

### 2.0. Regions data

In [None]:
UF_df = pd.read_csv(join(config['main_brazil'], "BR_UF_2020.csv"), encoding='iso-8859-1', converters={'CD_UF':str})
UF_df

In [None]:
MUN_df = pd.read_csv(join(config['main_brazil'], "BR_Municipios_2020.csv"), encoding='iso-8859-1', converters={'CD_MUN':str})
MUN_df["CD_MUN"] = MUN_df["CD_MUN"].apply(lambda x: x.replace(x, x[:6]))
MUN_df.head()

In [None]:
regions_df = pd.merge(UF_df, MUN_df, on='SIGLA_UF', how='inner')
regions_df.head()

### 2.1. Population data

In [None]:
pop_df = pd.read_excel(join(config['main_brazil'], "Pop_Total_2010.xlsx"))\
           .rename(columns={'Cod6':'CD_MUN', 
                            'Pop_Total_0_19':'Pop0_19', 
                            'Pop_Total_20More':'Pop20_99', 
                            'Pop_Total':'PopTotal'})\
           .drop('Município', axis=1)\
           .sort_values('CD_MUN', ignore_index=True)
pop_df.head()

In [None]:
urban_df = pd.read_excel(join(config['main_brazil'], "Pop_Urban_2010.xlsx"))\
             .rename(columns={'Cod6':'CD_MUN', 
                              'Pop_Urban_0_19':'Pop0_19_Urban', 
                              'Pop_Urban_20More':'Pop20_99_Urban', 
                              'Total_Pop_Urban':'PopTotal_Urban'})\
             .drop('Município', axis=1)

rural_df = pd.read_excel('Pop_Rural_2010.xlsx')\
             .rename(columns={'Cod6':'CD_MUN', 
                              'Pop_Rural_0_19':'Pop0_19_Rural', 
                              'Pop_Rural_20More':'Pop20_99_Rural', 
                              'Total':'PopTotal_Rural'})\
             .drop('Município', axis=1)
try:
    assert len(urban_df.CD_MUN.unique())==len(rural_df.CD_MUN.unique())
except AssertionError:
    print('AssertionError: Number of municipalities is not the same in both dataframes.')

popUrbRur_df = pd.merge(urban_df, rural_df, on='CD_MUN', how='left').sort_values('CD_MUN', ignore_index=True) \
                 .merge(pop_df, on='CD_MUN', how='right')
popUrbRur_df.head()

In [None]:
popUrbRur_df.isna().sum()

In [None]:
popUrbRur_df[popUrbRur_df.Pop0_19_Rural.isnull()]

In [None]:
temp_df = popUrbRur_df[~popUrbRur_df.Pop0_19_Rural.isnull()]
assert all(temp_df['Pop0_19'] == temp_df['Pop0_19_Urban']+temp_df['Pop0_19_Rural'])
assert all(temp_df['Pop20_99'] == temp_df['Pop20_99_Urban']+temp_df['Pop20_99_Rural'])
assert all(temp_df['PopTotal'] == temp_df['PopTotal_Urban']+temp_df['PopTotal_Rural'])

def fillNa_pop(row):
    if np.isnan(row['Pop0_19_Rural']):
        row['Pop0_19_Rural'] = row['Pop0_19']-row['Pop0_19_Urban']
        row['Pop20_99_Rural'] = row['Pop20_99']-row['Pop20_99_Urban']
        row['PopTotal_Rural'] = row['PopTotal']-row['PopTotal_Urban']
    return row.astype(int)

population_df = popUrbRur_df.copy()
population_df = population_df.apply(fillNa_pop, axis=1).astype({'CD_MUN':str})
population_df.info()

In [None]:
population_df = population_df.merge(regions_df[['CD_UF', 'CD_MUN']], on='CD_MUN')
population_df = population_df.set_index(['CD_UF']).reset_index()
population_df.head()

In [None]:
popByUF_df = population_df.groupby('CD_UF').sum().add_suffix('_UF').reset_index()

# assertions
assert popByUF_df[popByUF_df['CD_UF']=='11']['PopTotal_UF'].values[0] == population_df[population_df['CD_UF']=='11']['PopTotal'].sum()
assert all(popByUF_df['Pop0_19_UF'] == popByUF_df['Pop0_19_Urban_UF']+popByUF_df['Pop0_19_Rural_UF'])
assert all(popByUF_df['Pop20_99_UF'] == popByUF_df['Pop20_99_Urban_UF']+popByUF_df['Pop20_99_Rural_UF'])
assert all(popByUF_df['PopTotal_UF'] == popByUF_df['PopTotal_Urban_UF']+popByUF_df['PopTotal_Rural_UF'])

popByUF_df = popByUF_df.drop(columns=['Pop20_99_Urban_UF', 'Pop20_99_Rural_UF', 'Pop20_99_UF'], axis=1)
popByUF_df

### 2.2. Dengue data

In [None]:
dengue_df = pd.read_csv(join(config['main_brazil'], "data_CaseDengue_month_notified_limpo.csv"), encoding='iso-8859-1', converters={'UF':str})\
                .rename(columns={'UF':'CD_UF', 'totalcase':'cases_total', 'case0to19': 'cases0_19'})

dengue_df.drop(columns=['Unnamed: 0', 'Name', 'date', 'time'], axis=1, inplace=True)
dengue_df['cases20_99'] = dengue_df['cases_total'] - dengue_df['cases0_19']
dengue_df

In [None]:
dengue_df.info()

#### 2.2.1. Plots

In [None]:
tot_df = dengue_df.merge(UF_df, on='CD_UF', how='left')[['CD_UF','Year','Month','cases_total','NM_UF']]
tot_df['year-month'] = pd.to_datetime(tot_df['Year'].astype(str)+'-'+tot_df['Month'].astype(str), format='%Y-%m').dt.strftime('%Y-%m')
df = tot_df[['NM_UF', 'cases_total', 'year-month']]

# create pivot table
result = df.pivot(index='NM_UF', columns='year-month', values='cases_total')

fig, ax = plt.subplots(1,1, figsize=(65,9))
sns.heatmap(data=result, fmt="g", 
            robust=True, square=True,
            cmap="Blues", linewidth=0.01, 
            cbar_kws={'label':'Dengue cases',
                      'pad': .01, 
                      'shrink': .85})
ax.set(title='Monthly dengue cases for Total Population by State',
       ylabel = 'State')
plt.show()

In [None]:
popByUF_df.head()

In [None]:
aux = dengue_df.merge(popByUF_df[['CD_UF', 'PopTotal_UF']], on=['CD_UF'], how='left')[['CD_UF','Year','Month','cases_total','PopTotal_UF']]
aux['DIR'] = round(aux['cases_total'].div(aux['PopTotal_UF'], axis=0).multiply(np.power(10,5), axis=0), 2)
aux.head()

In [None]:
ufs = dengue_df['CD_UF'].unique()

dict = {'UF': ['14','16','13','15','21','23','12','11','17','22','25','24','51','52','29','26','27','50','53','31','28','35','33','32','41','42','43'],
        'row': [0,0,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,4,4,4,4,5,5,5,6,6,7],
        'col': [1,2,1,2,3,4,0,1,2,3,4,5,1,2,3,4,5,1,2,3,4,2,3,4,2,3,2]}

dict2 = {'row': [0,0,0,0,1,1,3,4,4,5,5,5,6,6,6,6,7,7,7,7,7],
         'col': [0,3,4,5,0,5,0,0,5,0,1,5,0,1,4,5,0,1,3,4,5]}

no_xaxis = {'row': [0,0,1,1,1,1,2,2,2,2,2,3,3,3,3,4,4,4,5,5,6],
            'col': [1,2,1,2,3,4,1,2,3,4,5,1,2,3,4,2,3,4,2,3,2]}

no_yaxis = {'row': [0,1,1,1,2,2,2,2,2,3,3,3,3,4,4,4,5,5,6],
            'col': [2,2,3,4,1,2,3,4,5,2,3,4,5,2,3,4,3,4,3]}

months = {0:'Jan', 1:'Feb', 2:'Mar', 3:'Apr', 4:'May', 5:'Jun', 
          6:'Jul', 7:'Aug', 8:'Sep', 9:'Oct', 10:'Nov', 11:'Dec'}

fig, axes = plt.subplots(8, 6, figsize=(19,19))

cmap = cm.get_cmap('PuRd')
#normalizer = Normalize(0, aux['DIR'].max())
#normalizer = colors.LogNorm(vmin=aux['DIR'].min(), vmax=aux['DIR'].max())
normalizer = colors.PowerNorm(gamma=0.2, vmin=aux['DIR'].min(), vmax=aux['DIR'].max())
im = cm.ScalarMappable(norm=normalizer, cmap=cmap)

for i in range(len(ufs)):
    name = UF_df[UF_df['CD_UF'] == dict['UF'][i]]['NM_UF'].values[0]
    #temp_df = dengue_df[dengue_df['CD_UF'] == dict['UF'][i]][['Year','Month','cases_total']]
    temp_df = aux[aux['CD_UF'] == dict['UF'][i]][['Year','Month','DIR']]

    result = temp_df.pivot(index='Year', columns='Month', values='DIR')\
                    .reset_index()\
                    .sort_values(by=['Year'], ascending=[False])\
                    .set_index(['Year'])
    sns.heatmap(data=result, ax = axes[dict['row'][i], dict['col'][i]], cbar=False, robust=False, cmap=cmap, norm=normalizer, label=name)
    axes[dict['row'][i], dict['col'][i]].set_title(name)

# set new labels for the Months
N = 3  # 1 tick every 3
xticks_pos = axes[0,1].get_xticks()
xticks_labels = axes[0,1].get_xticklabels()
labels = [t.get_text() for t in xticks_labels]
xticks_newlabels = [months[i] for i, lbl in enumerate(labels)]
myticks = [j for i,j in enumerate(xticks_pos) if not i%N]  # index of selected ticks
newlabels = [label for i,label in enumerate(xticks_newlabels) if not i%N]

for ax in axes.flat:
    ax.set_xticks(myticks)
    ax.set_xticklabels(newlabels)
    ax.tick_params(axis='both', labelsize=10)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.tick_params(axis="x", which="both", rotation=55)
axes[2,0].set_ylabel('Year')
axes[7,2].set_xlabel('Month')

for j in range(len(dict2['row'])):
    axes[dict2['row'][j], dict2['col'][j]].set_visible(False)
for k in range(len(no_xaxis['row'])):
    axes[no_xaxis['row'][k], no_xaxis['col'][k]].xaxis.set_ticklabels([])
for w in range(len(no_yaxis['row'])):
    axes[no_yaxis['row'][w], no_yaxis['col'][w]].yaxis.set_ticklabels([])

cbar = plt.colorbar(im, ax=axes.ravel().tolist(), pad=0.03, shrink=0.5, aspect=50, ticks=[0.01,10,100,300,1000])
cbar.set_label(label='Dengue Incidence Rate (per 100 000 people)', size=12, labelpad=8)

#plt.tight_layout()
plt.show()

In [None]:
years = dengue_df.Year.unique()

aux2 = dengue_df[['CD_UF','Year','Month','cases_total']]\
        .groupby(['CD_UF','Year'], as_index=False).sum()[['CD_UF','Year','cases_total']]\
        .merge(popByUF_df[['CD_UF', 'PopTotal_UF']], on=['CD_UF'], how='left')
aux2['DIR'] = round(aux2['cases_total'].div(aux2['PopTotal_UF'], axis=0).multiply(np.power(10,5), axis=0), 2)

# possible schemes: 'BoxPlot', 'EqualInterval', 'FisherJenks', 'FisherJenksSampled',
#                   'HeadTailBreaks', 'JenksCaspall', 'JenksCaspallForced', 
#                   'JenksCaspallSampled', 'MaxP', 'MaximumBreaks', 'NaturalBreaks', 
#                   'Quantiles', 'Percentiles', 'StdMean', 'UserDefined', 'Categorical'

fig, axes = plt.subplots(2,4, figsize=(24,12))
for i, ax in enumerate(axes.flat[:-1]):    

    temp_df = aux2[aux2.Year == years[i]][['CD_UF', 'DIR']]
    bd = brazil.merge(temp_df, on=['CD_UF'])

    bd.plot(column='DIR', edgecolor='black', ax=ax, legend=True,  scheme='quantiles', linewidth=0.5, cmap='YlOrRd')
    ax.set_title(f'DIR by State in {years[i]}')

for ax in axes.flat:
    ax.axis('off')

plt.tight_layout()
plt.show()

### 2.3. Forest/land data

#### 2.3.1. Elevation data

In [None]:
elev_df = pd.read_csv(join(config['main_brazil'], "Brazil_Elevation_stats_by_municipality.csv"), converters={'CD_MUN':str}) \
            .drop(['system:index','.geo','Municipality_Area'], axis=1)
elev_df["CD_MUN"] = elev_df["CD_MUN"].apply(lambda x: x.replace(x, x[:6]))
elev_df

In [None]:
mun_area_df = pd.read_csv(join(config['main_brazil'], "Brazil_Elevation_stats_by_municipality.csv"), converters={'CD_MUN':str})[['CD_MUN', 'Municipality_Area']]
mun_area_df["CD_MUN"] = mun_area_df["CD_MUN"].apply(lambda x: x.replace(x, x[:6]))
mun_area_df

In [None]:
elev_df.isna().sum()

#### 2.3.2. Urbal/Rural data

In [None]:
urbRur = pd.read_csv(join(config['main_brazil'], "Brazil_Urban_Rural_stats_by_municipality.csv"), converters={'CD_MUN':str})
urbRur["CD_MUN"] = urbRur["CD_MUN"].apply(lambda x: x.replace(x, x[:6]))
urbRur = urbRur.merge(regions_df[['CD_UF', 'CD_MUN']], on=['CD_MUN'], how='left')\
               .rename(columns={'LC_Type1':'urb_area',
                                'Municipality_Area': 'region_area'})\
               .drop(['CD_MUN','system:index','.geo'], axis=1)
urbRur.head()

In [None]:
# sum urb_area and region_area by State (UF)
urbRur_df = urbRur.groupby('CD_UF', as_index=False).sum()
urbRur_df.head()

In [None]:
urbRur_df['Urban_Cover_Percent'] = (urbRur_df['urb_area'] / urbRur_df['region_area']) * 100
urbRur_df = urbRur_df[['CD_UF', 'Urban_Cover_Percent']]
urbRur_df

#### 2.3.3. Forest Cover data

In [None]:
fCover_df = pd.read_csv(join(config['main_brazil'], "Forest_Cover_Percent_Brazil_Municipalities.csv"), converters={'CD_MUN':str}).drop(['Unnamed: 0'], axis=1)
fCover_df["CD_MUN"] = fCover_df["CD_MUN"].apply(lambda x: x.replace(x, x[:6]))
fCover_df = fCover_df.merge(regions_df[['CD_UF', 'CD_MUN']], on=['CD_MUN'], how='left')\
                     .merge(mun_area_df, on=['CD_MUN'], how='left')
fCover_df

In [None]:
perc_cols = fCover_df.columns[1:-2]
fCover_df.loc[:, perc_cols] = fCover_df.loc[:, perc_cols].div(100, axis=0).multiply(fCover_df['Municipality_Area'], axis=0)
fCover_df = fCover_df.drop('CD_MUN', axis=1)
fCover_df

In [None]:
fCover_df.isna().sum()

In [None]:
# sum percentages by State (UF)
fCover_df1 = fCover_df.groupby('CD_UF', as_index=False).sum()
fCover_df1.head()

In [None]:
fCover_df1.isna().sum()

In [None]:
perc_cols

In [None]:
fCover_df1.loc[:, perc_cols] = fCover_df1.loc[:, perc_cols].div(fCover_df1['Municipality_Area'], axis=0).multiply(100, axis=0)
fCover_df1 = fCover_df1.drop(['Municipality_Area'], axis=1)
fCover_df1

In [None]:
fCover_df1.isna().sum()

In [None]:
years = np.arange(2001, 2021)

newDict={'Year':[], 'CD_UF':[], 'Forest_Cover_Percent':[]}

for id, row in fCover_df1.iterrows():
    for year in years:
        newDict['Year'].append(year)
        newDict['CD_UF'].append(row.CD_UF)
        newDict['Forest_Cover_Percent'].append(row['Forest_Cover_Percent_%i' %year])

fCover_df2 = pd.DataFrame(newDict)
fCover_df2

### 2.4. Climate data

In [None]:
era5land = pd.read_csv(join(config['main_brazil'], 'ERA5land_NDVI_monthly_cities_Brazil.csv'), converters={'CD_UF':str, 'CD_MUN':str})
era5land['Date'] = pd.to_datetime(era5land['Date'])
era5land["CD_MUN"] = era5land["CD_MUN"].apply(lambda x: x.replace(x, x[:6]))
era5land.head()

In [None]:
era5land.isna().sum()

In [None]:
# fill NaNs of NDVI from Januray 2000 with the value from February 2000
era5land.fillna(method='bfill', inplace=True)
era5land.head()

In [None]:
era5land.isnull().sum()

In [None]:
era5land_df = era5land.merge(elev_df, on=['CD_MUN'], how='left')
era5land_df.head()

In [None]:
era5land_df.isna().sum()

### 2.5. Social data

In [None]:
social_df = pd.read_excel(join(config['main_brazil'], "dadosUF.xlsx"), converters={'UF':str})\
    [['UF', 'IVS', 'IVS Infraestrutura Urbana', 'IVS Capital Humano', 'IVS Renda e Trabalho', '% de pessoas em domicílios com abastecimento de água e esgotamento sanitário inadequados',
      '% da população que vive em domicílios urbanos sem o serviço de coleta de lixo', '% de pessoas que vivem em domicílios com renda per capita inferior a meio salário mínimo (de 2010) e que gastam mais de uma hora até o trabalho',
      'Taxa de analfabetismo da população de 15 anos ou mais de idade', '% de crianças que vivem em domicílios em que nenhum dos moradores tem o ensino fundamental completo',
      '% de pessoas de 15 a 24 anos que não estudam, não trabalham e possuem renda domiciliar per capita igual ou inferior a meio salário mínimo (de 2010)',
      'Porcentagem de pessoas com renda domiciliar per capita igual ou inferior a meio salário mínimo (de 2010)',
      'Taxa de desocupação da população de 18 anos ou mais de idade', '% de pessoas de 18 anos ou mais sem fundamental completo e em ocupação informal',
      'IDHM', 'IDHM Longevidade', 'IDHM Educação', 'IDHM Renda',
      'Subíndice de escolaridade - IDHM Educação', '% de 18 anos ou mais com fundamental completo', 'Subíndice de frequência escolar - IDHM Educação', 'Renda per capita',
      'PEA - 10 a 14 anos', 'PEA - 15 a 17 anos', 'PEA - 18 anos ou mais', '% da população em domicílios com energia elétrica', '% da população em domicílios com densidade > 2',
      'Renda per capita dos vulneráveis à pobreza', 'Taxa de analfabetismo - 18 anos ou mais',
      'Grau de formalização dos ocupados - 18 anos ou mais', '% dos ocupados com fundamental completo - 18 anos ou mais', '% dos ocupados com médio completo - 18 anos ou mais']]

social_df = social_df.rename(columns={'UF': 'CD_UF',
                                      'IVS':'ivs',
                                      'IVS Infraestrutura Urbana': 'ivs_infraestrutura_urbana',
                                      'IVS Capital Humano': 'ivs_capital_humano',
                                      'IVS Renda e Trabalho': 'ivs_renda_e_trabalho',
                                      '% de pessoas em domicílios com abastecimento de água e esgotamento sanitário inadequados':'t_sem_agua_esgoto',
                                      '% da população que vive em domicílios urbanos sem o serviço de coleta de lixo':'t_sem_lixo',
                                      '% de pessoas que vivem em domicílios com renda per capita inferior a meio salário mínimo (de 2010) e que gastam mais de uma hora até o trabalho':'t_vulner_mais1h',
                                      'Taxa de analfabetismo da população de 15 anos ou mais de idade':'t_analf_15m',
                                      '% de crianças que vivem em domicílios em que nenhum dos moradores tem o ensino fundamental completo':'t_cdom_fundin',
                                      '% de pessoas de 15 a 24 anos que não estudam, não trabalham e possuem renda domiciliar per capita igual ou inferior a meio salário mínimo (de 2010)':'t_p15a24_nada',
                                      'Porcentagem de pessoas com renda domiciliar per capita igual ou inferior a meio salário mínimo (de 2010)':'t_vulner',
                                      'Taxa de desocupação da população de 18 anos ou mais de idade':'t_desocup18m',
                                      '% de pessoas de 18 anos ou mais sem fundamental completo e em ocupação informal':'t_p18m_fundin_informal',
                                      'IDHM':'idhm',
                                      'IDHM Longevidade':'idhm_long',
                                      'IDHM Educação':'idhm_educ',
                                      'IDHM Renda':'idhm_renda',
                                      'Subíndice de escolaridade - IDHM Educação':'idhm_educ_sub_esc',
                                      '% de 18 anos ou mais com fundamental completo':'t_pop18m_fundc',
                                      'Subíndice de frequência escolar - IDHM Educação':'idhm_educ_sub_freq',
                                      'Renda per capita':'renda_per_capita',
                                      'PEA - 10 a 14 anos':'pea10a14',
                                      'PEA - 15 a 17 anos':'pea15a17',
                                      'PEA - 18 anos ou mais':'pea18m',
                                      '% da população em domicílios com energia elétrica':'t_eletrica',
                                      '% da população em domicílios com densidade > 2':'t_densidadem2',
                                      'Renda per capita dos vulneráveis à pobreza':'rdpc_def_vulner',
                                      'Taxa de analfabetismo - 18 anos ou mais':'t_analf_18m',
                                      'Grau de formalização dos ocupados - 18 anos ou mais':'t_formal_18m',
                                      '% dos ocupados com fundamental completo - 18 anos ou mais':'t_fundc_ocup18m',
                                      '% dos ocupados com médio completo - 18 anos ou mais':'t_medioc_ocup18m'})
social_df.head()

## **3.** Create final dataframe

### 3.1. Merge all data

In [None]:
print('Município not in ERA5-land:', MUN_df[MUN_df['CD_MUN'] == '260545']['NM_MUN'].values[0])
MUN_df[MUN_df['CD_MUN'] == '260545']

In [None]:
list1=population_df.CD_MUN.unique()
print(len(list1))
list2=era5land.CD_MUN.unique()
print(len(list2))

mun_notin_population_df = [i for i in list2 if i not in list1]
#print(mun_notin_population_df)
print('Municipios without population data:', [MUN_df[MUN_df['CD_MUN'] == m]['NM_MUN'].values[0] for m in mun_notin_population_df])

In [None]:
df = pd.merge(era5land_df, population_df[['CD_MUN', 'PopTotal']], on='CD_MUN', how='inner')
df.head()

In [None]:
df.isnull().sum()

Create population-weighted monthly averages of climatic variables.

In [None]:
vars = ['NDVI', 'dewpoint_temperature_2m', 'humidity',
        'max_temperature_2m', 'min_temperature_2m', 'surface_pressure',
        'temperature_2m', 'total_precipitation', 
        'u_component_of_wind_10m', 'v_component_of_wind_10m',
        'max_elevation', 'mean_elevation', 'min_elevation',
        'stdDev_elevation', 'variance_elevation']

df.loc[:, vars] = df.loc[:, vars].multiply(df['PopTotal'], axis=0)
df = df.rename(columns={col: col+'_d' for col in df.columns if col in vars})
df.head()

In [None]:
df1 = df.groupby(['Date','CD_UF'], as_index=False).sum()
df1.head()

In [None]:
cols = ['NDVI_d', 'dewpoint_temperature_2m_d', 'humidity_d',
        'max_temperature_2m_d', 'min_temperature_2m_d', 'surface_pressure_d',
        'temperature_2m_d', 'total_precipitation_d',
        'u_component_of_wind_10m_d', 'v_component_of_wind_10m_d',
        'max_elevation_d', 'mean_elevation_d', 'min_elevation_d',
        'stdDev_elevation_d', 'variance_elevation_d']

df1.loc[:, cols] = df1.loc[:, cols].div(df1['PopTotal'], axis=0)
df1.head()

In [None]:
df1 = df1.drop(columns=['PopTotal'], axis=1)
df1.insert(loc=1, column='Year', value=df1['Date'].dt.year)
df1.insert(loc=2, column='Month', value=df1['Date'].dt.month)
df1.head()

In [None]:
df2 = pd.merge(df1, popByUF_df, on=['CD_UF'], how='left')\
        .merge(dengue_df, on=['CD_UF', 'Year', 'Month'], how='right')\
        .merge(fCover_df2, on=['CD_UF', 'Year'], how='left')\
        .merge(urbRur_df, on=['CD_UF'], how='left')\
        .merge(social_df, on=['CD_UF'], how='left')
df2

In [None]:
df2.isna().sum()

In [None]:
# save final dataframe to csv
df2.to_csv(join(config['main_brazil'], 'Brazil_UF_dengue_monthly.csv'), index=False)

### 3.2. Plots

In [None]:
ufs = dengue_df['CD_UF'].unique()

dict = {'UF': ['14','16','13','15','21','23','12','11','17','22','25','24','51','52','29','26','27','50','53','31','28','35','33','32','41','42','43'],
        'row': [0,0,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,4,4,4,4,5,5,5,6,6,7],
        'col': [1,2,1,2,3,4,0,1,2,3,4,5,1,2,3,4,5,1,2,3,4,2,3,4,2,3,2]}

dict2 = {'row': [0,0,0,0,1,1,3,4,4,5,5,5,6,6,6,6,7,7,7,7,7],
         'col': [0,3,4,5,0,5,0,0,5,0,1,5,0,1,4,5,0,1,3,4,5]}

no_xaxis = {'row': [0,0,1,1,1,1,2,2,2,2,2,3,3,3,3,4,4,4,5,5,6],
            'col': [1,2,1,2,3,4,1,2,3,4,5,1,2,3,4,2,3,4,2,3,2]}

no_yaxis = {'row': [0,1,1,1,2,2,2,2,2,3,3,3,3,4,4,4,5,5,6],
            'col': [2,2,3,4,1,2,3,4,5,2,3,4,5,2,3,4,3,4,3]}

months = {0:'Jan', 1:'Feb', 2:'Mar', 3:'Apr', 4:'May', 5:'Jun', 
          6:'Jul', 7:'Aug', 8:'Sep', 9:'Oct', 10:'Nov', 11:'Dec'}

from matplotlib.colors import ListedColormap

vars=['NDVI_d', 'humidity_d', 'max_temperature_2m_d', 'total_precipitation_d', 'mean_elevation_d']
lbls=['Normalized Difference Vegetation Index (NDVI)', 'Relative humidity', 'Maximum air temperature at 2m above the surface (°C)', 'Total precipitation (m)', 'Mean elevation (m)']
viridisBig = cm.get_cmap('viridis', 512)
newcmp = ListedColormap(viridisBig(np.linspace(0.15, 0.80, 256))) # blue to green colormap
colormaps = [cm.get_cmap('Greens'), cm.get_cmap('Blues'), cm.get_cmap('coolwarm'), cm.get_cmap('Blues'), cm.get_cmap('Oranges')]
gammas = [0.3, 0.6, 1, 0.5, 0.5]

for v in range(len(vars)):

    if vars[v]=='max_temperature_2m_d':
        df3 = df2.copy()
        df3[vars[v]] -= 273.15
        #normalizer = Normalize(vmin=df3[vars[v]].min(), vmax=df3[vars[v]].max())
        normalizer = colors.PowerNorm(gamma=gammas[v], vmin=df3[vars[v]].min(), vmax=df3[vars[v]].max())
    else:
        normalizer = colors.PowerNorm(gamma=gammas[v], vmin=df2[vars[v]].min(), vmax=df2[vars[v]].max())

    fig, axes = plt.subplots(8, 6, figsize=(19,19))
    cmap = colormaps[v]
    im = cm.ScalarMappable(norm=normalizer, cmap=cmap)

    for i in range(len(ufs)):
        name = UF_df[UF_df['CD_UF'] == dict['UF'][i]]['NM_UF'].values[0]
        if vars[v]=='max_temperature_2m_d':
            temp_df = df3[df3['CD_UF'] == dict['UF'][i]][['Year','Month', vars[v]]]
        else:
            temp_df = df2[df2['CD_UF'] == dict['UF'][i]][['Year','Month', vars[v]]]

        result = temp_df.pivot(index='Year', columns='Month', values=vars[v])\
                        .reset_index()\
                        .sort_values(by=['Year'], ascending=[False])\
                        .set_index(['Year'])
        sns.heatmap(data=result, ax = axes[dict['row'][i], dict['col'][i]], cbar=False, robust=False, cmap=cmap, norm=normalizer, label=name)
        axes[dict['row'][i], dict['col'][i]].set_title(name)

    # set new labels for the Months
    N = 3  # 1 tick every 3
    xticks_pos = axes[0,1].get_xticks()
    xticks_labels = axes[0,1].get_xticklabels()
    labels = [t.get_text() for t in xticks_labels]
    xticks_newlabels = [months[i] for i, lbl in enumerate(labels)]
    myticks = [j for i,j in enumerate(xticks_pos) if not i%N]  # index of selected ticks
    newlabels = [label for i,label in enumerate(xticks_newlabels) if not i%N]

    for ax in axes.flat:
        ax.set_xticks(myticks)
        ax.set_xticklabels(newlabels)
        ax.tick_params(axis='both', labelsize=10)
        ax.set_xlabel('')
        ax.set_ylabel('')
        ax.tick_params(axis="x", which="both", rotation=55)
    axes[2,0].set_ylabel('Year')
    axes[7,2].set_xlabel('Month')

    for j in range(len(dict2['row'])):
        axes[dict2['row'][j], dict2['col'][j]].set_visible(False)
    for k in range(len(no_xaxis['row'])):
        axes[no_xaxis['row'][k], no_xaxis['col'][k]].xaxis.set_ticklabels([])
    for w in range(len(no_yaxis['row'])):
        axes[no_yaxis['row'][w], no_yaxis['col'][w]].yaxis.set_ticklabels([])

    cbar = plt.colorbar(im, ax=axes.ravel().tolist(), pad=0.03, shrink=0.5, aspect=50)
    cbar.set_label(label=lbls[v], size=12, labelpad=8)

    #plt.tight_layout()
    plt.show()
    #fig.savefig(join(config['main_brazil'], "plots", 'Brazil_UF_{vars[v][:-2]}.png'), bbox_inches='tight')