In [1]:
import numpy as np
import pandas as pd 
from numpy import log, exp
from itertools import product 
import matplotlib.pyplot as plt 
from scipy.stats import lognorm
from scoringrules import crps_lognormal
from mosqlient.scoring import compute_wis

Load data: 

In [2]:
df = pd.read_csv('data/dengue_agg.csv.gz')
df.date = pd.to_datetime(df.date)
df.head()

Unnamed: 0,date,uf,casos
0,2010-01-03,AC,760
1,2010-01-03,TO,231
2,2010-01-03,SP,1628
3,2010-01-03,SE,3
4,2010-01-03,SC,10


Parametros estimados pela CDF:

In [3]:
df_cdf = pd.read_csv('results/preds_CDF.csv')

df_cdf.loc[df_cdf.mu.isna(), ['mu', 'sigma']] = [0.01, 1/2]
df_cdf.head()

Unnamed: 0,date,lower_95,lower_90,lower_80,lower_50,pred,upper_50,upper_80,upper_90,upper_95,valid_test,state,model_id,mu,sigma
0,2024-10-06,22.422222,40.304679,79.16297,244.274988,853.700012,2982.915845,9196.810733,18041.732884,32366.797486,3,RS,108,6.748348,1.855412
1,2024-10-13,18.743651,34.187301,68.26913,216.515891,779.997873,2809.263104,8900.886361,17748.532118,32294.740637,3,RS,108,6.65783,1.900458
2,2024-10-20,6.581569,12.226861,24.874887,81.165077,301.351891,1118.132411,3638.484729,7371.869282,13600.483733,3,RS,108,5.704178,1.945792
3,2024-10-27,17.097895,32.060474,66.082749,220.93438,843.905436,3222.679992,10763.205908,22149.610067,41420.387215,3,RS,108,6.736465,1.987314
4,2024-11-03,18.724941,35.572182,74.438413,255.270334,1003.039414,3940.40738,13500.169745,28209.239339,53454.977878,3,RS,108,6.909363,2.029238


Parâmetros estimados pela KL:

In [4]:
df_kl = pd.read_csv('results/preds_KL.csv')

#df_kl.loc[df_kl.mu.isna(), ['mu', 'sigma']] = [0.01, 1/2]
df_kl.head()

Unnamed: 0,date,lower_95,lower_90,lower_80,lower_50,pred,upper_50,upper_80,upper_90,upper_95,valid_test,state,model_id,mu,sigma
0,2024-10-06,22.422222,40.304679,79.16297,244.274988,853.700012,2982.915845,9196.810733,18041.732884,32366.797486,3,RS,108,6.195184,1.748367
1,2024-10-13,18.743651,34.187301,68.26913,216.515891,779.997873,2809.263104,8900.886361,17748.532118,32294.740637,3,RS,108,6.091297,1.790893
2,2024-10-20,6.581569,12.226861,24.874887,81.165077,301.351891,1118.132411,3638.484729,7371.869282,13600.483733,3,RS,108,5.124968,1.834564
3,2024-10-27,17.097895,32.060474,66.082749,220.93438,843.905436,3222.679992,10763.205908,22149.610067,41420.387215,3,RS,108,6.144043,1.872778
4,2024-11-03,18.724941,35.572182,74.438413,255.270334,1003.039414,3940.40738,13500.169745,28209.239339,53454.977878,3,RS,108,6.30438,1.912227


Concatenating results:

In [5]:
df_aprox = df_cdf.merge(df_kl, left_on = ['date', 'lower_95', 'lower_90', 'lower_80', 'lower_50', 'pred',
       'upper_50', 'upper_80', 'upper_90', 'upper_95', 'valid_test', 'state',
       'model_id'], right_on =['date', 'lower_95', 'lower_90', 'lower_80', 'lower_50', 'pred',
       'upper_50', 'upper_80', 'upper_90', 'upper_95', 'valid_test', 'state',
       'model_id'],suffixes=('_cdf', '_kl') 
            )

df_aprox.date = pd.to_datetime(df_aprox.date)

df_aprox = df_aprox.loc[df_aprox.state != 'ES']

df_aprox.head()

Unnamed: 0,date,lower_95,lower_90,lower_80,lower_50,pred,upper_50,upper_80,upper_90,upper_95,valid_test,state,model_id,mu_cdf,sigma_cdf,mu_kl,sigma_kl
0,2024-10-06,22.422222,40.304679,79.16297,244.274988,853.700012,2982.915845,9196.810733,18041.732884,32366.797486,3,RS,108,6.748348,1.855412,6.195184,1.748367
1,2024-10-13,18.743651,34.187301,68.26913,216.515891,779.997873,2809.263104,8900.886361,17748.532118,32294.740637,3,RS,108,6.65783,1.900458,6.091297,1.790893
2,2024-10-20,6.581569,12.226861,24.874887,81.165077,301.351891,1118.132411,3638.484729,7371.869282,13600.483733,3,RS,108,5.704178,1.945792,5.124968,1.834564
3,2024-10-27,17.097895,32.060474,66.082749,220.93438,843.905436,3222.679992,10763.205908,22149.610067,41420.387215,3,RS,108,6.736465,1.987314,6.144043,1.872778
4,2024-11-03,18.724941,35.572182,74.438413,255.270334,1003.039414,3940.40738,13500.169745,28209.239339,53454.977878,3,RS,108,6.909363,2.029238,6.30438,1.912227


In [6]:
df_aprox.valid_test.unique()

array([3, 2, 1])

Cálculo do WIS, do CRPS usando os parâmetros obtidos pela CDF e os parâmetros obtidos pela KL:


In [50]:
def compute_metrics(model, df_w, df_preds_, state, valid_test, metric, sufix = None):
    '''
    Function to compute the score for the entire validation test or only around the peak (if peak=True).
    '''

    df_preds_model = df_preds_.loc[(df_preds_.model_id == model) & (df_preds_.state == state) & (df_preds_.valid_test == valid_test)].reset_index(drop = True)
        
    df_preds_to_score = df_w.loc[df_w.uf ==state].merge(df_preds_model, left_on = ['date', 'uf'], right_on = ['date', 'state'])

    if metric == 'wis':
        score = np.mean(compute_wis( 
                        df_preds_to_score[['date',  'lower_95', 'lower_90', 'lower_80', 'lower_50',
                           'pred', 'upper_50', 'upper_80', 'upper_90', 'upper_95']],
                        observed_value = df_preds_to_score['casos'].values)) 

    if metric == 'crps':
        score = np.mean(crps_lognormal(df_preds_to_score['casos'].values,
                                      df_preds_to_score[f'mu_{sufix}'].values,
                                      df_preds_to_score[f'sigma_{sufix}'].values)) 

    return score

In [8]:
compute_metrics(model= 155, df_w = df,
                df_preds_ = df_aprox,
                state = 'PR', 
                valid_test = 1, metric = 'wis', sufix = None)

2230.412212185296

In [9]:
compute_metrics(model= 155, df_w = df,
                df_preds_ = df_aprox,
                state = 'PR', 
                valid_test = 1, metric = 'crps', sufix = 'cdf')

2563.4105611388836

In [10]:
compute_metrics(model= 155, df_w = df,
                df_preds_ = df_aprox,
                state = 'PR', 
                valid_test = 1, metric = 'crps', sufix = 'kl')

2973.2265057633203

In [11]:
list_scores = []

for state, model, val_test in product(df_aprox.state.unique(), df_aprox.model_id.unique(), [1,2,3]):

    wis = compute_metrics(model= model, df_w = df,
                df_preds_ = df_aprox,
                state = state, 
                valid_test = val_test, metric = 'wis', sufix = None)

    crps_cdf = compute_metrics(model= model, df_w = df,
                df_preds_ = df_aprox,
                state = state, 
                valid_test = val_test, metric = 'crps', sufix = 'cdf')

    crps_kl = compute_metrics(model= model, df_w = df,
                df_preds_ = df_aprox,
                state = state, 
                valid_test = val_test, metric = 'crps', sufix = 'kl')
    
    list_scores.append(
        pd.DataFrame([[model, state, val_test, wis, crps_cdf, crps_kl]],
                            columns = ['model', 'state', 'validation_test', 'WIS', 'CRPS_CDF', 'CRPS_KL'] 
                            )
    ) 


df_scores = pd.concat(list_scores, ignore_index = True)

df_scores.head()    

  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.log(x)
  return np.

Unnamed: 0,model,state,validation_test,WIS,CRPS_CDF,CRPS_KL
0,108,RS,1,8921.236508,5307.986849,1818.125055
1,108,RS,2,17769.92095,10238.907752,3852.876667
2,108,RS,3,94820.348453,59714.365961,21088.670323
3,133,RS,1,609.003134,673.953669,646.58563
4,133,RS,2,2872.044724,3641.527471,3407.693164


In [12]:
df_scores.isnull().sum()

model              0
state              0
validation_test    0
WIS                0
CRPS_CDF           0
CRPS_KL            0
dtype: int64

In [13]:
df_scores.loc[df_scores.WIS.isna()]

Unnamed: 0,model,state,validation_test,WIS,CRPS_CDF,CRPS_KL


Rank dos modelos basead no valor médio de cada métrica em cada teste de validação:

In [19]:
df_sc_agg = df_scores.groupby(['model', 'state'])[['WIS', 'CRPS_CDF', 'CRPS_KL']].mean().reset_index()

df_sc_agg.head()

Unnamed: 0,model,state,WIS,CRPS_CDF,CRPS_KL
0,108,AC,140.019272,122.311151,99.854603
1,108,AL,712.714739,679.482779,379.053034
2,108,AM,52.051717,57.042867,56.223123
3,108,AP,117.166705,139.856468,109.192725
4,108,BA,1480.538419,1745.255772,1469.447018


In [35]:
df_pos = pd.DataFrame()

for state in df_aprox.state.unique(): 

    df_slice_ = df_sc_agg.loc[df_sc_agg.state == state]
    
    for sc in ['WIS', 'CRPS_CDF', 'CRPS_KL']: 
    
        df_slice_[f'rank_{sc}'] = df_slice_[f'{sc}'].rank(method="dense", ascending=True)

    for pos in np.arange(1, 18):

        df_pos = pd.concat([df_pos, pd.DataFrame([[state, pos, df_slice_.loc[df_slice_[f'rank_WIS'] == pos].model.values[0],
                       df_slice_.loc[df_slice_[f'rank_CRPS_CDF'] == pos].model.values[0],
                       df_slice_.loc[df_slice_[f'rank_CRPS_KL'] == pos].model.values[0]]], 
                     columns = ['state', 'pos', 'model_wis', 'model_crps_cdf', 'model_crps_kl'])], ignore_index = True)


df_pos.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_slice_[f'rank_{sc}'] = df_slice_[f'{sc}'].rank(method="dense", ascending=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_slice_[f'rank_{sc}'] = df_slice_[f'{sc}'].rank(method="dense", ascending=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_slice_[f'rank_{sc}'] = df_slice_[f'{sc

Unnamed: 0,state,pos,model_wis,model_crps_cdf,model_crps_kl
0,RS,1,137,156,156
1,RS,2,155,155,155
2,RS,3,156,157,157
3,RS,4,157,137,137
4,RS,5,144,150,150


In [37]:
df_pos.head()

Unnamed: 0,state,pos,model_wis,model_crps_cdf,model_crps_kl
0,RS,1,137,156,156
1,RS,2,155,155,155
2,RS,3,156,157,157
3,RS,4,157,137,137
4,RS,5,144,150,150


Número de modelos diferentes em cada posição do ranking. Se os três scores concordassem o valor seria 1. No geral os CRPS concordam mais entre si do que com o WIS.

In [46]:
df_pos["n_unique_models"] = df_pos.apply(
    lambda row: len(np.unique([row["model_wis"], row["model_crps_cdf"], row["model_crps_kl"]])),
    axis=1
)

df_pos.head()

Unnamed: 0,state,pos,model_wis,model_crps_cdf,model_crps_kl,n_unique_models
0,RS,1,137,156,156,2
1,RS,2,155,155,155,1
2,RS,3,156,157,157,2
3,RS,4,157,137,137,2
4,RS,5,144,150,150,2


In [47]:
df_pos.loc[df_pos.pos == 1]

Unnamed: 0,state,pos,model_wis,model_crps_cdf,model_crps_kl,n_unique_models
0,RS,1,137,156,156,2
17,RN,1,135,139,135,2
34,RJ,1,156,156,156,1
51,PI,1,150,150,135,2
68,PE,1,150,139,150,2
85,PR,1,155,155,155,1
102,PB,1,150,150,139,2
119,PA,1,155,156,156,2
136,MG,1,156,156,156,1
153,MS,1,133,136,136,2
