In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

api_key = os.getenv("API_KEY")

In [2]:
import warnings
from pandas.errors import SettingWithCopyWarning

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)

In [3]:
import numpy as np
import pandas as pd
from format_data import *
from epiweeks import Week
from mosqlient.scoring import Scorer
from format_data import filter_agg_data

import sys 
sys.path.append('../')
from methods.ensemble import Ensemble

In [4]:
Week.fromdate(pd.to_datetime('2025-04-27'))

Week(2025, 18, CDC)

### Apply models:

In [5]:
YEAR = 2023

In [6]:
state = 'PR'

In [7]:
df_st = filter_agg_data(state)

df_org = org_data(df_st)

df_org.to_csv(f'data/dengue_{state}.csv.gz')

df_org.head()

Unnamed: 0_level_0,SE,casos,diff_casos,casos_mean,casos_std,casos_slope
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-24,4,7.784715,0.41667,7.243636,0.364523,0.313907
2010-01-31,5,8.188735,0.40402,7.570653,0.465085,0.415953
2010-02-07,6,8.741296,0.552561,8.020698,0.507233,0.452377
2010-02-14,7,9.185855,0.444559,8.475151,0.532604,0.475598
2010-02-21,8,9.684223,0.498367,8.950027,0.551768,0.493102


#### Train the models

In [8]:
from aux_func import *

In [9]:
start_train_date = str(Week(2015,1).startdate())
end_train_date = str(Week(YEAR-1,52).startdate())

In [10]:
train_models(state, start_train_date, end_train_date)

PR
--------------------- Training ARIMA ---------------------
Performing stepwise search to minimize aic
 ARIMA(2,1,2)(0,0,0)[0] intercept   : AIC=inf, Time=0.41 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=-1336.718, Time=0.03 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=-1337.848, Time=0.05 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=-1337.149, Time=0.06 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=-1338.678, Time=0.03 sec
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=-1332.893, Time=0.25 sec

Best model:  ARIMA(0,1,0)(0,0,0)[0]          
Total fit time: 0.834 seconds
--------------------- Training GP ---------------------
INFO:tensorflow:Assets written to: saved_models/gp_PR/assets


INFO:tensorflow:Assets written to: saved_models/gp_PR/assets


--------------------- Training LSTM ---------------------
Training fold 1...


  super().__init__(**kwargs)


Training fold 2...
Training fold 3...
Training fold 4...


In [11]:
end_date = Week(YEAR, 52).startdate()

In [12]:
df_forecast = pd.DataFrame()
for date in pd.date_range(start=end_train_date, end=end_date, freq = 'W-SUN'):

    df_concat = apply_models(state, date.strftime(format = '%Y-%m-%d'))

    df_concat['epiweek'] =  Week.fromdate(date).isoformat()

    df_forecast = pd.concat([df_forecast, df_concat], ignore_index = True)

#df_forecast.head()

In [13]:
df_forecast.to_csv(f'forecast_tables/for_{YEAR}_{state}.csv', index = False)

### Generate the ensembles: 

In [14]:
df_casos = filter_agg_data(state)
df_casos.head()

Unnamed: 0,date,casos,uf
0,2010-01-03,369.0,PR
1,2010-01-10,386.0,PR
2,2010-01-17,529.0,PR
3,2010-01-24,716.0,PR
4,2010-01-31,956.0,PR


In [15]:
df_for = pd.read_csv(f'forecast_tables/for_{YEAR}_{state}.csv')
df_for.date = pd.to_datetime(df_for.date)
df_for.head()

Unnamed: 0,lower_95,upper_95,lower_90,upper_90,lower_80,upper_80,lower_50,upper_50,pred,date,step,model,epiweek
0,907.72402,2677.743458,981.279792,2429.435612,1075.358322,2177.173297,1258.528692,1823.268402,1509.0,2023-01-01,1,arima,2022W52
1,747.127417,3476.099876,830.527581,3006.563566,941.300659,2557.767184,1169.840738,1976.641778,1509.0,2023-01-08,2,arima,2022W52
2,647.020726,4293.043466,733.746442,3566.279678,852.111871,2906.111929,1106.93323,2105.135371,1509.0,2023-01-15,3,arima,2022W52
3,2627.952657,3825.025866,2709.140145,3712.341357,2805.652605,3586.396453,2974.658094,3384.490749,3173.296739,2023-01-01,1,gp,2022W52
4,1701.101873,2527.117102,1756.592192,2448.716216,1822.669226,2361.233417,1938.660455,2221.317868,2075.424656,2023-01-08,2,gp,2022W52


In [16]:
epiweeks = df_for.epiweek.unique()

df_ens_lin_end = pd.DataFrame()
df_ens_lin_end_equal = pd.DataFrame()
df_ens_lin_end_crps = pd.DataFrame()
df_ens_log_end = pd.DataFrame()
df_ens_log_end_equal = pd.DataFrame()
df_ens_log_end_crps = pd.DataFrame()

weights_crps_lin = np.empty((0, 3))
weights_crps_log = np.empty((0, 3))
weights_crps_final = np.empty((0, 3))

for idx in np.arange(1, len(epiweeks)):
    
    df_preds_last = format_pred(df_for, step =1, label_epiweek = epiweeks[idx-1])

    casos = df_casos.loc[df_casos.date == df_preds_last.date[0].strftime(format = '%Y-%m-%d')].reset_index(drop = True)

    weights_lin, weights_log, weights_crps = get_weights(api_key, df_preds_last, casos)

    weights_crps_lin = np.concatenate((weights_crps_lin, weights_lin.reshape(1,3)))
    weights_crps_log = np.concatenate((weights_crps_log, weights_log.reshape(1,3)))
    weights_crps_final = np.concatenate((weights_crps_final, weights_crps.reshape(1,3)))

    df_preds = format_pred(df_for, label_epiweek = epiweeks[idx])

    df_ens_lin, df_ens_lin_equal,  df_ens_lin_crps, df_ens_log, df_ens_log_equal, df_ens_log_crps = apply_ensemble(df_preds, weights_lin, weights_log, weights_crps)

    df_ens_lin_end = pd.concat([df_ens_lin_end, df_ens_lin], ignore_index = True)
    
    df_ens_lin_end_equal = pd.concat([df_ens_lin_end_equal, df_ens_lin_equal], ignore_index = True)

    df_ens_lin_end_crps = pd.concat([df_ens_lin_end_crps, df_ens_lin_crps], ignore_index = True)

    df_ens_log_end = pd.concat([df_ens_log_end, df_ens_log], ignore_index = True)
    
    df_ens_log_end_equal = pd.concat([df_ens_log_end_equal, df_ens_log_equal], ignore_index = True)

    df_ens_log_end_crps = pd.concat([df_ens_log_end_crps, df_ens_log_crps], ignore_index = True)



In [17]:
len(weights_lin)

3

In [18]:
df_w_lin = pd.DataFrame(weights_crps_lin, columns = ['weights_1', 'weights_2', 'weights_3'])
df_w_log = pd.DataFrame(weights_crps_log, columns = ['weights_1', 'weights_2', 'weights_3'])
df_w_crps = pd.DataFrame(weights_crps_final, columns = ['weights_1', 'weights_2', 'weights_3'])

for df in [df_w_lin, df_w_log, df_w_crps]:
    
    df['epiweek'] = epiweeks[1:]

    df['date'] = df['epiweek'].apply(lambda x: Week.fromstring(x).startdate())


df_w_lin.head()

Unnamed: 0,weights_1,weights_2,weights_3,epiweek,date
0,0.999998,9.999991e-07,9.999997e-07,2023W01,2023-01-01
1,1.000041e-06,0.999999,0.0,2023W02,2023-01-08
2,1.000067e-06,1e-06,0.999998,2023W03,2023-01-15
3,1.000003e-06,1.00006e-06,0.999998,2023W04,2023-01-22
4,1.304694e-15,0.999999,1e-06,2023W05,2023-01-29


In [19]:
df_w_lin.to_csv(f'forecast_tables/weights_linear_{state}_for_{YEAR}.csv.gz')
df_w_log.to_csv(f'forecast_tables/weights_log_{state}_for_{YEAR}.csv.gz')
df_w_crps.to_csv(f'forecast_tables/weights_crps_{state}_for_{YEAR}.csv.gz')
df_ens_lin_end.to_csv(f'forecast_tables/ensemble_linear_{state}_for_{YEAR}.csv.gz')
df_ens_lin_end_equal.to_csv(f'forecast_tables/ensemble_linear_equal_{state}_for_{YEAR}.csv.gz')
df_ens_log_end.to_csv(f'forecast_tables/ensemble_log_{state}_for_{YEAR}.csv.gz')
df_ens_log_end_equal.to_csv(f'forecast_tables/ensemble_log_equal_{state}_for_{YEAR}.csv.gz')
df_ens_lin_end_crps.to_csv(f'forecast_tables/ensemble_linear_crps_{state}_for_{YEAR}.csv.gz')
df_ens_log_end_crps.to_csv(f'forecast_tables/ensemble_crps_log_{state}_for_{YEAR}.csv.gz')