In [65]:
"""
This is for parsing from CDC's github repo
    Output: score card containing all models
    i.e., 
    model, forecast_week, ahead, location (as region abbreviation), type, quantile, value
    e.g.
    GT-FluFNP, 202205, 1, CA, point, NaN, 843
    GT-FluFNP, 202205, 1, CA, quantile, 0.01, 338
        ....
        GT-FluFNP, 202205, 2, CA, point, NaN, 900
        GT-FluFNP, 202205, 2, CA, quantile, 0.01, 438
"""
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import glob
from epiweeks import Week
import pdb

# death_target = ['1 wk ahead inc death' , '2 wk ahead inc death' , '3 wk ahead inc death' , '4 wk ahead inc death']

data_ew = Week.thisweek(system="CDC") - 1  # -1 because we have data for the previous (ending) week
DIR =  './data-forecasts/'
models = [file.split(DIR)[1] for file in glob.glob(DIR + '/*') if ".md" not in file]
location_df = pd.read_csv('data-locations/locations.csv')
location_dict = {location_df['location'][i]:location_df['abbreviation'][i] for i in range(len(location_df))}
# for each model, get all submissions
df_list = []
print(models)
for model in models:
    model_dir = DIR + '/' + model + '/' 

    all_items_path = np.array(glob.glob(model_dir + '*.csv'))  # list all csv files' paths
    all_items = [path.replace(model_dir, '') for path in all_items_path]  #list of all csv files' names

    """
    remove forecasts that were duplicated in a given week (if any)
    forecasts file should be unique for each epiweek
    """
    subm_dict = {}
    for i, item in enumerate(all_items):
        date = datetime.strptime(item[:10], '%Y-%m-%d')
        epiweek  = date.isocalendar()[1]
        if epiweek in subm_dict.keys():
            if subm_dict[epiweek][0] <= date:
                subm_dict[epiweek] = (date, i)
        else:
            subm_dict[epiweek] = (date, i)

    select = [ value[1] for key, value in subm_dict.items()]
    select_paths = all_items_path[select]


    data_model = []
    for path in select_paths:

        df = pd.read_csv(path)
        
        """
            create epiweek column
        """
        date = path.split('/')[-1][:10]
        # epiweek ends on Saturday, but submission is until Monday. 
        # we can subtract 2 days, thus, submission on Monday will be considered in the prev week  
        # this also aligns submission week and data
        date = datetime.strptime(date, '%Y-%m-%d') - timedelta(days=2)
        forecast_week = Week.fromdate(date)
        df['forecast_week'] = forecast_week
        #pdb.set_trace()
        data_model.append(df)


    # join all dataframes saved in data_model

    """
        select, rename and sort columns
    """
        

    """
        convert location to region abbreviation
    """
    print(len(data_model))
    print(model)
    df = pd.concat(data_model, ignore_index=True, sort=False)
    df = df.rename(columns={'target': 'ahead'})
    model_list = []
    df['location']= df['location'].astype(str)
    for i in range(len(df)):  
        key = df['location'][i]
        if len(key) == 1: 
            key = '0' + key
        df.at[i, 'location'] = location_dict[key]
        df.at[i, 'ahead'] = df['ahead'][i][0]
        model_list.append(model)
    df['model'] = model
    df = df[['model', 'forecast_week', 'ahead', 'location', 'type', 'quantile', 'value']]
    final_row = {'model': [], 'forecast_week': [], 'ahead':[], 'location':[],'type':[],'quantile':[],
             'value':[]}
    for index, row in df.iterrows():
        if row['quantile'] == 0.5 and model == 'Flusight-ensemble': 
            final_row['model'].append(row['model'])
            final_row['forecast_week'].append(row['forecast_week'])
            final_row['ahead'].append(row['ahead'])
            final_row['location'].append(row['location'])
            final_row['type'].append('point')
            final_row['quantile'].append(np.nan)
            final_row['value'].append(row['value'])
    df2 = pd.DataFrame(final_row)
    df3 = pd.concat([df,df2], ignore_index = False)
    df3 = df3.sort_values(by=['forecast_week', 'location', 'ahead', 'type'], ascending=[True, True,True,True])
    df_list.append(df3)  
df = pd.concat(df_list, ignore_index=True, sort=False)
df = df.sort_values(by=['model','forecast_week', 'location', 'ahead', 'type'], ascending=[True,True, True,True,True])
df.to_csv('./predictions.csv',index=False)
print("done")

['LosAlamos_NAU-CModel_Flu', 'SigSci-TSENS', 'Flusight-baseline', 'LUcompUncertLab-VAR2K_plusCOVID', 'CU-ensemble', 'JHUAPL-Gecko', 'SGroup-RandomForest', 'VTSanghani-ExogModel', 'SigSci-CREG', 'UMass-trends_ensemble', 'CMU-TimeSeries', 'SGroup-SIkJalpha', 'CEID-Walk', 'LUcompUncertLab-humanjudgment', 'Flusight-ensemble', 'PSI-DICE', 'IEM_Health-FluProject', 'UT_FluCast-Voltaire', 'LUcompUncertLab-VAR2_plusCOVID', 'UVAFluX-Ensemble', 'LUcompUncertLab-VAR2K', 'LUcompUncertLab-TEVA', 'GH-Flusight', 'MOBS-GLEAM_FLUH', 'LUcompUncertLab-VAR2', 'GT-FluFNP-raw', 'GT-FluFNP']
9
LosAlamos_NAU-CModel_Flu
9
SigSci-TSENS
9
Flusight-baseline
9
LUcompUncertLab-VAR2K_plusCOVID
9
CU-ensemble
6
JHUAPL-Gecko
5
SGroup-RandomForest
8
VTSanghani-ExogModel
9
SigSci-CREG
7
UMass-trends_ensemble
9
CMU-TimeSeries
9
SGroup-SIkJalpha
7
CEID-Walk
6
LUcompUncertLab-humanjudgment
9
Flusight-ensemble
9
PSI-DICE
9
IEM_Health-FluProject
9
UT_FluCast-Voltaire
9
LUcompUncertLab-VAR2_plusCOVID
9
UVAFluX-Ensemble
9
LUcomp

In [60]:
for file in glob.glob(DIR + '/*'):
    print(file.split(DIR)[1])

LosAlamos_NAU-CModel_Flu
SigSci-TSENS
Flusight-baseline
LUcompUncertLab-VAR2K_plusCOVID
CU-ensemble
JHUAPL-Gecko
SGroup-RandomForest
VTSanghani-ExogModel
SigSci-CREG
UMass-trends_ensemble
CMU-TimeSeries
SGroup-SIkJalpha
CEID-Walk
LUcompUncertLab-humanjudgment
Flusight-ensemble
PSI-DICE
IEM_Health-FluProject
README.md
UT_FluCast-Voltaire
METADATA.md
LUcompUncertLab-VAR2_plusCOVID
UVAFluX-Ensemble
LUcompUncertLab-VAR2K
LUcompUncertLab-TEVA
GH-Flusight
MOBS-GLEAM_FLUH
LUcompUncertLab-VAR2
GT-FluFNP-raw
GT-FluFNP


In [45]:
type(5)

int

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  z = np.arange(3, dtype=np.int)


In [73]:
df = pd.read_csv("predictions.csv")
df = df.query('model == "VTSanghani-ExogModel"')
for i in df.head(100)['value']: 
    print(i)
df.head(100)

2.957617998123169
0.0
0.6375894010066986
1.620467758178711
2.1095148801803587
2.1444329738616945
2.2644848823547363
2.5354020595550537
2.63372004032135
2.664825999736786
2.721835231781006
2.81559591293335
2.957617998123169
3.072873091697693
3.1612223625183105
3.2074337005615234
3.251999616622925
3.4100457429885864
3.48996901512146
3.48996901512146
3.48996901512146
3.5151335477828964
4.289033865928646
5.381228952407841
3.48996901512146
1.3488049411773682
1.582910993695259
2.2530653715133666
3.4424407482147217
3.48996901512146
3.48996901512146
3.48996901512146
3.48996901512146
3.48996901512146
3.48996901512146
3.48996901512146
3.48996901512146
3.48996901512146
3.48996901512146
3.48996901512146
3.48996901512146
3.48996901512146
3.48996901512146
5.213014078140256
6.184170722961428
6.893884968757629
7.709536004066457
8.690273122787483
2.734231948852539
0.0
0.0
0.0457495808601379
0.4021731853485108
1.0403863072395323
1.3810546398162842
1.4895933866500854
1.989180445671081
2.1520814895629883


Unnamed: 0,model,forecast_week,ahead,location,type,quantile,value
1067154,VTSanghani-ExogModel,202201,1,AK,point,,2.957618
1067155,VTSanghani-ExogModel,202201,1,AK,quantile,0.010,0.000000
1067156,VTSanghani-ExogModel,202201,1,AK,quantile,0.025,0.637589
1067157,VTSanghani-ExogModel,202201,1,AK,quantile,0.050,1.620468
1067158,VTSanghani-ExogModel,202201,1,AK,quantile,0.100,2.109515
...,...,...,...,...,...,...,...
1067249,VTSanghani-ExogModel,202201,4,AK,quantile,0.990,4.299457
1067250,VTSanghani-ExogModel,202201,1,AL,point,,24.984921
1067251,VTSanghani-ExogModel,202201,1,AL,quantile,0.010,8.745167
1067252,VTSanghani-ExogModel,202201,1,AL,quantile,0.025,9.682554
