In [1]:
import sys
import datetime
import pyseir
import dill
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from scipy.stats import pearsonr

import tensorflow as tf
from tensorflow import keras

import ai_utils.metrics as ai_metrics

### Parâmetros do script

In [2]:
lookback = sys.argv[1]
weighted = sys.argv[2]

if lookback != 'all':
    lookback = int(lookback)
    
weighted = bool(weighted)

### Função para geração de DataFrame com métricas de regressão

In [3]:
def get_metrics(y_true, y_pred):
    
    metrics = pd.DataFrame(columns=['MSE', 'RMSE', 'MAE', 'Pearson r', 'Fac2'])
    
    n_cols = y_true.shape[1]
    
    for col in range(n_cols):
        y_true_col = y_true.iloc[:, col]
        y_pred_col = y_pred.iloc[:, col]

        if y_true_col.name == y_pred_col.name:
            col_name = y_true_col.name
        else:
            print('Column name error.')
            return None

        mse = mean_squared_error(y_true_col, y_pred_col)
        rmse = mean_squared_error(y_true_col, y_pred_col, squared=False)
        mae = mean_absolute_error(y_true_col, y_pred_col)
        pearson_r = pearsonr(y_true_col, y_pred_col)[0]
        fac2 = ai_metrics.fac2(y_true_col.values, y_pred_col.values, to_numpy=True)
        metrics.loc[col_name, :] = [mse, rmse, mae, pearson_r, fac2]

    metrics.loc['Average', :] = metrics.mean()
    return metrics

### Carregando datasets para avaliação

In [4]:
x_train_scaled = pd.read_csv('../data_eval/out(confirmed)in(confirmed-infection_days)/lf20_lb13/x_train_lf20_lb13.csv')
x_val_scaled = pd.read_csv('../data_eval/out(confirmed)in(confirmed-infection_days)/lf20_lb13/x_val_lf20_lb13.csv')
x_test_scaled = pd.read_csv('../data_eval/out(confirmed)in(confirmed-infection_days)/lf20_lb13/x_test_lf20_lb13.csv')

y_train_scaled = pd.read_csv('../data_eval/out(confirmed)in(confirmed-infection_days)/lf20_lb13/y_train_lf20_lb13.csv')
y_val_scaled = pd.read_csv('../data_eval/out(confirmed)in(confirmed-infection_days)/lf20_lb13/y_val_lf20_lb13.csv')
y_test_scaled = pd.read_csv('../data_eval/out(confirmed)in(confirmed-infection_days)/lf20_lb13/y_test_lf20_lb13.csv')

### Carregando objeto "Scaler" para converter datasets de volta para a escala original

In [5]:
with open('../data_eval/scaler.dill', 'rb') as scaler_file:
    scaler = dill.load(scaler_file)

### Convertendo dados para a escala original

In [6]:
x_train = scaler.get_original_scale(x_train_scaled, lb=14, lf=None)
x_val = scaler.get_original_scale(x_val_scaled, lb=14, lf=None)
x_test = scaler.get_original_scale(x_test_scaled, lb=14, lf=None)

y_train = scaler.get_original_scale(y_train_scaled, lb=None, lf=20)
y_val = scaler.get_original_scale(y_val_scaled, lb=None, lf=20)
y_test = scaler.get_original_scale(y_test_scaled, lb=None, lf=20)

### Lendo dados populacionais mundiais

In [7]:
population_df = pd.read_csv('../../kaggle_pop_data/population_by_country_2020.csv')
population_df = population_df.drop(
    columns=['Yearly Change','Net Change',
             'Density (P/Km²)','Land Area (Km²)',
             'Migrants (net)','Fert. Rate', 'World Share']
)

replace_col = population_df.loc[:, 'Country (or dependency)'].replace(['Czech Republic (Czechia)'], 'Czechia')
population_df.loc[:, 'Country (or dependency)'] = replace_col

In [8]:
begin = datetime.datetime.now()
print('Begin:', begin)

partitions = {'val':(x_val, y_val), 'test':(x_test, y_test)}

for part in partitions.keys():
    
    print(part)
    
    part_params_history = pd.DataFrame()
    y_part_pred = pd.DataFrame()
    
    x_part = partitions[part][0]
    y_part_true = partitions[part][1]
    
    cols_to_drop = ['infection_days_t-{}'.format(x) for x in range(1, 14)]
    x_part = x_part.drop(columns=cols_to_drop)
    
    part_regions = x_part['region'].unique()
    
    for region in part_regions:
        
        print(region)
        
        x_region = x_part.loc[x_part.loc[:,'region']==region, :]
        y_region = y_part_true.loc[y_part_true.loc[:,'region']==region, :]
        pop = population_df.loc[(population_df['Country (or dependency)']==region),
                                'Population (2020)'].values[0]
        
        for row in range(len(x_region)):
            
            if lookback == 'all':
                if row == 0:
                    sample = x_region.iloc[row:row+1, :]
                else:
                    a = x_region.iloc[0:1, :].filter(regex='^confirmed', axis=1).values.flatten()
                    b = x_region.iloc[1:row+1, -2].values
                    c = np.concatenate([a,b])
                    confirmed_cols = ['confirmed_t{}'.format(x) for x in range(-len(c)+1, 1, 1)]
                    cols = ['region']
                    cols.extend(confirmed_cols)
                    cols.extend(['infection_days_t0'])
                    inf_days = x_region.iloc[row, -1]
                    vals = [region]
                    vals.extend(c)
                    vals.extend([inf_days])
                    df_dict = dict(zip(cols, vals))
                    sample = pd.DataFrame(df_dict, index=[0])
                    
            else:
                sample = x_region.iloc[row:row+1, :]
                cols_to_drop = ['confirmed_t-{}'.format(x) for x in range(lookback+1, 14)]
                sample = sample.drop(columns=cols_to_drop)
                
            if weighted:
                sample_vals = sample.filter(regex='^confirmed', axis=1).values.flatten()
                weights = 1 / np.arange(1, len(sample_vals)+1)[::-1]  # Recent data is more heavily weighted
                weights = np.reshape(weights, (1,-1))
            else:
                weights = None
                
            model = pyseir.SEIR_model()

            model.fit(data=sample, pop=pop, weights=weights)

            y_pred = model.predict(data=sample, pop=pop, look_forward=20)

            y_part_pred = pd.concat([y_part_pred, y_pred], ignore_index=True)

            params = model.params
            params_df = pd.DataFrame(params, index=[0])
            params_df.insert(0, column='region', value=region)
            part_params_history = pd.concat([part_params_history, params_df], ignore_index=True)
    
    y_part_true_nregion = y_part_true.drop('region', axis=1)
    y_part_pred_nregion = y_part_pred.drop('region', axis=1)
    
    part_metrics = get_metrics(y_part_true_nregion, y_part_pred_nregion)
    
    part_metrics.to_csv('seir_{}_metrics_lb_{}_weight_{}.csv'.format(part,lookback,weighted), index=False)
    
    y_part_pred.to_csv('seir_{}_preds_lb_{}_weight_{}.csv'.format(part,lookback,weighted), index=False)
    
    part_params_history.to_csv('seir_{}_params_lb_{}_weight_{}.csv'.format(part,lookback,weighted), index=False)

end = datetime.datetime.now()
print('End:', end)
print('Time:', end-begin)    

Begin: 2020-05-14 21:25:23.268091
val
Spain


Process ForkPoolWorker-54:
Process ForkPoolWorker-53:
Process ForkPoolWorker-56:
Process ForkPoolWorker-55:
Traceback (most recent call last):
  File "/home/lucas/miniconda3/envs/covid/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/lucas/miniconda3/envs/covid/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/lucas/miniconda3/envs/covid/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/lucas/miniconda3/envs/covid/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/home/lucas/miniconda3/envs/covid/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/home/lucas/miniconda3/envs/covid/lib/python3.7/multiprocessing/pool.py", line

Traceback (most recent call last):
  File "/home/lucas/miniconda3/envs/covid/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-8-21b159ffcead>", line 64, in <module>
    model.fit(data=sample, pop=pop, weights=weights)
  File "/home/lucas/CimatecProjects/covid19/lucas/source/models_epidemic/pyseir.py", line 40, in fit
    self.params = SEIR_fit(data=data, population=pop, bounds=self.bounds, **kwargs)
  File "/home/lucas/CimatecProjects/covid19/lucas/source/models_epidemic/pyseir.py", line 272, in SEIR_fit
    updating='deferred', polish=True, workers=-1, seed=0
  File "/home/lucas/miniconda3/envs/covid/lib/python3.7/site-packages/scipy/optimize/_differentialevolution.py", line 306, in differential_evolution
    ret = solver.solve()
  File "/home/lucas/miniconda3/envs/covid/lib/python3.7/site-packages/scipy/optimize/_differentialevolution.py", line 753, in solve
    next(self

KeyboardInterrupt: 