In [1]:
import os
import sys

## Add convida lib and convida server lib to path
convida_lib_path = os.path.dirname(os.getcwd())
lib = os.path.join(convida_lib_path,'lib')
sys.path.append(lib)

In [2]:
from convida import COnVIDa
from regions import Regions
from datatype import DataType
import pandas as pd
import h5py
import os.path
import time

## Generation of cache

### Temporal

In [3]:
print('GENERATING CACHÉ...')
print()

all_regions = Regions.get_regions('ES')
print('REGIONS: ', ', '.join(all_regions))
print()
datasources = COnVIDa.get_data_items_names(DataType.TEMPORAL,language='internal')
all_data_items = []
for data_items in datasources.values():
    all_data_items += data_items
print('DATA ITEMS: ', ', '.join(all_data_items))
print()

GENERATING CACHÉ...

REGIONS:  España, CA Andalucía, CA Aragón, CA Principado de Asturias, CA Islas Baleares, CA Canarias, CA Cantabria, CA Castilla-La Mancha, CA Castilla y León, CA Cataluña, CA Ceuta, CA Comunidad Valenciana, CA Extremadura, CA Galicia, CA Comunidad de Madrid, CA Melilla, CA Región de Murcia, CA Comunidad Foral de Navarra, CA País Vasco, CA La Rioja, Albacete, Alicante, Almería, Álava, Asturias, Ávila, Badajoz, Baleares, Barcelona, Bizkaia, Burgos, Cáceres, Cádiz, Cantabria, Castellón, Ciudad Real, Córdoba, A Coruña, Cuenca, Gipuzkoa, Girona, Granada, Guadalajara, Huelva, Huesca, Jaén, León, Lleida, Lugo, Madrid, Málaga, Murcia, Navarra, Ourense, Palencia, Las Palmas, Pontevedra, La Rioja, Salamanca, Santa Cruz de Tenerife, Segovia, Sevilla, Soria, Tarragona, Teruel, Toledo, Valencia, Valladolid, Zamora, Zaragoza, Ceuta, Melilla

DATA ITEMS:  prec, presMax, presMin, racha, sol, tmax, tmed, tmin, velmedia, altitud, dir, ia14, daily_cases_avg7, num_casos_prueba_pcr_avg

In [4]:
%%time
start = pd.to_datetime('2020-11-01', format='%Y-%m-%d')
end = pd.to_datetime('2020-12-30', format='%Y-%m-%d')

temporal_data = COnVIDa.get_data_items(regions=all_regions,
                          data_items=all_data_items,
                          start_date=start,
                          end_date=end,
                          language='internal',
                          errors='raise')

temporal_data.to_hdf(path_or_buf=f'data/cache_{str(end)[0:10]}.h5',key='temporal',mode='a')

Assumed a TEMPORAL data retrieval...


  percent = (df.loc[idx, ('España', param)] * 100) / valor
  percent = (df.loc[idx, ('España', param)] * 100) / valor


Wall time: 2min 54s


In [5]:
%%time

## Read info of generated cache
temporal_data = pd.read_hdf(path_or_buf=f'data/cache_{str(end)[0:10]}.h5',
                            key='temporal',
                            mode='r')
temporal_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 60 entries, 2020-11-01 to 2020-12-30
Freq: D
Columns: 2383 entries, ('A Coruña', 'accumulated_lethality') to ('Ávila', 'recovered')
dtypes: float64(2364), int64(19)
memory usage: 1.1 MB
Wall time: 216 ms


### Geographical

In [6]:
print('UPDATING CACHÉ...')
print()

all_regions = Regions.get_regions('ES')
print('REGIONS: ', ', '.join(all_regions))
print()
datasources = COnVIDa.get_data_items_names(DataType.GEOGRAPHICAL,
                                              language='internal')
all_data_items = []
for data_items in datasources.values():
    all_data_items += data_items
print('DATA ITEMS: ', ', '.join(all_data_items))

UPDATING CACHÉ...

REGIONS:  España, CA Andalucía, CA Aragón, CA Principado de Asturias, CA Islas Baleares, CA Canarias, CA Cantabria, CA Castilla-La Mancha, CA Castilla y León, CA Cataluña, CA Ceuta, CA Comunidad Valenciana, CA Extremadura, CA Galicia, CA Comunidad de Madrid, CA Melilla, CA Región de Murcia, CA Comunidad Foral de Navarra, CA País Vasco, CA La Rioja, Albacete, Alicante, Almería, Álava, Asturias, Ávila, Badajoz, Baleares, Barcelona, Bizkaia, Burgos, Cáceres, Cádiz, Cantabria, Castellón, Ciudad Real, Córdoba, A Coruña, Cuenca, Gipuzkoa, Girona, Granada, Guadalajara, Huelva, Huesca, Jaén, León, Lleida, Lugo, Madrid, Málaga, Murcia, Navarra, Ourense, Palencia, Las Palmas, Pontevedra, La Rioja, Salamanca, Santa Cruz de Tenerife, Segovia, Sevilla, Soria, Tarragona, Teruel, Toledo, Valencia, Valladolid, Zamora, Zaragoza, Ceuta, Melilla

DATA ITEMS:  actividad_fisica, imc, tabaco, hogares_tipo_familia, hogares_densidad_ocupacion, mayores_65_solos


In [7]:
%%time
geographical_data = COnVIDa.get_data_items(regions=all_regions,
                              data_items=all_data_items,
                              language='internal')

geographical_data.to_hdf(path_or_buf=f'data/cache_{str(end)[0:10]}.h5',key='geographical',mode='a')

Assumed a GEOGRAPHICAL data retrieval...
Wall time: 4.52 s


In [8]:
%%time

## Read info of generated cache

geographical_data = pd.read_hdf(path_or_buf=f'data/cache_{str(end)[0:10]}.h5',
                            key='geographical',
                            mode='r')
geographical_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19 entries, CA Andalucía to Melilla
Data columns (total 35 columns):
 #   Column                                                               Non-Null Count  Dtype  
---  ------                                                               --------------  -----  
 0   actividad_fisica (Nivel alto)                                        19 non-null     float64
 1   actividad_fisica (Nivel bajo)                                        19 non-null     float64
 2   actividad_fisica (Nivel moderado)                                    19 non-null     float64
 3   actividad_fisica (No consta)                                         19 non-null     float64
 4   hogares_densidad_ocupacion (60 m2 o más por ocupante)                19 non-null     float64
 5   hogares_densidad_ocupacion (Entre 10 y menos de 20 m2 por ocupante)  19 non-null     float64
 6   hogares_densidad_ocupacion (Entre 20 y menos de 30 m2 por ocupante)  19 non-null     float64
 7  

## Last update info

In [9]:
lu = {'last_update':[None,None,None,None,None]}
dfu = pd.DataFrame(lu, index=['AEMETDataSource','COVID19DataSource','INEDataSource','MobilityDataSource','MoMoDataSource'])
dfu.to_hdf(path_or_buf=f'data/cache_{str(end)[0:10]}.h5',key='last_updates',mode='a')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['last_update'], dtype='object')]

  encoding=encoding,
