In [1]:
import os
from datetime import datetime, timedelta

import numpy as np
import pandas as pd

In [4]:
try:    
    import utils.scripts.data_collection.data.peru_data_v2 as peru_data
    import utils.scripts.data_collection.data.ecuador_data_v2 as ecuador_data
    import utils.scripts.data_collection.data.cuba_data_v2 as cuba_data
    import utils.scripts.data_time_series.time_series_generator as time_series_generator
except Exception as e:
    print('Exception fixed: ',e)
    import data_collection.data.peru_data_v2 as peru_data
    import data_collection.data.ecuador_data_v2 as ecuador_data
    import data_collection.data.cuba_data_v2 as cuba_data
    import data_time_series.time_series_generator as time_series_generator

Exception fixed:  No module named 'utils'


In [2]:
PATH_DSRP_DAILY_REPORTS = '../../latam_covid_19_data/daily_reports/'
DATA_TEMPLATE_URL = 'https://raw.githubusercontent.com/DataScienceResearchPeru/covid-19_latinoamerica/master/latam_covid_19_data/templates/daily_report.csv'
PATH_CUBA = 'data_collection/data/cuba_temporal/'
PATH_ECUADOR = 'data_collection/data/ecuador_temporal/'
PATH_PERU = 'data_collection/data/peru_temporal/'


In [3]:

def generate_list_dates(path):
    # Generate dates from files existing
    date_list_csv = []
    path, dirs, files = next(os.walk(path))
    numero_archivos = len(files)
    print('There is {} files on the path and one is README. We iterate {} times...'.format(
        numero_archivos, numero_archivos-1))
    # dates
    base = (datetime.today()).date()
    numdays = numero_archivos-1
    date_list_csv = [str(base - timedelta(days=x))+str('.csv')
                     for x in range(numdays)]
    print('Adding {} dates in a list...'.format(len(date_list_csv)))
    date_list = []
    for d in date_list_csv:
        date_list.append(d[:-4])
    print("List of dates:", date_list)
    return date_list_csv, date_list



In [7]:

def load_all_data_temporal(list_date_list):

    cuba_data.load_and_generatecsv(list_date_list)
    peru_data.load_and_generatecsv(list_date_list)
    ecuador_data.load_and_generatecsv(list_date_list)

    print("------------------------ALL TEMPORALS CREATED----------------------------")



In [13]:
df_template=pd.read_csv(DATA_TEMPLATE_URL)
df_template=df_template.fillna('')
df_template

Unnamed: 0,ISO 3166-2 Code,Country,Subdivision,Last Update,Confirmed,Deaths,Recovered
0,AR-B,Argentina,Buenos Aires,,,,
1,AR-K,Argentina,Catamarca,,,,
2,AR-H,Argentina,Chaco,,,,
3,AR-U,Argentina,Chubut,,,,
4,AR-C,Argentina,Ciudad Autonoma de Buenos Aires,,,,
...,...,...,...,...,...,...,...
403,VE-S,Venezuela,Tachira,,,,
404,VE-T,Venezuela,Trujillo,,,,
405,VE-X,Venezuela,Vargas,,,,
406,VE-U,Venezuela,Yaracuy,,,,


In [20]:
date_list_csv, date_list = generate_list_dates(PATH_DSRP_DAILY_REPORTS)

There is 447 files on the path and one is README. We iterate 446 times...
Adding 446 dates in a list...
List of dates: ['2021-05-15', '2021-05-14', '2021-05-13', '2021-05-12', '2021-05-11', '2021-05-10', '2021-05-09', '2021-05-08', '2021-05-07', '2021-05-06', '2021-05-05', '2021-05-04', '2021-05-03', '2021-05-02', '2021-05-01', '2021-04-30', '2021-04-29', '2021-04-28', '2021-04-27', '2021-04-26', '2021-04-25', '2021-04-24', '2021-04-23', '2021-04-22', '2021-04-21', '2021-04-20', '2021-04-19', '2021-04-18', '2021-04-17', '2021-04-16', '2021-04-15', '2021-04-14', '2021-04-13', '2021-04-12', '2021-04-11', '2021-04-10', '2021-04-09', '2021-04-08', '2021-04-07', '2021-04-06', '2021-04-05', '2021-04-04', '2021-04-03', '2021-04-02', '2021-04-01', '2021-03-31', '2021-03-30', '2021-03-29', '2021-03-28', '2021-03-27', '2021-03-26', '2021-03-25', '2021-03-24', '2021-03-23', '2021-03-22', '2021-03-21', '2021-03-20', '2021-03-19', '2021-03-18', '2021-03-17', '2021-03-16', '2021-03-15', '2021-03-14'

In [25]:
df_template.to_csv('withzeros.csv',index=False)

In [26]:
for d in date_list[0:10]:  # date_list
    import requests

    URL = f"https://raw.githubusercontent.com/DataScienceResearchPeru/covid-19_latinoamerica/master/latam_covid_19_data/daily_reports/{d}.csv"

    try:
        response = requests.head(URL)
    except Exception as e:
        print(f"NOT OK: {str(e)}")
    else:
        if response.status_code == 200:
            print("OK")
        else:
            print(f"NOT OK: HTTP response code {response.status_code}")
            print(f'Creating file {d}.csv')
            df_template.to_csv(PATH_DSRP_DAILY_REPORTS,index=False)


OK
OK
OK
OK
OK
OK
OK
OK
OK
OK


In [3]:

def fix_format(df):
    df = df.fillna('')

    for m in range(len(df)):

        if df.loc[m]['Confirmed'] != '':
            a = int(float(df.loc[m]['Confirmed']))
        else:
            a = ''

        if df.loc[m]['Deaths'] != '':
            b = int(float(df.loc[m]['Deaths']))
        else:
            b = ''

        if df.loc[m]['Recovered'] != '':
            c = int(float(df.loc[m]['Recovered']))
        else:
            c = ''

        df.loc[m, ['Confirmed']] = str(a)
        df.loc[m, ['Deaths']] = str(b)
        df.loc[m, ['Recovered']] = str(c)

    return df

In [4]:
df_template=pd.read_csv(DATA_TEMPLATE_URL)
df_template=df_template.set_index('ISO 3166-2 Code')
df_template.head()

Unnamed: 0_level_0,Country,Subdivision,Last Update,Confirmed,Deaths,Recovered
ISO 3166-2 Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AR-B,Argentina,Buenos Aires,,,,
AR-K,Argentina,Catamarca,,,,
AR-H,Argentina,Chaco,,,,
AR-U,Argentina,Chubut,,,,
AR-C,Argentina,Ciudad Autonoma de Buenos Aires,,,,


In [5]:
df_template.info()

<class 'pandas.core.frame.DataFrame'>
Index: 408 entries, AR-B to VE-V
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Country      408 non-null    object 
 1   Subdivision  408 non-null    object 
 2   Last Update  0 non-null      float64
 3   Confirmed    0 non-null      float64
 4   Deaths       0 non-null      float64
 5   Recovered    0 non-null      float64
dtypes: float64(4), object(2)
memory usage: 22.3+ KB


In [7]:
DATA_TEMPORAL_PERU='data_collection/data/peru_temporal/2021-05-13.csv'
df=pd.read_csv(DATA_TEMPORAL_PERU)
df=df.set_index('ISO 3166-2 Code')
df.head()

Unnamed: 0_level_0,Country,Subdivision,Last Update,Confirmed,Deaths,Recovered
ISO 3166-2 Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PE-AMA,Peru,Amazonas,2021-05-15T22:13:38.661973,27339,486,
PE-ANC,Peru,Ancash,2021-05-15T22:13:38.661973,66090,2824,
PE-APU,Peru,Apurimac,2021-05-15T22:13:38.661973,21116,542,
PE-ARE,Peru,Arequipa,2021-05-15T22:13:38.661973,76480,2632,
PE-AYA,Peru,Ayacucho,2021-05-15T22:13:38.661973,27457,789,


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25 entries, PE-AMA to PE-UCA
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Country      25 non-null     object 
 1   Subdivision  25 non-null     object 
 2   Last Update  25 non-null     object 
 3   Confirmed    25 non-null     int64  
 4   Deaths       25 non-null     int64  
 5   Recovered    0 non-null      float64
dtypes: float64(1), int64(2), object(3)
memory usage: 1.4+ KB


In [8]:
df_template.update(df)
df_template


Unnamed: 0_level_0,Country,Subdivision,Last Update,Confirmed,Deaths,Recovered
ISO 3166-2 Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AR-B,Argentina,Buenos Aires,,,,
AR-K,Argentina,Catamarca,,,,
AR-H,Argentina,Chaco,,,,
AR-U,Argentina,Chubut,,,,
AR-C,Argentina,Ciudad Autonoma de Buenos Aires,,,,
...,...,...,...,...,...,...
VE-S,Venezuela,Tachira,,,,
VE-T,Venezuela,Trujillo,,,,
VE-X,Venezuela,Vargas,,,,
VE-U,Venezuela,Yaracuy,,,,


In [9]:
df_template=df_template.reset_index(drop=False)

In [14]:
df_template

Unnamed: 0,ISO 3166-2 Code,Country,Subdivision,Last Update,Confirmed,Deaths,Recovered
0,AR-B,Argentina,Buenos Aires,,,,
1,AR-K,Argentina,Catamarca,,,,
2,AR-H,Argentina,Chaco,,,,
3,AR-U,Argentina,Chubut,,,,
4,AR-C,Argentina,Ciudad Autonoma de Buenos Aires,,,,
...,...,...,...,...,...,...,...
403,VE-S,Venezuela,Tachira,,,,
404,VE-T,Venezuela,Trujillo,,,,
405,VE-X,Venezuela,Vargas,,,,
406,VE-U,Venezuela,Yaracuy,,,,


In [10]:

df_template.loc[df_template['ISO 3166-2 Code'].str.contains('PE-')]

Unnamed: 0,ISO 3166-2 Code,Country,Subdivision,Last Update,Confirmed,Deaths,Recovered
338,PE-AMA,Peru,Amazonas,2021-05-15T22:13:38.661973,27339.0,486.0,
339,PE-ANC,Peru,Ancash,2021-05-15T22:13:38.661973,66090.0,2824.0,
340,PE-APU,Peru,Apurimac,2021-05-15T22:13:38.661973,21116.0,542.0,
341,PE-ARE,Peru,Arequipa,2021-05-15T22:13:38.661973,76480.0,2632.0,
342,PE-AYA,Peru,Ayacucho,2021-05-15T22:13:38.661973,27457.0,789.0,
343,PE-CAJ,Peru,Cajamarca,2021-05-15T22:13:38.661973,54935.0,1367.0,
344,PE-CAL,Peru,Callao,2021-05-15T22:13:38.661973,93428.0,3345.0,
345,PE-CUS,Peru,Cusco,2021-05-15T22:13:38.661973,58130.0,1307.0,
346,PE-HUV,Peru,Huancavelica,2021-05-15T22:13:38.661973,13855.0,406.0,
347,PE-HUC,Peru,Huanuco,2021-05-15T22:13:38.661973,30858.0,1014.0,


In [11]:
df_template=fix_format(df_template)
df_template.loc[df_template['ISO 3166-2 Code'].str.contains('PE-')]

Unnamed: 0,ISO 3166-2 Code,Country,Subdivision,Last Update,Confirmed,Deaths,Recovered
338,PE-AMA,Peru,Amazonas,2021-05-15T22:13:38.661973,27339,486,
339,PE-ANC,Peru,Ancash,2021-05-15T22:13:38.661973,66090,2824,
340,PE-APU,Peru,Apurimac,2021-05-15T22:13:38.661973,21116,542,
341,PE-ARE,Peru,Arequipa,2021-05-15T22:13:38.661973,76480,2632,
342,PE-AYA,Peru,Ayacucho,2021-05-15T22:13:38.661973,27457,789,
343,PE-CAJ,Peru,Cajamarca,2021-05-15T22:13:38.661973,54935,1367,
344,PE-CAL,Peru,Callao,2021-05-15T22:13:38.661973,93428,3345,
345,PE-CUS,Peru,Cusco,2021-05-15T22:13:38.661973,58130,1307,
346,PE-HUV,Peru,Huancavelica,2021-05-15T22:13:38.661973,13855,406,
347,PE-HUC,Peru,Huanuco,2021-05-15T22:13:38.661973,30858,1014,


In [16]:
df_template.to_csv('prueba.csv',index=False)