In [1]:
import pandas as pd
import numpy as np
import torch
import os
from dateutil.parser import parse 


In [2]:
country_codes = ['ABW','AFG','AGO','ALB','AND','ARE','ARG','AUS','AUT','AZE','BDI','BEL','BEN','BFA','BGD','BGR','BHR','BHS','BIH','BLR','BLZ','BMU','BOL','BRA','BRB','BRN','BTN','BWA','CAF','CAN','CHE','CHL','CHN','CIV','CMR','COD','COG','COL','COM','CPV','CRI','CUB','CYP','CZE','DEU','DJI','DMA','DNK','DOM','DZA','ECU','EGY','ERI','ESP','EST','ETH','FIN','FJI','FRA','FRO','GAB','GBR','GEO','GHA','GIN','GMB','GRC','GRL','GTM','GUM','GUY','HKG','HND','HRV','HTI','HUN','IDN','IND','IRL','IRN','IRQ','ISL','ISR','ITA','JAM','JOR','JPN','KAZ','KEN','KGZ','KHM','KOR','KWT','LAO','LBN','LBR','LBY','LKA','LSO','LTU','LUX','LVA','MAC','MAR','MCO','MDA','MDG','MEX','MLI','MMR','MNG','MOZ','MRT','MUS','MWI','MYS','NAM','NER','NGA','NIC','NLD','NOR','NPL','NZL','OMN','PAK','PAN','PER','PHL','PNG','POL','PRI','PRT','PRY','PSE','QAT','RKS','ROU','RUS','RWA','SAU','SDN','SEN','SGP','SLB','SLE','SLV','SMR','SOM','SRB','SSD','SUR','SVK','SVN','SWE','SWZ','SYC','SYR','TCD','TGO','THA','TJK','TKM','TLS','TTO','TUN','TUR','TWN','TZA','UGA','UKR','URY','USA','UZB','VEN','VIR','VNM','VUT','YEM','ZAF','ZMB','ZWE']
filenames = ["c1_school_closing.csv", "c2_workplace_closing.csv", "c3_cancel_public_events.csv", "c4_restrictions_on_gatherings.csv", "c5_close_public_transport.csv", "c6_stay_at_home_requirements.csv", "c7_movementrestrictions.csv", "c8_internationaltravel.csv", "confirmed_cases.csv"]

def dateConvertor(date):
    dt = parse(date)
    date = dt.strftime('%Y-%m-%d')
    return date

country_code2id = {}
for i in range(len(country_codes)):
    country_code2id[country_codes[i]] = i 

# date extraction
npi_date = pd.DataFrame({})
npi_date['Date'] = pd.read_csv(os.path.join('timeseries', filenames[0])).keys()[3:]
npi_date['Date'] = npi_date['Date'].apply(dateConvertor)

In [34]:
dataframes = {} 

countries_to_extract = ['ITA','IND','USA','CHN','BRA','IRN','CAN','GBR',
                        'FRA','ESP','BEL','DEU','NLD','MEX','TUR','SWE','ECU','RUS','PER','CHE'] # countries code for which you want data. 
index = [country_code2id[code] for code in countries_to_extract]

##reading and processing the country-wise static data
static_data = pd.read_csv(os.path.join('timeseries', 'Consolidated - Consolidated.csv')).T[2:][index].T.to_numpy()
tmp = static_data[:,4:]
#final static data = [20,6]
final_static_data = np.concatenate((static_data[:,0:4],np.min(tmp,axis = -1).reshape(-1,1),
                                   np.min(tmp,axis = -1).reshape(-1,1)),axis = -1).astype(np.float64)


for file in filenames:
    npi_df = pd.read_csv(os.path.join('timeseries', file)).T[3:]
    npi_df['Date'] = npi_date['Date'].values
    npi_df.set_index('Date', drop=True, inplace=True)
    npi_df = npi_df[index] # selecting countries 
    npi_df = npi_df[64:335] # removing Jan, Feb and Dec data
    for col in npi_df:
        npi_df[col] = pd.to_numeric(npi_df[col], errors='coerce') # converting object to numeric 
    npi_df.interpolate(method='linear', inplace=True) # interpolate missing values 
    dataframes[file[:-4]] = npi_df
   
    
    
    if(file[:-4]=='confirmed_cases'):
#         npi_df = pd.read_csv(os.path.join('timeseries', file))
#         print(npi_df)
        npi_df = pd.read_csv(os.path.join('timeseries', file)).T[3:]
        npi_df['Date'] = npi_date['Date'].values
        npi_df.set_index('Date', drop=True, inplace=True)
        npi_df = npi_df[index] # selecting countries 
#         npi_df = npi_df[64:335] # removing Jan, Feb and Dec data
        for col in npi_df:
            npi_df[col] = pd.to_numeric(npi_df[col], errors='coerce')
        npi_df = npi_df.interpolate(method='linear') # interpolate missing values     
        npi_df = npi_df.rolling(7).mean()
        
        npi_df = 100*npi_df.diff()/npi_df
        npi_df = npi_df[64:335] # removing Jan, Feb and Dec data
        
        
        
#         npi_df.interpolate(method='linear', inplace=True) # interpolate missing values     
        dataframes['growth_rate'] = npi_df
    

In [41]:
def readData(attributes, history, date):
    index = dataframes['c1_school_closing'].index.get_loc(date)
    if(history>index):
        print('Not sufficient history')
        sys.exit()
    data = []
    for att in attributes:
        temp = dataframes[att].iloc[index-history:index].values
        if(len(data)==0):
            data = np.asarray(temp)
        else:
            data = np.dstack((data, temp))

#      #without including static data       
#     x = torch.from_numpy(data).to(dtype=torch.double).permute(1,0,2).view(len(countries_to_extract),-1)
    
    #including static data
    x = torch.cat((torch.from_numpy(data).to(dtype=torch.double).permute(1,0,2).view(len(countries_to_extract),-1),
                   torch.from_numpy(final_static_data).to(dtype=torch.double)),dim = -1)
    
    y = torch.from_numpy(dataframes['growth_rate'].iloc[index].values).to(dtype=torch.double)
    
    return x,y 

In [42]:
attributes = ["c1_school_closing", "c2_workplace_closing", "c3_cancel_public_events", "c4_restrictions_on_gatherings", "c5_close_public_transport", "c6_stay_at_home_requirements", "c7_movementrestrictions", "c8_internationaltravel", "confirmed_cases"]
history = 21
dates = npi_date['Date'][85:335].values #(total allowed range for dates)
x,y = readData(attributes=attributes, history=history, date=dates[i])

In [43]:
x.shape
# x = [20,195] y = [20]
    # 195 = history*len(attributes)+dimension_of_static data
    # 20 countries

torch.Size([20, 195])

In [31]:
y.shape

torch.Size([20])