# Creation of the SIRD Table

In [1]:
import matplotlib.pyplot as plt 
%matplotlib inline
import pandas as pd
import numpy as np
from IPython import display

## 1. SIRD Raw Table

In [2]:
import os
list_files = os.listdir()  # List of files in current directory

import re
files_desired = []

# Loop for obtaining the specific files with the desired information
for file in list_files:
    result = re.findall('[0-9A-Za-z()_]*_Spain.csv$', file)
    if result:
        result = ''.join(result)
        files_desired.append(result)
        
files_desired

['Confirmed_Spain.csv',
 'Deaths_Spain.csv',
 'Recovered_Spain.csv',
 'SIRD(Raw)_Spain.csv',
 'SIRD_Spain.csv',
 'Susceptibles_Spain.csv']

In [3]:
# Labels of the columns from which I want to extract values
labels_desired = ['Accumulated Alive Confirmed in 14 days', 'Total Recovered',
                  'Total Deaths Fixed (sum New Deaths)', 'Total Susceptibles']

In [4]:
# Obtaining a list with the arrays of Confirmed, Deaths and Recovered

values = []
for i in range(len(files_desired)):
    df = pd.read_csv(files_desired[i])  # Creation of dataframe for each file
    # Try every label: only 1 is going to match with each file/dataframe
    for j in range(len(labels_desired)):
        try:
            values.append(df[labels_desired[j]].to_numpy())
        except Exception:
            pass

# First is Confirmed, Second is Deaths, Third is Recovered, Fourth is Susceptibles
len(values)

4

In [5]:
# Creation of individual arrays
I = values[0]
D = values[1]
R = values[2]
S = values[3]
len(I), I[-1], len(D), D[-1], len(R), R[-1], len(S), S[-1]

(787, 173683, 787, 103973, 787, 11185964, 787, 35868994)

In [6]:
# Arrays which will form the final dataframe
iso_code = df.values[:, 0]
continent = df.values[:, 1]
region = df.values[:, 2]
dates = df.values[:, 3]

iso_code[0:5], continent[0:5], region[0:5], dates[0:5]

(array(['ESP', 'ESP', 'ESP', 'ESP', 'ESP'], dtype=object),
 array(['Europe', 'Europe', 'Europe', 'Europe', 'Europe'], dtype=object),
 array(['Spain', 'Spain', 'Spain', 'Spain', 'Spain'], dtype=object),
 array(['2020-01-28', '2020-01-29', '2020-01-30', '2020-01-31',
        '2020-02-01'], dtype=object))

In [7]:
# Array with total population for each time (should be constant)
N = np.empty(len(S), dtype = object)

for i in range(len(S)):
    N[i] = S[i] + I[i] + R[i] + D[i]
    
len(N), N[0:20], N[100:120]

(787,
 array([47332614, 47332614, 47332614, 47332614, 47332614, 47332614,
        47332614, 47332614, 47332614, 47332614, 47332614, 47332614,
        47332614, 47332614, 47332614, 47332614, 47332614, 47332614,
        47332614, 47332614], dtype=object),
 array([47332614, 47332614, 47332614, 47332614, 47332614, 47332614,
        47332614, 47332614, 47332614, 47332614, 47332614, 47332614,
        47332614, 47332614, 47332614, 47332614, 47332614, 47332614,
        47332614, 47332614], dtype=object))

In [8]:
# Array with total population minus total deaths
N_no_deaths = np.empty(len(S), dtype = object)

for i in range(len(S)):
    N_no_deaths[i] = S[i] + I[i] + R[i]
    
len(N_no_deaths), N_no_deaths[0:20], N_no_deaths[100:120]

(787,
 array([47332614, 47332614, 47332614, 47332614, 47332614, 47332614,
        47332614, 47332614, 47332614, 47332614, 47332614, 47332614,
        47332614, 47332614, 47332614, 47332614, 47332614, 47332614,
        47332614, 47332614], dtype=object),
 array([47306544, 47306315, 47306136, 47305993, 47305870, 47305694,
        47305510, 47305293, 47305155, 47305051, 47305051, 47304905,
        47304836, 47304726, 47304674, 47303986, 47303936, 47303862,
        47303862, 47303579], dtype=object))

In [9]:
# Array concatenation for building the final dataframe

columns = 10  # Number of columns
X_total = np.empty((len(S), columns), dtype = object)

for i in range(len(S)):
    X_total[i] = [iso_code[i], continent[i], region[i], dates[i], S[i], I[i], R[i], D[i], N[i], N_no_deaths[i]]
    
X_total.shape, X_total[0:10]

((787, 10),
 array([['ESP', 'Europe', 'Spain', '2020-01-28', 47332614, 0, 0, 0,
         47332614, 47332614],
        ['ESP', 'Europe', 'Spain', '2020-01-29', 47332614, 0, 0, 0,
         47332614, 47332614],
        ['ESP', 'Europe', 'Spain', '2020-01-30', 47332614, 0, 0, 0,
         47332614, 47332614],
        ['ESP', 'Europe', 'Spain', '2020-01-31', 47332614, 0, 0, 0,
         47332614, 47332614],
        ['ESP', 'Europe', 'Spain', '2020-02-01', 47332613, 1, 0, 0,
         47332614, 47332614],
        ['ESP', 'Europe', 'Spain', '2020-02-02', 47332613, 1, 0, 0,
         47332614, 47332614],
        ['ESP', 'Europe', 'Spain', '2020-02-03', 47332613, 1, 0, 0,
         47332614, 47332614],
        ['ESP', 'Europe', 'Spain', '2020-02-04', 47332613, 1, 0, 0,
         47332614, 47332614],
        ['ESP', 'Europe', 'Spain', '2020-02-05', 47332613, 1, 0, 0,
         47332614, 47332614],
        ['ESP', 'Europe', 'Spain', '2020-02-06', 47332613, 1, 0, 0,
         47332614, 47332614]], dtype=o

In [10]:
# Labels of each column of the dataframe
labels = ['ISO_code', 'Continent', 'Country/Region', 'Date', 'Susceptibles Raw (S)', 'Infected Raw (I)',
          'Recovered Raw (R)', 'Deaths Raw (D)', 'Total Population (N)', 'Total Population Without Deahts (N Alive)']

In [11]:
# Creation of the final dataframe
df_SIRD_raw = pd.DataFrame(data=X_total, columns=labels)
df_SIRD_raw

Unnamed: 0,ISO_code,Continent,Country/Region,Date,Susceptibles Raw (S),Infected Raw (I),Recovered Raw (R),Deaths Raw (D),Total Population (N),Total Population Without Deahts (N Alive)
0,ESP,Europe,Spain,2020-01-28,47332614,0,0,0,47332614,47332614
1,ESP,Europe,Spain,2020-01-29,47332614,0,0,0,47332614,47332614
2,ESP,Europe,Spain,2020-01-30,47332614,0,0,0,47332614,47332614
3,ESP,Europe,Spain,2020-01-31,47332614,0,0,0,47332614,47332614
4,ESP,Europe,Spain,2020-02-01,47332613,1,0,0,47332614,47332614
...,...,...,...,...,...,...,...,...,...,...
782,ESP,Europe,Spain,2022-03-20,35923141,222919,11082931,103623,47332614,47228991
783,ESP,Europe,Spain,2022-03-21,35923141,187083,11118767,103623,47332614,47228991
784,ESP,Europe,Spain,2022-03-22,35868994,218016,11141631,103973,47332614,47228641
785,ESP,Europe,Spain,2022-03-23,35868994,196213,11163434,103973,47332614,47228641


In [12]:
# Export dataframe to a csv file

from pathlib import Path  
filepath = Path('SIRD(Raw)_Spain.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df_SIRD_raw.to_csv(filepath, index=False)

## 2. SIRD Smooth Table

In [13]:
import os
list_files = os.listdir()  # List of files in current directory

import re
files_desired = []

# Loop for obtaining the specific files with the desired information
for file in list_files:
    result = re.findall('[0-9A-Za-z()_]*_Spain.csv$', file)
    if result:
        result = ''.join(result)
        files_desired.append(result)
        
files_desired

['Confirmed_Spain.csv',
 'Deaths_Spain.csv',
 'Recovered_Spain.csv',
 'SIRD(Raw)_Spain.csv',
 'SIRD_Spain.csv',
 'Susceptibles_Spain.csv']

In [14]:
# Labels of the columns from which I want to extract values
labels_desired = ['Accumulated Alive Confirmed Smooth in 14 days', 'Total Recovered Smooth (7 days)',
                  'Total Deaths Fixed Smooth (7 days)', 'Susceptibles Smooth (7 days average)']

In [15]:
# Obtaining a list with the arrays of Confirmed, Deaths and Recovered

values = []
for i in range(len(files_desired)):
    df = pd.read_csv(files_desired[i])  # Creation of dataframe for each file
    # Try every label: only 1 is going to match with each file/dataframe
    for j in range(len(labels_desired)):
        try:
            values.append(df[labels_desired[j]].to_numpy())
        except Exception:
            pass

# First is Confirmed, Second is Deaths, Third is Recovered, Fourth is Susceptibles
len(values)

4

In [16]:
# Creation of individual arrays
I = values[0]
D = values[1]
R = values[2]
S = values[3]

len(I), I[-1], len(D), D[-1], len(R), R[-1], len(S), S[-1]

(787,
 206250.2857142857,
 787,
 103773.0,
 787,
 11122655.57142857,
 787,
 35899935.14285715)

In [17]:
# Arrays which will form the final dataframe
iso_code = df.values[:, 0]
continent = df.values[:, 1]
region = df.values[:, 2]
dates = df.values[:, 3]

iso_code[0:5], continent[0:5], region[0:5], dates[0:5]

(array(['ESP', 'ESP', 'ESP', 'ESP', 'ESP'], dtype=object),
 array(['Europe', 'Europe', 'Europe', 'Europe', 'Europe'], dtype=object),
 array(['Spain', 'Spain', 'Spain', 'Spain', 'Spain'], dtype=object),
 array(['2020-01-28', '2020-01-29', '2020-01-30', '2020-01-31',
        '2020-02-01'], dtype=object))

In [18]:
# Array with total population for each time (should be constant)
N = np.empty(len(S), dtype = object)

for i in range(len(S)):
    N[i] = S[i] + I[i] + R[i] + D[i]
    
len(N), N[0:20], N[100:120]

(787,
 array([47332614.0, 47332614.0, 47332614.0, 47332614.0, 47332614.0,
        47332614.0, 47332614.00000001, 47332614.00000001, 47332614.0,
        47332614.0, 47332614.0, 47332614.0, 47332614.0, 47332614.0,
        47332614.00000001, 47332614.00000001, 47332614.0, 47332614.0,
        47332614.0, 47332614.0], dtype=object),
 array([47332614.00000001, 47332614.0, 47332614.00000001, 47332614.0,
        47332614.0, 47332614.0, 47332614.00000001, 47332614.0, 47332614.0,
        47332614.0, 47332614.0, 47332614.0, 47332614.0, 47332614.0,
        47332614.0, 47332614.0, 47332614.0, 47332614.0, 47332614.0,
        47332614.0], dtype=object))

In [19]:
# Array with total population minus total deaths
N_no_deaths = np.empty(len(S), dtype = object)

for i in range(len(S)):
    N_no_deaths[i] = S[i] + I[i] + R[i]
    
len(N_no_deaths), N_no_deaths[0:20], N_no_deaths[100:120]

(787,
 array([47332614.0, 47332614.0, 47332614.0, 47332614.0, 47332614.0,
        47332614.0, 47332614.00000001, 47332614.00000001, 47332614.0,
        47332614.0, 47332614.0, 47332614.0, 47332614.0, 47332614.0,
        47332614.00000001, 47332614.00000001, 47332614.0, 47332614.0,
        47332614.0, 47332614.0], dtype=object),
 array([47307203.28571429, 47306952.428571425, 47306755.57142858,
        47306561.71428572, 47306373.71428572, 47306187.0,
        47306008.857142866, 47305830.14285714, 47305664.428571425,
        47305509.428571425, 47305374.85714286, 47305237.0,
        47305114.428571425, 47305002.428571425, 47304914.0, 47304747.0,
        47304587.71428572, 47304417.85714286, 47304268.85714286,
        47304089.28571428], dtype=object))

In [20]:
# Array concatenation for building the final dataframe

columns = 10  # Number of columns
X_total = np.empty((len(S), columns), dtype = object)

for i in range(len(S)):
    X_total[i] = [iso_code[i], continent[i], region[i], dates[i], S[i], I[i], R[i], D[i], N[i], N_no_deaths[i]]
    
X_total.shape, X_total[0:10]

((787, 10),
 array([['ESP', 'Europe', 'Spain', '2020-01-28', 47332614.0, 0.0, 0.0,
         0.0, 47332614.0, 47332614.0],
        ['ESP', 'Europe', 'Spain', '2020-01-29', 47332614.0, 0.0, 0.0,
         0.0, 47332614.0, 47332614.0],
        ['ESP', 'Europe', 'Spain', '2020-01-30', 47332614.0, 0.0, 0.0,
         0.0, 47332614.0, 47332614.0],
        ['ESP', 'Europe', 'Spain', '2020-01-31', 47332614.0, 0.0, 0.0,
         0.0, 47332614.0, 47332614.0],
        ['ESP', 'Europe', 'Spain', '2020-02-01', 47332613.85714286,
         0.14285714285714285, 0.0, 0.0, 47332614.0, 47332614.0],
        ['ESP', 'Europe', 'Spain', '2020-02-02', 47332613.71428572,
         0.2857142857142857, 0.0, 0.0, 47332614.0, 47332614.0],
        ['ESP', 'Europe', 'Spain', '2020-02-03', 47332613.57142858,
         0.42857142857142855, 0.0, 0.0, 47332614.00000001,
         47332614.00000001],
        ['ESP', 'Europe', 'Spain', '2020-02-04', 47332613.42857143,
         0.5714285714285714, 0.0, 0.0, 47332614.00000001,
 

In [21]:
# Labels of each column of the dataframe
labels = ['ISO_code', 'Continent', 'Country/Region', 'Date', 'Susceptibles Smooth (S)', 'Infected Smooth (I)',
          'Recovered Smooth (R)', 'Deaths Smooth (D)', 'Total Population (N)', 'Total Population Without Deahts (N Alive)']

In [22]:
# Creation of the final dataframe
df_SIRD = pd.DataFrame(data=X_total, columns=labels)
df_SIRD

Unnamed: 0,ISO_code,Continent,Country/Region,Date,Susceptibles Smooth (S),Infected Smooth (I),Recovered Smooth (R),Deaths Smooth (D),Total Population (N),Total Population Without Deahts (N Alive)
0,ESP,Europe,Spain,2020-01-28,4.73326e+07,0,0,0,4.73326e+07,4.73326e+07
1,ESP,Europe,Spain,2020-01-29,4.73326e+07,0,0,0,4.73326e+07,4.73326e+07
2,ESP,Europe,Spain,2020-01-30,4.73326e+07,0,0,0,4.73326e+07,4.73326e+07
3,ESP,Europe,Spain,2020-01-31,4.73326e+07,0,0,0,4.73326e+07,4.73326e+07
4,ESP,Europe,Spain,2020-02-01,4.73326e+07,0.142857,0,0,4.73326e+07,4.73326e+07
...,...,...,...,...,...,...,...,...,...,...
782,ESP,Europe,Spain,2022-03-20,3.59652e+07,217216,1.10468e+07,103419,4.73326e+07,4.72292e+07
783,ESP,Europe,Spain,2022-03-21,3.59508e+07,208981,1.10693e+07,103500,4.73326e+07,4.72291e+07
784,ESP,Europe,Spain,2022-03-22,3.59339e+07,208352,1.10868e+07,103591,4.73326e+07,4.7229e+07
785,ESP,Europe,Spain,2022-03-23,3.59169e+07,207272,1.11048e+07,103682,4.73326e+07,4.72289e+07


In [23]:
# Export dataframe to a csv file

from pathlib import Path  
filepath = Path('SIRD_Spain.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df_SIRD.to_csv(filepath, index=False)