# Final Features Table

In [1]:
import matplotlib.pyplot as plt 
%matplotlib inline
import pandas as pd
import numpy as np
from IPython import display

## 1. SIRD Table

In [2]:
# Reading SIRD Table
df = pd.read_csv('SIRD_Spain.csv')
df.head()

Unnamed: 0,ISO_code,Continent,Country/Region,Date,Susceptibles Smooth (S),Infected Smooth (I),Recovered Smooth (R),Deaths Smooth (D),Total Population (N),Total Population Without Deahts (N Alive)
0,ESP,Europe,Spain,2020-01-28,47332610.0,0.0,0.0,0.0,47332614.0,47332614.0
1,ESP,Europe,Spain,2020-01-29,47332610.0,0.0,0.0,0.0,47332614.0,47332614.0
2,ESP,Europe,Spain,2020-01-30,47332610.0,0.0,0.0,0.0,47332614.0,47332614.0
3,ESP,Europe,Spain,2020-01-31,47332610.0,0.0,0.0,0.0,47332614.0,47332614.0
4,ESP,Europe,Spain,2020-02-01,47332610.0,0.142857,0.0,0.0,47332614.0,47332614.0


In [3]:
# Obtaining all relevant values
iso_code = df.values[:, 0]
continent = df.values[:, 1]
region = df.values[:, 2]
dates = df.values[:, 3]
S = df.values[:, 4]
I = df.values[:, 5]
R = df.values[:, 6]
D = df.values[:, 7]

iso_code[0:5], continent[0:5], region[0:5], dates[0:5], S[0:5], I[0:5], R[0:5], D[0:5], len(dates), len(S)

(array(['ESP', 'ESP', 'ESP', 'ESP', 'ESP'], dtype=object),
 array(['Europe', 'Europe', 'Europe', 'Europe', 'Europe'], dtype=object),
 array(['Spain', 'Spain', 'Spain', 'Spain', 'Spain'], dtype=object),
 array(['2020-01-28', '2020-01-29', '2020-01-30', '2020-01-31',
        '2020-02-01'], dtype=object),
 array([47332614.0, 47332614.0, 47332614.0, 47332614.0, 47332613.85714286],
       dtype=object),
 array([0.0, 0.0, 0.0, 0.0, 0.14285714285714285], dtype=object),
 array([0.0, 0.0, 0.0, 0.0, 0.0], dtype=object),
 array([0.0, 0.0, 0.0, 0.0, 0.0], dtype=object),
 787,
 787)

## 2. Mobility Data

In [4]:
# Reading Mobility Table
df = pd.read_excel('ES_Mobility_Report_Join.xlsx')
df.head()

Unnamed: 0,country_region_code,country_region,place_id,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline,average_perc,average_rate
0,ES,Spain,ChIJi7xhMnjjQgwR7KNoB5Qs7KY,2020-02-15,2,-1,26,8,0,-2,5.5,0.055
1,ES,Spain,ChIJi7xhMnjjQgwR7KNoB5Qs7KY,2020-02-16,2,3,13,5,-1,-2,3.333333,0.033333
2,ES,Spain,ChIJi7xhMnjjQgwR7KNoB5Qs7KY,2020-02-17,0,1,9,3,3,-1,2.5,0.025
3,ES,Spain,ChIJi7xhMnjjQgwR7KNoB5Qs7KY,2020-02-18,-2,0,5,4,3,0,1.666667,0.016667
4,ES,Spain,ChIJi7xhMnjjQgwR7KNoB5Qs7KY,2020-02-19,0,1,11,2,3,-1,2.666667,0.026667


In [5]:
# Obtaining all relevant values
dates_mob = df.values[:, 3]
mob_rate = df.values[:, -1]

len(dates_mob), len(mob_rate), dates_mob[0:5], mob_rate[0:5]

(770,
 770,
 array(['2020-02-15', '2020-02-16', '2020-02-17', '2020-02-18',
        '2020-02-19'], dtype=object),
 array([0.055, 0.03333333333333333, 0.025, 0.01666666666666667,
        0.02666666666666666], dtype=object))

In [6]:
# Calculating mobility rate smooth
mob_rate_smooth = np.empty(len(mob_rate)-6, dtype = object)  # 6 first values are not available

for i in range(0, len(mob_rate)-6):
    sum_mob = 0
    for j in range(i, i+7):
        sum_mob = sum_mob + mob_rate[j]
    mob_rate_smooth[i] = (sum_mob/7)

mob_rate_smooth.shape, mob_rate_smooth[0:100]

((764,),
 array([0.035, 0.0419047619047619, 0.05119047619047619,
        0.05738095238095237, 0.057857142857142864, 0.05785714285714285,
        0.060000000000000005, 0.05690476190476191, 0.04833333333333334,
        0.038571428571428576, 0.02547619047619048, 0.022857142857142857,
        0.023333333333333334, 0.016666666666666666, 0.012619047619047618,
        0.011428571428571429, 0.01595238095238095, 0.023333333333333338,
        0.029761904761904764, 0.030000000000000002, 0.028095238095238093,
        0.013333333333333338, -0.036904761904761905, -0.12047619047619049,
        -0.19238095238095237, -0.2711904761904762, -0.3478571428571428,
        -0.4288095238095238, -0.48976190476190473, -0.5285714285714286,
        -0.5457142857142857, -0.5554761904761905, -0.5630952380952381,
        -0.5707142857142857, -0.5692857142857143, -0.5730952380952381,
        -0.5745238095238095, -0.5752380952380953, -0.5811904761904761,
        -0.5864285714285715, -0.5904761904761905, -0.593809523809

In [7]:
# Readjusting dates_mob
dates_mob = dates_mob[6:len(dates_mob)]

dates_mob.shape, len(mob_rate_smooth)

((764,), 764)

## 3. Testing Data

In [8]:
# Reading Testing Table
df = pd.read_excel('Spain_Testing.xlsx')
df.head()

Unnamed: 0,Entity,ISO code,Date,Source URL,Source label,Notes,Cumulative total,Daily change in cumulative total,Cumulative total per thousand,Daily change in cumulative total per thousand,7-day smoothed daily change,7-day smoothed daily change per thousand,Short-term positive rate,Short-term tests per case
0,Spain - tests performed,ESP,2020-03-15,https://www.sanidad.gob.es/profesionales/salud...,Ministry of Health,,6303,6303,0.135,0.135,,,0.0,
1,Spain - tests performed,ESP,2020-03-16,https://www.sanidad.gob.es/profesionales/salud...,Ministry of Health,,15353,9050,0.328,0.194,,,0.0,
2,Spain - tests performed,ESP,2020-03-17,https://www.sanidad.gob.es/profesionales/salud...,Ministry of Health,,27042,11689,0.578,0.25,,,0.0,
3,Spain - tests performed,ESP,2020-03-18,https://www.sanidad.gob.es/profesionales/salud...,Ministry of Health,,39814,12772,0.852,0.273,,,0.0,
4,Spain - tests performed,ESP,2020-03-19,https://www.sanidad.gob.es/profesionales/salud...,Ministry of Health,,54736,14922,1.171,0.319,,,0.0,


In [9]:
# Obtaining all relevant values
dates_test = df.values[:, 2]
test_rate = df.values[:, 12]

dates_test.shape, test_rate.shape, dates_test[0:5], test_rate[0:5]

((741,),
 (741,),
 array([Timestamp('2020-03-15 00:00:00'), Timestamp('2020-03-16 00:00:00'),
        Timestamp('2020-03-17 00:00:00'), Timestamp('2020-03-18 00:00:00'),
        Timestamp('2020-03-19 00:00:00')], dtype=object),
 array([0.0, 0.0, 0.0, 0.0, 0.0], dtype=object))

In [10]:
# Readjusting dates_test and test_rate
dates_test = dates_test[6:len(dates_test)]
test_rate = test_rate[6:len(test_rate)]

dates_test.shape, test_rate.shape, dates_test[0:5], test_rate[0:5]

((735,),
 (735,),
 array([Timestamp('2020-03-21 00:00:00'), Timestamp('2020-03-22 00:00:00'),
        Timestamp('2020-03-23 00:00:00'), Timestamp('2020-03-24 00:00:00'),
        Timestamp('2020-03-25 00:00:00')], dtype=object),
 array([0.395, 0.406, 0.416, 0.427, 0.433], dtype=object))

## 4. Parameters Data

In [11]:
# Reading Parameters Table
df = pd.read_csv('Parameters_LM.csv')
df.head()

Unnamed: 0,ISO_code,Continent,Country/Region,Date,Mu Raw,Mu Smooth (7 days average),Mu Fixed Raw,Mu Fixed Smooth,Beta,Beta Fixed Smooth,Gamma Raw,Gamma Smooth (7 days average),Gamma Fixed,Gamma Fixed Smooth,R0 Raw,R0 Fixed Smooth (Gamma = 1/14),R0 Fixed Smooth (Mean Gamma)
0,ESP,Europe,Spain,2020-02-03,0.01,0.010716,0.01,0.010716,0.045,0.039941,0.071429,0.039653,0.071429,0.042238,0.647538,0.559171,0.574838
1,ESP,Europe,Spain,2020-02-04,0.01,0.011303,0.01,0.011303,0.045,0.047611,0.071429,0.026832,0.071429,0.032033,0.647538,0.666556,0.685232
2,ESP,Europe,Spain,2020-02-05,0.01,0.011209,0.01,0.011209,0.045,0.058809,0.071429,0.015204,0.071429,0.021829,0.647538,0.823328,0.846397
3,ESP,Europe,Spain,2020-02-06,0.01,0.011512,0.01,0.011512,0.045,0.072383,0.071429,0.003901,0.071429,0.011625,0.647538,1.013361,1.041755
4,ESP,Europe,Spain,2020-02-07,0.007559,0.014458,0.007559,0.014458,0.013622,0.087533,0.004974,-0.008355,0.004974,0.001421,0.196016,1.225468,1.259804


In [12]:
# Obtaining all relevant values
dates_param = df.values[:, 3]
beta = df['Beta Fixed Smooth'].values
gamma = df['Gamma Fixed Smooth'].values

dates_param[0:5], beta[0:5], gamma[0:5]

(array(['2020-02-03', '2020-02-04', '2020-02-05', '2020-02-06',
        '2020-02-07'], dtype=object),
 array([0.03994076, 0.04761113, 0.05880914, 0.07238295, 0.08753341]),
 array([0.04223751, 0.03203343, 0.02182934, 0.01162526, 0.00142118]))

In [13]:
# Reading Parameters Table
df = pd.read_csv('Parameters_ODEs.csv')
df.head()

Unnamed: 0,ISO_code,Continent,Country/Region,Date,Mu Raw,Mu Smooth (7 days average),Beta Raw,Beta Smooth (7 days average),Beta Fixed Raw (No negative values),Beta Fixed Smooth,R0 Raw,R0 Smooth (7 days average),R0 Fixed Raw (No negative values),R0 Fixed Smooth
0,ESP,Europe,Spain,2020-02-03,0.0,0.0,0.0,0.029762,0.0,0.029762,0.0,0.428571,0.0,0.428571
1,ESP,Europe,Spain,2020-02-04,0.0,0.0,0.0,0.039683,0.0,0.039683,0.0,0.571429,0.0,0.571429
2,ESP,Europe,Spain,2020-02-05,0.0,0.0,0.0,0.049603,0.0,0.049603,0.0,0.714286,0.0,0.714286
3,ESP,Europe,Spain,2020-02-06,0.0,0.0,0.0,0.059524,0.0,0.059524,0.0,0.857143,0.0,0.857143
4,ESP,Europe,Spain,2020-02-07,0.0,0.0,0.069444,0.069444,0.069444,0.069444,1.0,1.0,1.0,1.0


In [14]:
# Obtaining all relevant values
mu = df['Mu Smooth (7 days average)'].values

# Adjusting length
mu = mu[0:len(beta)]

len(mu), len(beta), len(gamma), len(dates_param)

(775, 775, 775, 775)

## 5. Creation of the final dataframe

**Establishing the same start date to every array**

In [15]:
# First date
first = dates_test[0]

# Transform day format to string
from datetime import datetime
first_str = dates_test[0].strftime("%Y-%m-%d")  # '2020-03-21' Format

# Get the index where fisrt day of testing data registered is located in mobility data
index_mob = np.where(dates_mob == first_str)[0][0]

index_mob, dates_mob[index_mob]

(29, '2020-03-21')

In [16]:
# df_prov = pd.DataFrame(data=[dates_mob, mob_rate_smooth])
# df_prov[index_mob], mob_rate_smooth[index_mob:len(mob_rate_smooth)]

# While to make the change only once in case this cell is executed more than once
while index_mob != 0:
    # Readjust mobility rates array to have the same length as testing array
    mob_rate_smooth = mob_rate_smooth[index_mob:len(mob_rate_smooth)]
    dates_mob = dates_mob[index_mob:len(dates_mob)]
    
    # Index = 0 when the first date is changed
    index_mob = 0

len(dates_mob), len(mob_rate_smooth), dates_mob[0:5], mob_rate_smooth[0:5]

(735,
 735,
 array(['2020-03-21', '2020-03-22', '2020-03-23', '2020-03-24',
        '2020-03-25'], dtype=object),
 array([-0.5285714285714286, -0.5457142857142857, -0.5554761904761905,
        -0.5630952380952381, -0.5707142857142857], dtype=object))

In [17]:
# Transform day format to string
from datetime import datetime
first_str = dates_test[0].strftime("%Y-%m-%d")

# Get the index where fisrt day of testing data registered is located in SIRD data
index_SIRD = np.where(dates == first_str)[0][0]

index_SIRD, dates[index_SIRD]

(53, '2020-03-21')

In [18]:
# df_prov = pd.DataFrame(data=[dates, S, I, R, D])
# df_prov[index_SIRD], S[index_SIRD], I[index_SIRD], R[index_SIRD], D[index_SIRD]

# While to make the change only once in case this cell is executed more than once
while index_SIRD != 0:
    # Readjust SIRD array to have the same length as testing array
    S = S[index_SIRD:len(S)]
    I = I[index_SIRD:len(I)]
    R = R[index_SIRD:len(R)]
    D = D[index_SIRD:len(D)]
    dates = dates[index_SIRD:len(dates)]
    
    # Index = 0 when the first date is changed
    index_SIRD = 0

len(dates), len(S), len(I), len(R), len(D)

(734, 734, 734, 734, 734)

In [19]:
# Transform day format to string
from datetime import datetime
first_str = dates_test[0].strftime("%Y-%m-%d")

# Get the index where fisrt day of testing data registered is located in parameters data
index_param = np.where(dates_param == first_str)[0][0]

index_param, dates_param[index_param]

(47, '2020-03-21')

In [20]:
# df_prov = pd.DataFrame(data=[dates_param, mu, beta])
# df_prov[index_param], mu[index_param], beta[index_param]

# While to make the change only once in case this cell is executed more than once
while index_param != 0:
    mu = mu[index_param:len(mu)]
    beta = beta[index_param:len(beta)]
    dates_param = dates_param[index_param:len(dates_param)]
    
    # Index = 0 when the first date is changed
    index_param = 0

len(dates_param), len(mu), len(beta)

(728, 728, 728)

**Establishing the same final date to every array**

In [21]:
# Last date is defined by the parameters data
dates_param[-1], len(dates_param)

('2022-03-18', 728)

In [22]:
# As every array has now the same beginning, all the arrays must have the same length to be equal (dates equal)

# Readjust SIRD array to have the same length as parameters array
S = S[0:len(dates_param)]
I = I[0:len(dates_param)]
R = R[0:len(dates_param)]
D = D[0:len(dates_param)]

# Readjust mobility rates array to have the same length as parameters array
mob_rate_smooth = mob_rate_smooth[0:len(dates_param)]

# Readjust test rate array to have the same length as parameters array
test_rate = test_rate[0:len(dates_param)]

# Readjust date array
dates_test = dates_test[0:len(dates_param)]

In [23]:
# Checking that all arrays are the same length
len(S), len(I), len(R), len(D), len(mu), len(beta), len(test_rate), len(mob_rate_smooth), len(dates_test)

(728, 728, 728, 728, 728, 728, 728, 728, 728)

In [24]:
# Adjust other arrays
iso_code = iso_code[0:len(dates_test)]
continent = continent[0:len(dates_test)]
region = region[0:len(dates_test)]

len(iso_code), len(continent), len(region)

(728, 728, 728)

**Creation of the dataframe**

In [25]:
# Array concatenation for building the final dataframe

columns = 13  # Number of columns
X_total = np.empty((len(dates_test), columns), dtype = object)

for i in range(len(dates_test)):
    X_total[i] = [iso_code[i], continent[i], region[i], dates_test[i], S[i], I[i], R[i], D[i],
                  mu[i], beta[i], gamma[i], test_rate[i], mob_rate_smooth[i]]
    
X_total.shape, X_total[0:10]

((728, 13),
 array([['ESP', 'Europe', 'Spain', Timestamp('2020-03-21 00:00:00'),
         47317307.57142858, 14340.142857142857, 247.0, 719.2857142857142,
         0.014039795340392238, 0.19353179986696373, 0.042237508052353566,
         0.395, -0.5285714285714286],
        ['ESP', 'Europe', 'Spain', Timestamp('2020-03-22 00:00:00'),
         47314311.85714286, 17042.285714285714, 328.7142857142857,
         931.1428571428572, 0.015943384874961268, 0.19006097754950504,
         0.0320334264197005, 0.406, -0.5457142857142857],
        ['ESP', 'Europe', 'Spain', Timestamp('2020-03-23 00:00:00'),
         47310712.71428572, 20228.0, 460.8571428571428,
         1212.4285714285713, 0.01529925447423688, 0.1864088141528808,
         0.02182934478704744, 0.416, -0.5554761904761905],
        ['ESP', 'Europe', 'Spain', Timestamp('2020-03-24 00:00:00'),
         47306693.14285714, 23708.857142857134, 674.5714285714286,
         1537.4285714285709, 0.01751998725230702, 0.1817815366191864,
        

In [26]:
# Labels of each column of the dataframe
labels = ['ISO_code', 'Continent', 'Country/Region', 'Date', 'Susceptibles Smooth (S)', 
          'Infected Smooth (I)', 'Recovered Smooth (R)', 'Deaths Smooth (D)',
          'Mu Smooth', 'Beta Smooth', 'Gamma Smooth', 'Positive Test Rate Smooth', 'Mobility Rate Smooth']

In [27]:
# Creation of the final dataframe
df_final = pd.DataFrame(data=X_total, columns=labels)
df_final

Unnamed: 0,ISO_code,Continent,Country/Region,Date,Susceptibles Smooth (S),Infected Smooth (I),Recovered Smooth (R),Deaths Smooth (D),Mu Smooth,Beta Smooth,Gamma Smooth,Positive Test Rate Smooth,Mobility Rate Smooth
0,ESP,Europe,Spain,2020-03-21,4.73173e+07,14340.1,247,719.286,0.0140398,0.193532,0.0422375,0.395,-0.528571
1,ESP,Europe,Spain,2020-03-22,4.73143e+07,17042.3,328.714,931.143,0.0159434,0.190061,0.0320334,0.406,-0.545714
2,ESP,Europe,Spain,2020-03-23,4.73107e+07,20228,460.857,1212.43,0.0152993,0.186409,0.0218293,0.416,-0.555476
3,ESP,Europe,Spain,2020-03-24,4.73067e+07,23708.9,674.571,1537.43,0.01752,0.181782,0.0116253,0.427,-0.563095
4,ESP,Europe,Spain,2020-03-25,4.73016e+07,28077.1,960.714,1969.43,0.0176071,0.173246,0.00142118,0.433,-0.570714
...,...,...,...,...,...,...,...,...,...,...,...,...,...
723,ESP,Europe,Spain,2022-03-14,3.60419e+07,253374,1.09343e+07,102987,0.0003112,0.0689761,0.0773409,0.173,-0.062619
724,ESP,Europe,Spain,2022-03-15,3.60276e+07,245555,1.09564e+07,103066,0.000242646,0.0700354,0.0770997,0.174,-0.0690476
725,ESP,Europe,Spain,2022-03-16,3.60163e+07,236758,1.09764e+07,103127,0.000196885,0.0771561,0.0781806,0.176,-0.0754762
726,ESP,Europe,Spain,2022-03-17,3.60083e+07,226561,1.09945e+07,103175,0.000390759,0.0907183,0.0797475,0.177,-0.0804762


In [28]:
# Export dataframe to a csv file

from pathlib import Path  
filepath = Path('Final_Smooth_Features_Spain.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df_final.to_csv(filepath, index=False)

In [29]:
# Export dataframe to a xlsx file

from pathlib import Path  
filepath = Path('Final_Smooth_Features_Spain.xlsx')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df_final.to_excel(filepath, index=False)