# Calculation of the number of hospitalisations

In [1]:
import matplotlib.pyplot as plt 
%matplotlib inline
import pandas as pd
import numpy as np
from IPython import display

In [2]:
# Reading the data
# Original data available in: https://cnecovid.isciii.es/covid19/#documentaci%C3%B3n-y-datos
df = pd.read_csv('casos_hosp_uci_def_sexo_edad_provres.csv')
df.head()

Unnamed: 0,provincia_iso,sexo,grupo_edad,fecha,num_casos,num_hosp,num_uci,num_def
0,A,H,0-9,2020-01-01,0,0,0,0
1,A,H,10-19,2020-01-01,0,0,0,0
2,A,H,20-29,2020-01-01,0,0,0,0
3,A,H,30-39,2020-01-01,0,0,0,0
4,A,H,40-49,2020-01-01,0,0,0,0


In [3]:
# Obtaining the date 'fecha' column
df_date = df.values[:,3]
len(df_date), df_date[0:5]

(1367400,
 array(['2020-01-01', '2020-01-01', '2020-01-01', '2020-01-01',
        '2020-01-01'], dtype=object))

In [4]:
# Obtaining the number of rows between different days
initial_date = '2020-01-01'
range_each_date = np.where(df_date == initial_date)[0][-1] + 1

print("Each day involves a total of {} rows from the complete dataset:\n".format(range_each_date))
print(df_date[range_each_date-1], ' -> ', df_date[range_each_date])
print(df_date[(range_each_date*2)-1], ' -> ', df_date[(range_each_date*2)])
print(df_date[(range_each_date*3)-1], ' -> ', df_date[(range_each_date*3)])

Each day involves a total of 1590 rows from the complete dataset:

2020-01-01  ->  2020-01-02
2020-01-02  ->  2020-01-03
2020-01-03  ->  2020-01-04


In [5]:
# Array with unique dates (no repeated)
dates = np.unique(df_date)

len(dates), dates[0:5], dates[-5:len(dates)]

(860,
 array(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
        '2020-01-05'], dtype=object),
 array(['2022-05-05', '2022-05-06', '2022-05-07', '2022-05-08',
        '2022-05-09'], dtype=object))

In [6]:
# Obtaining the hospitalizations data
df_hosp = df['num_hosp'].values

# Grouping the data into the range defined by the dates
hosp = np.empty(len(dates), dtype=object)
index = 0
for i in range(0, len(df_hosp), range_each_date):
    hosp[index] = np.sum(df_hosp[i:i+range_each_date])
    index += 1

len(hosp), hosp[0:20], hosp[-20:len(hosp)]

(860,
 array([8, 74, 8, 8, 7, 4, 7, 5, 8, 9, 3, 6, 8, 10, 10, 6, 4, 5, 5, 11],
       dtype=object),
 array([584, 567, 550, 436, 394, 579, 600, 619, 577, 533, 496, 420, 558,
        545, 551, 502, 359, 275, 177, 3], dtype=object))

In [7]:
# Obtaining the UCI data
df_UCI = df['num_uci'].values

# Grouping the data into the range defined by the dates
UCI = np.empty(len(dates), dtype=object)
index = 0
for i in range(0, len(df_UCI), range_each_date):
    UCI[index] = np.sum(df_UCI[i:i+range_each_date])
    index += 1

len(UCI), UCI[0:20], UCI[-20:len(UCI)]

(860,
 array([0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0],
       dtype=object),
 array([30, 25, 25, 18, 24, 26, 21, 24, 33, 25, 14, 21, 21, 28, 23, 17, 8,
        4, 5, 0], dtype=object))

In [8]:
# Smoothing the data (7 time moving average)

# Hospitalization smooth array
hosp_smooth = np.empty(len(hosp), dtype = object)

# First 6 values set null to maintain the shape (860 data points)
for i in range(0, 6):
    hosp_smooth[i] = 0

# Moving average in 7 days
for i in range(0, len(hosp)-6):
    sum_H = 0
    for j in range(i, i+7):
        sum_H = sum_H + hosp[j]
    hosp_smooth[i+6] = (sum_H/7)

hosp_smooth.shape, hosp_smooth[0:10], hosp_smooth[-5:len(hosp_smooth)]

((860,),
 array([0, 0, 0, 0, 0, 0, 16.571428571428573, 16.142857142857142,
        6.714285714285714, 6.857142857142857], dtype=object),
 array([515.0, 490.14285714285717, 458.57142857142856, 423.85714285714283,
        344.57142857142856], dtype=object))

In [9]:
# Smoothing the data (7 time moving average)

# UCI smooth
UCI_smooth = np.empty(len(UCI), dtype = object)

# First 6 values set null to maintain the shape (860 data points)
for i in range(0, 6):
    UCI_smooth[i] = 0

# Moving average in 7 days
for i in range(0, len(UCI)-6):
    sum_UCI = 0
    for j in range(i, i+7):
        sum_UCI = sum_UCI + UCI[j]
    UCI_smooth[i+6] = (sum_UCI/7)

UCI_smooth.shape, UCI_smooth[0:10], UCI_smooth[-5:len(UCI_smooth)]

((860,),
 array([0, 0, 0, 0, 0, 0, 0.42857142857142855, 0.42857142857142855,
        0.2857142857142857, 0.14285714285714285], dtype=object),
 array([21.285714285714285, 18.857142857142858, 17.428571428571427,
        15.142857142857142, 12.142857142857142], dtype=object))

In [10]:
# Selecting only the range of dates in which we have previosly calculated the final parameters
first_day = '2020-03-21'
last_day = '2022-03-18'

first_index = np.where(dates == first_day)[0][0]
last_index = np.where(dates == last_day)[0][0]

final_dates = dates[first_index:last_index+1]
final_hosp = hosp[first_index:last_index+1]
final_UCI = UCI[first_index:last_index+1]
final_hosp_smooth = hosp_smooth[first_index:last_index+1]
final_UCI_smooth = UCI_smooth[first_index:last_index+1]

len(final_dates), len(final_hosp), len(final_UCI), len(final_hosp_smooth), len(final_UCI_smooth), final_dates[0:5], final_dates[-5:len(final_dates)]

(728,
 728,
 728,
 728,
 728,
 array(['2020-03-21', '2020-03-22', '2020-03-23', '2020-03-24',
        '2020-03-25'], dtype=object),
 array(['2022-03-14', '2022-03-15', '2022-03-16', '2022-03-17',
        '2022-03-18'], dtype=object))

In [11]:
# Create array with the name of the country
region = np.empty(len(final_dates), dtype = object)
for i in range(len(region)):
    region[i] = 'Spain'
    
region.shape, region[0:10]

((728,),
 array(['Spain', 'Spain', 'Spain', 'Spain', 'Spain', 'Spain', 'Spain',
        'Spain', 'Spain', 'Spain'], dtype=object))

In [12]:
# Array concatenation for building the final dataframe

columns = 6  # Number of columns
X_total = np.empty((len(final_dates), columns), dtype = object)

for i in range(len(final_dates)):
    X_total[i] = [region[i], final_dates[i], final_hosp[i], final_hosp_smooth[i], final_UCI[i], final_UCI_smooth[i]]
    
X_total.shape, X_total[0:10]

((728, 6),
 array([['Spain', '2020-03-21', 3495, 2826.1428571428573, 340, 279.0],
        ['Spain', '2020-03-22', 3333, 3106.5714285714284, 380,
         309.42857142857144],
        ['Spain', '2020-03-23', 4943, 3509.5714285714284, 448,
         342.42857142857144],
        ['Spain', '2020-03-24', 5307, 3920.285714285714, 511,
         379.7142857142857],
        ['Spain', '2020-03-25', 5207, 4222.285714285715, 465,
         401.85714285714283],
        ['Spain', '2020-03-26', 5040, 4463.0, 436, 416.14285714285717],
        ['Spain', '2020-03-27', 4810, 4590.714285714285, 373,
         421.85714285714283],
        ['Spain', '2020-03-28', 3393, 4576.142857142857, 334, 421.0],
        ['Spain', '2020-03-29', 2657, 4479.571428571428, 278,
         406.42857142857144],
        ['Spain', '2020-03-30', 3933, 4335.285714285715, 335,
         390.2857142857143]], dtype=object))

In [13]:
# Labels of each column of the dataframe
labels = ['Country/Region', 'Date', 'Num Hospitalizations', 'Num Hospitalizations Smooth (7 days)', 
         'Num UCI', 'Num UCI Smooth (7 days)']

In [14]:
# Creation of the final dataframe
df_final = pd.DataFrame(data=X_total, columns=labels)
df_final

Unnamed: 0,Country/Region,Date,Num Hospitalizations,Num Hospitalizations Smooth (7 days),Num UCI,Num UCI Smooth (7 days)
0,Spain,2020-03-21,3495,2826.142857,340,279.0
1,Spain,2020-03-22,3333,3106.571429,380,309.428571
2,Spain,2020-03-23,4943,3509.571429,448,342.428571
3,Spain,2020-03-24,5307,3920.285714,511,379.714286
4,Spain,2020-03-25,5207,4222.285714,465,401.857143
...,...,...,...,...,...,...
723,Spain,2022-03-14,356,318.142857,19,19.142857
724,Spain,2022-03-15,375,320.857143,19,18.285714
725,Spain,2022-03-16,365,324.857143,12,16.857143
726,Spain,2022-03-17,343,328.285714,17,17.428571


In [15]:
# Export dataframe to a csv file

from pathlib import Path  
filepath = Path('Hosp_UCI_Spain.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df_final.to_csv(filepath, index=False)