# Predicting Covid Cases in Sweden

A small hobby project for predicting total daily Covid-19 cases using linear regression

In [237]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt

In [238]:
# Read data from URL provided by ECDC and display it

data = pd.read_csv('https://opendata.ecdc.europa.eu/covid19/nationalcasedeath_eueea_daily_ei/csv/data.csv')
display(data)

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2020,continentExp
0,14/01/2022,14,1,2022,16193,14,Austria,AT,AUT,8901064,Europe
1,13/01/2022,13,1,2022,17231,9,Austria,AT,AUT,8901064,Europe
2,12/01/2022,12,1,2022,11953,19,Austria,AT,AUT,8901064,Europe
3,11/01/2022,11,1,2022,10441,9,Austria,AT,AUT,8901064,Europe
4,10/01/2022,10,1,2022,10860,2,Austria,AT,AUT,8901064,Europe
...,...,...,...,...,...,...,...,...,...,...,...
9265,05/03/2021,5,3,2021,4069,15,Sweden,SE,SWE,10327589,Europe
9266,04/03/2021,4,3,2021,4882,19,Sweden,SE,SWE,10327589,Europe
9267,03/03/2021,3,3,2021,4871,18,Sweden,SE,SWE,10327589,Europe
9268,02/03/2021,2,3,2021,6189,23,Sweden,SE,SWE,10327589,Europe


In [239]:
# Filter data on Sweden

sweden_data = data[data.countriesAndTerritories.eq("Sweden")]
sweden_data.reset_index(drop=True, inplace=True)
display(sweden_data)

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2020,continentExp
0,14/01/2022,14,1,2022,0,0,Sweden,SE,SWE,10327589,Europe
1,13/01/2022,13,1,2022,0,0,Sweden,SE,SWE,10327589,Europe
2,12/01/2022,12,1,2022,25215,0,Sweden,SE,SWE,10327589,Europe
3,11/01/2022,11,1,2022,22538,4,Sweden,SE,SWE,10327589,Europe
4,10/01/2022,10,1,2022,14887,6,Sweden,SE,SWE,10327589,Europe
...,...,...,...,...,...,...,...,...,...,...,...
304,05/03/2021,5,3,2021,4069,15,Sweden,SE,SWE,10327589,Europe
305,04/03/2021,4,3,2021,4882,19,Sweden,SE,SWE,10327589,Europe
306,03/03/2021,3,3,2021,4871,18,Sweden,SE,SWE,10327589,Europe
307,02/03/2021,2,3,2021,6189,23,Sweden,SE,SWE,10327589,Europe


In [240]:
# Creating new df with relevant data for the regression
# Filter creates a copy by default

date_and_cases = sweden_data.filter(["dateRep", "cases"], axis=1)

# Take most recent 50 days
date_and_cases = date_and_cases.head(40)
date_and_cases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dateRep  40 non-null     object
 1   cases    40 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 768.0+ bytes


In [241]:
# In case of holidays, there are no reports of Covid-19 cases until the first following weekday
# This creates extreme values in the data and needs to be handled, e.g. 84k cases for jan the 3rd
# Sometimes, there is a delay in the reporting as well, leading to missing values for today or yesterday.
# I solved this by linear regression imputation

# Replace max with 0
max_cases = date_and_cases["cases"].max()
date_and_cases["cases"].replace(max_cases, np.nan, inplace=True)
date_and_cases["cases"].replace(0, np.nan, inplace=True)
display(date_and_cases)

Unnamed: 0,dateRep,cases
0,14/01/2022,
1,13/01/2022,
2,12/01/2022,25215.0
3,11/01/2022,22538.0
4,10/01/2022,14887.0
5,09/01/2022,16442.0
6,08/01/2022,19226.0
7,07/01/2022,19711.0
8,06/01/2022,17526.0
9,05/01/2022,23878.0


In [244]:
# Insert missing dates
# A bit hardcoded but will work for now. 

d = np.array(["02/01/2022", "01/01/2022", "31/12/2021", "30/12/2021", "29/12/2021","28/12/2021", 
               "27/12/2021", "26/12/2021", "25/12/2021", "24/12/2021","23/12/2021"])

v = np.array([np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan])

new_values = np.column_stack((d,v))
missing_dates = pd.DataFrame(hej, columns=["dateRep", "cases"])

first_split = date_and_cases.iloc[:12]
second_split = date_and_cases.iloc[12:]
first_part = pd.concat([first_split, missing_dates])

date_and_cases = pd.concat([first_part, second_split])
display(date_and_cases)


Unnamed: 0,level_0,index,dateRep,cases
0,0.0,0.0,14/01/2022,
1,1.0,1.0,13/01/2022,
2,2.0,2.0,12/01/2022,25215
3,3.0,3.0,11/01/2022,22538
4,4.0,4.0,10/01/2022,14887
...,...,...,...,...
46,35.0,,29/11/2021,1347
47,36.0,,28/11/2021,880
48,37.0,,27/11/2021,1759
49,38.0,,26/11/2021,2025


In [245]:
# Drop index column (where the hell did it come from?)

date_and_cases = date_and_cases.drop(["level_0", "index"], axis=1)


In [247]:
# Linear regression imputation for missing values 

date_and_cases = date_and_cases['cases'].interpolate(method='linear')
display(date_and_cases)

0       NaN
1       NaN
2     25215
3     22538
4     14887
      ...  
46     1347
47      880
48     1759
49     2025
50     2114
Name: cases, Length: 62, dtype: object

In [None]:
# Plot the relevant data

# From all (both) indexes, return the first vector (dates)  
x_axis = date_and_cases.iloc[:, 0]

# From all (both) indexes, return the second array (cases)  
y_axis = date_and_cases.iloc[:, 1]

# Reverse the y_axis for accurate plotting 
y_axis = y_axis[::-1]

# Select and plot every third date for readability
#x_axis_length = len(x_axis)
#x_axis = x_axis[0:x_axis_length:2]

plt.figure(figsize=(20,10))
plt.title("Daily corona cases in Sweden")
plt.xticks(rotation=45)
plt.tight_layout() 
plt.plot(x_axis, y_axis)
plt.show()