# Predicting Covid Cases in Sweden

A small hobby project for predicting total daily Covid-19 cases using linear regression

In [308]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot  as plt
import statistics as st

In [330]:
# Read data from URL provided by ECDC and display it

data = pd.read_csv('https://opendata.ecdc.europa.eu/covid19/nationalcasedeath_eueea_daily_ei/csv/data.csv')
display(data)

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2020,continentExp
0,17/01/2022,17,1,2022,16190,3,Austria,AT,AUT,8901064,Europe
1,16/01/2022,16,1,2022,17169,7,Austria,AT,AUT,8901064,Europe
2,15/01/2022,15,1,2022,17658,6,Austria,AT,AUT,8901064,Europe
3,14/01/2022,14,1,2022,16193,14,Austria,AT,AUT,8901064,Europe
4,13/01/2022,13,1,2022,17231,9,Austria,AT,AUT,8901064,Europe
...,...,...,...,...,...,...,...,...,...,...,...
9355,05/03/2021,5,3,2021,4069,15,Sweden,SE,SWE,10327589,Europe
9356,04/03/2021,4,3,2021,4882,19,Sweden,SE,SWE,10327589,Europe
9357,03/03/2021,3,3,2021,4871,18,Sweden,SE,SWE,10327589,Europe
9358,02/03/2021,2,3,2021,6189,23,Sweden,SE,SWE,10327589,Europe


In [347]:
# Filter data on Sweden

sweden_data = data[data.countriesAndTerritories.eq("Sweden")]
sweden_data.reset_index(drop=True, inplace=True)
display(sweden_data)

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2020,continentExp
0,17/01/2022,17,1,2022,0,0,Sweden,SE,SWE,10327589,Europe
1,16/01/2022,16,1,2022,0,0,Sweden,SE,SWE,10327589,Europe
2,15/01/2022,15,1,2022,0,0,Sweden,SE,SWE,10327589,Europe
3,14/01/2022,14,1,2022,0,0,Sweden,SE,SWE,10327589,Europe
4,13/01/2022,13,1,2022,25079,0,Sweden,SE,SWE,10327589,Europe
...,...,...,...,...,...,...,...,...,...,...,...
307,05/03/2021,5,3,2021,4069,15,Sweden,SE,SWE,10327589,Europe
308,04/03/2021,4,3,2021,4882,19,Sweden,SE,SWE,10327589,Europe
309,03/03/2021,3,3,2021,4871,18,Sweden,SE,SWE,10327589,Europe
310,02/03/2021,2,3,2021,6189,23,Sweden,SE,SWE,10327589,Europe


In [356]:
# Creating new df with relevant data for the regression
# Filter creates a copy by default

date_and_cases = sweden_data.filter(["dateRep", "cases"], axis=1)

# Take most recent 50 days
date_and_cases = date_and_cases.head(40)
date_and_cases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dateRep  40 non-null     object
 1   cases    40 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 768.0+ bytes


In [357]:
# In case of holidays, there are no reports of Covid-19 cases until the first following weekday
# This creates extreme values in the data and needs to be handled, e.g. 84k cases for jan the 3rd
# Sometimes, there is a delay in the reporting as well, leading to missing values for today or yesterday.
# I solved this by linear regression imputation

# Replace max with 0
max_cases = date_and_cases["cases"].max()
date_and_cases["cases"].replace(max_cases, 0, inplace=True)
display(date_and_cases)

Unnamed: 0,dateRep,cases
0,17/01/2022,0
1,16/01/2022,0
2,15/01/2022,0
3,14/01/2022,0
4,13/01/2022,25079
5,12/01/2022,25705
6,11/01/2022,22546
7,10/01/2022,14894
8,09/01/2022,16441
9,08/01/2022,19226


In [358]:
# Insert missing dates
# A bit hardcoded but will work for now. 

extreme_row = date_and_cases[date_and_cases.cases.eq(max_cases)]
d = np.array(["03/01/2022", "02/01/2022", "01/01/2022", "31/12/2021", "30/12/2021", "29/12/2021","28/12/2021", 
               "27/12/2021", "26/12/2021", "25/12/2021", "24/12/2021","23/12/2021"])
v = np.zeros(len(d), dtype=int)

new_values = np.column_stack((d,v))

length = len(d)
avg = max_cases / length

missing_dates = pd.DataFrame(new_values, columns=["dateRep", "cases"])

missing_dates['cases'] = [round(avg) for x in missing_dates['cases']]

display(missing_dates)

first_split = date_and_cases.iloc[:14]
second_split = date_and_cases.iloc[15:]
first_part = pd.concat([first_split, missing_dates])

date_and_cases = pd.concat([first_part, second_split]).reset_index(drop=True)
display(date_and_cases)


Unnamed: 0,dateRep,cases
0,03/01/2022,7049
1,02/01/2022,7049
2,01/01/2022,7049
3,31/12/2021,7049
4,30/12/2021,7049
5,29/12/2021,7049
6,28/12/2021,7049
7,27/12/2021,7049
8,26/12/2021,7049
9,25/12/2021,7049


Unnamed: 0,dateRep,cases
0,17/01/2022,0
1,16/01/2022,0
2,15/01/2022,0
3,14/01/2022,0
4,13/01/2022,25079
5,12/01/2022,25705
6,11/01/2022,22546
7,10/01/2022,14894
8,09/01/2022,16441
9,08/01/2022,19226


In [372]:
# Imputing missing values for the recently added dates with the mean of the extreme value
date_and_cases_df = pd.DataFrame(date_and_cases, columns=["dateRep", "cases"])
display(date_and_cases_df)


for i in date_and_cases_df['cases']:
    if date_and_cases_df['cases'][i] == 0:
        date_and_cases_df.drop(i, axis=0, inplace=True)
        

date_and_cases.head(30)

ValueError: Shape of passed values is (4, 1), indices imply (4, 2)

In [None]:
# Plot the relevant data

# From all (both) indexes, return the first vector (dates)  
x_axis = date_and_cases.iloc[:, 0]

# From all (both) indexes, return the second array (cases)  
y_axis = date_and_cases.iloc[:, 1]

# Reverse the y_axis for accurate plotting 
y_axis = y_axis[::-1]

# Select and plot every third date for readability
#x_axis_length = len(x_axis)
#x_axis = x_axis[0:x_axis_length:2]

plt.figure(figsize=(20,10))
plt.title("Daily corona cases in Sweden")
plt.xticks(rotation=45)
plt.tight_layout() 
plt.plot(x_axis, y_axis)
plt.show()