In [2]:
import pandas as pd

In [73]:
europe = pd.read_csv("../covid.csv")["country"]
europe.drop_duplicates(inplace=True)

# Cases

In [74]:
df = pd.read_csv(
    "./cumulative-confirmed-covid-19-cases.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457250 entries, 0 to 457249
Data columns (total 3 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   Entity                             457250 non-null  object
 1   Day                                457250 non-null  object
 2   Total confirmed cases of COVID-19  457250 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 10.5+ MB


In [75]:
df.rename(columns={"Entity": "country", "Day": "date",
          "Total confirmed cases of COVID-19": "cases"}, inplace=True)
df.head()

Unnamed: 0,country,date,cases
0,Afghanistan,2020-01-04,0
1,Afghanistan,2020-01-05,0
2,Afghanistan,2020-01-06,0
3,Afghanistan,2020-01-07,0
4,Afghanistan,2020-01-08,0


In [76]:
df['date'] = pd.to_datetime(df['date'])

df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

df.drop(columns=["date"], inplace=True)

df = df[['country', 'year', 'month', 'day', 'cases']]
df.head()

Unnamed: 0,country,year,month,day,cases
0,Afghanistan,2020,1,4,0
1,Afghanistan,2020,1,5,0
2,Afghanistan,2020,1,6,0
3,Afghanistan,2020,1,7,0
4,Afghanistan,2020,1,8,0


In [77]:
# keep only europe
df = df[df['country'].isin(europe)]

In [78]:
df.to_csv("./clean/covid_cases.csv", index=False)

In [79]:
# Calculate the average cases per year for each country
df_avg = df.groupby(['country', 'year'])['cases'].mean().reset_index()
df_avg

Unnamed: 0,country,year,cases
0,Albania,2020,9949.071625
1,Albania,2021,139191.265753
2,Albania,2022,294772.032877
3,Albania,2023,333964.232877
4,Albania,2024,335940.379781
...,...,...,...
301,Vatican,2021,26.000000
302,Vatican,2022,26.000000
303,Vatican,2023,26.000000
304,Vatican,2024,26.000000


In [80]:
# Save the averaged data to a new CSV file
df_avg.to_csv("./clean/covid_cases_avg_year.csv", index=False)

In [None]:
#Calculate the new cases per year for each country
covid = df.copy()
covid['new_cases'] = covid.groupby('country')['cases'].diff().fillna(0)
covid['new_cases'] = covid['new_cases'].astype(int)
yearly_new_cases = covid.groupby(['country','year'])['new_cases'].sum().reset_index()
yearly_new_cases

Unnamed: 0,country,year,new_cases
0,Albania,2020,57146
1,Albania,2021,151753
2,Albania,2022,123869
3,Albania,2023,1828
4,Albania,2024,2600
...,...,...,...
301,Vatican,2021,0
302,Vatican,2022,0
303,Vatican,2023,0
304,Vatican,2024,0


In [4]:
# Save the new cases data to a new CSV file
yearly_new_cases.to_csv("./clean/covid_new_cases_year.csv", index=False)

# Deaths

In [81]:
df_d = pd.read_csv("./cumulative-confirmed-covid-19-deaths.csv")
df_d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457250 entries, 0 to 457249
Data columns (total 3 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   Entity                                  457250 non-null  object
 1   Day                                     457250 non-null  object
 2   Total confirmed deaths due to COVID-19  457250 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 10.5+ MB


In [82]:
df_d.rename(columns={"Entity": "country", "Day": "date",
                     "Total confirmed deaths due to COVID-19": "deaths"}, inplace=True)
df_d.head()

Unnamed: 0,country,date,deaths
0,Afghanistan,2020-01-04,0
1,Afghanistan,2020-01-05,0
2,Afghanistan,2020-01-06,0
3,Afghanistan,2020-01-07,0
4,Afghanistan,2020-01-08,0


In [83]:
df_d['date'] = pd.to_datetime(df_d['date'])

df_d['year'] = df_d['date'].dt.year
df_d['month'] = df_d['date'].dt.month
df_d['day'] = df_d['date'].dt.day

df_d.drop(columns=["date"], inplace=True)

df_d = df_d[['country', 'year', 'month', 'day', 'deaths']]
df_d.head()

Unnamed: 0,country,year,month,day,deaths
0,Afghanistan,2020,1,4,0
1,Afghanistan,2020,1,5,0
2,Afghanistan,2020,1,6,0
3,Afghanistan,2020,1,7,0
4,Afghanistan,2020,1,8,0


In [84]:
df_d = df_d[df_d['country'].isin(europe)]

In [85]:
df_d.to_csv("./clean/covid_deaths.csv", index=False)

In [86]:
# Calculate the average cases per year for each country
df_d_avg = df_d.groupby(['country', 'year'])['deaths'].mean().reset_index()
# Save the averaged data to a new CSV file
df_d_avg

Unnamed: 0,country,year,deaths
0,Albania,2020,236.964187
1,Albania,2021,2375.964384
2,Albania,2022,3508.271233
3,Albania,2023,3602.430137
4,Albania,2024,3606.106557
...,...,...,...
301,Vatican,2021,0.000000
302,Vatican,2022,0.000000
303,Vatican,2023,0.000000
304,Vatican,2024,0.000000


In [87]:
df_d_avg.to_csv("./clean/covid_deaths_avg_year.csv", index=False)

# Vaccines

In [88]:
df_v = pd.read_csv(
    "./number-of-people-who-received-at-least-one-dose-of-covid-19-vaccine.csv")
df_v.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79011 entries, 0 to 79010
Data columns (total 3 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Entity                          79011 non-null  object 
 1   Day                             79011 non-null  object 
 2   People vaccinated (cumulative)  79011 non-null  float64
dtypes: float64(1), object(2)
memory usage: 1.8+ MB


In [89]:
df_v.rename(columns={"Entity": "country", "Day": "date",
                     "People vaccinated (cumulative)": "vaccines"}, inplace=True)
df_v.head()

Unnamed: 0,country,date,vaccines
0,Afghanistan,2021-02-22,0.0
1,Afghanistan,2021-02-28,8200.0
2,Afghanistan,2021-03-16,54000.0
3,Afghanistan,2021-04-07,120000.0
4,Afghanistan,2021-04-22,240000.0


In [90]:
df_v['date'] = pd.to_datetime(df_v['date'])

df_v['year'] = df_v['date'].dt.year
df_v['month'] = df_v['date'].dt.month
df_v['day'] = df_v['date'].dt.day

df_v.drop(columns=["date"], inplace=True)

df_v = df_v[['country', 'year', 'month', 'day', 'vaccines']]
df_v.head()

Unnamed: 0,country,year,month,day,vaccines
0,Afghanistan,2021,2,22,0.0
1,Afghanistan,2021,2,28,8200.0
2,Afghanistan,2021,3,16,54000.0
3,Afghanistan,2021,4,7,120000.0
4,Afghanistan,2021,4,22,240000.0


In [91]:
df_v = df_v[df_v['country'].isin(europe)]

In [92]:
df_v.to_csv("./clean/covid_vaccines.csv", index=False)

In [93]:
# Calculate the average cases per year for each country
df_v_avg = df_v.groupby(['country', 'year'])['vaccines'].mean().reset_index()
df_v_avg

Unnamed: 0,country,year,vaccines
0,Albania,2021,7.122184e+05
1,Albania,2022,1.283091e+06
2,Albania,2023,1.347003e+06
3,Andorra,2021,2.932703e+04
4,Andorra,2022,5.784859e+04
...,...,...,...
156,Ukraine,2021,5.060961e+06
157,Ukraine,2022,1.527731e+07
158,Ukraine,2023,1.626720e+07
159,United Kingdom,2021,3.856614e+07


In [94]:
df_d_avg.to_csv("./clean/covid_vaccines_avg_year.csv", index=False)