In [44]:
import pandas as pd

In [45]:
country_codes = pd.read_json("../../countries_ISO2_ISO3.json")

In [46]:
europe = pd.read_csv("../covid.csv")["country"]
europe.drop_duplicates(inplace=True)

In [47]:
population = pd.read_csv(
    "../covid.csv")[["country", "population"]].drop_duplicates().reset_index(drop=True)
population

Unnamed: 0,country,population
0,Albania,2827615.0
1,Andorra,79722.0
2,Austria,9064679.0
3,Belarus,9173241.0
4,Belgium,11641813.0
5,Bosnia and Herzegovina,3204805.0
6,Bulgaria,6825863.0
7,Croatia,3907031.0
8,Cyprus,1331376.0
9,Czechia,10673216.0


# Cases

In [48]:
df = pd.read_csv(
    "./cumulative-confirmed-covid-19-cases.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457250 entries, 0 to 457249
Data columns (total 3 columns):
 #   Column                             Non-Null Count   Dtype 
---  ------                             --------------   ----- 
 0   Entity                             457250 non-null  object
 1   Day                                457250 non-null  object
 2   Total confirmed cases of COVID-19  457250 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 10.5+ MB


In [49]:
df.rename(columns={"Entity": "country", "Day": "date",
          "Total confirmed cases of COVID-19": "cases"}, inplace=True)
df.head()

Unnamed: 0,country,date,cases
0,Afghanistan,2020-01-04,0
1,Afghanistan,2020-01-05,0
2,Afghanistan,2020-01-06,0
3,Afghanistan,2020-01-07,0
4,Afghanistan,2020-01-08,0


In [50]:
df['date'] = pd.to_datetime(df['date'])

df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

df.drop(columns=["date"], inplace=True)

df = df[['country', 'year', 'month', 'day', 'cases']]
df.head()

Unnamed: 0,country,year,month,day,cases
0,Afghanistan,2020,1,4,0
1,Afghanistan,2020,1,5,0
2,Afghanistan,2020,1,6,0
3,Afghanistan,2020,1,7,0
4,Afghanistan,2020,1,8,0


In [51]:
# keep only europe
df = df[df['country'].isin(europe)]
df

Unnamed: 0,country,year,month,day,cases
3658,Albania,2020,1,4,0
3659,Albania,2020,1,5,0
3660,Albania,2020,1,6,0
3661,Albania,2020,1,7,0
3662,Albania,2020,1,8,0
...,...,...,...,...,...
438955,Vatican,2025,1,1,26
438956,Vatican,2025,1,2,26
438957,Vatican,2025,1,3,26
438958,Vatican,2025,1,4,26


In [52]:
df = df.merge(country_codes, on = 'country', how = 'left')
df = df[['ISO2', 'ISO3', 'country', 'year', 'month', 'day', 'cases']]
df.head()

Unnamed: 0,ISO2,ISO3,country,year,month,day,cases
0,AL,ALB,Albania,2020,1,4,0
1,AL,ALB,Albania,2020,1,5,0
2,AL,ALB,Albania,2020,1,6,0
3,AL,ALB,Albania,2020,1,7,0
4,AL,ALB,Albania,2020,1,8,0


In [53]:
df.to_csv("./clean/covid_cases.csv", index=False)

In [54]:
# Take the latest value for each country
df_max = df.sort_values(['ISO2', 'ISO3', 'country', 'year', 'month', 'day']).groupby(
    'country').last().reset_index()
df_max.sort_values(["cases"], ascending=False, inplace=True)
df_max.reset_index(drop=True, inplace=True)
df_max

Unnamed: 0,country,ISO2,ISO3,year,month,day,cases
0,France,FR,FRA,2025,1,5,39013547
1,Germany,DE,DEU,2025,1,5,38437756
2,Italy,IT,ITA,2025,1,5,26958866
3,United Kingdom,GB,GBR,2025,1,5,25025238
4,Russia,RU,RUS,2025,1,5,24831847
5,Spain,ES,ESP,2025,1,5,13980340
6,Netherlands,NL,NLD,2025,1,5,8635049
7,Poland,PL,POL,2025,1,5,6770109
8,Austria,AT,AUT,2025,1,5,6083046
9,Greece,GR,GRC,2025,1,5,5755165


In [55]:
# Calculate the average cases per year for each country
df_max_year = df.groupby(['ISO2', 'ISO3', 'country', 'year'])[
    'cases'].max().reset_index()
df_max_year

Unnamed: 0,ISO2,ISO3,country,year,cases
0,AD,AND,Andorra,2020,7983
1,AD,AND,Andorra,2021,23122
2,AD,AND,Andorra,2022,47751
3,AD,AND,Andorra,2023,48015
4,AD,AND,Andorra,2024,48015
...,...,...,...,...,...
301,XK,XKX,Kosovo,2021,161399
302,XK,XKX,Kosovo,2022,272237
303,XK,XKX,Kosovo,2023,274279
304,XK,XKX,Kosovo,2024,274279


In [56]:
# Save the averaged data to a new CSV file
df_max_year.to_csv("./clean/covid_cases_max_year.csv", index=False)

In [57]:
# Calculate the new cases per year for each country
covid = pd.read_csv("./clean/covid_cases.csv")
covid['new_cases'] = covid.groupby('country')['cases'].diff().fillna(0)
covid['new_cases'] = covid['new_cases'].astype(int)
yearly_new_cases = covid.groupby(['country', 'year'])[
    'new_cases'].sum().reset_index()
yearly_new_cases

Unnamed: 0,country,year,new_cases
0,Albania,2020,57146
1,Albania,2021,151753
2,Albania,2022,123869
3,Albania,2023,1828
4,Albania,2024,2600
...,...,...,...
301,Vatican,2021,0
302,Vatican,2022,0
303,Vatican,2023,0
304,Vatican,2024,0


In [58]:
# Merge yearly_vaccines with population dataframe
yearly_new_cases = yearly_new_cases.merge(population, on='country', how='left')

# Calculate the percentage of vaccines over population
yearly_new_cases['percentage'] = (
    yearly_new_cases['new_cases'] / yearly_new_cases['population']) * 100
yearly_new_cases.drop(columns=['population'], inplace=True)
yearly_new_cases.head()

Unnamed: 0,country,year,new_cases,percentage
0,Albania,2020,57146,2.020996
1,Albania,2021,151753,5.36682
2,Albania,2022,123869,4.380688
3,Albania,2023,1828,0.064648
4,Albania,2024,2600,0.09195


In [59]:
# Save the new cases data to a new CSV file
yearly_new_cases.to_csv("./clean/covid_new_cases_year.csv", index=False)

# Deaths

In [60]:
df_d = pd.read_csv("./cumulative-confirmed-covid-19-deaths.csv")
df_d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 457250 entries, 0 to 457249
Data columns (total 3 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   Entity                                  457250 non-null  object
 1   Day                                     457250 non-null  object
 2   Total confirmed deaths due to COVID-19  457250 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 10.5+ MB


In [61]:
df_d.rename(columns={"Entity": "country", "Day": "date",
                     "Total confirmed deaths due to COVID-19": "deaths"}, inplace=True)
df_d.head()

Unnamed: 0,country,date,deaths
0,Afghanistan,2020-01-04,0
1,Afghanistan,2020-01-05,0
2,Afghanistan,2020-01-06,0
3,Afghanistan,2020-01-07,0
4,Afghanistan,2020-01-08,0


In [62]:
df_d['date'] = pd.to_datetime(df_d['date'])

df_d['year'] = df_d['date'].dt.year
df_d['month'] = df_d['date'].dt.month
df_d['day'] = df_d['date'].dt.day

df_d.drop(columns=["date"], inplace=True)

df_d = df_d[['country', 'year', 'month', 'day', 'deaths']]
df_d.head()

Unnamed: 0,country,year,month,day,deaths
0,Afghanistan,2020,1,4,0
1,Afghanistan,2020,1,5,0
2,Afghanistan,2020,1,6,0
3,Afghanistan,2020,1,7,0
4,Afghanistan,2020,1,8,0


In [63]:
df_d = df_d[df_d['country'].isin(europe)]

In [64]:
df_d = df_d.merge(country_codes, on='country', how='left')
df_d = df_d[['ISO2', 'ISO3', 'country', 'year', 'month', 'day', 'deaths']]
df_d.head()

Unnamed: 0,ISO2,ISO3,country,year,month,day,deaths
0,AL,ALB,Albania,2020,1,4,0
1,AL,ALB,Albania,2020,1,5,0
2,AL,ALB,Albania,2020,1,6,0
3,AL,ALB,Albania,2020,1,7,0
4,AL,ALB,Albania,2020,1,8,0


In [65]:
df_d.to_csv("./clean/covid_deaths.csv", index=False)

In [66]:
# Calculate the average cases per year for each country
df_d_max_year = df_d.groupby(['ISO2', 'ISO3', 'country', 'year'])[
    'deaths'].max().reset_index()
# Save the averaged data to a new CSV file
df_d_max_year

Unnamed: 0,ISO2,ISO3,country,year,deaths
0,AD,AND,Andorra,2020,84
1,AD,AND,Andorra,2021,140
2,AD,AND,Andorra,2022,158
3,AD,AND,Andorra,2023,159
4,AD,AND,Andorra,2024,159
...,...,...,...,...,...
301,XK,XKX,Kosovo,2021,2980
302,XK,XKX,Kosovo,2022,3192
303,XK,XKX,Kosovo,2023,3212
304,XK,XKX,Kosovo,2024,3212


In [67]:
df_d_max_year.to_csv("./clean/covid_deaths_max_year.csv", index=False)

# Vaccines

In [68]:
df_v = pd.read_csv(
    "./number-of-people-who-received-at-least-one-dose-of-covid-19-vaccine.csv")
df_v.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79011 entries, 0 to 79010
Data columns (total 3 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Entity                          79011 non-null  object 
 1   Day                             79011 non-null  object 
 2   People vaccinated (cumulative)  79011 non-null  float64
dtypes: float64(1), object(2)
memory usage: 1.8+ MB


In [69]:
df_v.rename(columns={"Entity": "country", "Day": "date",
                     "People vaccinated (cumulative)": "vaccines"}, inplace=True)
df_v.head()

Unnamed: 0,country,date,vaccines
0,Afghanistan,2021-02-22,0.0
1,Afghanistan,2021-02-28,8200.0
2,Afghanistan,2021-03-16,54000.0
3,Afghanistan,2021-04-07,120000.0
4,Afghanistan,2021-04-22,240000.0


In [70]:
df_v['date'] = pd.to_datetime(df_v['date'])

df_v['year'] = df_v['date'].dt.year
df_v['month'] = df_v['date'].dt.month
df_v['day'] = df_v['date'].dt.day

df_v.drop(columns=["date"], inplace=True)

df_v = df_v[['country', 'year', 'month', 'day', 'vaccines']]
df_v.head()

Unnamed: 0,country,year,month,day,vaccines
0,Afghanistan,2021,2,22,0.0
1,Afghanistan,2021,2,28,8200.0
2,Afghanistan,2021,3,16,54000.0
3,Afghanistan,2021,4,7,120000.0
4,Afghanistan,2021,4,22,240000.0


In [71]:
df_v = df_v[df_v['country'].isin(europe)]

In [72]:
df_v = df_v.merge(country_codes, on='country', how='left')
df_v = df_v[['ISO2', 'ISO3', 'country', 'year', 'month', 'day', 'vaccines']]
df_v.head()

Unnamed: 0,ISO2,ISO3,country,year,month,day,vaccines
0,AL,ALB,Albania,2021,1,10,0.0
1,AL,ALB,Albania,2021,1,12,128.0
2,AL,ALB,Albania,2021,1,13,188.0
3,AL,ALB,Albania,2021,1,14,266.0
4,AL,ALB,Albania,2021,1,15,308.0


In [73]:
# Find the minimum date in each dataframe
min_year_df = df['year'].min()
min_year_df_d = df_d['year'].min()
min_year_df_v = df_v['year'].min()

max_year_df = df['year'].max()
max_year_df_d = df_d['year'].max()
max_year_df_v = df_v['year'].max()

# Find the overall minimum date
min_year = min(min_year_df, min_year_df_d, min_year_df_v)

# Find the overall maximum date
max_year = max(max_year_df, max_year_df_d, max_year_df_v)


min_month_df = df[df['year'] == min_year]['month'].min()
min_month_df_d = df_d[df_d['year'] == min_year]['month'].min()
min_month_df_v = df_v[df_v['year'] == min_year]['month'].min()

max_month_df = df[df['year'] == max_year]['month'].max()
max_month_df_d = df_d[df_d['year'] == max_year]['month'].max()
max_month_df_v = df_v[df_v['year'] == max_year]['month'].max()

# Find the overall minimum date
min_month = min(min_month_df, min_month_df_d, min_month_df_v)

# Find the overall maximum date
max_month = max(max_month_df, max_month_df_d, max_month_df_v)


min_day_df = df[(df['year'] == min_year) & (
    df['month'] == min_month)]['day'].min()
min_day_df_d = df_d[(df_d['year'] == min_year) & (
    df_d['month'] == min_month)]['day'].min()
min_day_df_v = df_v[(df_v['year'] == min_year) & (
    df_v['month'] == min_month)]['day'].min()

max_day_df = df[(df['year'] == max_year) & (
    df['month'] == max_month)]['day'].max()
max_day_df_d = df_d[(df_d['year'] == max_year) & (
    df_d['month'] == max_month)]['day'].max()
max_day_df_v = df_v[(df_v['year'] == max_year) & (
    df_v['month'] == max_month)]['day'].max()

# Find the overall minimum date
min_day = min(min_day_df, min_day_df_d, min_day_df_v)

# Find the overall maximum date
max_day = max(max_day_df, max_day_df_d, max_day_df_v)


print(min_day, min_month, min_year)
print(max_day, max_month, max_year)

4 1 2020
5 1 2025


In [74]:
def fill_missing_dates(df, min_year, min_month, min_day, max_year, max_month, max_day):
    min_date = pd.Timestamp(year=min_year, month=min_month, day=min_day)
    max_date = pd.Timestamp(year=max_year, month=max_month, day=max_day)
    date_range = pd.date_range(start=min_date, end=max_date, freq='D')

    # Create a new dataframe with all dates for each country
    countries = df['country'].unique()
    all_dates = pd.DataFrame(
        [(country, date) for country in countries for date in date_range], columns=['country', 'date'])

    # Merge the new dataframe with the original dataframe
    df['date'] = pd.to_datetime(df[['year', 'month', 'day']])
    df = all_dates.merge(df, on=['country', 'date'], how='left')

    # Map both ISO2 and ISO3 codes
    df['ISO3'] = df['country'].map(country_codes.set_index('country')['ISO3'])
    df['ISO2'] = df['country'].map(country_codes.set_index('country')['ISO2'])

    df = df.sort_values(['country', 'date'])

    df = df.groupby('country').ffill()

    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day

    # Drop the date column
    df = df.drop(columns=['date'])

    df['country'] = df['ISO3'].map(country_codes.set_index('ISO3')['country'])

    return df


df = fill_missing_dates(df, min_year, min_month, min_day,
                        max_year, max_month, max_day)
df_d = fill_missing_dates(df_d, min_year, min_month,
                          min_day, max_year, max_month, max_day)
df_v = fill_missing_dates(df_v, min_year, min_month,
                          min_day, max_year, max_month, max_day)

In [75]:
df.to_csv("./clean/covid_cases_filled.csv", index=False)
df_d.to_csv("./clean/covid_deaths_filled.csv", index=False)
df_v.to_csv("./clean/covid_vaccines_filled.csv", index=False)

In [82]:
# # Define the missing ISO codes
# missing_iso_codes = {
#   'Czechia': 'CZE',
#   'Kosovo': 'XKX',
#   'Moldova': 'MDA',
#   'North Macedonia': 'MKD',
#   'Russia': 'RUS',
#   'Vatican': 'VAT'
# }

# # Update the country_codes DataFrame with the missing ISO codes
# for country, iso in missing_iso_codes.items():
#   df.loc[df['country'] == country, 'ISO3'] = iso
#   df_d.loc[df_d['country'] == country, 'ISO3'] = iso
#   df_v.loc[df_v['country'] == country, 'ISO3'] = iso

In [79]:
df_v.to_csv("./clean/covid_vaccines.csv", index=False)

In [80]:
vaccini = pd.read_csv("./clean/covid_vaccines.csv")
vaccini['vaccines'] = vaccini.groupby('country')['vaccines'].diff().fillna(0)
vaccini['vaccines'] = vaccini['vaccines'].astype(int)
yearly_vaccines = vaccini.groupby(['country', 'year'])[
    'vaccines'].sum().reset_index()
yearly_vaccines

Unnamed: 0,country,year,vaccines
0,Albania,2020,0
1,Albania,2021,1141486
2,Albania,2022,200757
3,Albania,2023,7012
4,Albania,2024,0
...,...,...,...
295,United Kingdom,2021,49500198
296,United Kingdom,2022,2020194
297,United Kingdom,2023,0
298,United Kingdom,2024,0


In [81]:
# Merge yearly_vaccines with population dataframe
yearly_vaccines = yearly_vaccines.merge(population, on='country', how='left')

# Calculate the percentage of vaccines over population
yearly_vaccines['percentage'] = (
    yearly_vaccines['vaccines'] / yearly_vaccines['population']) * 100
yearly_vaccines.drop(columns=['population'], inplace=True)
yearly_vaccines.head()

Unnamed: 0,country,year,vaccines,percentage
0,Albania,2020,0,0.0
1,Albania,2021,1141486,40.369216
2,Albania,2022,200757,7.09987
3,Albania,2023,7012,0.247983
4,Albania,2024,0,0.0


In [64]:
yearly_vaccines.to_csv("./clean/covid_new_vaccines_year.csv", index=False)

In [65]:
# Calculate the average cases per year for each country
df_v_max_year = df_v.groupby(['ISO2', 'ISO3', 'country', 'year'])[
    'vaccines'].max().reset_index()
df_v_max_year

Unnamed: 0,ISO3,country,year,vaccines
0,ALB,Albania,2021,1141486.0
1,ALB,Albania,2022,1342243.0
2,ALB,Albania,2023,1349255.0
3,AND,Andorra,2021,57085.0
4,AND,Andorra,2022,57901.0
...,...,...,...,...
138,SWE,Sweden,2022,7772354.0
139,SWE,Sweden,2023,7775726.0
140,UKR,Ukraine,2021,14713309.0
141,UKR,Ukraine,2022,15774300.0


In [66]:
df_d_max_year.to_csv("./clean/covid_vaccines_max_year.csv", index=False)

In [67]:
merged_data = yearly_new_cases.merge(
    yearly_vaccines, on=['country', 'year'], suffixes=('_cases', '_vaccines'))
merged_data = merged_data.rename(columns={'new_cases': 'cases'})
merged_data.head()

Unnamed: 0,country,year,cases,percentage_cases,vaccines,percentage_vaccines
0,Albania,2021,151753,5.36682,1141486,40.369216
1,Albania,2022,123869,4.380688,200757,7.09987
2,Albania,2023,1828,0.064648,7012,0.247983
3,Andorra,2021,15139,18.989739,56509,70.882567
4,Andorra,2022,24629,30.893605,816,1.023557


In [68]:
merged_data.to_csv("../bubblechart/covid_cases_vacc_yearly.csv", index=False)

# Expand the dates for each dataset

In [69]:
# # Extract the date components from each dataframe
# df['date'] = pd.to_datetime(df[['year', 'month', 'day']])
# df_d['date'] = pd.to_datetime(df_d[['year', 'month', 'day']])
# df_v['date'] = pd.to_datetime(df_v[['year', 'month', 'day']])

# # Find the maximum date in each dataframe
# max_date_df = df['date'].max()
# max_date_df_d = df_d['date'].max()
# max_date_df_v = df_v['date'].max()

# # Find the overall maximum date
# max_date = max(max_date_df, max_date_df_d, max_date_df_v)
# max_date

Timestamp('2025-01-05 00:00:00')

In [70]:
# def expand_to_max_date(df, max_date):
#     # Get the last available row for each country
#     last_values = df.groupby('country').last().reset_index()

#     # Create a date range from the last date in the dataframe to max_date
#     last_date = pd.to_datetime(last_values[['year', 'month', 'day']])
#     date_range = pd.date_range(start=last_date.max(), end=max_date, freq='D')

#     # Create new rows for each country with the last available values and the new dates
#     new_rows = []
#     for _, row in last_values.iterrows():
#         for date in date_range:
#             new_row = row.copy()
#             new_row['year'] = date.year
#             new_row['month'] = date.month
#             new_row['day'] = date.day
#             new_rows.append(new_row)

#     # Append the new rows to the original dataframe, excluding the 'date' column
#     new_df = pd.concat([df.drop(columns=['date']), pd.DataFrame(
#         new_rows).drop(columns=['date'])], ignore_index=True)

#     return new_df


# # Expand df, df_d, and df_v to max_date
# df_expanded = expand_to_max_date(df, max_date)
# df_d_expanded = expand_to_max_date(df_d, max_date)
# df_v_expanded = expand_to_max_date(df_v, max_date)

In [71]:
# df_expanded.to_csv("./clean/covid_cases.csv", index=False)
# df_d_expanded.to_csv("./clean/covid_deaths.csv", index=False)
# df_v_expanded.to_csv("./clean/covid_vaccines.csv", index=False)