# **References**

In [605]:
# References for implementation
#https://towardsdatascience.com/interactive-data-visualization-for-exploring-coronavirus-spreads-f33cabc64043
#https://towardsdatascience.com/covid-19-data-processing-58aaa3663f6

# **Imports**

In [606]:
!pip install altair vega_datasets



In [607]:
# Import all required libraries
import altair as alt
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from datetime import date, timedelta
import plotly.express as px

# **Data Exploration**
Here we import and explore our data frames. The confirmed and deaths dataframes came from a different source to the vacinated dataframe so we need to bear this in mind

In [608]:
# Import the data confirmed globabl
confirmedCasesGlobalDF = pd.read_csv(
    "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv")
# Here we print the dataframe
print(confirmedCasesGlobalDF.head())

  Province/State Country/Region       Lat  ...  11/24/21  11/25/21  11/26/21
0            NaN    Afghanistan  33.93911  ...    157032    157144    157171
1            NaN        Albania  41.15330  ...    197776    198292    198732
2            NaN        Algeria  28.03390  ...    209463    209624    209817
3            NaN        Andorra  42.50630  ...     16426     16566     16712
4            NaN         Angola -11.20270  ...     65080     65105     65130

[5 rows x 679 columns]


In [609]:
# Import the data deaths globabl
confirmedDeathsGlobalDF = pd.read_csv(
    "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv")
# Here we print the dataframe
print(confirmedDeathsGlobalDF.head())


  Province/State Country/Region       Lat  ...  11/24/21  11/25/21  11/26/21
0            NaN    Afghanistan  33.93911  ...      7306      7307      7307
1            NaN        Albania  41.15330  ...      3063      3068      3077
2            NaN        Algeria  28.03390  ...      6035      6041      6046
3            NaN        Andorra  42.50630  ...       131       131       131
4            NaN         Angola -11.20270  ...      1731      1732      1733

[5 rows x 679 columns]


In [610]:
# Import the data vacinated globabl
vacinationsGlobalDF = pd.read_csv(
    "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/vaccinations.csv")

# Here we print the dataframe
print(vacinationsGlobalDF.head())
# Here we print the dataframe columns, as we can see there is alot of information
print(vacinationsGlobalDF.info())

      location  ... daily_people_vaccinated_per_hundred
0  Afghanistan  ...                                 NaN
1  Afghanistan  ...                               0.003
2  Afghanistan  ...                               0.003
3  Afghanistan  ...                               0.003
4  Afghanistan  ...                               0.003

[5 rows x 16 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64249 entries, 0 to 64248
Data columns (total 16 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   location                             64249 non-null  object 
 1   iso_code                             64249 non-null  object 
 2   date                                 64249 non-null  object 
 3   total_vaccinations                   36245 non-null  float64
 4   people_vaccinated                    34674 non-null  float64
 5   people_fully_vaccinated              31710 non-null  float64
 6  

In [611]:
# Some more exploration of the data
print(confirmedCasesGlobalDF.describe())

              Lat        Long  ...      11/25/21      11/26/21
count  278.000000  278.000000  ...  2.800000e+02  2.800000e+02
mean    20.156042   21.788955  ...  9.288017e+05  9.309242e+05
std     25.283318   76.200169  ...  3.969183e+06  3.973485e+06
min    -51.796300 -178.116500  ...  0.000000e+00  0.000000e+00
25%      4.643279  -37.713675  ...  3.029000e+03  3.034250e+03
50%     21.517170   20.921188  ...  4.306400e+04  4.306400e+04
75%     40.393350   84.992575  ...  4.295650e+05  4.312620e+05
max     71.706900  178.065000  ...  4.812657e+07  4.817791e+07

[8 rows x 677 columns]


In [612]:
# Some more exploration of the data
print(confirmedDeathsGlobalDF.describe())

              Lat        Long  ...       11/25/21       11/26/21
count  278.000000  278.000000  ...     280.000000     280.000000
mean    20.156042   21.788955  ...   18508.746429   18533.578571
std     25.283318   76.200169  ...   72992.658071   73058.082243
min    -51.796300 -178.116500  ...       0.000000       0.000000
25%      4.643279  -37.713675  ...      22.000000      22.000000
50%     21.517170   20.921188  ...     676.000000     677.500000
75%     40.393350   84.992575  ...    6856.500000    6867.000000
max     71.706900  178.065000  ...  775797.000000  776349.000000

[8 rows x 677 columns]


In [613]:
# Some more exploration of the data
print(vacinationsGlobalDF.describe())
# Here we notice that Vacinated is different in format to confirmed and deaths so we need to change this

       total_vaccinations  ...  daily_people_vaccinated_per_hundred
count        3.624500e+04  ...                         62719.000000
mean         1.120684e+08  ...                             0.178767
std          5.306737e+08  ...                             0.279821
min          0.000000e+00  ...                             0.000000
25%          4.300000e+05  ...                             0.031000
50%          3.063864e+06  ...                             0.096000
75%          1.748784e+07  ...                             0.238000
max          7.875897e+09  ...                            11.750000

[8 rows x 13 columns]


# **Data Preprocessing**

In [614]:
# We need to augment the dataframe to our liking, we only need location, date and people_fully_vaccinated
vacinationsGlobalDF = vacinationsGlobalDF[['location','date','people_fully_vaccinated']]
# We print the dataframe
print(vacinationsGlobalDF)

          location        date  people_fully_vaccinated
0      Afghanistan  2021-02-22                      NaN
1      Afghanistan  2021-02-23                      NaN
2      Afghanistan  2021-02-24                      NaN
3      Afghanistan  2021-02-25                      NaN
4      Afghanistan  2021-02-26                      NaN
...            ...         ...                      ...
64244     Zimbabwe  2021-11-22                2752352.0
64245     Zimbabwe  2021-11-23                2757486.0
64246     Zimbabwe  2021-11-24                2767243.0
64247     Zimbabwe  2021-11-25                2774729.0
64248     Zimbabwe  2021-11-26                2786979.0

[64249 rows x 3 columns]


In [615]:
# I need to preprocess the data, this includes remaning the columns so they match the other two dataframes I am working with
# I renmae location to Country/Region, date to Date, and people_fully_vaccinated to Vacinated
vacinationsGlobalDF = vacinationsGlobalDF.rename(columns={'location': 'Country/Region', 'date': 'Date', 'people_fully_vaccinated': 'Vacinated'})
# We also convert the Date column to the pandas datetime object
vacinationsGlobalDF['Date'] = pd.to_datetime(vacinationsGlobalDF['Date'])
# Print the dataframe
print(vacinationsGlobalDF)

      Country/Region       Date  Vacinated
0        Afghanistan 2021-02-22        NaN
1        Afghanistan 2021-02-23        NaN
2        Afghanistan 2021-02-24        NaN
3        Afghanistan 2021-02-25        NaN
4        Afghanistan 2021-02-26        NaN
...              ...        ...        ...
64244       Zimbabwe 2021-11-22  2752352.0
64245       Zimbabwe 2021-11-23  2757486.0
64246       Zimbabwe 2021-11-24  2767243.0
64247       Zimbabwe 2021-11-25  2774729.0
64248       Zimbabwe 2021-11-26  2786979.0

[64249 rows x 3 columns]


In [616]:
# We investigate this dataframe for NaN values
print(vacinationsGlobalDF.isna().sum())

Country/Region        0
Date                  0
Vacinated         32539
dtype: int64


In [617]:
# We fill the vacinated NaN values with the most previous number so if there are no stats for a day
# It takes the most recent stats for that day
vacinationsGlobalDF['Vacinated'] = vacinationsGlobalDF['Vacinated'].ffill()
# We print this again to see the results
# The NaN values left at the start of the dataframe since there is no value to take
print(vacinationsGlobalDF.isna().sum())
# Print the dataframe
print(vacinationsGlobalDF)

Country/Region     0
Date               0
Vacinated         78
dtype: int64
      Country/Region       Date  Vacinated
0        Afghanistan 2021-02-22        NaN
1        Afghanistan 2021-02-23        NaN
2        Afghanistan 2021-02-24        NaN
3        Afghanistan 2021-02-25        NaN
4        Afghanistan 2021-02-26        NaN
...              ...        ...        ...
64244       Zimbabwe 2021-11-22  2752352.0
64245       Zimbabwe 2021-11-23  2757486.0
64246       Zimbabwe 2021-11-24  2767243.0
64247       Zimbabwe 2021-11-25  2774729.0
64248       Zimbabwe 2021-11-26  2786979.0

[64249 rows x 3 columns]


In [618]:
# We need to change the way we use this table, we take all the date columns which is all columns after the 4th column
# And we need to take the columns we will use as identifiers
dates = confirmedCasesGlobalDF.columns[4:]
print(dates)
idVariables = confirmedCasesGlobalDF.columns[0:4]
print(idVariables)

Index(['1/22/20', '1/23/20', '1/24/20', '1/25/20', '1/26/20', '1/27/20',
       '1/28/20', '1/29/20', '1/30/20', '1/31/20',
       ...
       '11/17/21', '11/18/21', '11/19/21', '11/20/21', '11/21/21', '11/22/21',
       '11/23/21', '11/24/21', '11/25/21', '11/26/21'],
      dtype='object', length=675)
Index(['Province/State', 'Country/Region', 'Lat', 'Long'], dtype='object')


In [619]:
# No we use the melt command to transpose the tables using the dates
# We use idVariables as identifiers and the dats as the transposing values
# The transposed dataframes will look like Vacinated
confirmedCasesGlobalDFLong = confirmedCasesGlobalDF.melt(
    id_vars=idVariables,
    value_vars=dates,
    var_name='Date',
    value_name='Confirmed'
)
# We can see that there is now a Date column and a Confirmed Column for each country for each day
print(confirmedCasesGlobalDFLong)

       Province/State      Country/Region  ...      Date  Confirmed
0                 NaN         Afghanistan  ...   1/22/20          0
1                 NaN             Albania  ...   1/22/20          0
2                 NaN             Algeria  ...   1/22/20          0
3                 NaN             Andorra  ...   1/22/20          0
4                 NaN              Angola  ...   1/22/20          0
...               ...                 ...  ...       ...        ...
188995            NaN             Vietnam  ...  11/26/21    1181337
188996            NaN  West Bank and Gaza  ...  11/26/21     459479
188997            NaN               Yemen  ...  11/26/21       9979
188998            NaN              Zambia  ...  11/26/21     210118
188999            NaN            Zimbabwe  ...  11/26/21     133836

[189000 rows x 6 columns]


In [620]:
# Do the same as above for deaths
confirmedDeathsGlobalDFLong = confirmedDeathsGlobalDF.melt(
    id_vars=idVariables,
    value_vars=dates,
    var_name='Date',
    value_name='Deaths'
)
print(confirmedDeathsGlobalDFLong)

       Province/State      Country/Region  ...      Date  Deaths
0                 NaN         Afghanistan  ...   1/22/20       0
1                 NaN             Albania  ...   1/22/20       0
2                 NaN             Algeria  ...   1/22/20       0
3                 NaN             Andorra  ...   1/22/20       0
4                 NaN              Angola  ...   1/22/20       0
...               ...                 ...  ...       ...     ...
188995            NaN             Vietnam  ...  11/26/21   24544
188996            NaN  West Bank and Gaza  ...  11/26/21    4789
188997            NaN               Yemen  ...  11/26/21    1943
188998            NaN              Zambia  ...  11/26/21    3667
188999            NaN            Zimbabwe  ...  11/26/21    4704

[189000 rows x 6 columns]


In [621]:
# Merging confirmedCasesGlobalDFLong and confirmedDeathsGlobalDFLong on the idVariables
# We get one big table with Confirmed and Deaths
fullConfirmedCovidData = confirmedCasesGlobalDFLong.merge(
    right=confirmedDeathsGlobalDFLong,
    how='left',
    on=['Province/State', 'Country/Region', 'Date', 'Lat', 'Long']
)
print(fullConfirmedCovidData.head())

  Province/State Country/Region       Lat  ...     Date Confirmed  Deaths
0            NaN    Afghanistan  33.93911  ...  1/22/20         0       0
1            NaN        Albania  41.15330  ...  1/22/20         0       0
2            NaN        Algeria  28.03390  ...  1/22/20         0       0
3            NaN        Andorra  42.50630  ...  1/22/20         0       0
4            NaN         Angola -11.20270  ...  1/22/20         0       0

[5 rows x 7 columns]


In [622]:
# Currently, the values for Date are string, we need to convert them to a Date object
print(fullConfirmedCovidData.info())
# Convert the Date to a datatime object
fullConfirmedCovidData['Date'] = pd.to_datetime(fullConfirmedCovidData['Date'])
# Check that it has worked
print(fullConfirmedCovidData.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 189000 entries, 0 to 188999
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Province/State  58725 non-null   object 
 1   Country/Region  189000 non-null  object 
 2   Lat             187650 non-null  float64
 3   Long            187650 non-null  float64
 4   Date            189000 non-null  object 
 5   Confirmed       189000 non-null  int64  
 6   Deaths          189000 non-null  int64  
dtypes: float64(2), int64(2), object(3)
memory usage: 11.5+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 189000 entries, 0 to 188999
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   Province/State  58725 non-null   object        
 1   Country/Region  189000 non-null  object        
 2   Lat             187650 non-null  float64       
 3   Long            187650 non-n

In [623]:
# Now we need to merge fullConfirmedCovidData with the Vacinated DF
fullConfirmedCovidData = fullConfirmedCovidData.merge(
    right=vacinationsGlobalDF,
    how='left',
    on=['Country/Region', 'Date']
)
# Now we have Confirmed, Deaths, and Vacinated for each country for each day
print(fullConfirmedCovidData)

       Province/State      Country/Region  ...  Deaths  Vacinated
0                 NaN         Afghanistan  ...       0        NaN
1                 NaN             Albania  ...       0        NaN
2                 NaN             Algeria  ...       0        NaN
3                 NaN             Andorra  ...       0        NaN
4                 NaN              Angola  ...       0        NaN
...               ...                 ...  ...     ...        ...
188995            NaN             Vietnam  ...   24544        NaN
188996            NaN  West Bank and Gaza  ...    4789        NaN
188997            NaN               Yemen  ...    1943        NaN
188998            NaN              Zambia  ...    3667        NaN
188999            NaN            Zimbabwe  ...    4704  2786979.0

[189000 rows x 8 columns]


In [624]:
# We now group the by Date and Country/Region as the keys. Then we use .sum() method to sum to get the total count of the
# Confirmed, Deaths, Recovered and Active for each country for each date
# We also reset the index using Date and Country
fullConfirmedCovidData = fullConfirmedCovidData.groupby(
    ['Date', 'Country/Region'])[['Confirmed', 'Deaths', 'Vacinated']].sum().reset_index()
print(fullConfirmedCovidData)

             Date      Country/Region  Confirmed  Deaths  Vacinated
0      2020-01-22         Afghanistan          0       0        0.0
1      2020-01-22             Albania          0       0        0.0
2      2020-01-22             Algeria          0       0        0.0
3      2020-01-22             Andorra          0       0        0.0
4      2020-01-22              Angola          0       0        0.0
...           ...                 ...        ...     ...        ...
132295 2021-11-26             Vietnam    1181337   24544        0.0
132296 2021-11-26  West Bank and Gaza     459479    4789        0.0
132297 2021-11-26               Yemen       9979    1943        0.0
132298 2021-11-26              Zambia     210118    3667        0.0
132299 2021-11-26            Zimbabwe     133836    4704  2786979.0

[132300 rows x 5 columns]


In [625]:
# We need to get todays date for filtering out since, the vacinated data is one/two day behind
# This can cause the graphs to break
today = date.today() - timedelta(days=1)
today = pd.to_datetime(today)
yesterday = date.today() - timedelta(days=2)
yesterday = pd.to_datetime(yesterday)

In [626]:
# We need to filter out today and yesterday dates as they have no vacinated data
# This prevents us from breaking the graphs
fullConfirmedCovidData = fullConfirmedCovidData[fullConfirmedCovidData['Date'] != today]
fullConfirmedCovidData = fullConfirmedCovidData[fullConfirmedCovidData['Date'] != yesterday]
# Select some key countries of interest
countries = ['Ireland', 'Italy', 'Spain', 'Germany', 'Belgium', 'Norway']
# Filter the selected countries to these countries
selected_countries1 = fullConfirmedCovidData[fullConfirmedCovidData['Country/Region'].isin(countries)]
print(selected_countries1)

             Date Country/Region  Confirmed  Deaths   Vacinated
16     2020-01-22        Belgium          0       0         0.0
66     2020-01-22        Germany          0       0         0.0
83     2020-01-22        Ireland          0       0         0.0
85     2020-01-22          Italy          0       0         0.0
131    2020-01-22         Norway          0       0         0.0
...           ...            ...        ...     ...         ...
131778 2021-11-24        Germany    5595674  100123  56719485.0
131795 2021-11-24        Ireland     542146    5652   3786011.0
131797 2021-11-24          Italy    4954585  133415  43923993.0
131843 2021-11-24         Norway     251397    1049   3809887.0
131876 2021-11-24          Spain    5111842   87904  37578516.0

[4038 rows x 5 columns]


In [627]:
# We get the most recent stats here
threeDaysAgo = date.today() - timedelta(days=3)
threeDaysAgo = pd.to_datetime(threeDaysAgo)
# Filter to latest date with data
mostRecentStats = selected_countries1[selected_countries1['Date'] == threeDaysAgo]

# **Data Visualization**

In [628]:
# We compare the Confirmed and Vacinated against each other of key countries
fig = go.Figure()
fig.add_trace(go.Bar(
    x=mostRecentStats['Country/Region'],
    y=mostRecentStats['Confirmed'],
    name='Confirmed',
    marker_color='indianred'
))
fig.add_trace(go.Bar(
    x=mostRecentStats['Country/Region'],
    y=mostRecentStats['Vacinated'],
    name='Vacinated',
    marker_color='pink'
))

# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode='group', xaxis_tickangle=-45)
# Code below hides the axis
fig.update_xaxes(showgrid=False, zeroline=False)
fig.update_yaxes(showgrid=False, zeroline=False)
fig.show()

In [629]:
# Next I want to print the total figures of the countries of the world
# I filter the date to being the date of the last data that is working
threeDaysAgo = date.today() - timedelta(days=3)
threeDaysAgo = pd.to_datetime(threeDaysAgo)
mostRecentStats = fullConfirmedCovidData[fullConfirmedCovidData['Date'] == threeDaysAgo]
print(mostRecentStats)

             Date      Country/Region  Confirmed  Deaths  Vacinated
131712 2021-11-24         Afghanistan     157032    7306        0.0
131713 2021-11-24             Albania     197776    3063   951371.0
131714 2021-11-24             Algeria     209463    6035        0.0
131715 2021-11-24             Andorra      16426     131        0.0
131716 2021-11-24              Angola      65080    1731  2760266.0
...           ...                 ...        ...     ...        ...
131903 2021-11-24             Vietnam    1155778   24243        0.0
131904 2021-11-24  West Bank and Gaza     458956    4783        0.0
131905 2021-11-24               Yemen       9972    1942        0.0
131906 2021-11-24              Zambia     210099    3667   667433.0
131907 2021-11-24            Zimbabwe     133747    4703  2767243.0

[196 rows x 5 columns]


In [630]:
# Now I want to graph the confirmed in each country
# This graph is an interactive graph
fig = px.bar(mostRecentStats, x="Country/Region", y="Confirmed")
# Code below hides the axis
fig.update_xaxes(showgrid=False, zeroline=False)
fig.update_yaxes(showgrid=False, zeroline=False)
fig.show()

In [631]:
# Same as above execpt for deaths
fig = px.bar(mostRecentStats, x="Country/Region", y="Deaths")
fig.update_xaxes(showgrid=False, zeroline=False)
fig.update_yaxes(showgrid=False, zeroline=False)
fig.show()

In [632]:
# Filter the selected countries to these countries
selected_countries = fullConfirmedCovidData[fullConfirmedCovidData['Country/Region'].isin(countries)]

In [633]:
# This creates an interactive two in one graph
interval = alt.selection_interval()
# Creates a circle graph using the country/region, date and confirmed number
circle = alt.Chart(selected_countries).mark_circle().encode(
    x='Date:O',
    y='Country/Region',
    color=alt.condition(interval, 'Country/Region', alt.value('lightgray')),
    size=alt.Size('Confirmed:Q',
        scale=alt.Scale(range=[0, 2500]),
        legend=alt.Legend(title='Total Confirmed')
    ) 
).properties(
    width=1250,
    height=400,
    selection=interval
)
# Creates a bar chart using country/region and sum of confirmed
bars = alt.Chart(selected_countries).mark_bar().encode(
    y='Country/Region',
    color='Country/Region',
    x='sum(Confirmed):Q'
).properties(
    width=1250
).transform_filter(
    interval
)
# Show both graphs
circle & bars

In [635]:
# Extra scatter plot using deaths and confirmed
fig = px.scatter(mostRecentStats, x="Confirmed", y="Deaths",
	         size="Confirmed", color="Deaths",
                 hover_name="Country/Region", log_x=True, size_max=100)

# Code below hides the axis
fig.update_xaxes(showgrid=False, zeroline=False)
fig.update_yaxes(showgrid=False, zeroline=False)
fig.show()