In [2]:
import numpy as np 
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import plotly.express as px

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
pip install pandas_bokeh

Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas_bokeh 
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
pandas_bokeh.output_notebook()
pd.set_option('plotting.backend', 'pandas_bokeh')

In [6]:
df = pd.read_csv('owid-covid-data.csv')
df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,24/02/2020,5.0,5.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
1,AFG,Asia,Afghanistan,25/02/2020,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
2,AFG,Asia,Afghanistan,26/02/2020,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
3,AFG,Asia,Afghanistan,27/02/2020,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
4,AFG,Asia,Afghanistan,28/02/2020,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,


In [7]:
df['date'] = pd.to_datetime(df['date'])
df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,


# Part 1 Data Cleaning


In [8]:
df.isnull().sum()

iso_code                                        0
continent                                   12881
location                                        0
date                                            0
total_cases                                  9176
                                            ...  
human_development_index                     44950
excess_mortality_cumulative_absolute       215592
excess_mortality_cumulative                215592
excess_mortality                           216499
excess_mortality_cumulative_per_million    215592
Length: 67, dtype: int64

In [9]:
# Eliminating the locations except Countries
df_new= df.query('location != ["World", "European Union","International","High income","Low income","Lower middle income","Upper middle income", "Asia","Africa","Europe","North America","Oceania","South America"]')
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 211368 entries, 0 to 224248
Data columns (total 67 columns):
 #   Column                                      Non-Null Count   Dtype         
---  ------                                      --------------   -----         
 0   iso_code                                    211368 non-null  object        
 1   continent                                   211368 non-null  object        
 2   location                                    211368 non-null  object        
 3   date                                        211368 non-null  datetime64[ns]
 4   total_cases                                 202199 non-null  float64       
 5   new_cases                                   201894 non-null  float64       
 6   new_cases_smoothed                          200763 non-null  float64       
 7   total_deaths                                183216 non-null  float64       
 8   new_deaths                                  182954 non-null  float64      

In [10]:
pd.set_option('display.max_columns',None)
df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,new_deaths_per_million,new_deaths_smoothed_per_million,reproduction_rate,icu_patients,icu_patients_per_million,hosp_patients,hosp_patients_per_million,weekly_icu_admissions,weekly_icu_admissions_per_million,weekly_hosp_admissions,weekly_hosp_admissions_per_million,total_tests,new_tests,total_tests_per_thousand,new_tests_per_thousand,new_tests_smoothed,new_tests_smoothed_per_thousand,positive_rate,tests_per_case,tests_units,total_vaccinations,people_vaccinated,people_fully_vaccinated,total_boosters,new_vaccinations,new_vaccinations_smoothed,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,total_boosters_per_hundred,new_vaccinations_smoothed_per_million,new_people_vaccinated_smoothed,new_people_vaccinated_smoothed_per_hundred,stringency_index,population,population_density,median_age,aged_65_older,aged_70_older,gdp_per_capita,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,0.125,0.125,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,40099462.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,0.125,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,40099462.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,0.125,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,40099462.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,0.125,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,40099462.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,0.125,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.33,40099462.0,54.422,18.6,2.581,1.337,1803.987,,597.029,9.59,,,37.746,0.5,64.83,0.511,,,,


In [11]:
df['location'] = df['location'].fillna(0)

In [12]:
df['location']

0         Afghanistan
1         Afghanistan
2         Afghanistan
3         Afghanistan
4         Afghanistan
             ...     
224244       Zimbabwe
224245       Zimbabwe
224246       Zimbabwe
224247       Zimbabwe
224248       Zimbabwe
Name: location, Length: 224249, dtype: object

In [13]:
df['total_cases'].fillna(df.groupby('location')['total_cases'].transform('mean'), inplace = True)

In [14]:
df['total_cases_per_million'].fillna(df.groupby('location')['total_cases_per_million'].transform('mean'), inplace = True)

In [15]:
df['total_tests'].fillna(df.groupby('location')['total_tests'].transform('mean'), inplace = True)

In [16]:
df['total_tests_per_thousand'].fillna(df.groupby('location')['total_tests_per_thousand'].transform('mean'), inplace = True)

In [18]:
df['new_tests_per_thousand'].fillna(df.groupby('location')['new_tests_per_thousand'].transform('mean'), inplace = True)

In [19]:
df['new_tests_per_thousand'].fillna(df.groupby('location')['new_tests_per_thousand'].transform('mean'), inplace = True)

In [20]:
df['positive_rate'].fillna(df.groupby('location')['positive_rate'].transform('mean'), inplace = True)

In [21]:
df['total_vaccinations'].fillna(df.groupby('location')['total_vaccinations'].transform('mean'), inplace = True)

In [22]:
df['total_vaccinations'].fillna(df.groupby('location')['total_vaccinations'].transform('mean'), inplace = True)

In [23]:
df['people_vaccinated'].fillna(df.groupby('location')['people_vaccinated'].transform('mean'), inplace = True)

In [24]:
df['stringency_index'].fillna(df.groupby('location')['stringency_index'].transform('mean'), inplace = True)

In [25]:
df['population'].fillna(df.groupby('location')['population'].transform('mean'), inplace = True)

In [26]:
df['population_density'].fillna(df.groupby('location')['population_density'].transform('mean'), inplace = True)

In [27]:
df['aged_65_older'].fillna(df.groupby('location')['aged_65_older'].transform('mean'), inplace = True)

In [28]:
df['gdp_per_capita'].fillna(df.groupby('location')['gdp_per_capita'].transform('mean'), inplace = True)

In [29]:
df['handwashing_facilities'].fillna(df.groupby('location')['handwashing_facilities'].transform('mean'), inplace = True)

In [32]:
df['new_cases_smoothed'].fillna(df.groupby('location')['new_cases_smoothed'].transform('mean'), inplace = True)

In [33]:
df['new_deaths_per_million'].fillna(df.groupby('location')['new_deaths_per_million'].transform('mean'), inplace = True)

In [35]:
df['tests_per_case'].fillna(df.groupby('location')['tests_per_case'].transform('mean'), inplace = True)