## Importing the required libraries

In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

### Read the data


In [2]:
df = pd.read_csv("country_vaccination_stats.csv")
df.head(10)

Unnamed: 0,country,date,daily_vaccinations,vaccines
0,Argentina,12/29/2020,,Sputnik V
1,Argentina,12/30/2020,15656.0,Sputnik V
2,Argentina,12/31/2020,15656.0,Sputnik V
3,Argentina,1/1/2021,11070.0,Sputnik V
4,Argentina,1/2/2021,8776.0,Sputnik V
5,Argentina,1/3/2021,7400.0,Sputnik V
6,Argentina,1/4/2021,6483.0,Sputnik V
7,Argentina,1/5/2021,7984.0,Sputnik V
8,Argentina,1/6/2021,8173.0,Sputnik V
9,Argentina,1/7/2021,8363.0,Sputnik V


In [4]:
# checking missing data 
df.isnull().sum()

country                0
date                   0
daily_vaccinations    60
vaccines               0
dtype: int64

In [3]:
# Finding the unique country count in dataset
len(df.country.value_counts())

60

In [6]:
# Finding the missing data in how many different countries
len(df[df["daily_vaccinations"].isna()]["country"].value_counts())

60

### Question-4 

In [109]:
# group the data by country and calculate the minimum daily vaccination number for each country 
country_min_vaccinations = df.groupby('country')['daily_vaccinations'].min()
country_min_vaccinations.head()

country
Argentina    6483.0
Austria      3368.0
Bahrain       943.0
Belgium         1.0
Brazil        112.0
Name: daily_vaccinations, dtype: float64

In [110]:
# fill the missing data (impute) in daily_vaccinations column per country with the minimum daily vaccination number of relevant countries.   
for i in country_min_vaccinations.index:
    nan_indexes = df[(df["country"]==i)&(df["daily_vaccinations"].isna())].index
    df.loc[nan_indexes,"daily_vaccinations"] = country_min_vaccinations[i]

In [116]:
# missing or NaN values in the DataFrame df with the value 0.
df.fillna(0,inplace=True)

In [117]:
# checking missing data 
df.isnull().sum()

country               0
date                  0
daily_vaccinations    0
vaccines              0
dtype: int64

###  Question-5


In [15]:
#  list the top-3 countries with highest median daily vaccination numbers 
df.groupby("country")["daily_vaccinations"].median().sort_values(ascending=False).head(3)

country
United States    378253.0
China            276786.0
India            173922.0
Name: daily_vaccinations, dtype: float64

### Question-6

In [22]:
# number of total vaccinations done on 1/6/2021 (MM/DD/YYYY)
df[df["date"] == "1/6/2021"]["daily_vaccinations"].sum()

1485255.0

###  Question-7

In [None]:
# SQL query to fill in the missing daily vaccination numbers with discrete median of country
UPDATE vaccinations v1
SET daily_vaccinations = (
  SELECT COALESCE(MEDIAN(daily_vaccinations), 0)
  FROM vaccinations v2
  WHERE v1.country = v2.country AND daily_vaccinations IS NOT NULL
)
WHERE daily_vaccinations IS NULL;
