In [3]:
import numpy as np
import pandas as pd
import dash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output
from sqlalchemy import create_engine
import psycopg2
import plotly.figure_factory as ff
import os
POSTGRES_PASSWORD = os.getenv('POSTGRES_PASSWORD')

In [5]:
dataset = pd.read_csv("owid-covid-data.csv")

In [14]:
dataset.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-01-03,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
1,AFG,Asia,Afghanistan,2020-01-04,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
2,AFG,Asia,Afghanistan,2020-01-05,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
3,AFG,Asia,Afghanistan,2020-01-06,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
4,AFG,Asia,Afghanistan,2020-01-07,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
5,AFG,Asia,Afghanistan,2020-01-08,,0.0,0.0,,0.0,0.0,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
6,AFG,Asia,Afghanistan,2020-01-09,,0.0,0.0,,0.0,0.0,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
7,AFG,Asia,Afghanistan,2020-01-10,,0.0,0.0,,0.0,0.0,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
8,AFG,Asia,Afghanistan,2020-01-11,,0.0,0.0,,0.0,0.0,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
9,AFG,Asia,Afghanistan,2020-01-12,,0.0,0.0,,0.0,0.0,...,,37.746,0.5,64.83,0.511,41128772.0,,,,


In [13]:
# Get the list of column names
column_names = dataset.columns
print(column_names)

Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations', 'new_vaccinations_smoothed',
       't

In [10]:
# 1. Data Summary
print(dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 355599 entries, 0 to 355598
Data columns (total 67 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   iso_code                                    355599 non-null  object 
 1   continent                                   338653 non-null  object 
 2   location                                    355599 non-null  object 
 3   date                                        355599 non-null  object 
 4   total_cases                                 317456 non-null  float64
 5   new_cases                                   345855 non-null  float64
 6   new_cases_smoothed                          344596 non-null  float64
 7   total_deaths                                295737 non-null  float64
 8   new_deaths                                  345907 non-null  float64
 9   new_deaths_smoothed                         344677 non-null  float64
 

In [11]:
##Data Summary
print(dataset.describe())

        total_cases     new_cases  new_cases_smoothed  total_deaths  \
count  3.174560e+05  3.458550e+05        3.445960e+05  2.957370e+05   
mean   6.796974e+06  9.457200e+03        9.491605e+03  8.666838e+04   
std    4.123893e+07  1.094194e+05        9.374232e+04  4.426444e+05   
min    1.000000e+00  0.000000e+00        0.000000e+00  1.000000e+00   
25%    8.285000e+03  0.000000e+00        2.860000e-01  1.280000e+02   
50%    7.143600e+04  1.000000e+00        2.357100e+01  1.341000e+03   
75%    7.667340e+05  2.510000e+02        4.821430e+02  1.198800e+04   
max    7.718202e+08  8.401960e+06        6.402036e+06  6.978162e+06   

         new_deaths  new_deaths_smoothed  total_cases_per_million  \
count  345907.00000        344677.000000            317456.000000   
mean       84.23524            84.532007            103457.466834   
std       608.98649           554.835927            152731.556394   
min         0.00000             0.000000                 0.000000   
25%         0.0

In [15]:
missing_values_count = dataset.isnull().sum()
print(missing_values_count)


iso_code                                        0
continent                                   16946
location                                        0
date                                            0
total_cases                                 38143
                                            ...  
population                                      0
excess_mortality_cumulative_absolute       343388
excess_mortality_cumulative                343388
excess_mortality                           343388
excess_mortality_cumulative_per_million    343388
Length: 67, dtype: int64


In [16]:
missing_values_percentage = (dataset.isnull().sum() / len(dataset)) * 100
print(missing_values_percentage)


iso_code                                    0.000000
continent                                   4.765480
location                                    0.000000
date                                        0.000000
total_cases                                10.726408
                                             ...    
population                                  0.000000
excess_mortality_cumulative_absolute       96.566076
excess_mortality_cumulative                96.566076
excess_mortality                           96.566076
excess_mortality_cumulative_per_million    96.566076
Length: 67, dtype: float64
