In [3]:
pip install pandas numpy matplotlib seaborn plotly jupyter scikit-learn




Collecting pandas
  Using cached pandas-2.3.3-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Using cached numpy-2.3.4-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.7-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting plotly
  Downloading plotly-6.3.1-py3-none-any.whl.metadata (8.5 kB)
Collecting jupyter
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.3-cp313-cp313-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Dow

In [1]:
import pandas as pd

import numpy as np

import matplotlib as mlt

import seaborn as sns

In [2]:
data = pd.read_csv("owid-covid-data.csv")

data.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-01-05,0.0,0.0,,0.0,0.0,,...,,37.75,0.5,64.83,0.51,41128772,,,,
1,AFG,Asia,Afghanistan,2020-01-06,0.0,0.0,,0.0,0.0,,...,,37.75,0.5,64.83,0.51,41128772,,,,
2,AFG,Asia,Afghanistan,2020-01-07,0.0,0.0,,0.0,0.0,,...,,37.75,0.5,64.83,0.51,41128772,,,,
3,AFG,Asia,Afghanistan,2020-01-08,0.0,0.0,,0.0,0.0,,...,,37.75,0.5,64.83,0.51,41128772,,,,
4,AFG,Asia,Afghanistan,2020-01-09,0.0,0.0,,0.0,0.0,,...,,37.75,0.5,64.83,0.51,41128772,,,,


In [3]:
data.dtypes

iso_code                                    object
continent                                   object
location                                    object
date                                        object
total_cases                                float64
                                            ...   
population                                   int64
excess_mortality_cumulative_absolute       float64
excess_mortality_cumulative                float64
excess_mortality                           float64
excess_mortality_cumulative_per_million    float64
Length: 67, dtype: object

data types include floats, objects, and integers 

In [4]:
## converting date column to date 
data['date'] = pd.to_datetime(data['date'])

data.dtypes

iso_code                                           object
continent                                          object
location                                           object
date                                       datetime64[ns]
total_cases                                       float64
                                                ...      
population                                          int64
excess_mortality_cumulative_absolute              float64
excess_mortality_cumulative                       float64
excess_mortality                                  float64
excess_mortality_cumulative_per_million           float64
Length: 67, dtype: object

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 429435 entries, 0 to 429434
Data columns (total 67 columns):
 #   Column                                      Non-Null Count   Dtype         
---  ------                                      --------------   -----         
 0   iso_code                                    429435 non-null  object        
 1   continent                                   402910 non-null  object        
 2   location                                    429435 non-null  object        
 3   date                                        429435 non-null  datetime64[ns]
 4   total_cases                                 411804 non-null  float64       
 5   new_cases                                   410159 non-null  float64       
 6   new_cases_smoothed                          408929 non-null  float64       
 7   total_deaths                                411804 non-null  float64       
 8   new_deaths                                  410608 non-null  float64      

In [6]:
print("Rows and columns:", data.shape)
print("Column names:", data.columns.tolist())


Rows and columns: (429435, 67)
Column names: ['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases', 'new_cases_smoothed', 'total_deaths', 'new_deaths', 'new_deaths_smoothed', 'total_cases_per_million', 'new_cases_per_million', 'new_cases_smoothed_per_million', 'total_deaths_per_million', 'new_deaths_per_million', 'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients', 'icu_patients_per_million', 'hosp_patients', 'hosp_patients_per_million', 'weekly_icu_admissions', 'weekly_icu_admissions_per_million', 'weekly_hosp_admissions', 'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests', 'total_tests_per_thousand', 'new_tests_per_thousand', 'new_tests_smoothed', 'new_tests_smoothed_per_thousand', 'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'total_boosters', 'new_vaccinations', 'new_vaccinations_smoothed', 'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred', '

In [7]:
data.isnull().sum()

iso_code                                        0
continent                                   26525
location                                        0
date                                            0
total_cases                                 17631
                                            ...  
population                                      0
excess_mortality_cumulative_absolute       416024
excess_mortality_cumulative                416024
excess_mortality                           416024
excess_mortality_cumulative_per_million    416024
Length: 67, dtype: int64

has missing values in multiple columns

In [8]:
#replacing missing values in continet with location since it's similar
data['continent'] = data['continent'].fillna(data['location'])


In [9]:
#dropping excess mortality columns as they have too many missing values
data = data.drop(columns=[
    'excess_mortality_cumulative_absolute',
    'excess_mortality_cumulative',
    'excess_mortality',
    'excess_mortality_cumulative_per_million'
], errors='ignore')


In [10]:
data.describe()

Unnamed: 0,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,...,extreme_poverty,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population
count,429435,411804.0,410159.0,408929.0,411804.0,410608.0,409378.0,411804.0,410159.0,408929.0,...,211996.0,328865.0,345911.0,247165.0,243817.0,161741.0,290689.0,390299.0,319127.0,429435.0
mean,2022-04-21 01:06:25.463691008,7365292.0,8017.36,8041.026,81259.57,71.852139,72.060828,112096.19942,122.357073,122.713852,...,13.924729,264.639534,8.556055,10.772438,33.097758,50.64939,3.106895,73.702098,0.722178,152033600.0
min,2020-01-01 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.1,79.37,0.99,0.1,7.7,1.19,0.1,53.28,0.39,47.0
25%,2021-03-05 00:00:00,6280.75,0.0,0.0,43.0,0.0,0.0,1916.1,0.0,0.0,...,0.6,175.7,5.35,1.9,22.6,20.86,1.3,69.5,0.6,523798.0
50%,2022-04-20 00:00:00,63653.0,0.0,12.0,799.0,0.0,0.0,29145.48,0.0,2.79,...,2.5,245.46,7.2,6.3,33.1,49.54,2.5,75.05,0.74,6336393.0
75%,2023-06-08 00:00:00,758272.0,0.0,313.29,9574.0,0.0,3.14,156770.19,0.0,56.25,...,21.4,333.44,10.79,19.3,41.5,82.5,4.21,79.46,0.83,32969520.0
max,2024-08-14 00:00:00,775866800.0,44236230.0,6319461.0,7057132.0,103719.0,14817.0,763598.6,241758.23,34536.89,...,77.6,724.42,30.53,44.0,78.1,100.0,13.8,86.75,0.96,7975105000.0
std,,44775820.0,229664.9,86616.11,441190.1,1368.32299,513.636565,162240.412405,1508.778585,559.701663,...,20.073912,120.756698,4.934656,10.761091,13.853952,31.905236,2.549168,7.387914,0.149237,697540800.0


In [11]:
#describing numerical columns
numeric_cols = ['new_cases', 'new_deaths', 'total_cases', 'total_deaths', 'population']
data[numeric_cols].describe()


Unnamed: 0,new_cases,new_deaths,total_cases,total_deaths,population
count,410159.0,410608.0,411804.0,411804.0,429435.0
mean,8017.36,71.852139,7365292.0,81259.57,152033600.0
std,229664.9,1368.32299,44775820.0,441190.1,697540800.0
min,0.0,0.0,0.0,0.0,47.0
25%,0.0,0.0,6280.75,43.0,523798.0
50%,0.0,0.0,63653.0,799.0,6336393.0
75%,0.0,0.0,758272.0,9574.0,32969520.0
max,44236230.0,103719.0,775866800.0,7057132.0,7975105000.0


In [12]:
data[numeric_cols].agg(['min', 'max', 'mean'])


Unnamed: 0,new_cases,new_deaths,total_cases,total_deaths,population
min,0.0,0.0,0.0,0.0,47.0
max,44236230.0,103719.0,775866800.0,7057132.0,7975105000.0
mean,8017.36,71.852139,7365292.0,81259.57,152033600.0
