# Data Loading & Exploration

In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('owid-covid-data.csv')

# Check the columns
print(df.columns)

# Preview the first few rows
print(df.head())

# Identify missing values
print(df.isnull().sum())


Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations', 'new_vaccinations_smoothed',
       't

# Data Cleaning

In [3]:
# Filter countries of interest
countries_of_interest = ['Kenya', 'USA', 'India']
df_filtered = df[df['location'].isin(countries_of_interest)]

# Drop rows with missing dates or critical values
df_filtered = df_filtered.dropna(subset=['date', 'total_cases', 'total_deaths'])

# Convert 'date' column to datetime
df_filtered['date'] = pd.to_datetime(df_filtered['date'])

# Handle missing numeric values by filling them with the previous value
df_filtered['total_cases'].fillna(method='ffill', inplace=True)
df_filtered['total_deaths'].fillna(method='ffill', inplace=True)
df_filtered['total_vaccinations'].fillna(method='ffill', inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filtered['total_cases'].fillna(method='ffill', inplace=True)
  df_filtered['total_cases'].fillna(method='ffill', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filtered['total_deaths'].fillna(method='ffill', inplace=True)
  df_filtered['total_deaths'].fillna(

# Exploratory Data Analysis (EDA)

In [7]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot total cases over time for selected countries
plt.figure(figsize=(10, 6))
sns.lineplot(data=df_filtered, x='date', y='total_cases', hue='location')
plt.title('Total COVID-19 Cases Over Time')
plt.xlabel('Date')
plt.ylabel('Total Cases')
plt.legend(title='Country')
plt.xticks(rotation=45)
plt.show()

# Plot total deaths over time
plt.figure(figsize=(10, 6))
sns.lineplot(data=df_filtered, x='date', y='total_deaths', hue='location')
plt.title('Total COVID-19 Deaths Over Time')
plt.xlabel('Date')
plt.ylabel('Total Deaths')
plt.legend(title='Country')
plt.xticks(rotation=45)
plt.show()

# Compare daily new cases between countries
plt.figure(figsize=(10, 6))
sns.lineplot(data=df_filtered, x='date', y='new_cases', hue='location')
plt.title('Daily New COVID-19 Cases')
plt.xlabel('Date')
plt.ylabel('New Cases')
plt.legend(title='Country')
plt.xticks(rotation=45)
plt.show()

# Calculate the death rate: total_deaths / total_cases
df_filtered['death_rate'] = df_filtered['total_deaths'] / df_filtered['total_cases']
print(df_filtered[['location', 'death_rate']].drop_duplicates())


ModuleNotFoundError: No module named 'seaborn'

# Visualizing Vaccination Progress

In [None]:
# Plot cumulative vaccinations over time for selected countries
plt.figure(figsize=(10, 6))
sns.lineplot(data=df_filtered, x='date', y='total_vaccinations', hue='location')
plt.title('Cumulative Vaccinations Over Time')
plt.xlabel('Date')
plt.ylabel('Total Vaccinations')
plt.legend(title='Country')
plt.xticks(rotation=45)
plt.show()

# Compare % vaccinated population
df_filtered['vaccinated_percentage'] = (df_filtered['total_vaccinations'] / df_filtered['total_cases']) * 100
plt.figure(figsize=(10, 6))
sns.barplot(data=df_filtered, x='location', y='vaccinated_percentage')
plt.title('Vaccinated Percentage by Country')
plt.xlabel('Country')
plt.ylabel('Vaccinated Percentage')
plt.show()
