In [4]:
# 📦 Import Required Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Optional tools
import plotly.express as px
import geopandas as gpd
import geopandas.tools
sns.set(style="whitegrid")

In [None]:
# Load dataset
df = pd.read_csv('owid-covid-data.csv')

# Preview the data
df.head()

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/allen-institute-for-ai/CORD-19-research-challenge?dataset_version_number=111...


  1%|          | 165M/18.4G [11:48<22:15:13, 244kB/s] 


KeyboardInterrupt: 

In [None]:
# Check dataset structure
print(df.columns)

# Check for missing values
df.isnull().sum()

# Check data types
df.dtypes

In [None]:
# Convert 'date' to datetime
df['date'] = pd.to_datetime(df['date'])

# Filter for selected countries
countries = ['Kenya', 'United States', 'India']
df_filtered = df[df['location'].isin(countries)]

# Handle missing values
df_filtered = df_filtered.fillna(method='ffill')

In [None]:
# Line plot: Total cases over time
plt.figure(figsize=(10,6))
for country in countries:
    country_data = df_filtered[df_filtered['location'] == country]
    plt.plot(country_data['date'], country_data['total_cases'], label=country)
plt.title("Total COVID-19 Cases Over Time")
plt.xlabel("Date")
plt.ylabel("Total Cases")
plt.legend()
plt.show()

In [None]:
# Bar chart: Total cases by country (latest date)
latest = df_filtered[df_filtered['date'] == df_filtered['date'].max()]
latest = latest[['location', 'total_cases']].dropna().sort_values(by='total_cases', ascending=False)

sns.barplot(data=latest, x='location', y='total_cases')
plt.title("Total Cases by Country (Latest Date)")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Death rate analysis
df_filtered['death_rate'] = df_filtered['total_deaths'] / df_filtered['total_cases']

In [None]:
# Line plot: Total vaccinations over time
plt.figure(figsize=(10,6))
for country in countries:
    country_data = df_filtered[df_filtered['location'] == country]
    plt.plot(country_data['date'], country_data['total_vaccinations'], label=country)
plt.title("Total Vaccinations Over Time")
plt.xlabel("Date")
plt.ylabel("Total Vaccinations")
plt.legend()
plt.show()

In [None]:
# Latest data by country
latest_data = df[df['date'] == df['date'].max()]
map_data = latest_data[['iso_code', 'location', 'total_cases']].dropna()

# Uncomment to use Plotly (install plotly first)
# import plotly.express as px
# fig = px.choropleth(map_data,
#                     locations="iso_code",
#                     color="total_cases",
#                     hover_name="location",
#                     title="Global COVID-19 Cases",
#                     color_continuous_scale="OrRd")
# fig.show()

### 📌 Key Insights:

1. India had a rapid surge in cases during early 2021.
2. The U.S. leads in both total cases and vaccination rollout.
3. Kenya shows lower case counts but also slower vaccine distribution.

### 🧠 Observations:

- Death rates fluctuate widely and correlate with total cases in some countries.
- Vaccination efforts significantly reduced daily new cases in some regions.