# COVID-19 Global Data Tracker Project

This notebook analyzes global COVID-19 data using the dataset `Covid Data.csv`. We will load, clean, and explore the data, focusing on key countries and metrics such as cases, deaths, and vaccinations.

## 1. Data Loading & Exploration

In this section, we will load the dataset, inspect its structure, and identify missing values.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [1]:
# Load the dataset
df = pd.read_csv('Covid Data.csv')

NameError: name 'pd' is not defined

In [None]:
# Display the columns in the dataset
df.columns

In [None]:
# Preview the first five rows
df.head()

In [None]:
# Identify missing values in each column
df.isnull().sum()

## 2. Data Cleaning

We will filter for countries of interest (Kenya, USA, India), handle missing values, and ensure correct data types.

In [2]:
# Filter for selected countries
countries_of_interest = ['Kenya', 'United States', 'India']
df_filtered = df[df['location'].isin(countries_of_interest)].copy()

NameError: name 'df' is not defined

In [None]:
# Drop rows with missing dates or critical values (e.g., total_cases, total_deaths)
critical_columns = ['date', 'total_cases', 'total_deaths']
df_filtered = df_filtered.dropna(subset=critical_columns)

In [None]:
# Convert 'date' column to datetime
df_filtered['date'] = pd.to_datetime(df_filtered['date'])

In [None]:
# Handle missing numeric values: fill with interpolation
numeric_cols = ['total_cases', 'total_deaths', 'new_cases', 'new_deaths', 'total_vaccinations']
for col in numeric_cols:
    if col in df_filtered.columns:
        df_filtered[col] = df_filtered[col].interpolate()

In [None]:
# Check for remaining missing values
df_filtered.isnull().sum()

## 3. Exploratory Data Analysis (EDA)

Let's explore trends in cases and deaths, compare countries, and calculate the death rate.

In [None]:


import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
sns.set(style="whitegrid")

In [None]:
# Plot total cases over time for each country
plt.figure(figsize=(12,6))
for country in countries_of_interest:
    country_data = df_filtered[df_filtered['location'] == country]
    plt.plot(country_data['date'], country_data['total_cases'], label=country)
plt.title('Total COVID-19 Cases Over Time')
plt.xlabel('Date')
plt.ylabel('Total Cases')
plt.legend()
plt.show()

In [None]:
# Plot total deaths over time for each country
plt.figure(figsize=(12,6))
for country in countries_of_interest:
    country_data = df_filtered[df_filtered['location'] == country]
    plt.plot(country_data['date'], country_data['total_deaths'], label=country)
plt.title('Total COVID-19 Deaths Over Time')
plt.xlabel('Date')
plt.ylabel('Total Deaths')
plt.legend()
plt.show()

In [None]:
# Compare daily new cases between countries
plt.figure(figsize=(12,6))
for country in countries_of_interest:
    country_data = df_filtered[df_filtered['location'] == country]
    plt.plot(country_data['date'], country_data['new_cases'], label=country)
plt.title('Daily New COVID-19 Cases')
plt.xlabel('Date')
plt.ylabel('New Cases')
plt.legend()
plt.show()

In [None]:
# Calculate and plot death rate (total_deaths / total_cases)
plt.figure(figsize=(12,6))
for country in countries_of_interest:
    country_data = df_filtered[df_filtered['location'] == country].copy()
    country_data['death_rate'] = country_data['total_deaths'] / country_data['total_cases']
    plt.plot(country_data['date'], country_data['death_rate'], label=country)
plt.title('COVID-19 Death Rate Over Time')
plt.xlabel('Date')
plt.ylabel('Death Rate')
plt.legend()
plt.show()

In [None]:
# Optional: Correlation heatmap for numeric columns
plt.figure(figsize=(8,6))
sns.heatmap(df_filtered[numeric_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

## 4. Visualizing Vaccination Progress

Analyze and visualize vaccination rollouts for the selected countries.

In [None]:
# Plot cumulative vaccinations over time for each country
plt.figure(figsize=(12,6))
for country in countries_of_interest:
    country_data = df_filtered[df_filtered['location'] == country]
    if 'total_vaccinations' in country_data.columns:
        plt.plot(country_data['date'], country_data['total_vaccinations'], label=country)
plt.title('Cumulative COVID-19 Vaccinations Over Time')
plt.xlabel('Date')
plt.ylabel('Total Vaccinations')
plt.legend()
plt.show()

In [None]:
# Compare % vaccinated population (if population and total_vaccinations columns exist)
if 'population' in df_filtered.columns and 'total_vaccinations' in df_filtered.columns:
    latest = df_filtered.sort_values('date').groupby('location').tail(1)
    latest['percent_vaccinated'] = (latest['total_vaccinations'] / latest['population']) * 100
    plt.figure(figsize=(8,6))
    sns.barplot(x='location', y='percent_vaccinated', data=latest)
    plt.title('Percentage of Population Vaccinated (Latest Available Date)')
    plt.ylabel('% Vaccinated')
    plt.xlabel('Country')
    plt.show()

In [None]:
# Optional: Pie chart for vaccinated vs. unvaccinated (for one country, e.g., USA)
country = 'United States'
if 'population' in df_filtered.columns and 'total_vaccinations' in df_filtered.columns:
    latest_usa = df_filtered[(df_filtered['location'] == country)].sort_values('date').tail(1)
    if not latest_usa.empty:
        vaccinated = latest_usa['total_vaccinations'].values[0]
        population = latest_usa['population'].values[0]
        unvaccinated = population - vaccinated
        plt.figure(figsize=(6,6))
        plt.pie([vaccinated, unvaccinated], labels=['Vaccinated', 'Unvaccinated'], autopct='%1.1f%%', colors=['#4CAF50', '#FFC107'])
        plt.title(f'Vaccinated vs. Unvaccinated in {country}')
        plt.show()